From 7c68757561c595a5966da718f9bc0bffcd84a373 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Thu, 1 Sep 2016 08:43:48 -0700 Subject: [PATCH 2/2] grep: avoid code duplication with -iF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * NEWS: Simplify description of -iF improvement. * src/dfa.c: Do not include wctype.h. (lonesome_lower, case_folded_counterparts): Move to localeinfo.c. (CASE_FOLDED_BUFSIZE): Move to localeinfo.h. * src/grep.c: Do not include wctype.h. (lonesome_lower): Remove. (fgrep_icase_available): Use case_folded_counterparts instead. Do not call it for the same character twice. Return false on wcrtomb failures (which should never happen). (fgrep_to_grep_pattern, main): Simplify. Let fgrep_to_grep’s caller fiddle with the global variables. * src/localeinfo.c: Include (lonesome_lower, case_folded_counterparts): Move here from src/dfa.c. Return int, not unsigned int. Verify that CASE_FOLDED_BUFSIZE is big enough. * src/localeinfo.h (CASE_FOLDED_BUFSIZE): Now 32, so that we don’t expose lonesome_lower’s size. * src/searchutils.c (kwsinit): Return new kwset instead of storing it via a pointer. All callers changed. Simplify a bit. --- NEWS | 9 ++-- src/dfa.c | 48 ------------------- src/dfasearch.c | 2 +- src/grep.c | 135 +++++++++++++++++++----------------------------------- src/kwsearch.c | 2 +- src/localeinfo.c | 47 +++++++++++++++++++ src/localeinfo.h | 7 +++ src/search.h | 2 +- src/searchutils.c | 24 ++++------ 9 files changed, 115 insertions(+), 161 deletions(-) diff --git a/NEWS b/NEWS index 65e663d..91f1bfc 100644 --- a/NEWS +++ b/NEWS @@ -10,18 +10,15 @@ GNU grep NEWS -*- outline -*- as it now uses the Aho-Corasick algorithm instead of the Commentz-Walter algorithm in that case. + grep -iF is typically much faster in a multibyte locale, if the + pattern and its case counterparts contain only single byte characters. + In multibyte locales that are not UTF-8, grep now handles leading "." in patterns more efficiently. grep now prints a "FILENAME:LINENO: " prefix when diagnosing an invalid regular expression that was read from an '-f'-specified file. - grep -F now tries fgrep matcher for case insensitive matching in - multibyte locale. It improves performance for a case which a pattern - is composed of only single byte characters and their all counterparts - are also single byte characters and the pattern does not have invalid - sequences. - * Noteworthy changes in release 2.25 (2016-04-21) [stable] diff --git a/src/dfa.c b/src/dfa.c index cc96740..0f5109e 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -50,7 +50,6 @@ #define _(str) gettext (str) #include -#include /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit @@ -857,53 +856,6 @@ using_simple_locale (bool multibyte) # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif -/* The set of wchar_t values C such that there's a useful locale - somewhere where C != towupper (C) && C != towlower (towupper (C)). - For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because - towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and - towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ -static short const lonesome_lower[] = - { - 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, - 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, - - /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase - counterpart in locales predating Unicode 4.0.0 (April 2003). */ - 0x03F2, - - 0x03F5, 0x1E9B, 0x1FBE, - }; - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER. */ -enum -{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; - -/* Find the characters equal to C after case-folding, other than C - itself, and store them into FOLDED. Return the number of characters - stored. */ -static unsigned int -case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) -{ - unsigned int i; - unsigned int n = 0; - wint_t uc = towupper (c); - wint_t lc = towlower (uc); - if (uc != c) - folded[n++] = uc; - if (lc != uc && lc != c && towupper (lc) == uc) - folded[n++] = lc; - for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) - { - wint_t li = lonesome_lower[i]; - if (li != lc && li != uc && li != c && towupper (li) == uc) - folded[n++] = li; - } - return n; -} - typedef int predicate (int); /* The following list maps the names of the Posix named character classes diff --git a/src/dfasearch.c b/src/dfasearch.c index 548ef08..61b1f70 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -89,7 +89,7 @@ kwsmusts (void) struct dfamust *dm = dfamust (dfa); if (!dm) return; - kwsinit (&kwset, false); + kwset = kwsinit (false); if (dm->exact) { /* Prepare a substring whose presence implies a match. diff --git a/src/grep.c b/src/grep.c index ae3b6e7..15c6dc6 100644 --- a/src/grep.c +++ b/src/grep.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -2265,91 +2264,53 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } -/* The set of wchar_t values C such that there's a useful locale - somewhere where C != towupper (C) && C != towlower (towupper (C)). - For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because - towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and - towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ -static short const lonesome_lower[] = - { - 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, - 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, - - /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase - counterpart in locales predating Unicode 4.0.0 (April 2003). */ - 0x03F2, - - 0x03F5, 0x1E9B, 0x1FBE - }; +/* Return true if the fgrep pattern PAT, of size PATLEN, matches only + single-byte characters including case folding, and so is suitable + to be converted to a grep pattern. */ static bool fgrep_icase_available (char const *pat, size_t patlen) { - for (size_t i = 0; i < patlen; ++i) - { - unsigned char c = pat[i]; - if (localeinfo.sbclen[c] > 1) - return false; - } + bool used[UCHAR_MAX + 1] = { 0, }; - for (size_t i = 0; i < patlen; ++i) + for (size_t i = 0; i < patlen; i++) { unsigned char c = pat[i]; - - wint_t wc = localeinfo.sbctowc[c]; - if (wc == WEOF) + if (localeinfo.sbctowc[c] == WEOF) return false; + used[c] = true; + } - wint_t uwc = towupper (wc); - if (uwc != wc) - { - char s[MB_LEN_MAX]; - mbstate_t mb_state = { 0 }; - size_t len = wcrtomb (s, uwc, &mb_state); - if (len > 1 && len != (size_t) -1) - return false; - } - - wint_t lwc = towlower (uwc); - if (lwc != uwc && lwc != wc && towupper (lwc) == uwc) - { - char s[MB_LEN_MAX]; - mbstate_t mb_state = { 0 }; - size_t len = wcrtomb (s, lwc, &mb_state); - if (len > 1 && len != (size_t) -1) - return false; - } + for (int c = 0; c <= UCHAR_MAX; c++) + if (used[c]) + { + wint_t wc = localeinfo.sbctowc[c]; + wchar_t folded[CASE_FOLDED_BUFSIZE]; + size_t nfolded = case_folded_counterparts (wc, folded); - for (size_t j = 0; lonesome_lower[j]; j++) - { - wint_t li = lonesome_lower[j]; - if (li != lwc && li != uwc && li != wc && towupper (li) == uwc) - { - char s[MB_LEN_MAX]; - mbstate_t mb_state = { 0 }; - size_t len = wcrtomb (s, li, &mb_state); - if (len > 1 && len != (size_t) -1) - return false; - } - } - } + for (size_t i = 0; i < nfolded; i++) + { + char s[MB_LEN_MAX]; + mbstate_t mb_state = { 0 }; + if (1 < wcrtomb (s, folded[i], &mb_state)) + return false; + } + } return true; } -/* Change a pattern for fgrep into grep. */ +/* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */ + static void fgrep_to_grep_pattern (char **keys_p, size_t *len_p) { - char *keys, *new_keys, *p; + size_t len = *len_p; + char *keys = *keys_p; mbstate_t mb_state = { 0 }; - size_t len, n; - - len = *len_p; - keys = *keys_p; - - new_keys = xnmalloc (len + 1, 2); - p = new_keys; + char *new_keys = xnmalloc (len + 1, 2); + char *p = new_keys; + size_t n; for (; len; keys += n, len -= n) { @@ -2365,14 +2326,15 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p) case (size_t) -1: memset (&mb_state, 0, sizeof mb_state); + n = 1; /* Fall through. */ case 1: - *p = '\\'; - p += strchr ("$*.[\\^", *keys) != NULL; - /* Fall through. */ - case 0: + switch (*keys) + { + case '$': case '*': case '.': case '[': case '\\': case '^': + *p++ = '\\'; break; + } *p++ = *keys; - n = 1; break; } } @@ -2380,10 +2342,6 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p) free (*keys_p); *keys_p = new_keys; *len_p = p - new_keys; - - matcher = "grep"; - compile = Gcompile; - execute = EGexecute; } int @@ -2814,19 +2772,18 @@ main (int argc, char **argv) /* In a unibyte locale, switch from fgrep to grep if the pattern matches words (where grep is typically faster). In a multibyte locale, switch from fgrep to grep if either - (1) case is ignored (where grep is typically faster), or - (2) the pattern has an encoding error (where fgrep might not work). */ - if (compile == Fcompile) + (1) the pattern has an encoding error (where fgrep might not work), or + (2) case is ignored and a fast fgrep ignore-case is not available. */ + if (compile == Fcompile + && (MB_CUR_MAX <= 1 + ? match_words + : (contains_encoding_error (keys, keycc) + || (match_icase && !fgrep_icase_available (keys, keycc))))) { - if (MB_CUR_MAX > 1) - { - if (contains_encoding_error (keys, keycc)) - fgrep_to_grep_pattern (&keys, &keycc); - else if (match_icase && !fgrep_icase_available (keys, keycc)) - fgrep_to_grep_pattern (&keys, &keycc); - } - else if (match_words) - fgrep_to_grep_pattern (&keys, &keycc); + fgrep_to_grep_pattern (&keys, &keycc); + matcher = "grep"; + compile = Gcompile; + execute = EGexecute; } compile (keys, keycc); diff --git a/src/kwsearch.c b/src/kwsearch.c index 7fe08aa..29d140c 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size) { size_t total = size; - kwsinit (&kwset, true); + kwset = kwsinit (true); char const *p = pattern; do diff --git a/src/localeinfo.c b/src/localeinfo.c index 329d431..ca96afc 100644 --- a/src/localeinfo.c +++ b/src/localeinfo.c @@ -29,6 +29,7 @@ #include #include #include +#include /* The sbclen implementation relies on this. */ verify (MB_LEN_MAX <= SCHAR_MAX); @@ -64,3 +65,49 @@ init_localeinfo (struct localeinfo *localeinfo) localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF; } } + +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +/* Verify that the worst case fits. This is 1 for towupper, 1 for + towlower, and 1 for each entry in LONESOME_LOWER. */ +verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower + <= CASE_FOLDED_BUFSIZE); + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ + +int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + int i; + int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} diff --git a/src/localeinfo.h b/src/localeinfo.h index 70b55a8..cf2f9a6 100644 --- a/src/localeinfo.h +++ b/src/localeinfo.h @@ -45,3 +45,10 @@ struct localeinfo }; extern void init_localeinfo (struct localeinfo *); + +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is a generous upper bound. */ +enum { CASE_FOLDED_BUFSIZE = 32 }; + +extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); diff --git a/src/search.h b/src/search.h index 534a49e..fb4e5c8 100644 --- a/src/search.h +++ b/src/search.h @@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN typedef signed char mb_len_map_t; /* searchutils.c */ -extern void kwsinit (kwset_t *, bool); +extern kwset_t kwsinit (bool); extern ptrdiff_t mb_goback (char const **, char const *, char const *); extern wint_t mb_prev_wc (char const *, char const *, char const *); extern wint_t mb_next_wc (char const *, char const *); diff --git a/src/searchutils.c b/src/searchutils.c index 87f51a4..73d6c1c 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -24,42 +24,36 @@ #define NCHAR (UCHAR_MAX + 1) -void -kwsinit (kwset_t *kwset, bool mb_trans) +kwset_t +kwsinit (bool mb_trans) { static char trans[NCHAR]; - int i; + char *transptr = NULL; if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) { if (MB_CUR_MAX == 1) - for (i = 0; i < NCHAR; ++i) + for (int i = 0; i < NCHAR; i++) trans[i] = toupper (i); else - for (i = 0; i < NCHAR; ++i) + for (int i = 0; i < NCHAR; i++) { wint_t wc = localeinfo.sbctowc[i]; wint_t uwc = towupper (wc); if (uwc != wc) { - char s[MB_LEN_MAX]; mbstate_t mbs = { 0 }; - size_t len = wcrtomb (s, uwc, &mbs); - if (len > 1) + size_t len = wcrtomb (&trans[i], uwc, &mbs); + if (len != 1) abort (); - trans[i] = s[0]; } else trans[i] = i; } - - *kwset = kwsalloc (trans, false); + transptr = trans; } - else - *kwset = kwsalloc (NULL, false); - if (!*kwset) - xalloc_die (); + return kwsalloc (transptr, false); } /* In the buffer *MB_START, return the number of bytes needed to go -- 2.7.4