From 290ca116c9172d97b2b026951fac722d3bd3ced9 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 20 Dec 2016 18:04:39 -0800 Subject: [PATCH 3/3] grep: fix performance with multiple patterns Problem reported by Jaroslav Skarvada (Bug#22357). * NEWS: Document this and other recent performance fixes. * src/grep.c (E_MATCHER_INDEX): New constant. (all_single_byte_after_folding): New function, split out from fgrep_icase_available. (fgrep_icase_available): Use it. (try_fgrep_pattern): New function, which also uses it. (main): With two or more patterns, use try_fgrep_pattern to fix performance regression. The number "two" here is just a heuristic. --- NEWS | 11 ++++++ src/grep.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 123 insertions(+), 20 deletions(-) diff --git a/NEWS b/NEWS index aa0483b..2e7d1ae 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,17 @@ GNU grep NEWS -*- outline -*- /proc file system and standard output is /dev/null. [bug introduced in grep-2.27] +** Bug fixes + + Fix performance regression with multiple patterns, e.g., for -Fi in + a multi-byte locale, or for -Fw in a single-byte locale. + [bugs introduced in grep-2.19, grep-2.22 and grep-2.26] + +** Improvements + + Improve performance for -E or -G pattern lists that are easily + converted to -F format. + * Noteworthy changes in release 2.27 (2016-12-06) [stable] diff --git a/src/grep.c b/src/grep.c index 574a380..f36654c 100644 --- a/src/grep.c +++ b/src/grep.c @@ -2032,7 +2032,7 @@ static struct { "perl", 0, Pcompile, Pexecute, }, }; /* Keep these in sync with the 'matchers' table. */ -enum { F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 }; +enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 }; /* Return the index of the matcher corresponding to M if available. MATCHER is the index of the previous matcher, or -1 if none. @@ -2245,23 +2245,12 @@ contains_encoding_error (char const *pat, size_t patlen) return false; } -/* Return true if the fgrep pattern PAT, of size PATLEN, matches only - single-byte characters including case folding, and so is suitable - to be converted to a grep pattern. */ +/* Return true if the set of single-byte characters USED contains only + characters whose case counterparts are also single-byte. */ static bool -fgrep_icase_available (char const *pat, size_t patlen) +all_single_byte_after_folding (bool used[UCHAR_MAX + 1]) { - bool used[UCHAR_MAX + 1] = { 0, }; - - for (size_t i = 0; i < patlen; i++) - { - unsigned char c = pat[i]; - if (localeinfo.sbctowc[c] == WEOF) - return false; - used[c] = true; - } - for (int c = 0; c <= UCHAR_MAX; c++) if (used[c]) { @@ -2281,6 +2270,26 @@ fgrep_icase_available (char const *pat, size_t patlen) return true; } +/* Return true if the -F patterns PAT, of size PATLEN, match only + single-byte characters including case folding, and so can be + processed by the -F matcher even if -i is given. */ + +static bool +fgrep_icase_available (char const *pat, size_t patlen) +{ + bool used[UCHAR_MAX + 1] = { 0, }; + + for (size_t i = 0; i < patlen; i++) + { + unsigned char c = pat[i]; + if (localeinfo.sbctowc[c] == WEOF) + return false; + used[c] = true; + } + + return all_single_byte_after_folding (used); +} + /* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */ static void @@ -2325,6 +2334,84 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p) *len_p = p - new_keys; } +/* If it is easy, convert the MATCHER-style patterns KEYS (of size + *LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and + return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and + return MATCHER. This function is conservative and sometimes misses + conversions, e.g., it does not convert the -E pattern "(a|a|[aa])" + to the -F pattern "a". */ + +static int +try_fgrep_pattern (int matcher, char *keys, size_t *len_p) +{ + int result = matcher; + size_t len = *len_p; + char *new_keys = xmalloc (len + 1); + char *p = new_keys; + mbstate_t mb_state = { 0 }; + size_t mblen_lim = match_icase ? 1 : -3; + bool used[UCHAR_MAX + 1] = { 0, }; + + while (len != 0) + { + switch (*keys) + { + case '$': case '*': case '.': case '[': case '^': + goto fail; + + case '(': case '+': case '?': case '{': case '|': + if (matcher != G_MATCHER_INDEX) + goto fail; + break; + + case '\\': + if (1 < len) + switch (keys[1]) + { + case '\n': + case 'B': case 'S': case 'W': case'\'': case '<': + case 'b': case 's': case 'w': case '`': case '>': + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + goto fail; + + case '(': case '+': case '?': case '{': case '|': + if (matcher == G_MATCHER_INDEX) + goto fail; + /* Fall through. */ + default: + keys++, len--; + break; + } + break; + } + + { + size_t n = mb_clen (keys, len, &mb_state); + if (mblen_lim < n) + goto fail; + used[to_uchar (keys[0])] = true; + p = mempcpy (p, keys, n); + keys += n; + len -= n; + } + } + + if (mblen_lim == 1 && !all_single_byte_after_folding (used)) + goto fail; + + if (*len_p != p - new_keys) + { + *len_p = p - new_keys; + memcpy (keys, new_keys, p - new_keys); + } + result = F_MATCHER_INDEX; + + fail: + free (new_keys); + return result; +} + int main (int argc, char **argv) { @@ -2758,11 +2845,11 @@ main (int argc, char **argv) if (matcher < 0) matcher = G_MATCHER_INDEX; - /* In a unibyte locale, switch from fgrep to grep if - the pattern matches words (where grep is typically faster). - In a multibyte locale, switch from fgrep to grep if either - (1) the pattern has an encoding error (where fgrep might not work), or - (2) case is ignored and a fast fgrep ignore-case is not available. */ + /* In a single-byte locale, switch from -F to -G if it is a single + pattern that matches words, where -G is typically faster. In a + multi-byte locale, switch if the patterns have an encoding error + (where -F does not work) or if -i and the patterns will not work + for -iF. */ if (matcher == F_MATCHER_INDEX && (MB_CUR_MAX <= 1 ? match_words @@ -2772,6 +2859,11 @@ main (int argc, char **argv) fgrep_to_grep_pattern (&keys, &keycc); matcher = G_MATCHER_INDEX; } + /* With two or more patterns, if -F works then switch from either -E + or -G, as -F is probably faster then. */ + else if ((matcher == G_MATCHER_INDEX || matcher == E_MATCHER_INDEX) + && 1 < n_patterns) + matcher = try_fgrep_pattern (matcher, keys, &keycc); execute = matchers[matcher].execute; matchers[matcher].compile (keys, keycc, matchers[matcher].syntax); -- 2.7.4