From 763af945ee2365ed57e4d0fdf1cf47f25db1a6ec Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 17 Nov 2014 08:26:53 +0900 Subject: [PATCH] dfa: speed up handling of long pattern DFA tries to find a long sequence of characters that must appear in any matching line. However, when a pattern is long (length N), it is very slow, because it makes O(N^2) strstr calls. This change reduces that to O(N) by processing each sequence of adjacent "regular" characters as a group. * src/dfa.c (dfamust): Process a string concatenated normal characters at a time. * NEWS (Improvement): Mention it. Prompted by a bug report and patch by Ivan Yanikov in http://bugs.gnu.org/15191#5 Compare the run times of this command before and after this change: (on a i7-4770S CPU @ 3.10GHz using rawhide (~fedora 22) and compiled with gcc 6.0.0 20150627) env time -f %e grep -f <(seq -s '' 9999 ) <<<1 Before: 0.85 After: 0.02 --- NEWS | 2 ++ src/dfa.c | 53 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 35c4aad..fb398c4 100644 --- a/NEWS +++ b/NEWS @@ -22,6 +22,8 @@ GNU grep NEWS -*- outline -*- 'grep -D skip PATTERN FILE' no longer hangs if FILE is a fifo. [bug introduced in grep-2.12] + Performance has improved for patterns containing very long strings. + * Noteworthy changes in release 2.21 (2014-11-23) [stable] diff --git a/src/dfa.c b/src/dfa.c index 2f82282..381676d 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3967,13 +3967,13 @@ struct must }; static must * -allocmust (must *mp) +allocmust (must *mp, size_t size) { must *new_mp = xmalloc (sizeof *new_mp); new_mp->in = xzalloc (sizeof *new_mp->in); - new_mp->left = xzalloc (2); - new_mp->right = xzalloc (2); - new_mp->is = xzalloc (2); + new_mp->left = xzalloc (size); + new_mp->right = xzalloc (size); + new_mp->is = xzalloc (size); new_mp->begline = false; new_mp->endline = false; new_mp->prev = mp; @@ -4006,23 +4006,22 @@ dfamust (struct dfa const *d) { must *mp = NULL; char const *result = ""; - size_t ri; size_t i; bool exact = false; bool begline = false; bool endline = false; - for (ri = 0; ri < d->tindex; ++ri) + for (size_t ri = 0; ri < d->tindex; ++ri) { token t = d->tokens[ri]; switch (t) { case BEGLINE: - mp = allocmust (mp); + mp = allocmust (mp, 2); mp->begline = true; break; case ENDLINE: - mp = allocmust (mp); + mp = allocmust (mp, 2); mp->endline = true; break; case LPAREN: @@ -4037,7 +4036,7 @@ dfamust (struct dfa const *d) case BACKREF: case ANYCHAR: case MBCSET: - mp = allocmust (mp); + mp = allocmust (mp, 2); break; case STAR: @@ -4154,7 +4153,6 @@ dfamust (struct dfa const *d) goto done; default: - mp = allocmust (mp); if (CSET <= t) { /* If T is a singleton, or if case-folding in a unibyte @@ -4167,7 +4165,10 @@ dfamust (struct dfa const *d) if (tstbit (j, *ccl)) break; if (! (j < NOTCHAR)) - break; + { + mp = allocmust (mp, 2); + break; + } t = j; while (++j < NOTCHAR) if (tstbit (j, *ccl) @@ -4175,12 +4176,36 @@ dfamust (struct dfa const *d) && toupper (j) == toupper (t))) break; if (j < NOTCHAR) - break; + { + mp = allocmust (mp, 2); + break; + } } + + size_t rj = ri + 2; + if (d->tokens[ri + 1] == CAT) + { + for (; rj < d->tindex - 1; rj += 2) + { + if ((rj != ri && (d->tokens[rj] <= 0 + || NOTCHAR <= d->tokens[rj])) + || d->tokens[rj + 1] != CAT) + break; + } + } + mp = allocmust (mp, ((rj - ri) >> 1) + 1); mp->is[0] = mp->left[0] = mp->right[0] = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t; - mp->is[1] = mp->left[1] = mp->right[1] = '\0'; - mp->in = enlist (mp->in, mp->is, 1); + + for (i = 1; ri + 2 < rj; i++) + { + ri += 2; + t = d->tokens[ri]; + mp->is[i] = mp->left[i] = mp->right[i] + = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t; + } + mp->is[i] = mp->left[i] = mp->right[i] = '\0'; + mp->in = enlist (mp->in, mp->is, i - 1); break; } } -- 2.3.7