From 5bae88c4190bfcf38ae854a7c8eb6ccefb1b9d48 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 28 Apr 2014 21:13:57 +0900 Subject: [PATCH 1/2] grep: simplification for superset * src/dfa.h (dfahint): Remove it. (dfasuperset): New function. * src/dfa.c (dfahint): Remove it. (dfassbuild): Rename from dfasuperset. (dfasuperset): New function. it returns the superset oneself of D. * src/dfasearch.c: Substitute dfahint to dfasuperset and simplify. --- src/dfa.c | 26 ++++---------- src/dfa.h | 13 ++----- src/dfasearch.c | 106 +++++++++++++++++++++++++++++++------------------------- 3 files changed, 69 insertions(+), 76 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 362de2c..c9070fe 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3391,24 +3391,12 @@ dfaexec (struct dfa *d, char const *begin, char *end, return (char *) p; } -/* Search through a buffer looking for a potential match for D. - Return the offset of the byte after the first potential match. - If there is no match, return (size_t) -1. If D lacks a superset - so it's not known whether there is a match, return (size_t) -2. - BEGIN points to the beginning of the buffer, and END points to the - first byte after its end. Store a sentinel byte (usually newline) - in *END, so the actual buffer must be one byte longer. If COUNT is - non-NULL, increment *COUNT once for each newline processed. */ -size_t -dfahint (struct dfa *d, char const *begin, char *end, size_t *count) +/* Return superset for D, which searchs through a buffer looking for a + potential match. */ +struct dfa * +dfasuperset (struct dfa *d) { - if (! d->superset) - return -2; - else - { - char const *match = dfaexec (d->superset, begin, end, 1, count, NULL); - return match ? match - begin : -1; - } + return d->superset; } bool @@ -3495,7 +3483,7 @@ dfaoptimize (struct dfa *d) } static void -dfasuperset (struct dfa *d) +dfassbuild (struct dfa *d) { size_t i, j; charclass ccl; @@ -3579,7 +3567,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfaparse (s, len, d); dfamust (d); dfaoptimize (d); - dfasuperset (d); + dfassbuild (d); dfaanalyze (d, searchflag); if (d->superset) dfaanalyze (d->superset, searchflag); diff --git a/src/dfa.h b/src/dfa.h index fbcb191..2de7ba8 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -71,16 +71,9 @@ extern void dfacomp (char const *, size_t, struct dfa *, int); extern char *dfaexec (struct dfa *d, char const *begin, char *end, int newline, size_t *count, int *backref); -/* Search through a buffer looking for a potential match for D. - Return the offset of the byte after the first potential match. - If there is no match, return (size_t) -1. If D lacks a superset - so it's not known whether there is a match, return (size_t) -2. - BEGIN points to the beginning of the buffer, and END points to the - first byte after its end. Store a sentinel byte (usually newline) - in *END, so the actual buffer must be one byte longer. If COUNT is - non-NULL, increment *COUNT once for each newline processed. */ -extern size_t dfahint (struct dfa *d, char const *begin, char *end, - size_t *count); +/* Return superset for D, which searchs through a buffer looking for a + potential match. */ +extern struct dfa *dfasuperset (struct dfa *d); /* Return true if the DFA is likely to be fast. */ extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE; diff --git a/src/dfasearch.c b/src/dfasearch.c index 7edc96b..c1822b4 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -213,6 +213,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, size_t len, best_len; struct kwsmatch kwsm; size_t i; + struct dfa *superset = dfasuperset (dfa); bool dfafast = dfaisfast (dfa); mb_start = buf; @@ -220,26 +221,37 @@ EGexecute (char const *buf, size_t size, size_t *match_size, for (beg = end = buf; end < buflim; beg = end) { + end = buflim; + if (!start_ptr) { - /* We don't care about an exact match. */ + char const *next_beg, *dfa_beg = beg; + size_t count = 0; + bool narrowed = false; + + /* Try matching with KWset, if it's defined. */ if (kwset) { + char const *prev_beg; + /* Find a possible match using the KWset matcher. */ size_t offset = kwsexec (kwset, beg - begline, buflim - beg + begline, &kwsm); if (offset == (size_t) -1) goto failure; match = beg + offset; - - /* Narrow down to the line containing the candidate. */ + prev_beg = beg; + + /* Narrow down to the line we've found. */ beg = memrchr (buf, eol, match - buf); beg = beg ? beg + 1 : buf; - end = memchr (match, eol, buflim - match); - end = end ? end + 1 : buflim; + dfa_beg = beg; if (kwsm.index < kwset_exact_matches) { + end = memchr (match, eol, buflim - match); + end = end ? end + 1 : buflim; + if (MB_CUR_MAX == 1) goto success; if (mb_start < beg) @@ -250,61 +262,63 @@ EGexecute (char const *buf, size_t size, size_t *match_size, /* The matched line starts in the middle of a multibyte character. Perform the DFA search starting from the beginning of the next character. */ - if (! dfaexec (dfa, mb_start, (char *) end, 0, NULL, - &backref)) - continue; + dfa_beg = mb_start; + narrowed = true; } else if (!dfafast) { - if (dfahint (dfa, beg, (char *) end, NULL) == (size_t) -1) - continue; - if (! dfaexec (dfa, beg, (char *) end, 0, NULL, &backref)) - continue; + end = memchr (match, eol, buflim - match); + end = end ? end + 1 : buflim; + narrowed = true; } } - if (!kwset || dfafast) + + /* Try matching with the superset of DFA, if it's defined. */ + if (superset && !(kwset && kwsm.index < kwset_exact_matches)) { - /* No good fixed strings or DFA is fast; use DFA. */ - size_t offset, count; - char const *next_beg; - count = 0; - offset = dfahint (dfa, beg, (char *) buflim, &count); - if (offset == (size_t) -1) - goto failure; - if (offset == (size_t) -2) + next_beg = dfaexec (superset, dfa_beg, (char *) end, 1, &count, NULL); + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == end) + continue; + if (!narrowed) { - /* No use hint. */ - next_beg = dfaexec (dfa, beg, (char *) buflim, 0, - NULL, &backref); - /* If there's no match, or if we've matched the sentinel, - we're done. */ - if (next_beg == NULL || next_beg == buflim) - goto failure; + /* Narrow down to the line we've found. */ + if (count != 0) + { + beg = memrchr (buf, eol, next_beg - buf); + beg = beg ? beg + 1 : buf; + + /* If dfaexec may match in multiple lines, try to + match in one line. */ + end = beg; + continue; + } + end = memchr (next_beg, eol, buflim - next_beg); + end = end ? end + 1 : buflim; + narrowed = true; } - else - next_beg = beg + offset; + } + + /* Try matching with DFA. */ + next_beg = dfaexec (dfa, dfa_beg, (char *) end, 0, &count, &backref); + + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == end) + continue; + if (!narrowed) + { /* Narrow down to the line we've found. */ - beg = memrchr (buf, eol, next_beg - buf); - beg = beg ? beg + 1 : buf; if (count != 0) { - /* If dfahint may match in multiple lines, try to - match in one line. */ - end = beg; - continue; + beg = memrchr (buf, eol, next_beg - buf); + beg = beg ? beg + 1 : buf; } end = memchr (next_beg, eol, buflim - next_beg); end = end ? end + 1 : buflim; - if (offset != (size_t) -2) - { - next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL, - &backref); - /* If there's no match, or if we've matched the sentinel, - we're done. */ - if (next_beg == NULL || next_beg == end) - continue; - } } + /* Successful, no backreferences encountered! */ if (!backref) goto success; @@ -314,8 +328,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, { /* We are looking for the leftmost (then longest) exact match. We will go through the outer loop only once. */ - beg = buf; - end = buflim; ptr = start_ptr; } -- 1.9.2 From 75014e37c4e66ed1a93b77eb9dcb033d6dff1448 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 28 Apr 2014 21:22:04 +0900 Subject: [PATCH 2/2] grep: [PATCH] grep: ajustment of timing back to kwset when dfaisfast is true If failed in DFA after succeed in kwset, doesn't return to kwset until reaches the end of the buffer or find a match. By that, thought some cases speed up, there is also a case slowdown. This patch ajusts timing back to kwset when dfaisfast is true. * src/dfasearch.c (EGexecute): Do it. --- src/dfasearch.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index c1822b4..371f907 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -265,7 +265,23 @@ EGexecute (char const *buf, size_t size, size_t *match_size, dfa_beg = mb_start; narrowed = true; } - else if (!dfafast) + else if (dfafast && beg == prev_beg) + { + /* As a heuristic, prefer DFA to KWset temporarily, + when the letter doesn't advance much. */ + offset = (match - beg) * 8; + if (offset < 1024) + offset = 1024; + end = match + offset; + if (end < buflim) + { + end = memchr (end, eol, buflim - end); + end = end ? end + 1 : buflim; + } + else + end = buflim; + } + else { end = memchr (match, eol, buflim - match); end = end ? end + 1 : buflim; -- 1.9.2