From b31ebd2bb5aae54ba46ac3bc88161872b50f9513 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 11 Aug 2016 11:53:24 +0900 Subject: [PATCH 2/2] dfa: support not newline_anchor of regex * src/dfa.c (char_context): Define context for not newline_anchor. (dfasyntax): Add argument newline_anchor. Update all callers. (lex): Use cached values to check whether each character is letter or not. (charclass_context): Avoid context from hard-coded for EOL byte (dfastate): Use cached values to check whether each character is newline, letter or none. (dfaexec_main): Define transition after found newline in input and accepted condition for not newline_anchor. --- src/dfa.c | 51 +++++++++++++++++++++++++++++++----------------- src/dfa.h | 2 +- src/dfasearch.c | 2 +- tests/dfa-match-aux.c | 2 +- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 59bb3bc..1609ad6 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -681,9 +681,9 @@ unibyte_word_constituent (unsigned char c) } static int -char_context (unsigned char c) +char_context (unsigned char c, bool newline_anchor) { - if (c == eolbyte) + if (c == eolbyte && newline_anchor) return CTX_NEWLINE; if (unibyte_word_constituent (c)) return CTX_LETTER; @@ -692,7 +692,7 @@ char_context (unsigned char c) /* Entry point to set syntax options. */ void -dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol) +dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol, bool newline_anchor) { int i; syntax_bits_set = true; @@ -709,7 +709,7 @@ dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol) mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */ - sbit[uc] = char_context (uc); + sbit[uc] = char_context (uc, newline_anchor); switch (sbit[uc]) { case CTX_LETTER: @@ -1486,7 +1486,7 @@ lex (void) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (sbit[c2] == CTX_LETTER) setbit (c2, ccl); if (c == 'W') notset (ccl); @@ -2221,11 +2221,10 @@ charclass_context (charclass c) int context = 0; unsigned int j; - if (tstbit (eolbyte, c)) - context |= CTX_NEWLINE; - for (j = 0; j < CHARCLASS_WORDS; ++j) { + if (c[j] & newline[j]) + context |= CTX_NEWLINE; if (c[j] & letters[j]) context |= CTX_LETTER; if (c[j] & ~(letters[j] | newline[j])) @@ -2736,8 +2735,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state; for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; - trans[eolbyte] = state_newline; + trans[i] = sbit[i] == CTX_LETTER ? state_letter : state; + if (sbit[eolbyte] == CTX_NEWLINE) + trans[eolbyte] = state_newline; } else for (i = 0; i < NOTCHAR; ++i) @@ -2840,12 +2840,21 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) { int c = j * CHARCLASS_WORD_BITS + k; - if (c == eolbyte) - trans[c] = state_newline; - else if (unibyte_word_constituent (c)) - trans[c] = state_letter; - else if (c < NOTCHAR) - trans[c] = state; + if (c >= NOTCHAR) + break; + + switch (sbit[c]) + { + case CTX_NEWLINE: + trans[c] = state_newline; + break; + case CTX_LETTER: + trans[c] = state_letter; + break; + default: + trans[c] = state; + break; + } } } @@ -3276,11 +3285,17 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, nlcount++; mbp = p; - s = allow_nl ? d->newlines[s1] : 0; + s = (allow_nl ? d->newlines[s1] + : (sbit[eol] == CTX_NEWLINE ? 0 + : (sbit[eol] == CTX_LETTER ? d->min_trcount - 1 + : d->initstate_notbol))); } else if (d->fails[s]) { - if (d->success[s] & sbit[*p]) + if (d->success[s] & sbit[*p] + || ((char *) p == end + && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, + *d))) goto done; if (multibyte && s < d->min_trcount) diff --git a/src/dfa.h b/src/dfa.h index 60da0e4..0e259bf 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -53,7 +53,7 @@ extern void dfamustfree (struct dfamust *); /* dfasyntax() takes three arguments; the first sets the syntax bits described earlier in this file, the second sets the case-folding flag, and the third specifies the line terminator. */ -extern void dfasyntax (reg_syntax_t, bool, unsigned char); +extern void dfasyntax (reg_syntax_t, bool, unsigned char, bool); /* Compile the given string of the given length into the given struct dfa. Final argument is a flag specifying whether to build a searching or an diff --git a/src/dfasearch.c b/src/dfasearch.c index 9a523c8..17d6a74 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -128,7 +128,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) if (match_icase) syntax_bits |= RE_ICASE; re_set_syntax (syntax_bits); - dfasyntax (syntax_bits, match_icase, eolbyte); + dfasyntax (syntax_bits, match_icase, eolbyte, true); /* For GNU regex, pass the patterns separately to detect errors like "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and diff --git a/tests/dfa-match-aux.c b/tests/dfa-match-aux.c index af933ff..f8db72c 100644 --- a/tests/dfa-match-aux.c +++ b/tests/dfa-match-aux.c @@ -54,7 +54,7 @@ main (int argc, char **argv) setlocale (LC_ALL, ""); - dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n'); + dfasyntax (RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0, '\n', 1); dfa = dfaalloc (); dfacomp (argv[1], strlen (argv[1]), dfa, 0); -- 1.7.1