>From a95ba0203afcc43feea0fa99e4d1b8fb923b1aeb Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 1 Sep 2016 11:45:18 -0700 Subject: [PATCH 1/4] dfa: use single-byte algorithm even in non-UTF-8 Even in non-UTF8 locales, if the current input character is single byte, we can use CSET to match ANYCHAR. * src/dfa.c (struct dfa): New member canychar. Cache index of CSET for ANYCHAR. (lex): Make CSET for ANYCHAR. (state_index): Simplify. (dfastate): Consider CSET for ANYCHAR. (transit_state_singlebyte, transit_state): Remove handling for eolbyte, as we assume that eolbyte does not appear at current position. (dfaexec_main): Use algorithm for single byte character to any single byte character in input text always. (dfasyntax): Initialize canychar. --- src/dfa.c | 115 ++++++++++++++++++++++++-------------------------------------- 1 file changed, 45 insertions(+), 70 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 0f5109e..e39d82a 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -401,6 +401,7 @@ struct dfa charclass *charclasses; /* Array of character sets for CSET tokens. */ size_t cindex; /* Index for adding new charclasses. */ size_t calloc; /* Number of charclasses allocated. */ + size_t canychar; /* Index of anychar class, or (size_t) -1. */ /* Scanner state */ struct lexer_state lex; @@ -1398,21 +1399,24 @@ lex (struct dfa *dfa) case '.': if (backslash) goto normal_char; - if (dfa->localeinfo.multibyte) + if (dfa->canychar == (size_t) -1) { - /* In multibyte environment period must match with a single - character not a byte. So we use ANYCHAR. */ - dfa->lex.laststart = false; - return dfa->lex.lasttok = ANYCHAR; + zeroset (ccl); + notset (ccl); + if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) + clrbit ('\n', ccl); + if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) + clrbit ('\0', ccl); + if (dfa->localeinfo.multibyte) + for (c2 = 0; c2 < NOTCHAR; c2++) + if (dfa->localeinfo.sbctowc[c2] == WEOF) + clrbit (c2, ccl); + dfa->canychar = charclass_index (dfa, ccl); } - zeroset (ccl); - notset (ccl); - if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) - clrbit ('\n', ccl); - if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) - clrbit ('\0', ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); + return dfa->lex.lasttok = (dfa->localeinfo.multibyte + ? ANYCHAR + : CSET + dfa->canychar); case 's': case 'S': @@ -2077,7 +2081,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + if (d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) @@ -2554,13 +2558,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR) + else if (d->tokens[pos.index] == ANYCHAR) { - /* ANYCHAR must match a single character, so put it to - D->states[s].mbps which contains the positions which can - match with a single character not a byte. If all - positions with ANYCHAR do not depend on the context of - the next character, put its follows instead to + copyset (d->charclasses[d->canychar], matches); + + /* ANYCHAR must match with a single character, so we must put + it to D->states[s].mbps which contains the positions which + can match with a single character not a byte. If all + positions which has ANYCHAR does not depend on context of + next character, we put the follows instead of it to D->states[s].mbps to optimize. */ if (d->states[s].curr_dependent) { @@ -2572,7 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) d->states[s].context, CTX_ANY)) { if (d->states[s].mbps.nelem == 0) - alloc_position_set (&d->states[s].mbps, 1); + alloc_position_set (&d->states[s].mbps, + d->follows[pos.index].nelem); for (j = 0; j < d->follows[pos.index].nelem; j++) insert (d->follows[pos.index].elems[j], &d->states[s].mbps); } @@ -2950,16 +2957,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp) { state_num *t; - if (**pp == d->syntax.eolbyte) - { - /* S is always an initial state in transit_state, so the - transition table for the state must have been built already. */ - assert (d->trans[s] || d->fails[s]); - - ++*pp; - return d->newlines[s]; - } - if (d->trans[s]) t = d->trans[s]; else if (d->fails[s]) @@ -2986,15 +2983,12 @@ static state_num transit_state (struct dfa *d, state_num s, unsigned char const **pp, unsigned char const *end) { - state_num s1; + state_num s1, s2; wint_t wc; int separate_contexts; - state_num state, state_newline, mb_index; size_t i, j; int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d); - int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE; - bool context_newline = context == CTX_NEWLINE; /* This state has some operators which can match a multibyte character. */ d->mb_follows.nelem = 0; @@ -3016,7 +3010,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, for (i = 0; i < d->states[s1].mbps.nelem; i++) { if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint, - d->states[s1].context, context)) + d->states[s1].context, CTX_NONE)) continue; for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) @@ -3025,10 +3019,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, } separate_contexts = state_separate_contexts (&d->mb_follows); - if (context_newline && separate_contexts & CTX_NEWLINE) - s = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); realloc_trans_if_necessary (d, s); return s; @@ -3041,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, { if (MAX_TRCOUNT <= d->mb_trcount) { - state_num s2; - for (s2 = -1; s2 < d->tralloc; s2++) + state_num s3; + for (s3 = -1; s3 < d->tralloc; s3++) { - free (d->mb_trans[s2]); - d->mb_trans[s2] = NULL; + free (d->mb_trans[s3]); + d->mb_trans[s3] = NULL; } for (i = 0; i < d->sindex; i++) @@ -3055,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, d->states[s1].mb_trindex = d->mb_trcount++; } - mb_index = d->states[s1].mb_trindex * 2; - if (! d->mb_trans[s]) { enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] }; - enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE }; + enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE }; d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE); - for (i = 0; i < 2 * MAX_TRCOUNT; i++) + for (i = 0; i < MAX_TRCOUNT; i++) d->mb_trans[s][i] = -1; } - else - { - state = d->mb_trans[s][mb_index + context_newline]; - if (0 <= state) - return state; - } + else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0) + return d->mb_trans[s][d->states[s1].mb_trindex]; if (s < 0) copy (&d->states[s1].mbps, &d->mb_follows); @@ -3078,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows); separate_contexts = state_separate_contexts (&d->mb_follows); - state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE); - else - state_newline = state; - realloc_trans_if_necessary (d, state_newline); + s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY); + realloc_trans_if_necessary (d, s2); - d->mb_trans[s][mb_index] = state; - d->mb_trans[s][mb_index + 1] = state_newline; + d->mb_trans[s][d->states[s1].mb_trindex] = s2; - return context_newline ? state_newline : state; + return s2; } /* The initial state may encounter a byte which is not a single byte character @@ -3215,10 +3195,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, } } - if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) - || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE)) - || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL)) - || (char *) p >= end) + if (d->states[s].mbps.nelem == 0 + || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) { /* If an input character does not match ANYCHAR, do it like a single-byte character. */ @@ -3227,8 +3205,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } @@ -3297,8 +3273,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, else { s = transit_state (d, s, &p, (unsigned char *) end); - if (s >= 0 && p[-1] == eol) - nlcount++; mbp = p; trans = d->trans; } @@ -4106,6 +4080,7 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, dfa->fast = !dfa->localeinfo.multibyte; + dfa->canychar = -1; dfa->lex.cur_mb_len = 1; dfa->syntax.syntax_bits_set = true; dfa->syntax.syntax_bits = bits; -- 2.7.4