>From a95ba0203afcc43feea0fa99e4d1b8fb923b1aeb Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <address@hidden>
Date: Thu, 1 Sep 2016 11:45:18 -0700
Subject: [PATCH 1/4] dfa: use single-byte algorithm even in non-UTF-8

Even in non-UTF8 locales, if the current input character
is single byte, we can use CSET to match ANYCHAR.
* src/dfa.c (struct dfa): New member canychar.
Cache index of CSET for ANYCHAR.
(lex): Make CSET for ANYCHAR.
(state_index): Simplify.
(dfastate): Consider CSET for ANYCHAR.
(transit_state_singlebyte, transit_state): Remove handling for eolbyte,
as we assume that eolbyte does not appear at current position.
(dfaexec_main): Use algorithm for single byte character to any single
byte character in input text always.
(dfasyntax): Initialize canychar.
---
 src/dfa.c | 115 ++++++++++++++++++++++++--------------------------------------
 1 file changed, 45 insertions(+), 70 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 0f5109e..e39d82a 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -401,6 +401,7 @@ struct dfa
   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
   size_t cindex;                /* Index for adding new charclasses.  */
   size_t calloc;                /* Number of charclasses allocated.  */
+  size_t canychar;              /* Index of anychar class, or (size_t) -1.  */
 
   /* Scanner state */
   struct lexer_state lex;
@@ -1398,21 +1399,24 @@ lex (struct dfa *dfa)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->localeinfo.multibyte)
+          if (dfa->canychar == (size_t) -1)
             {
-              /* In multibyte environment period must match with a single
-                 character not a byte.  So we use ANYCHAR.  */
-              dfa->lex.laststart = false;
-              return dfa->lex.lasttok = ANYCHAR;
+              zeroset (ccl);
+              notset (ccl);
+              if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
+                clrbit ('\n', ccl);
+              if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
+                clrbit ('\0', ccl);
+              if (dfa->localeinfo.multibyte)
+                for (c2 = 0; c2 < NOTCHAR; c2++)
+                  if (dfa->localeinfo.sbctowc[c2] == WEOF)
+                    clrbit (c2, ccl);
+              dfa->canychar = charclass_index (dfa, ccl);
             }
-          zeroset (ccl);
-          notset (ccl);
-          if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
-            clrbit ('\n', ccl);
-          if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
-            clrbit ('\0', ccl);
           dfa->lex.laststart = false;
-          return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+          return dfa->lex.lasttok = (dfa->localeinfo.multibyte
+                                     ? ANYCHAR
+                                     : CSET + dfa->canychar);
 
         case 's':
         case 'S':
@@ -2077,7 +2081,7 @@ state_index (struct dfa *d, position_set const *s, int context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2554,13 +2558,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->tokens[pos.index] == ANYCHAR)
         {
-          /* ANYCHAR must match a single character, so put it to
-             D->states[s].mbps which contains the positions which can
-             match with a single character not a byte.  If all
-             positions with ANYCHAR do not depend on the context of
-             the next character, put its follows instead to
+          copyset (d->charclasses[d->canychar], matches);
+
+          /* ANYCHAR must match with a single character, so we must put
+             it to D->states[s].mbps which contains the positions which
+             can match with a single character not a byte.  If all
+             positions which has ANYCHAR does not depend on context of
+             next character, we put the follows instead of it to
              D->states[s].mbps to optimize.  */
           if (d->states[s].curr_dependent)
             {
@@ -2572,7 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
                                         d->states[s].context, CTX_ANY))
             {
               if (d->states[s].mbps.nelem == 0)
-                alloc_position_set (&d->states[s].mbps, 1);
+                alloc_position_set (&d->states[s].mbps,
+                                    d->follows[pos.index].nelem);
               for (j = 0; j < d->follows[pos.index].nelem; j++)
                 insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
             }
@@ -2950,16 +2957,6 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
 {
   state_num *t;
 
-  if (**pp == d->syntax.eolbyte)
-    {
-      /* S is always an initial state in transit_state, so the
-         transition table for the state must have been built already.  */
-      assert (d->trans[s] || d->fails[s]);
-
-      ++*pp;
-      return d->newlines[s];
-    }
-
   if (d->trans[s])
     t = d->trans[s];
   else if (d->fails[s])
@@ -2986,15 +2983,12 @@ static state_num
 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
                unsigned char const *end)
 {
-  state_num s1;
+  state_num s1, s2;
   wint_t wc;
   int separate_contexts;
-  state_num state, state_newline, mb_index;
   size_t i, j;
 
   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
-  int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE;
-  bool context_newline = context == CTX_NEWLINE;
 
   /* This state has some operators which can match a multibyte character.  */
   d->mb_follows.nelem = 0;
@@ -3016,7 +3010,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
       for (i = 0; i < d->states[s1].mbps.nelem; i++)
         {
           if (!SUCCEEDS_IN_CONTEXT (d->states[s1].mbps.elems[i].constraint,
-                                    d->states[s1].context, context))
+                                    d->states[s1].context, CTX_NONE))
             continue;
           for (j = 0; j < d->follows[d->states[s1].mbps.elems[i].index].nelem;
                j++)
@@ -3025,10 +3019,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
         }
 
       separate_contexts = state_separate_contexts (&d->mb_follows);
-      if (context_newline && separate_contexts & CTX_NEWLINE)
-        s = state_index (d, &d->mb_follows, CTX_NEWLINE);
-      else
-        s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+      s = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
       realloc_trans_if_necessary (d, s);
 
       return s;
@@ -3041,11 +3032,11 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     {
       if (MAX_TRCOUNT <= d->mb_trcount)
         {
-          state_num s2;
-          for (s2 = -1; s2 < d->tralloc; s2++)
+          state_num s3;
+          for (s3 = -1; s3 < d->tralloc; s3++)
             {
-              free (d->mb_trans[s2]);
-              d->mb_trans[s2] = NULL;
+              free (d->mb_trans[s3]);
+              d->mb_trans[s3] = NULL;
             }
 
           for (i = 0; i < d->sindex; i++)
@@ -3055,22 +3046,16 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
       d->states[s1].mb_trindex = d->mb_trcount++;
     }
 
-  mb_index = d->states[s1].mb_trindex * 2;
-
   if (! d->mb_trans[s])
     {
       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
-      enum { TRANSALLOC_SIZE = 2 * MAX_TRCOUNT * TRANSPTR_SIZE };
+      enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
-      for (i = 0; i < 2 * MAX_TRCOUNT; i++)
+      for (i = 0; i < MAX_TRCOUNT; i++)
         d->mb_trans[s][i] = -1;
     }
-  else
-    {
-      state = d->mb_trans[s][mb_index + context_newline];
-      if (0 <= state)
-        return state;
-    }
+  else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
+    return d->mb_trans[s][d->states[s1].mb_trindex];
 
   if (s < 0)
     copy (&d->states[s1].mbps, &d->mb_follows);
@@ -3078,17 +3063,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
 
   separate_contexts = state_separate_contexts (&d->mb_follows);
-  state = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
-  if (separate_contexts & CTX_NEWLINE)
-    state_newline = state_index (d, &d->mb_follows, CTX_NEWLINE);
-  else
-    state_newline = state;
-  realloc_trans_if_necessary (d, state_newline);
+  s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
+  realloc_trans_if_necessary (d, s2);
 
-  d->mb_trans[s][mb_index] = state;
-  d->mb_trans[s][mb_index + 1] = state_newline;
+  d->mb_trans[s][d->states[s1].mb_trindex] = s2;
 
-  return context_newline ? state_newline : state;
+  return s2;
 }
 
 /* The initial state may encounter a byte which is not a single byte character
@@ -3215,10 +3195,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
                     }
                 }
 
-              if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
-                  || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
-                  || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
-                  || (char *) p >= end)
+              if (d->states[s].mbps.nelem == 0
+                  || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
                 {
                   /* If an input character does not match ANYCHAR, do it
                      like a single-byte character.  */
@@ -3227,8 +3205,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
               else
                 {
                   s = transit_state (d, s, &p, (unsigned char *) end);
-                  if (s >= 0 && p[-1] == eol)
-                    nlcount++;
                   mbp = p;
                   trans = d->trans;
                 }
@@ -3297,8 +3273,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
           else
             {
               s = transit_state (d, s, &p, (unsigned char *) end);
-              if (s >= 0 && p[-1] == eol)
-                nlcount++;
               mbp = p;
               trans = d->trans;
             }
@@ -4106,6 +4080,7 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
 
   dfa->fast = !dfa->localeinfo.multibyte;
 
+  dfa->canychar = -1;
   dfa->lex.cur_mb_len = 1;
   dfa->syntax.syntax_bits_set = true;
   dfa->syntax.syntax_bits = bits;
-- 
2.7.4