bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 10/11] dfa: introduce bufdelim context


From: Paolo Bonzini
Subject: [PATCH 10/11] dfa: introduce bufdelim context
Date: Wed, 4 Jan 2012 11:59:51 +0100

* src/dfa.c (CTX_DELIM): New.
(CTX_ANY): Update.
(MATCHES_DELIM_CONTEXT): New.
(MATCHES_NEWLINE_CONTEXT): Treat CTX_DELIM like CTX_NEWLINE.
(PREV_DELIM_DEPENDENT): New.
(NO_CONSTRAINT, BEGLINE_CONSTRAINT, ENDLINE_CONSTRAINT, BEGWORD_CONSTRAINT,
ENDWORD_CONSTRAINT, LIMWORD_CONSTRAINT, NOTLIMWORD_CONSTRAINT): Adjust.
(BEGBUF_CONSTRAIN, ENDBUF_CONSTRAINT): New.
(bufdelims): New.
(char_context, wchar_context, dfasyntax, state_index, charclass_context,
state_wanted_contexts, dfastate, build_state): Handle CTX_DELIM.
(dfaanalyze): Add CTX_DELIM to the possible contexts for state 0.
---
 src/dfa.c |   91 +++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index cc5caf2..d94b408 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -106,7 +106,8 @@ static inline unsigned char to_uchar (char ch) { return ch; 
}
 #define CTX_NONE       1
 #define CTX_LETTER     2
 #define CTX_NEWLINE    4
-#define CTX_ANY                7
+#define CTX_DELIM      8
+#define CTX_ANY                15
 
 /* Sometimes characters can only be matched depending on the surrounding
    context.  Such context decisions depend on what the previous character
@@ -115,8 +116,10 @@ static inline unsigned char to_uchar (char ch) { return 
ch; }
    is set indicates that the constraint succeeds in the corresponding
    context.
 
-   bit 5 - previous need not be newline
-   bit 4 - current need not be newline
+   bit 7 - previous need not be bufdelim
+   bit 6 - current need not be bufdelim
+   bit 5 - previous need not be newline or bufdelim
+   bit 4 - current need not be newline or bufdelim
    bit 3 - previous and current are word-constituents
    bit 2 - previous was word-constituent, current isn't
    bit 1 - previous wasn't word-constituent, current is
@@ -126,10 +129,14 @@ static inline unsigned char to_uchar (char ch) { return 
ch; }
    succeeds in a particular context.  Prev is a bitmask of possible
    context values for the previous character, curr is the bitmask of
    possible context values for the lookahead character. */
+#define MATCHES_DELIM_CONTEXT(constraint, prev, curr) \
+  ((((constraint) & 0xc0) | \
+    ((prev & ~CTX_DELIM) ? 0 : 0x80) | \
+    ((curr & ~CTX_DELIM) ? 0 : 0x40)) == 0xc0)
 #define MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
   ((((constraint) & 0x30) | \
-    ((prev & ~CTX_NEWLINE) ? 0 : 0x20) | \
-    ((curr & ~CTX_NEWLINE) ? 0 : 0x10)) == 0x30)
+    ((prev & ~(CTX_DELIM | CTX_NEWLINE)) ? 0 : 0x20) | \
+    ((curr & ~(CTX_DELIM | CTX_NEWLINE)) ? 0 : 0x10)) == 0x30)
 #define MATCHES_LETTER_CONTEXT(constraint, prev, curr) \
   ((constraint) & \
    1 << (((prev & ~CTX_LETTER) ? 0 : 2) + ((curr & ~CTX_LETTER) ? 0 : 1)))
@@ -138,6 +145,8 @@ static inline unsigned char to_uchar (char ch) { return ch; 
}
    && MATCHES_LETTER_CONTEXT(constraint, prev, curr))
 
 /* The following macros give information about what a constraint depends on. */
+#define PREV_DELIM_DEPENDENT(constraint) \
+  (((constraint) & 0x80) == 0)
 #define PREV_NEWLINE_DEPENDENT(constraint) \
   (((constraint) & 0x20) == 0)
 #define PREV_LETTER_DEPENDENT(constraint) \
@@ -147,13 +156,15 @@ static inline unsigned char to_uchar (char ch) { return 
ch; }
    work by applying that constraint to determine what may follow them,
    taking into account what has gone before.  The following values are
    the constraints corresponding to the special tokens previously defined. */
-#define NO_CONSTRAINT 0x3f
-#define BEGLINE_CONSTRAINT 0x1f
-#define ENDLINE_CONSTRAINT 0x2f
-#define BEGWORD_CONSTRAINT 0x32
-#define ENDWORD_CONSTRAINT 0x34
-#define LIMWORD_CONSTRAINT 0x36
-#define NOTLIMWORD_CONSTRAINT 0x39
+#define NO_CONSTRAINT 0xff
+#define BEGBUF_CONSTRAINT 0x7f
+#define ENDBUF_CONSTRAINT 0xbf
+#define BEGLINE_CONSTRAINT 0xdf
+#define ENDLINE_CONSTRAINT 0xef
+#define BEGWORD_CONSTRAINT 0xf2
+#define ENDWORD_CONSTRAINT 0xf4
+#define LIMWORD_CONSTRAINT 0xf6
+#define NOTLIMWORD_CONSTRAINT 0xf9
 
 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
    are operators and others are terminal symbols.  Most (but not all) of these
@@ -575,6 +586,9 @@ static charclass letters;
 /* Set of characters that are newline. */
 static charclass newline;
 
+/* Set of characters that are buffer delimiters. */
+static charclass bufdelims;
+
 /* Add this to the test for whether a byte is word-constituent, since on
    BSD-based systems, many values in the 128..255 range are classified as
    alphabetic, while on glibc-based systems, they are not.  */
@@ -591,7 +605,9 @@ static charclass newline;
 static int
 char_context(unsigned char c)
 {
-  if (c == bufdelim || c == 0)
+  if (c == bufdelim)
+    return CTX_DELIM;
+  if (c == '\n')
     return CTX_NEWLINE;
   if (IS_WORD_CONSTITUENT (c))
     return CTX_LETTER;
@@ -601,7 +617,9 @@ char_context(unsigned char c)
 static int
 wchar_context(wint_t wc)
 {
-  if (wc == (wchar_t)bufdelim || wc == 0)
+  if (wc == (wchar_t)bufdelim)
+    return CTX_DELIM;
+  if (wc == L'\n')
     return CTX_NEWLINE;
   if (wc == L'_' || iswalnum (wc))
     return CTX_LETTER;
@@ -624,6 +642,9 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char delim)
       sbit[i] = char_context (i);
       switch (sbit[i])
         {
+        case CTX_DELIM:
+          setbit(i, bufdelims);
+          break;
         case CTX_LETTER:
           setbit(i, letters);
           break;
@@ -1972,7 +1993,8 @@ delete (position p, position_set *s)
 
 /* Find the index of the state corresponding to the given position set with
    the given preceding context, or create a new state if there is no such
-   state.  Context tells whether we got here on a newline or letter. */
+   state. Context tells whether we got here on a newline, buffer delimiter
+   or letter. */
 static int
 state_index (struct dfa *d, position_set const *s, int context)
 {
@@ -2018,7 +2040,8 @@ state_index (struct dfa *d, position_set const *s, int 
context)
         constraint = s->elems[j].constraint;
         if (SUCCEEDS_IN_CONTEXT(constraint, context, CTX_NONE)
             || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_NEWLINE)
-            || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_LETTER))
+            || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_LETTER)
+            || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_DELIM))
           d->states[i].constraint |= constraint;
         if (! d->states[i].first_end)
           d->states[i].first_end = d->tokens[s->elems[j].index];
@@ -2107,9 +2130,12 @@ charclass_context(charclass c)
   int context = 0;
   int j;
 
-  if (tstbit(bufdelim, c))
+  if (tstbit('\n', c))
     context |= CTX_NEWLINE;
 
+  if (tstbit(bufdelim, c))
+    context |= CTX_DELIM;
+
   for (j = 0; j < CHARCLASS_INTS; ++j)
     if (c[j] & letters[j])
       {
@@ -2127,6 +2153,9 @@ int state_wanted_contexts(position_set *s, int 
possible_contexts)
 
   for (j = 0; j < s->nelem; ++j)
     {
+      if ((possible_contexts & CTX_DELIM)
+          && PREV_DELIM_DEPENDENT(s->elems[j].constraint))
+        wanted_context |= CTX_DELIM;
       if ((possible_contexts & CTX_NEWLINE)
           && PREV_NEWLINE_DEPENDENT(s->elems[j].constraint))
         wanted_context |= CTX_NEWLINE;
@@ -2396,7 +2425,7 @@ dfaanalyze (struct dfa *d, int searchflag)
   d->sindex = 0;
   MALLOC(d->states, d->salloc);
 
-  wanted_context = state_wanted_contexts(&merged, CTX_NEWLINE);
+  wanted_context = state_wanted_contexts(&merged, CTX_NEWLINE | CTX_DELIM);
   state_index(d, &merged, wanted_context ? wanted_context : CTX_ANY);
 
   free(o_nullable);
@@ -2457,6 +2486,7 @@ dfastate (int s, struct dfa *d, int trans[])
   int wanted_context;          /* Context that new state wants to know. */
   int state;                   /* New state. */
   int state_newline;           /* New state on a newline transition. */
+  int state_bufdelim;          /* New state on a delimiter transition. */
   int state_letter;            /* New state on a letter transition. */
   int next_isnt_1st_byte = 0;  /* Flag if we can't add state0.  */
   int i, j, k;
@@ -2493,6 +2523,14 @@ dfastate (int s, struct dfa *d, int trans[])
          they fail in the current context. */
       if (pos.constraint != NO_CONSTRAINT)
         {
+          if (! MATCHES_DELIM_CONTEXT(pos.constraint,
+                                       d->states[s].context, CTX_DELIM))
+            for (j = 0; j < CHARCLASS_INTS; ++j)
+              matches[j] &= ~bufdelims[j];
+          if (! MATCHES_DELIM_CONTEXT(pos.constraint,
+                                       d->states[s].context, ~CTX_DELIM))
+            for (j = 0; j < CHARCLASS_INTS; ++j)
+              matches[j] &= bufdelims[j];
           if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
                                         d->states[s].context, CTX_NEWLINE))
             for (j = 0; j < CHARCLASS_INTS; ++j)
@@ -2592,6 +2630,10 @@ dfastate (int s, struct dfa *d, int trans[])
       copy(&d->states[0].elems, &follows);
       wanted_context = state_wanted_contexts(&follows, CTX_ANY);
       state = state_index(d, &follows, wanted_context ^ CTX_ANY);
+      if (wanted_context & CTX_DELIM)
+        state_bufdelim = state_index(d, &follows, CTX_DELIM);
+      else
+        state_bufdelim = state;
       if (wanted_context & CTX_NEWLINE)
         state_newline = state_index(d, &follows, CTX_NEWLINE);
       else
@@ -2603,7 +2645,8 @@ dfastate (int s, struct dfa *d, int trans[])
 
       for (i = 0; i < NOTCHAR; ++i)
         trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
-      trans[bufdelim] = state_newline;
+      trans['\n'] = state_newline;
+      trans[bufdelim] = state_bufdelim;
     }
   else
     for (i = 0; i < NOTCHAR; ++i)
@@ -2664,6 +2707,10 @@ dfastate (int s, struct dfa *d, int trans[])
 
       /* Find the state(s) corresponding to the union of the follows. */
       state = state_index(d, &follows, wanted_context ^ CTX_ANY);
+      if (wanted_context & CTX_DELIM)
+        state_bufdelim = state_index(d, &follows, CTX_DELIM);
+      else
+        state_bufdelim = state;
       if (wanted_context & CTX_NEWLINE)
         state_newline = state_index(d, &follows, CTX_NEWLINE);
       else
@@ -2680,7 +2727,9 @@ dfastate (int s, struct dfa *d, int trans[])
             {
               int c = j * INTBITS + k;
 
-              if (c == bufdelim)
+             if (c == bufdelim)
+                trans[c] = state_bufdelim;
+              else if (c == '\n')
                 trans[c] = state_newline;
               else if (IS_WORD_CONSTITUENT(c))
                 trans[c] = state_letter;
@@ -2729,6 +2778,8 @@ build_state (int s, struct dfa *d)
 
   /* Set up the success bits for this state. */
   d->success[s] = 0;
+  if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_DELIM, s, *d))
+    d->success[s] |= CTX_DELIM;
   if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_NEWLINE, s, *d))
     d->success[s] |= CTX_NEWLINE;
   if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_LETTER, s, *d))
-- 
1.7.7.1





reply via email to

[Prev in Thread] Current Thread [Next in Thread]