[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 02/11] dfa: introduce contexts for the values in d->success
From: |
Paolo Bonzini |
Subject: |
[PATCH 02/11] dfa: introduce contexts for the values in d->success |
Date: |
Wed, 4 Jan 2012 11:59:43 +0100 |
Also initialize all tables in a single place in dfasyntax.
* src/dfa.c (CTX_NONE, CTX_LETTER, CTX_NEWLINE, char_context): New.
(sbit, letters, newline): New.
(dfasyntax): Fill them.
(dfastate): Remove letters, newline, initialized.
(build_state): Use CTX_* constants.
(dfaexec): Remove sbit and sbit_init.
---
src/dfa.c | 101 +++++++++++++++++++++++++++++++++++--------------------------
1 files changed, 58 insertions(+), 43 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 58505bf..2386f76 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -91,6 +91,13 @@ typedef int charclass[CHARCLASS_INTS];
errors that the cast doesn't. */
static inline unsigned char to_uchar (char ch) { return ch; }
+/* Contexts tell us whether a character is a newline or a word constituent.
+ Word-constituent characters are those that satisfy iswalnum(), plus '_'. */
+
+#define CTX_NONE 1
+#define CTX_LETTER 2
+#define CTX_NEWLINE 4
+
/* Sometimes characters can only be matched depending on the surrounding
context. Such context decisions depend on what the previous character
was, and the value of the current (lookahead) character. Context
@@ -107,8 +114,6 @@ static inline unsigned char to_uchar (char ch) { return ch;
}
bit 1 - previous wasn't word-constituent, current is
bit 0 - neither previous nor current is word-constituent
- Word-constituent characters are those that satisfy isalnum().
-
The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
succeeds in a particular context. Prevn is true if the previous character
was a newline, currn is true if the lookahead character is a newline.
@@ -553,14 +558,62 @@ static int case_fold;
/* End-of-line byte in data. */
static unsigned char eolbyte;
+/* Cache of char-context values. */
+static int sbit[NOTCHAR];
+
+/* Set of characters considered letters. */
+static charclass letters;
+
+/* Set of characters that are newline. */
+static charclass newline;
+
+/* Add this to the test for whether a byte is word-constituent, since on
+ BSD-based systems, many values in the 128..255 range are classified as
+ alphabetic, while on glibc-based systems, they are not. */
+#ifdef __GLIBC__
+# define is_valid_unibyte_character(c) 1
+#else
+# define is_valid_unibyte_character(c) (MBS_SUPPORT && btowc (c) != WEOF)
+#endif
+
+/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
+#define IS_WORD_CONSTITUENT(C) \
+ (is_valid_unibyte_character(C) && (isalnum(C) || (C) == '_'))
+
+static int
+char_context(unsigned char c)
+{
+ if (c == eolbyte || c == 0)
+ return CTX_NEWLINE;
+ if (IS_WORD_CONSTITUENT (c))
+ return CTX_LETTER;
+ return CTX_NONE;
+}
+
/* Entry point to set syntax options. */
void
dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
{
+ unsigned int i;
+
syntax_bits_set = 1;
syntax_bits = bits;
case_fold = fold;
eolbyte = eol;
+
+ for (i = 0; i < NOTCHAR; ++i)
+ {
+ sbit[i] = char_context (i);
+ switch (sbit[i])
+ {
+ case CTX_LETTER:
+ setbit(i, letters);
+ break;
+ case CTX_NEWLINE:
+ setbit(i, newline);
+ break;
+ }
+ }
}
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
@@ -1073,19 +1126,6 @@ parse_bracket_exp (void)
return CSET + charclass_index(ccl);
}
-/* Add this to the test for whether a byte is word-constituent, since on
- BSD-based systems, many values in the 128..255 range are classified as
- alphabetic, while on glibc-based systems, they are not. */
-#ifdef __GLIBC__
-# define is_valid_unibyte_character(c) 1
-#else
-# define is_valid_unibyte_character(c) (MBS_SUPPORT && btowc (c) != WEOF)
-#endif
-
-/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
-#define IS_WORD_CONSTITUENT(C) \
- (is_valid_unibyte_character(C) && (isalnum(C) || (C) == '_'))
-
static token
lex (void)
{
@@ -2364,8 +2404,6 @@ dfastate (int s, struct dfa *d, int trans[])
int intersectf; /* True if intersect is nonempty. */
charclass leftovers; /* Stuff in the label that didn't match. */
int leftoversf; /* True if leftovers is nonempty. */
- static charclass letters; /* Set of characters considered letters. */
- static charclass newline; /* Set of characters that are newline. */
position_set follows; /* Union of the follows of some group.
*/
position_set tmp; /* Temporary space for merging sets. */
int state; /* New state. */
@@ -2373,23 +2411,12 @@ dfastate (int s, struct dfa *d, int trans[])
int state_newline; /* New state on a newline transition. */
int wants_letter; /* New state wants to know letter context. */
int state_letter; /* New state on a letter transition. */
- static int initialized; /* Flag for static initialization. */
int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
int i, j, k;
MALLOC (grps, NOTCHAR);
MALLOC (labels, NOTCHAR);
- /* Initialize the set of letters, if necessary. */
- if (! initialized)
- {
- initialized = 1;
- for (i = 0; i < NOTCHAR; ++i)
- if (IS_WORD_CONSTITUENT(i))
- setbit(i, letters);
- setbit(eolbyte, newline);
- }
-
zeroset(matches);
for (i = 0; i < d->states[s].elems.nelem; ++i)
@@ -2674,13 +2701,13 @@ build_state (int s, struct dfa *d)
d->success[s] = 0;
if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
s, *d))
- d->success[s] |= 4;
+ d->success[s] |= CTX_NEWLINE;
if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
s, *d))
- d->success[s] |= 2;
+ d->success[s] |= CTX_LETTER;
if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
s, *d))
- d->success[s] |= 1;
+ d->success[s] |= CTX_NONE;
MALLOC(trans, NOTCHAR);
dfastate(s, d, trans);
@@ -3228,18 +3255,6 @@ dfaexec (struct dfa *d, char const *begin, char *end,
into a register. */
unsigned char eol = eolbyte; /* Likewise for eolbyte. */
unsigned char saved_end;
- static int sbit[NOTCHAR]; /* Table for anding with d->success. */
- static int sbit_init;
-
- if (! sbit_init)
- {
- unsigned int i;
-
- sbit_init = 1;
- for (i = 0; i < NOTCHAR; ++i)
- sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
- sbit[eol] = 4;
- }
if (! d->tralloc)
build_state_zero(d);
--
1.7.7.1
- [RFC/RFT PATCH 00/11] Differentiate ^/$ from \` and \' in grep -z mode, Paolo Bonzini, 2012/01/04
- [PATCH 01/11] dfa: fix corner case with anchors, Paolo Bonzini, 2012/01/04
- [PATCH 02/11] dfa: introduce contexts for the values in d->success,
Paolo Bonzini <=
- [PATCH 04/11] dfa: refactor common context computations, Paolo Bonzini, 2012/01/04
- [PATCH 03/11] dfa: change newline/letter to a single context value, Paolo Bonzini, 2012/01/04
- [PATCH 05/11] dfa: change meaning of a state context, Paolo Bonzini, 2012/01/04
- [PATCH 06/11] dfa: remove useless check, Paolo Bonzini, 2012/01/04
- [PATCH 07/11] dfa: make repetitive code *really* repetitive, Paolo Bonzini, 2012/01/04
- [PATCH 08/11] dfa: remove redundant line constraints, Paolo Bonzini, 2012/01/04
- [PATCH 11/11] dfa: introduce BEGBUF/ENDBUF, Paolo Bonzini, 2012/01/04
- [PATCH 09/11] dfa: rename "newline" to "buffer delimiter", Paolo Bonzini, 2012/01/04
- [PATCH 10/11] dfa: introduce bufdelim context, Paolo Bonzini, 2012/01/04
- Re: [RFC/RFT PATCH 00/11] Differentiate ^/$ from \` and \' in grep -z mode, Jim Meyering, 2012/01/04