grep-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Grep-devel] proposed new function for dfa


From: Arnold Robbins
Subject: Re: [Grep-devel] proposed new function for dfa
Date: Tue, 29 Nov 2016 20:40:20 +0200
User-agent: Heirloom mailx 12.5 6/20/10

Hi All.

Below is what I've pushed into gawk's master. The various bits
copied by dfasyntax don't include any pointers, I looked at all of
them.

If y'all want to take this, great! Or eventually do something else
similar, also OK. In the meantime, I'm going ahead with this. :-)

Thanks,

Arnold

P.S. I previously pushed the patch that removes DFA_CASE_FOLD, too. :-)
------------------------------------------------
diff --git a/ChangeLog b/ChangeLog
index 051e83e..b2f0e8c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,9 +1,22 @@
 2016-11-29         Arnold D. Robbins     <address@hidden>
 
+       Remove redundant flag from dfa:
+
        * dfa.c (dfasyntax): Use RE_ICASE instead of DFA_CASE_FOLD.
        * dfa.h (DFA_CASE_FOLD): Removed.
        * re.c (make_regexp): Use RE_ICASE for regex and dfa. Yay!
 
+       Unrelated: Don't have to recompute syntax stuff every time
+       we compile a regexp.
+
+       * dfa.c (dfacopysyntax): New function.
+       (dfaalloc): Zero out the newly allocated memory.
+       * dfa.h (dfacopysyntax): Declare it.
+       * re.c (make_regexp): Declare two static dfaregs, one for
+       with and without ignorecase. Compute the syntax once for each,
+       then use dfacopysyntax to copy the settings when compiling
+       a regexp.
+
 2016-11-28         Arnold D. Robbins     <address@hidden>
 
        Make gawk compile on HP-UX 11.33.
diff --git a/dfa.c b/dfa.c
index cd7dce6..0a23105 100644
--- a/dfa.c
+++ b/dfa.c
@@ -805,6 +805,23 @@ char_context (struct dfa const *dfa, unsigned char c)
   return CTX_NONE;
 }
 
+/* Copy the syntax settings from one dfa instance to another.
+   Saves considerable computation time if compiling many regular expressions
+   based on the same setting.  */
+void
+dfacopysyntax (struct dfa *to, const struct dfa *from)
+{
+  to->dfaexec = from->dfaexec;
+  to->simple_locale = from->simple_locale;
+  to->localeinfo = from->localeinfo;
+
+  to->fast = from->fast;
+
+  to->canychar = from->canychar;
+  to->lex.cur_mb_len = from->lex.cur_mb_len;
+  to->syntax = from->syntax;
+}
+
 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
    this may happen when folding case in weird Turkish locales where
@@ -3999,7 +4016,12 @@ dfamustfree (struct dfamust *dm)
 struct dfa *
 dfaalloc (void)
 {
-  return xmalloc (sizeof (struct dfa));
+  void *p = xmalloc (sizeof (struct dfa));
+  if (p)
+    {
+      memset (p, 0, sizeof (struct dfa));
+    }
+  return p;
 }
 
 /* Initialize DFA.  */
diff --git a/dfa.h b/dfa.h
index 0fd9b2c..c68b4df 100644
--- a/dfa.h
+++ b/dfa.h
@@ -110,6 +110,11 @@ extern struct dfa *dfasuperset (struct dfa const *d) 
_GL_ATTRIBUTE_PURE;
 /* The DFA is likely to be fast.  */
 extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
 
+/* Copy the syntax settings from one dfa instance to another.
+   Saves considerable computation time if compiling many regular expressions
+   based on the same setting.  */
+extern void dfacopysyntax (struct dfa *to, const struct dfa *from);
+
 /* Free the storage held by the components of a struct dfa. */
 extern void dfafree (struct dfa *);
 
diff --git a/re.c b/re.c
index 6c1e360..5be3d17 100644
--- a/re.c
+++ b/re.c
@@ -49,8 +49,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool 
dfa, bool canfatal)
        int c, c2;
        static bool first = true;
        static bool no_dfa = false;
-       reg_syntax_t dfa_syn;
        int i;
+       static struct dfa* dfaregs[2] = { NULL, NULL };
 
        /*
         * The number of bytes in the current multibyte character.
@@ -62,9 +62,9 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool 
dfa, bool canfatal)
        memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
 
        if (first) {
-               first = false;
                /* for debugging and testing */
                no_dfa = (getenv("GAWK_NO_DFA") != NULL);
+               /* don't set first to false here, we do it below */
        }
 
        /* always check */
@@ -202,9 +202,14 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                syn &= ~RE_ICASE;
        }
 
-       dfa_syn = syn;
-       if (ignorecase)
-               dfa_syn |= RE_ICASE;
+       /* initialize dfas to hold syntax */
+       if (first) {
+               first = false;
+               dfaregs[0] = dfaalloc();
+               dfaregs[1] = dfaalloc();
+               dfasyntax(dfaregs[0], & localeinfo, syn, DFA_ANCHOR);
+               dfasyntax(dfaregs[1], & localeinfo, syn | RE_ICASE, DFA_ANCHOR);
+       }
 
        re_set_syntax(syn);
 
@@ -222,7 +227,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
        rp->pat.newline_anchor = false; /* don't get \n in middle of string */
        if (dfa && ! no_dfa) {
                rp->dfareg = dfaalloc();
-               dfasyntax(rp->dfareg, & localeinfo, dfa_syn, DFA_ANCHOR);
+               dfacopysyntax(rp->dfareg, dfaregs[ignorecase]);
                dfacomp(buf, len, rp->dfareg, true);
        } else
                rp->dfareg = NULL;



reply via email to

[Prev in Thread] Current Thread [Next in Thread]