bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 1/2] dfa: convert to wide character line-by-line


From: Paolo Bonzini
Subject: [PATCH 1/2] dfa: convert to wide character line-by-line
Date: Tue, 4 May 2010 17:37:43 +0200

This provides a nice speedup for -m in general, but especially
it avoids quadratic complexity in case we have to go to glibc.

* NEWS: Document change.
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec.  Convert
only up to the next newline.
(dfaexec): Exit multibyte processing loop if past buf_end.
Call prepare_wc_buf again after processing a newline.
---
 NEWS      |    5 +++
 src/dfa.c |   97 ++++++++++++++++++++++++++++++++++++------------------------
 2 files changed, 63 insertions(+), 39 deletions(-)

diff --git a/NEWS b/NEWS
index 1f3d00a..e64ec40 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   X{0,0} is implemented correctly.  It used to be a synonym of X{0,1}.
   [bug present since "the beginning"]
 
+  In multibyte locales, regular expressions including backreferences
+  no longer exhibit quadratic complexity (i.e., they are orders
+  of magnitude faster). [bug present since multi-byte character set
+  support was introduced in 2.5.2]
+
   In UTF-8 locales, regular expressions including "." can be orders
   of magnitude faster.  For example, "grep ." is now twice as fast
   as "grep -v ^$", instead of being immensely slower.  It remains
diff --git a/src/dfa.c b/src/dfa.c
index 340a4c6..44efc02 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3105,6 +3105,51 @@ transit_state (struct dfa *d, int s, unsigned char const 
**pp)
 
 #endif /* MBS_SUPPORT */
 
+/* Initialize mblen_buf and inputwcs with data from the next line.  */
+
+static void
+prepare_wc_buf (const char *begin, const char *end)
+{
+  unsigned char eol = eolbyte;
+  size_t remain_bytes, i;
+
+  buf_begin = (unsigned char *) begin;
+
+  remain_bytes = 0;
+  for (i = 0; i < end - begin + 1; i++)
+    {
+      if (remain_bytes == 0)
+        {
+          remain_bytes
+            = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+          if (remain_bytes < 1
+              || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
+            {
+              remain_bytes = 0;
+              inputwcs[i] = (wchar_t)begin[i];
+              mblen_buf[i] = 0;
+              if (begin[i] == eol)
+                break;
+            }
+          else
+            {
+              mblen_buf[i] = remain_bytes;
+              remain_bytes--;
+            }
+        }
+      else
+        {
+          mblen_buf[i] = remain_bytes;
+          inputwcs[i] = 0;
+          remain_bytes--;
+        }
+    }
+
+  buf_end = (unsigned char *) (begin + i);
+  mblen_buf[i] = 0;
+  inputwcs[i] = 0; /* sentinel */
+}
+
 /* Search through a buffer looking for a match to the given struct dfa.
    Find the first occurrence of a string matching the regexp in the
    buffer, and the shortest possible version thereof.  Return a pointer to
@@ -3151,44 +3196,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 #if MBS_SUPPORT
   if (d->mb_cur_max > 1)
     {
-      unsigned int i;
-      int remain_bytes;
-      buf_begin = (unsigned char *) begin;
-      buf_end = (unsigned char *) end;
-
-      /* initialize mblen_buf, and inputwcs.  */
       MALLOC(mblen_buf, unsigned char, end - begin + 2);
       MALLOC(inputwcs, wchar_t, end - begin + 2);
-      memset(&mbs, 0, sizeof mbs);
-      remain_bytes = 0;
-      for (i = 0; i < end - begin + 1; i++)
-        {
-          if (remain_bytes == 0)
-            {
-              remain_bytes
-                = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
-              if (remain_bytes < 1
-                || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
-                {
-                  remain_bytes = 0;
-                  inputwcs[i] = (wchar_t)begin[i];
-                  mblen_buf[i] = 0;
-                }
-              else
-                {
-                  mblen_buf[i] = remain_bytes;
-                  remain_bytes--;
-                }
-            }
-          else
-            {
-              mblen_buf[i] = remain_bytes;
-              inputwcs[i] = 0;
-              remain_bytes--;
-            }
-        }
-      mblen_buf[i] = 0;
-      inputwcs[i] = 0; /* sentinel */
+      memset(&mbs, 0, sizeof(mbstate_t));
+      prepare_wc_buf (p, end);
     }
 #endif /* MBS_SUPPORT */
 
@@ -3198,7 +3209,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       if (d->mb_cur_max > 1)
         while ((t = trans[s]))
           {
-            if ((char *) p > end)
+            if (p > buf_end)
               break;
             s1 = s;
             SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -3258,8 +3269,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
         }
 
       /* If the previous character was a newline, count it. */
-      if (count && (char *) p <= end && p[-1] == eol)
-        ++*count;
+      if ((char *) p <= end && p[-1] == eol)
+        {
+          if (count)
+            ++*count;
+
+#if MBS_SUPPORT
+          if (d->mb_cur_max > 1)
+            prepare_wc_buf (p, end);
+#endif
+        }
 
       /* Check if we've run off the end of the buffer. */
       if ((char *) p > end)
-- 
1.6.6.1





reply via email to

[Prev in Thread] Current Thread [Next in Thread]