bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 3/3] dfa: optimize UTF-8 period


From: Paolo Bonzini
Subject: [PATCH 3/3] dfa: optimize UTF-8 period
Date: Sat, 17 Apr 2010 03:34:54 +0200

* NEWS: Document improvement.
* src/dfa.c (struct dfa): Add utf8_anychar_classes.
(add_utf8_anychar): New.
(atom): Simplify if/else nesting.  Call add_utf8_anychar for ANYCHAR
in UTF-8 locales.
(dfaoptimize): Abort on ANYCHAR.
---
 NEWS      |    6 ++++++
 src/dfa.c |   46 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index bcca373..80074a7 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,12 @@ GNU grep NEWS                                    -*- outline 
-*-
 
   X{0,0} is implemented correctly.  It used to be a synonym of X{0,1}.
   [bug present since "the beginning"]
+    
+  In UTF-8 locales, regular expressions including "." can be orders
+  of magnitude faster.  For example, "grep ." is now twice as fast
+  as "grep -v ^$", instead of being immensely slower.  It remains
+  slow in other multibyte locales. [bug present since multi-byte
+  character set support was introduced in 2.5.2]
 
 * Noteworthy changes in release 2.6.3 (2010-04-02) [stable]
 
diff --git a/src/dfa.c b/src/dfa.c
index 9b67240..6e57fb0 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -311,6 +311,7 @@ struct dfa
                                    with dfaparse(). */
 #if MBS_SUPPORT
   unsigned int mb_cur_max;     /* Cached value of MB_CUR_MAX.  */
+  int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
 
   /* The following are used only if MB_CUR_MAX > 1.  */
 
@@ -1475,6 +1476,35 @@ addtok_wc (wint_t wc)
 }
 #endif
 
+static void
+add_utf8_anychar (void)
+{
+  static charclass utf8_classes[5] = {
+      {  0,  0,  0,  0, ~0, ~0, 0, 0 },
+      { ~0, ~0, ~0, ~0, 0, 0, 0, 0 },
+      {  0,  0,  0,  0,  0,  0, 0xfffffffcU, 0 },
+      {  0,  0,  0,  0,  0,  0,  0, 0xffff },
+      {  0,  0,  0,  0,  0,  0,  0, 0xff0000 }
+  };
+  const int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
+  int i;
+
+  if (dfa->utf8_anychar_classes[0] == 0)
+    for (i = 0; i < n; i++)
+      dfa->utf8_anychar_classes[i] = CSET + charclass_index(utf8_classes[i]);
+
+  /* (B|CA|DAA|EAAA) = (B|(C|(D|EA)A)A) =
+     B (C (D (E A CAT) OR A CAT) OR A CAT) OR */
+  for (i = 1; i < n; i++)
+    addtok (dfa->utf8_anychar_classes[i]);
+  while (--i > 1)
+    {
+      addtok (dfa->utf8_anychar_classes[0]);
+      addtok (CAT);
+      addtok (OR);
+    }
+}
+
 /* The grammar understood by the parser is as follows.
 
    regexp:
@@ -1513,8 +1543,11 @@ addtok_wc (wint_t wc)
 static void
 atom (void)
 {
+  if (0)
+    ;
+
 #if MBS_SUPPORT
-  if (tok == WCHAR)
+  else if (tok == WCHAR)
     {
       addtok_wc (case_fold ? towlower(wctok) : wctok);
 #ifndef GREP
@@ -1526,11 +1559,16 @@ atom (void)
 #endif
 
       tok = lex();
-      return;
+    }
+
+  else if (tok == ANYCHAR && using_utf8())
+    {
+      add_utf8_anychar();
+      tok = lex();
     }
 #endif /* MBS_SUPPORT  */
 
-  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
+  else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
 #if MBS_SUPPORT
       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
@@ -3307,6 +3345,8 @@ dfaoptimize (struct dfa *d)
       switch(d->tokens[i])
         {
         case ANYCHAR:
+          /* Lowered.  */
+          abort ();
         case MBCSET:
           /* Requires multi-byte algorithm.  */
           return;
-- 
1.6.6.1





reply via email to

[Prev in Thread] Current Thread [Next in Thread]