grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.6.3-51-g7a0ad00


From: Paolo Bonzini
Subject: grep branch, master, updated. v2.6.3-51-g7a0ad00
Date: Mon, 19 Apr 2010 19:56:20 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  7a0ad00f634237b753572378289d76fa8f1c5942 (commit)
      from  0f4afc66be2e4c48d1f7b6b2c02122fbe5222921 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=7a0ad00f634237b753572378289d76fa8f1c5942


commit 7a0ad00f634237b753572378289d76fa8f1c5942
Author: Paolo Bonzini <address@hidden>
Date:   Mon Apr 19 14:50:23 2010 +0200

    dfa: optimize UTF-8 period
    
    * NEWS: Document improvement.
    * src/dfa.c (struct dfa): Add utf8_anychar_classes.
    (add_utf8_anychar): New.
    (atom): Simplify if/else nesting.  Call add_utf8_anychar for ANYCHAR
    in UTF-8 locales.
    (dfaoptimize): Abort on ANYCHAR.

diff --git a/NEWS b/NEWS
index bcca373..1bf9011 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,12 @@ GNU grep NEWS                                    -*- outline 
-*-
   X{0,0} is implemented correctly.  It used to be a synonym of X{0,1}.
   [bug present since "the beginning"]
 
+  In UTF-8 locales, regular expressions including "." can be orders
+  of magnitude faster.  For example, "grep ." is now twice as fast
+  as "grep -v ^$", instead of being immensely slower.  It remains
+  slow in other multibyte locales. [bug present since multi-byte
+  character set support was introduced in 2.5.2]
+
 * Noteworthy changes in release 2.6.3 (2010-04-02) [stable]
 
 ** Bug fixes
diff --git a/src/dfa.c b/src/dfa.c
index fe093a8..34be712 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -308,6 +308,7 @@ struct dfa
                                    with dfaparse(). */
 #if MBS_SUPPORT
   unsigned int mb_cur_max;     /* Cached value of MB_CUR_MAX.  */
+  int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
 
   /* The following are used only if MB_CUR_MAX > 1.  */
 
@@ -1470,6 +1471,44 @@ addtok_wc (wint_t wc)
 }
 #endif
 
+static void
+add_utf8_anychar (void)
+{
+  static const charclass utf8_classes[5] = {
+      {  0,  0,  0,  0, ~0, ~0, 0, 0 },            /* 80-bf: non-lead bytes */
+      { ~0, ~0, ~0, ~0, 0, 0, 0, 0 },              /* 00-7f: 1-byte sequence */
+      {  0,  0,  0,  0,  0,  0, 0xfffffffcU, 0 },  /* c2-df: 2-byte sequence */
+      {  0,  0,  0,  0,  0,  0,  0, 0xffff },      /* e0-ef: 3-byte sequence */
+      {  0,  0,  0,  0,  0,  0,  0, 0xff0000 }     /* f0-f7: 4-byte sequence */
+  };
+  const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
+  unsigned int i;
+
+  /* Define the five character classes that are needed below.  */
+  if (dfa->utf8_anychar_classes[0] == 0)
+    for (i = 0; i < n; i++)
+      dfa->utf8_anychar_classes[i] = CSET + charclass_index(utf8_classes[i]);
+
+  /* A valid UTF-8 character is
+
+          ([0x00-0x7f]
+           |[0xc2-0xdf][0x80-0xbf]
+           |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
+           |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
+
+     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
+     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
+     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
+  for (i = 1; i < n; i++)
+    addtok (dfa->utf8_anychar_classes[i]);
+  while (--i > 1)
+    {
+      addtok (dfa->utf8_anychar_classes[0]);
+      addtok (CAT);
+      addtok (OR);
+    }
+}
+
 /* The grammar understood by the parser is as follows.
 
    regexp:
@@ -1508,8 +1547,12 @@ addtok_wc (wint_t wc)
 static void
 atom (void)
 {
+  if (0)
+    {
+      /* empty */
+    }
 #if MBS_SUPPORT
-  if (tok == WCHAR)
+  else if (tok == WCHAR)
     {
       addtok_wc (case_fold ? towlower(wctok) : wctok);
 #ifndef GREP
@@ -1521,16 +1564,28 @@ atom (void)
 #endif
 
       tok = lex();
-      return;
+    }
+
+  else if (tok == ANYCHAR && using_utf8())
+    {
+      /* For UTF-8 expand the period to a series of CSETs that define a valid
+        UTF-8 character.  This avoids using the slow multibyte path.  I'm
+        pretty sure it would be both profitable and correct to do it for
+        any encoding; however, the optimization must be done manually as
+        it is done above in add_utf8_anychar.  So, let's start with
+        UTF-8: it is the most used, and the structure of the encoding
+        makes the correctness more obvious.  */
+      add_utf8_anychar();
+      tok = lex();
     }
 #endif /* MBS_SUPPORT  */
 
-  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
-      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
+  else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
+                  || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
 #if MBS_SUPPORT
-      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
+          || tok == ANYCHAR || tok == MBCSET
 #endif /* MBS_SUPPORT */
-      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
+          || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
     {
       addtok(tok);
       tok = lex();
@@ -3297,6 +3352,8 @@ dfaoptimize (struct dfa *d)
       switch(d->tokens[i])
         {
         case ANYCHAR:
+          /* Lowered.  */
+          abort ();
         case MBCSET:
           /* Requires multi-byte algorithm.  */
           return;

-----------------------------------------------------------------------

Summary of changes:
 NEWS      |    6 +++++
 src/dfa.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 69 insertions(+), 6 deletions(-)


hooks/post-receive
-- 
grep




reply via email to

[Prev in Thread] Current Thread [Next in Thread]