From 763af945ee2365ed57e4d0fdf1cf47f25db1a6ec Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <address@hidden>
Date: Mon, 17 Nov 2014 08:26:53 +0900
Subject: [PATCH] dfa: speed up handling of long pattern

DFA tries to find a long sequence of characters that must appear
in any matching line.  However, when a pattern is long (length N),
it is very slow, because it makes O(N^2) strstr calls.
This change reduces that to O(N) by processing each sequence of
adjacent "regular" characters as a group.

* src/dfa.c (dfamust): Process a string concatenated normal
characters at a time.
* NEWS (Improvement): Mention it.
Prompted by a bug report and patch by Ivan Yanikov
in http://bugs.gnu.org/15191#5

Compare the run times of this command before and after this change:
(on a i7-4770S CPU @ 3.10GHz using rawhide (~fedora 22) and compiled
with gcc 6.0.0 20150627)
  env time -f %e grep -f <(seq -s '' 9999 ) <<<1
  Before: 0.85
   After: 0.02
---
 NEWS      |  2 ++
 src/dfa.c | 53 +++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/NEWS b/NEWS
index 35c4aad..fb398c4 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,8 @@ GNU grep NEWS                                    -*- outline -*-
   'grep -D skip PATTERN FILE' no longer hangs if FILE is a fifo.
   [bug introduced in grep-2.12]

+  Performance has improved for patterns containing very long strings.
+

 * Noteworthy changes in release 2.21 (2014-11-23) [stable]

diff --git a/src/dfa.c b/src/dfa.c
index 2f82282..381676d 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3967,13 +3967,13 @@ struct must
 };

 static must *
-allocmust (must *mp)
+allocmust (must *mp, size_t size)
 {
   must *new_mp = xmalloc (sizeof *new_mp);
   new_mp->in = xzalloc (sizeof *new_mp->in);
-  new_mp->left = xzalloc (2);
-  new_mp->right = xzalloc (2);
-  new_mp->is = xzalloc (2);
+  new_mp->left = xzalloc (size);
+  new_mp->right = xzalloc (size);
+  new_mp->is = xzalloc (size);
   new_mp->begline = false;
   new_mp->endline = false;
   new_mp->prev = mp;
@@ -4006,23 +4006,22 @@ dfamust (struct dfa const *d)
 {
   must *mp = NULL;
   char const *result = "";
-  size_t ri;
   size_t i;
   bool exact = false;
   bool begline = false;
   bool endline = false;

-  for (ri = 0; ri < d->tindex; ++ri)
+  for (size_t ri = 0; ri < d->tindex; ++ri)
     {
       token t = d->tokens[ri];
       switch (t)
         {
         case BEGLINE:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           mp->begline = true;
           break;
         case ENDLINE:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           mp->endline = true;
           break;
         case LPAREN:
@@ -4037,7 +4036,7 @@ dfamust (struct dfa const *d)
         case BACKREF:
         case ANYCHAR:
         case MBCSET:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           break;

         case STAR:
@@ -4154,7 +4153,6 @@ dfamust (struct dfa const *d)
           goto done;

         default:
-          mp = allocmust (mp);
           if (CSET <= t)
             {
               /* If T is a singleton, or if case-folding in a unibyte
@@ -4167,7 +4165,10 @@ dfamust (struct dfa const *d)
                 if (tstbit (j, *ccl))
                   break;
               if (! (j < NOTCHAR))
-                break;
+                {
+                  mp = allocmust (mp, 2);
+                  break;
+                }
               t = j;
               while (++j < NOTCHAR)
                 if (tstbit (j, *ccl)
@@ -4175,12 +4176,36 @@ dfamust (struct dfa const *d)
                           && toupper (j) == toupper (t)))
                   break;
               if (j < NOTCHAR)
-                break;
+                {
+                  mp = allocmust (mp, 2);
+                  break;
+                }
             }
+
+          size_t rj = ri + 2;
+          if (d->tokens[ri + 1] == CAT)
+            {
+              for (; rj < d->tindex - 1; rj += 2)
+                {
+                  if ((rj != ri && (d->tokens[rj] <= 0
+                                    || NOTCHAR <= d->tokens[rj]))
+                      || d->tokens[rj + 1] != CAT)
+                    break;
+                }
+            }
+          mp = allocmust (mp, ((rj - ri) >> 1) + 1);
           mp->is[0] = mp->left[0] = mp->right[0]
             = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t;
-          mp->is[1] = mp->left[1] = mp->right[1] = '\0';
-          mp->in = enlist (mp->in, mp->is, 1);
+
+          for (i = 1; ri + 2 < rj; i++)
+            {
+              ri += 2;
+              t = d->tokens[ri];
+              mp->is[i] = mp->left[i] = mp->right[i]
+                = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t;
+            }
+          mp->is[i] = mp->left[i] = mp->right[i] = '\0';
+          mp->in = enlist (mp->in, mp->is, i - 1);
           break;
         }
     }
-- 
2.3.7