grep branch, master, updated. v2.22-17-gf6603c4

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.22-17-gf6603c4

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.22-17-gf6603c4
Date:	Wed, 06 Jan 2016 08:26:54 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  f6603c4e1e04dbb87a7232c4b44acc6afdf65fef (commit)
      from  71c206b5042a11c976c25a9f77aff04ebb29fcd9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=f6603c4e1e04dbb87a7232c4b44acc6afdf65fef


commit f6603c4e1e04dbb87a7232c4b44acc6afdf65fef
Author: Paul Eggert <address@hidden>
Date:   Wed Jan 6 00:26:26 2016 -0800

    grep: restore -P PCRE_NO_UTF8_CHECK optimization
    
    On my platform in the en_US.utf8 locale, this makes 'grep -P "z.*a" k'
    220x faster, where k is created by the shell command:
    yes 'abcdefg hijklmn opqrstu vwxyz' | head -n 10000000 >k
    * src/dfasearch.c (EGexecute):
    * src/grep.c (execute_fp_t):
    * src/kwsearch.c (Fexecute):
    * src/pcresearch.c (Pexecute):
    First arg is now char *, not char const *, since Pexecute now
    temporarily modifies this argument.
    * src/grep.c, src/grep.h (buf_has_encoding_errors): Now extern.
    * src/pcresearch.c (Pexecute): Use it.  If the input is free of
    encoding errors, use a multiline search and the PCRE_NO_UTF8_CHECK
    option, as this is typically way faster.  This restores an
    optimization that was removed with the recent changes for binary
    file detection.

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 0205011..a330eac 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -202,7 +202,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t 
syntax_bits)
 }
 
 size_t
-EGexecute (char const *buf, size_t size, size_t *match_size,
+EGexecute (char *buf, size_t size, size_t *match_size,
            char const *start_ptr)
 {
   char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
diff --git a/src/grep.c b/src/grep.c
index f6fb0bc..10aabf9 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -462,7 +462,7 @@ enum { SEEK_HOLE = SEEK_SET };
 
 /* Functions we'll use to search. */
 typedef void (*compile_fp_t) (char const *, size_t);
-typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *);
+typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *);
 static compile_fp_t compile;
 static execute_fp_t execute;
 
@@ -561,7 +561,7 @@ skip_easy_bytes (char const *buf)
 /* Return true if BUF, of size SIZE, has an encoding error.
    BUF must be followed by at least sizeof (uword) bytes,
    the first of which may be modified.  */
-static bool
+bool
 buf_has_encoding_errors (char *buf, size_t size)
 {
   if (! unibyte_mask)
diff --git a/src/grep.h b/src/grep.h
index 577fb72..75b7ef7 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -29,4 +29,6 @@ extern bool match_words;      /* -w */
 extern bool match_lines;       /* -x */
 extern char eolbyte;           /* -z */
 
+extern bool buf_has_encoding_errors (char *, size_t);
+
 #endif
diff --git a/src/kwsearch.c b/src/kwsearch.c
index e33caaf..e9966d4 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size)
 }
 
 size_t
-Fexecute (char const *buf, size_t size, size_t *match_size,
+Fexecute (char *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
   char const *beg, *try, *end, *mb_start;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a647514..8f3d935 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -174,7 +174,7 @@ Pcompile (char const *pattern, size_t size)
 }
 
 size_t
-Pexecute (char const *buf, size_t size, size_t *match_size,
+Pexecute (char *buf, size_t size, size_t *match_size,
           char const *start_ptr)
 {
 #if !HAVE_LIBPCRE
@@ -194,13 +194,31 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
      error.  */
   char const *subject = buf;
 
+  /* If the input is free of encoding errors a multiline search is
+     typically more efficient.  Otherwise, a single-line search is
+     typically faster, so that pcre_exec doesn't waste time validating
+     the entire input buffer.  */
+  bool multiline = ! buf_has_encoding_errors (buf, size - 1);
+  buf[size - 1] = eolbyte;
+
   for (; p < buf + size; p = line_start = line_end + 1)
     {
-      /* A single-line search is typically faster, so that
-         pcre_exec doesn't waste time validating the entire input
-         buffer.  */
-      line_end = memchr (p, eolbyte, buf + size - p);
-      if (INT_MAX < line_end - p)
+      bool too_big;
+
+      if (multiline)
+        {
+          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
+          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
+          line_end = memrchr (p, eolbyte, scan_size);
+          too_big = ! line_end;
+        }
+      else
+        {
+          line_end = memchr (p, eolbyte, buf + size - p);
+          too_big = INT_MAX < line_end - p;
+        }
+
+      if (too_big)
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       for (;;)
@@ -228,11 +246,27 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
           int options = 0;
           if (!bol)
             options |= PCRE_NOTBOL;
+          if (multiline)
+            options |= PCRE_NO_UTF8_CHECK;
 
           e = jit_exec (subject, line_end - subject, search_offset,
                         options, sub);
           if (e != PCRE_ERROR_BADUTF8)
-            break;
+            {
+              if (0 < e && multiline && sub[1] - sub[0] != 0)
+                {
+                  char const *nl = memchr (subject + sub[0], eolbyte,
+                                           sub[1] - sub[0]);
+                  if (nl)
+                    {
+                      /* This match crosses a line boundary; reject it.  */
+                      p = subject + sub[0];
+                      line_end = nl;
+                      continue;
+                    }
+                }
+              break;
+            }
           int valid_bytes = sub[0];
 
           /* Try to match the string before the encoding error.  */
@@ -304,6 +338,15 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           beg = matchbeg;
           end = matchend;
         }
+      else if (multiline)
+        {
+          char const *prev_nl = memrchr (line_start - 1, eolbyte,
+                                         matchbeg - (line_start - 1));
+          char const *next_nl = memchr (matchend, eolbyte,
+                                        line_end + 1 - matchend);
+          beg = prev_nl + 1;
+          end = next_nl + 1;
+        }
       else
         {
           beg = line_start;

-----------------------------------------------------------------------

Summary of changes:
 src/dfasearch.c  |    2 +-
 src/grep.c       |    4 +-
 src/grep.h       |    2 +
 src/kwsearch.c   |    2 +-
 src/pcresearch.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++------
 5 files changed, 56 insertions(+), 11 deletions(-)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.22-17-gf6603c4, Paul Eggert <=
Prev by Date: grep branch, master, updated. v2.22-16-g71c206b
Next by Date: grep branch, master, updated. v2.22-18-g4f04b82
Previous by thread: grep branch, master, updated. v2.22-16-g71c206b
Next by thread: grep branch, master, updated. v2.22-18-g4f04b82
Index(es):
- Date
- Thread