grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.16-4-g97318f5


From: Jim Meyering
Subject: grep branch, master, updated. v2.16-4-g97318f5
Date: Fri, 10 Jan 2014 21:13:20 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  97318f5e59a1ef6feb8a378434a00932a3fc1e0b (commit)
      from  c53ed7be03da564fc45836048324ee184f4541f1 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=97318f5e59a1ef6feb8a378434a00932a3fc1e0b


commit 97318f5e59a1ef6feb8a378434a00932a3fc1e0b
Author: Jim Meyering <address@hidden>
Date:   Sun Nov 24 18:49:31 2013 -0800

    grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales
    
    These days, nearly everyone uses a multibyte locale, and grep is often
    used with the --ignore-case (-i) option, but that option imposes a very
    high cost in order to handle some unusual cases in just a few multibyte
    locales.  This change gets most of the performance of using LC_ALL=C
    without eliminating the ability to search for multibyte strings.
    
    With the following example, I see an 11x speed-up with a 2.3GHz i7:
    Generate a 10M-line file, with each line consisting of 40 'j's:
    
        yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k
    
    Time searching it for the simple/noexistent string "foobar",
    first with this patch (best-of-5 trials):
    
        LC_ALL=en_US.UTF-8 env time src/grep -i foobar k
            1.10 real         1.03 user         0.07 sys
    
    Back out that commit (temporarily), recompile, and rerun the experiment:
    
        git log -1 -p|patch -R -p1; make
        LC_ALL=en_US.UTF-8 env time src/grep -i foobar k
            12.50 real        12.41 user         0.08 sys
    
    The trick is to realize that for some search strings, it is easy
    to convert to an equivalent one that is handled much more efficiently.
    E.g., convert this command:
    
      grep -i foobar k
    
    to this:
    
      grep '[fF][oO][oO][bB][aA][rR]' k
    
    That allows the matcher to search in buffer mode, rather than having to
    extract/case-convert/search each line separately.  Currently, we perform
    this conversion only when search strings contain neither '\' nor '['.
    See the comments for more detail.
    
    * src/main.c (trivial_case_ignore): New function.
    (main): When possible, transform the regexp so we can drop the -i.
    * tests/turkish-eyes: New file.
    * tests/Makefile.am (TESTS): Use it.
    * NEWS (Improvements): Mention it.

diff --git a/NEWS b/NEWS
index 6859ca0..6e46684 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Improvements
+
+  grep -i in a multibyte locale is now typically 10 times faster
+  for patterns that do not contain \ or [.
+
 
 * Noteworthy changes in release 2.16 (2014-01-01) [stable]
 
diff --git a/src/main.c b/src/main.c
index 44090be..bfd0982 100644
--- a/src/main.c
+++ b/src/main.c
@@ -27,6 +27,7 @@
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdio.h>
+#include <assert.h>
 #include "system.h"
 
 #include "argmatch.h"
@@ -1644,13 +1645,14 @@ if any error occurs and -q is not given, the exit 
status is 2.\n"));
   exit (status);
 }
 
+static char const *matcher;
+
 /* If M is NULL, initialize the matcher to the default.  Otherwise set the
    matcher to M if available.  Exit in case of conflicts or if M is not
    available.  */
 static void
 setmatcher (char const *m)
 {
-  static char const *matcher;
   unsigned int i;
 
   if (!m)
@@ -1865,6 +1867,84 @@ parse_grep_colors (void)
       return;
 }
 
+#define MBRTOWC(pwc, s, n, ps) \
+  (MB_CUR_MAX == 1 ? \
+   (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
+   mbrtowc ((pwc), (s), (n), (ps)))
+
+#define WCRTOMB(s, wc, ps) \
+  (MB_CUR_MAX == 1 ? \
+   (*(s) = wctob ((wint_t) (wc)), 1) : \
+   wcrtomb ((s), (wc), (ps)))
+
+/* If the newline-separated regular expressions, KEYS (with length, LEN
+   and no trailing NUL byte), are amenable to transformation into
+   otherwise equivalent case-ignoring ones, perform the transformation,
+   put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
+   and return true.  Otherwise, return false.  */
+static bool
+trivial_case_ignore (size_t len, char const *keys,
+                     size_t *new_len, char **new_keys)
+{
+  /* FIXME: consider removing the following restriction:
+     Reject if KEYS contain ASCII '\\' or '['.  */
+  if (memchr (keys, '\\', len) || memchr (keys, '[', len))
+    return false;
+
+  /* Worst case is that each byte B of KEYS is ASCII alphabetic and each
+     other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B
+     maps to [BC], which requires MB_CUR_MAX + 3 bytes.   */
+  *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1);
+  char *p = *new_keys;
+
+  mbstate_t mb_state;
+  memset (&mb_state, 0, sizeof mb_state);
+  while (len)
+    {
+      wchar_t wc;
+      int n = MBRTOWC (&wc, keys, len, &mb_state);
+
+      /* For an invalid, incomplete or L'\0', skip this optimization.  */
+      if (n <= 0)
+        {
+        skip_case_ignore_optimization:
+          free (*new_keys);
+          return false;
+        }
+
+      char const *orig = keys;
+      keys += n;
+      len -= n;
+
+      if (!iswalpha (wc))
+        {
+          memcpy (p, orig, n);
+          p += n;
+        }
+      else
+        {
+          *p++ = '[';
+          memcpy (p, orig, n);
+          p += n;
+
+          wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc);
+          char buf[MB_CUR_MAX];
+          int n2 = WCRTOMB (buf, wc2, &mb_state);
+          if (n2 <= 0)
+            goto skip_case_ignore_optimization;
+          assert (n2 <= MB_CUR_MAX);
+          memcpy (p, buf, n2);
+          p += n2;
+
+          *p++ = ']';
+        }
+    }
+
+  *new_len = p - *new_keys;
+
+  return true;
+}
+
 int
 main (int argc, char **argv)
 {
@@ -2263,6 +2343,35 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
+  /* As currently implemented, case-insensitive matching is expensive in
+     multi-byte locales because of a few outlier locales in which some
+     characters change size when converted to upper or lower case.  To
+     accommodate those, we revert to searching the input one line at a
+     time, rather than using the much more efficient buffer search.
+     However, if we have a regular expression, /foo/i, we can convert
+     it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
+     avoid the expensive read-and-process-a-line-at-a-time requirement.
+     Optimize-away the "-i" option, when possible, converting each
+     candidate alpha, C, in the regexp to [Cc].  */
+  if (match_icase)
+    {
+      size_t new_keycc;
+      char *new_keys;
+      /* It is not possible with -F, not useful with -P (pcre) and there is no
+         point when there is no regexp.  It also depends on which constructs
+         appear in the regexp.  See trivial_case_ignore for those details.  */
+      if (keycc
+          && ! (matcher
+                && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre")))
+          && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys))
+        {
+          match_icase = 0;
+          free (keys);
+          keys = new_keys;
+          keycc = new_keycc;
+        }
+    }
+
   compile (keys, keycc);
   free (keys);
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f4580b5..a37a814 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -94,6 +94,7 @@ TESTS =                                               \
   status                                       \
   surrogate-pair                               \
   symlink                                      \
+  turkish-eyes                                 \
   turkish-I                                    \
   turkish-I-without-dot                                \
   warn-char-classes                            \
diff --git a/tests/turkish-eyes b/tests/turkish-eyes
new file mode 100755
index 0000000..323eb35
--- /dev/null
+++ b/tests/turkish-eyes
@@ -0,0 +1,44 @@
+#!/bin/sh
+# Ensure that case-insensitive matching works with all Turkish i's
+
+# Copyright (C) 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_compiled_in_MB_support
+
+fail=0
+
+L=tr_TR.UTF-8
+
+# Check for a broken tr_TR.UTF-8 locale definition.
+# In this locale, 'i' is not a lower-case 'I'.
+echo I | LC_ALL=$L grep -i i > /dev/null \
+    && skip_ "your $L locale appears to be broken"
+
+# Ensure that this matches:
+# printf 'I:İ ı:i\n'|LC_ALL=tr_TR.utf8 grep -i 'ı:i I:İ'
+I=$(printf '\304\260') # capital I with dot
+i=$(printf '\304\261') # lowercase dotless i
+
+data=$(      printf "I:$I $i:i")
+search_str=$(printf "$i:i I:$I")
+printf "$data\n" > in || framework_failure_
+
+LC_ALL=$L grep -i "^$search_str\$" in > out || fail=1
+compare out in || fail=1
+
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                 |    5 ++
 src/main.c                           |  111 +++++++++++++++++++++++++++++++++-
 tests/Makefile.am                    |    1 +
 tests/{dfa-coverage => turkish-eyes} |   22 +++++--
 4 files changed, 133 insertions(+), 6 deletions(-)
 copy tests/{dfa-coverage => turkish-eyes} (53%)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]