grep branch, master, updated. v2.18-123-geb3292b

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.18-123-geb3292b

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.18-123-geb3292b
Date:	Tue, 06 May 2014 03:20:03 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  eb3292b3b205e50d0373f26ff0950ec82f49c14a (commit)
      from  17683df11fbea7aa01c9d60f1b45874c9ea5e26a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=eb3292b3b205e50d0373f26ff0950ec82f49c14a


commit eb3292b3b205e50d0373f26ff0950ec82f49c14a
Author: Paul Eggert <address@hidden>
Date:   Mon May 5 20:19:19 2014 -0700

    grep: fix encoding-error incompatibilities among regex, DFA, KWset
    
    This follows up to http://bugs.gnu.org/17376 and fixes a different
    set of incompatibilities, namely between the regex matcher and the
    other matchers, when the pattern contains encoding errors.
    The GNU regex matcher is not consistent in this area: sometimes
    an encoding error matches only itself, and sometimes it
    matches part of a multibyte character.  There is no documentation
    for grep's behavior in this area and users don't seem to care,
    and it's simpler to defer to the regex matcher for problematic
    cases like these.
    * NEWS: Document this.
    * src/dfa.c (ctok): Remove.  All uses removed.
    (parse_bracket_exp, atom): Use BACKREF if a pattern contains
    an encoding error, so that the matcher will revert to regex.
    * src/dfasearch.c, src/grep.c, src/pcresearch.c, src/searchutils.c:
    Don't include dfa.h, since search.h now does that for us.
    * src/dfasearch.c (EGexecute):
    * src/kwsearch.c (Fexecute): In a UTF-8 locale, there's no need to
    worry about matching part of a multibyte character.
    * src/grep.c (contains_encoding_error): New static function.
    (main): Use it, so that grep -F is consistent with plain fgrep
    when the pattern contains an encoding error.
    * src/search.h: Include dfa.h, so that kwsearch.c can call using_utf8.
    * src/searchutils.c (is_mb_middle): Remove UTF-8-specific code.
    Callers now ensure that we are in a non-UTF-8 locale.
    The code was clearly wrong, anyway.
    * tests/fgrep-infloop, tests/invalid-multibyte-infloop:
    * tests/prefix-of-multibyte:
    Do not require that grep have a particular behavor for this test.
    It's OK to match (exit status 0), not match (exit status 1), or
    report an error (exit status 2), since the pattern contains an
    encoding error and grep's behavior is not specified for such
    patterns.  Test only that KWset, DFA, and regex agree.
    * tests/prefix-of-multibyte: Add tests for ABCABC and __..._ABCABC___.

diff --git a/NEWS b/NEWS
index 2d3e12a..3bf7b69 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   grep -f no longer mishandles patterns containing NUL bytes.
   [bug introduced in grep-2.11]
 
+  Plain grep, grep -E, and grep -F now treat encoding errors in patterns
+  the same way the GNU regular expression matcher treats them, with respect
+  to whether the errors can match parts of multibyte characters in data.
+  [bug present since "the beginning"]
+
   grep -P now reports an error and exits when given invalid UTF-8 data.
   Previously it was unreliable, and sometimes crashed or looped.
   [bug introduced in grep-2.16]
diff --git a/src/dfa.c b/src/dfa.c
index 5211087..273d3d1 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -840,13 +840,11 @@ static int minrep, maxrep;      /* Repeat counts for 
{m,n}.  */
 
 static int cur_mb_len = 1;      /* Length of the multibyte representation of
                                    wctok.  */
-/* These variables are used only if (MB_CUR_MAX > 1).  */
+
 static wint_t wctok;           /* Wide character representation of the current
                                   multibyte character, or WEOF if there was
-                                   an encoding error.  */
-static int ctok;               /* Current input byte if it is an entire
-                                   character or is an encoding error,
-                                   EOF otherwise.  */
+                                  an encoding error.  Used only if
+                                  MB_CUR_MAX > 1.  */
 
 
 /* Fetch the next lexical input character.  Set C (of type int) to the
@@ -1194,7 +1192,7 @@ parse_bracket_exp (void)
         }
 
       if (wc == WEOF)
-        setbit (c, ccl);
+        known_bracket_exp = false;
       else
         {
           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
@@ -1255,8 +1253,7 @@ lex (void)
      "if (backslash) ...".  */
   for (i = 0; i < 2; ++i)
     {
-      FETCH_WC (ctok, wctok, NULL);
-      c = ctok;
+      FETCH_WC (c, wctok, NULL);
 
       switch (c)
         {
@@ -1786,7 +1783,7 @@ atom (void)
   if (tok == WCHAR)
     {
       if (wctok == WEOF)
-        addtok_mb (ctok, 3);
+        addtok (BACKREF);
       else
         {
           addtok_wc (wctok);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 4b3c79a..fad3da1 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -21,7 +21,6 @@
 #include <config.h>
 #include "intprops.h"
 #include "search.h"
-#include "dfa.h"
 
 /* For -w, we also consider _ to be word constituent.  */
 #define WCHAR(C) (isalnum (C) || (C) == '_')
@@ -265,7 +264,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
 
               if (exact_kwset_match)
                 {
-                  if (MB_CUR_MAX == 1)
+                  if (MB_CUR_MAX == 1 || using_utf8 ())
                     goto success;
                   if (mb_start < beg)
                     mb_start = beg;
diff --git a/src/grep.c b/src/grep.c
index d58ef03..a661fc0 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -32,7 +32,6 @@
 #include "c-ctype.h"
 #include "closeout.h"
 #include "colorize.h"
-#include "dfa.h"
 #include "error.h"
 #include "exclude.h"
 #include "exitfail.h"
@@ -1888,6 +1887,22 @@ parse_grep_colors (void)
       return;
 }
 
+/* Return true if PAT (of length PATLEN) contains an encoding error.  */
+static bool
+contains_encoding_error (char const *pat, size_t patlen)
+{
+  mbstate_t mbs = { 0 };
+  size_t i, charlen;
+
+  for (i = 0; i < patlen; i += charlen + (charlen == 0))
+    {
+      charlen = mbrlen (pat + i, patlen - i, &mbs);
+      if ((size_t) -2 <= charlen)
+        return true;
+    }
+  return false;
+}
+
 /* Change a pattern for fgrep into grep.  */
 static void
 fgrep_to_grep_pattern (size_t len, char const *keys,
@@ -2318,9 +2333,11 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
-  /* If case-insensitive fgrep in a multibyte locale, improve
-     performance by using grep instead.  */
-  if (match_icase && compile == Fcompile && MB_CUR_MAX > 1)
+  /* If fgrep in a multibyte locale, then use grep if either
+     (1) case is ignored (where grep is typically faster), or
+     (2) the pattern has an encoding error (where fgrep might not work).  */
+  if (compile == Fcompile && MB_CUR_MAX > 1
+      && (match_icase || contains_encoding_error (keys, keycc)))
     {
       size_t new_keycc;
       char *new_keys;
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 46569e9..5cbe5c0 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -126,7 +126,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       if (offset == (size_t) -1)
         goto failure;
       len = kwsmatch.size[0] - match_lines;
-      if (MB_CUR_MAX > 1 && !match_lines
+      if (!match_lines && MB_CUR_MAX > 1 && !using_utf8 ()
           && is_mb_middle (&mb_start, beg + offset, buf + size, len))
         {
           /* The match was a part of multibyte character, advance at least
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 9f63f37..820dd00 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -20,7 +20,6 @@
 
 #include <config.h>
 #include "search.h"
-#include "dfa.h"
 #if HAVE_PCRE_H
 # include <pcre.h>
 #elif HAVE_PCRE_PCRE_H
diff --git a/src/search.h b/src/search.h
index 871b7d5..bbf1fc2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -30,6 +30,7 @@
 #include "system.h"
 #include "error.h"
 #include "grep.h"
+#include "dfa.h"
 #include "kwset.h"
 #include "xalloc.h"
 
diff --git a/src/searchutils.c b/src/searchutils.c
index 76005bf..ef2cc5a 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,7 +19,6 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
-#include "dfa.h"
 
 #define NCHAR (UCHAR_MAX + 1)
 
@@ -230,16 +229,6 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   mbstate_t cur_state;
 
-  if (using_utf8 () && buf - p > MB_CUR_MAX)
-    {
-      for (p = buf; buf - p > MB_CUR_MAX; p--)
-        if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
-          break;
-
-      if (buf - p == MB_CUR_MAX)
-        p = buf;
-    }
-
   memset (&cur_state, 0, sizeof cur_state);
 
   while (p < buf)
diff --git a/tests/fgrep-infloop b/tests/fgrep-infloop
index 07a0ce0..015ec74 100755
--- a/tests/fgrep-infloop
+++ b/tests/fgrep-infloop
@@ -8,14 +8,20 @@ require_compiled_in_MB_support
 
 encode() { echo "$1" | tr ABC '\357\274\241'; }
 
+encode ABC > in || framework_failure_
 fail=0
 
 for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do
   out=out1-$LOC
-  encode ABC \
-    | LC_ALL=$LOC timeout 10s grep -F "$(encode BC)" > $out 2>&1
-  test $? = 1 || fail=1
-  compare /dev/null $out || fail=1
+  LC_ALL=$LOC timeout 10s grep -F "$(encode BC)" in > $out
+  status=$?
+  if test $status -eq 0; then
+    compare in $out
+  elif test $status -eq 1; then
+    compare_dev_null_ /dev/null $out
+  else
+    test $status -eq 2
+  fi || fail=1
 done
 
 Exit $fail
diff --git a/tests/invalid-multibyte-infloop b/tests/invalid-multibyte-infloop
index e98c170..b28bc53 100755
--- a/tests/invalid-multibyte-infloop
+++ b/tests/invalid-multibyte-infloop
@@ -14,7 +14,14 @@ encode AA > input
 fail=0
 
 # Before 2.15, this would infloop.
-LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out || fail=1
-compare input out || fail=1
+LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+status=$?
+if test $status -eq 0; then
+  compare input out
+elif test $status -eq 1; then
+  compare_dev_null_ /dev/null out
+else
+  test $status -eq 2
+fi || fail=1
 
 Exit $fail
diff --git a/tests/prefix-of-multibyte b/tests/prefix-of-multibyte
index 2ab9a99..2228a22 100755
--- a/tests/prefix-of-multibyte
+++ b/tests/prefix-of-multibyte
@@ -9,21 +9,35 @@ encode() { echo "$1" | tr ABC '\357\274\241'; }
 
 encode ABC >exp1
 encode aABC >exp2
+encode ABCABC >exp3
+encode _____________________ABCABC___ >exp4
 
 fail=0
 
 for LOC in en_US.UTF-8 $LOCALE_FR_UTF8; do
-  for type in dfa fgrep regex; do
-    case $type in
-      dfa) opt= prefix= ;;
-      fgrep) opt=-F prefix= ;;
-      regex) opt= prefix='\(\)\1' ;;
-    esac
-    out=out-$type-$LOC
-    LC_ALL=$LOC grep $opt "$prefix$(encode A)" exp1 >$out || fail=1
-    compare exp1 $out || fail=1
-    LC_ALL=$LOC grep $opt "$prefix$(encode aA)" exp2 >$out || fail=1
-    compare exp2 $out || fail=1
+  for pat in A aA BCA; do
+    for file in exp1 exp2 exp3 exp4; do
+      for type in regex dfa fgrep; do
+        case $type in
+          dfa) opt= prefix= ;;
+          fgrep) opt=-F prefix= ;;
+          regex) opt= prefix='\(\)\1' ;;
+        esac
+        pattern=$prefix$(encode $pat)
+        out=out-$type-$LOC
+        LC_ALL=$LOC grep $opt "$pattern" $file >$out
+        status=$?
+        echo $status >$out.status
+        if test $status -eq 0; then
+          compare $file $out
+        elif test $status -eq 1; then
+          compare_dev_null_ /dev/null $out
+        else
+          test $status -eq 2
+        fi || fail=1
+        compare out-regex-$LOC.status $out.status || fail=1
+      done
+    done
   done
 done
 

-----------------------------------------------------------------------

Summary of changes:
 NEWS                            |    5 +++++
 src/dfa.c                       |   15 ++++++---------
 src/dfasearch.c                 |    3 +--
 src/grep.c                      |   25 +++++++++++++++++++++----
 src/kwsearch.c                  |    2 +-
 src/pcresearch.c                |    1 -
 src/search.h                    |    1 +
 src/searchutils.c               |   11 -----------
 tests/fgrep-infloop             |   14 ++++++++++----
 tests/invalid-multibyte-infloop |   11 +++++++++--
 tests/prefix-of-multibyte       |   36 +++++++++++++++++++++++++-----------
 11 files changed, 79 insertions(+), 45 deletions(-)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.18-123-geb3292b, Paul Eggert <=
Prev by Date: grep branch, master, updated. v2.18-122-g17683df
Next by Date: grep branch, master, updated. v2.18-125-g94555dd
Previous by thread: grep branch, master, updated. v2.18-122-g17683df
Next by thread: grep branch, master, updated. v2.18-125-g94555dd
Index(es):
- Date
- Thread