grep branch, master, updated. v2.22-15-g40ed879

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.22-15-g40ed879

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.22-15-g40ed879
Date:	Sat, 02 Jan 2016 05:17:14 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  40ed879db22d57516a31fefd1c39416974b74ec4 (commit)
      from  c65dcd72d12d26d8540e1153232e03fa29610137 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=40ed879db22d57516a31fefd1c39416974b74ec4


commit 40ed879db22d57516a31fefd1c39416974b74ec4
Author: Paul Eggert <address@hidden>
Date:   Fri Jan 1 21:16:12 2016 -0800

    grep: fix bug with with invalid unibyte sequence
    
    This was introduced by the recent binary-data-detection changes.
    Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86
    * src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove,
    replacing with ...
    (uword_max, unibyte_mask, initialize_unibyte_mask): ... this new
    constant, static var, and function.  All uses changed.  The
    unibyte_mask var generalizes the old local var hibyte_mask, which
    worked only for encodings where every byte with 0x80 turned off is
    a single-byte character.
    (buf_has_encoding_errors): Return false immediately if
    unibyte_mask is zero, not whether the current encoding is unibyte.
    The old test was incorrect in unibyte locales in which some bytes
    were encoding errors.
    * tests/pcre-z: Require UTF-8 locale, since the grep -z . test now
    needs this.  Use printf \0 rather than tr.  Port the 'grep -z .'
    test to platforms where the C locale says '\200' is an encoding
    error.  Use cmp rather than compare, as the file is binary and
    so non-GNU diff might not work.
    * tests/unibyte-binary: New file.
    * tests/Makefile.am (TESTS): Add it.

diff --git a/src/grep.c b/src/grep.c
index 1207a76..a5f1fa2 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -484,21 +484,6 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
-/* The high-order bit of a byte.  */
-enum { HIBYTE = 0x80 };
-
-/* True if every byte with HIBYTE off is a single-byte character.
-   UTF-8 has this property.  */
-static bool easy_encoding;
-
-static void
-init_easy_encoding (void)
-{
-  easy_encoding = true;
-  for (int i = 0; i < HIBYTE; i++)
-    easy_encoding &= mbclen_cache[i] == 1;
-}
-
 /* A cast to TYPE of VAL.  Use this when TYPE is a pointer type, VAL
    is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
    the alignment and would otherwise complain about the cast.  */
@@ -517,21 +502,33 @@ init_easy_encoding (void)
 /* An unsigned type suitable for fast matching.  */
 typedef uintmax_t uword;
 
+/* All bytes that are not unibyte characters, ANDed together, and then
+   with the pattern repeated to fill a uword.  For an encoding where
+   all bytes are unibyte characters, this is 0.  For UTF-8, this is
+   0x808080....  For encodings where unibyte characters have no useful
+   pattern, this is all 1s.  The unsigned char C is a unibyte
+   character if C & UNIBYTE_MASK is zero.  If the uword W is the
+   concatenation of bytes, the bytes are all unibyte characters
+   if W & UNIBYTE_MASK is zero.  */
+static uword unibyte_mask;
+
+static void
+initialize_unibyte_mask (void)
+{
+  unsigned char mask = UCHAR_MAX;
+  for (int i = 1; i <= UCHAR_MAX; i++)
+    if (mbclen_cache[i] != 1)
+      mask &= i;
+  uword uword_max = -1;
+  unibyte_mask = uword_max / UCHAR_MAX * mask;
+}
+
 /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
    that is not easy, and return a pointer to the first non-easy byte.
-   In easy encodings, the easy bytes all have HIBYTE off.
-   In other encodings, no byte is easy.  */
+   The easy bytes all have UNIBYTE_MASK off.  */
 static char const * _GL_ATTRIBUTE_PURE
 skip_easy_bytes (char const *buf)
 {
-  if (!easy_encoding)
-    return buf;
-
-  uword uword_max = -1;
-
-  /* 0x8080..., extended to be wide enough for uword.  */
-  uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
-
   /* Search a byte at a time until the pointer is aligned, then a
      uword at a time until a match is found, then a byte at a time to
      identify the exact byte.  The uword search may go slightly past
@@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf)
   char const *p;
   uword const *s;
   for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
-    if (*p & HIBYTE)
+    if (to_uchar (*p) & unibyte_mask)
       return p;
-  for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
+  for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
     continue;
-  for (p = (char const *) s; ! (*p & HIBYTE); p++)
+  for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
     continue;
   return p;
 }
@@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf)
 static bool
 buf_has_encoding_errors (char *buf, size_t size)
 {
-  if (MB_CUR_MAX <= 1)
+  if (! unibyte_mask)
     return false;
 
   mbstate_t mbs = { 0 };
@@ -2592,7 +2589,7 @@ main (int argc, char **argv)
     usage (EXIT_TROUBLE);
 
   build_mbclen_cache ();
-  init_easy_encoding ();
+  initialize_unibyte_mask ();
 
   /* In a unibyte locale, switch from fgrep to grep if
      the pattern matches words (where grep is typically faster).
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f349aa3..a38303c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -133,6 +133,7 @@ TESTS =                                             \
   turkish-I-without-dot                                \
   turkish-eyes                                 \
   two-files                                    \
+  unibyte-binary                               \
   unibyte-bracket-expr                         \
   unibyte-negated-circumflex                   \
   utf8-bracket                                 \
diff --git a/tests/pcre-z b/tests/pcre-z
index 6bbde94..4ce9a93 100755
--- a/tests/pcre-z
+++ b/tests/pcre-z
@@ -2,10 +2,11 @@
 # Test Perl regex with NUL-separated input
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
 require_pcre_
+require_en_utf8_locale_
 
 REGEX=a
 
-printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
+printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
 
 grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
 compare /dev/null err || fail_ 'stderr not empty on grep -z.'
@@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
 compare exp out || fail=1
 compare /dev/null err || fail=1
 
-printf '\200\0' >in0
-LC_ALL=C grep -z . in0 >out || fail=1
-compare in0 out || fail=1
+printf '\303\200\0' >in0 # "Ã" followed by a NUL.
+LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
+cmp in0 out || fail=1
 
 Exit $fail
diff --git a/tests/unibyte-binary b/tests/unibyte-binary
new file mode 100755
index 0000000..78735b8
--- /dev/null
+++ b/tests/unibyte-binary
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Test binary files in unibyte locales with encoding errors
+
+# Copyright 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_unibyte_locale
+
+fail=0
+
+printf 'a\n\200\nb\n' >in || framework_failure_
+printf 'a\nBinary file in matches\n' >exp || framework_failure_
+grep . in >out || fail=1
+compare exp out || fail=1
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 src/grep.c                                         |   57 +++++++++----------
 tests/Makefile.am                                  |    1 +
 tests/pcre-z                                       |    9 ++--
 .../{unibyte-negated-circumflex => unibyte-binary} |   11 ++--
 4 files changed, 39 insertions(+), 39 deletions(-)
 copy tests/{unibyte-negated-circumflex => unibyte-binary} (73%)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.22-15-g40ed879, Paul Eggert <=
Prev by Date: grep branch, master, updated. v2.22-14-gc65dcd7
Next by Date: grep branch, master, updated. v2.22-16-g71c206b
Previous by thread: grep branch, master, updated. v2.22-14-gc65dcd7
Next by thread: grep branch, master, updated. v2.22-16-g71c206b
Index(es):
- Date
- Thread