grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.14-35-gbd58cfd


From: Jim Meyering
Subject: grep branch, master, updated. v2.14-35-gbd58cfd
Date: Sun, 01 Sep 2013 15:25:12 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  bd58cfd715ae24de75103522e7af75c1909da0f1 (commit)
      from  12edf2b2f25673204267e8f742c76328efc877db (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=bd58cfd715ae24de75103522e7af75c1909da0f1


commit bd58cfd715ae24de75103522e7af75c1909da0f1
Author: Corinna Vinschen <address@hidden>
Date:   Fri Aug 16 20:50:12 2013 +0200

    fix Cygwin UTF-16 surrogate-pair handling with -i
    
    grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin)
    when converting an input string containing certain 4-byte UTF-8
    sequences to lower case.  The conversions to wchar_t and back to
    a UTF-8 multibyte string did not take surrogate pairs into account.
    * src/searchutils.c (mbtolower) [__CYGWIN__]: Detect and handle
    surrogate pairs when converting.
    * NEWS (Bug fixes): Mention it.
    * tests/surrogate-pair: New test.
    * tests/Makefile.am (TESTS): Add it.
    Reported by: Jim Burwell

diff --git a/NEWS b/NEWS
index 8796a1b..fba3094 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,12 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 ** Bug fixes
 
+  grep -i would segfault on systems using UTF-16-based wchar_t (Cygwin)
+  when converting an input string containing certain 4-byte UTF-8
+  sequences to lower case.  The conversions to wchar_t and back to
+  a UTF-8 multibyte string did not take surrogate pairs into account.
+  [bug present since at least grep-2.6, though the segfault is new with 2.13]
+
   grep -E would segfault when given a regexp like '([^.]*[M]){1,2}'
   for any multibyte character M. [bug introduced in grep-2.6, which would
   segfault, but 2.7 and 2.8 had no problem, and 2.9 through 2.14 would
diff --git a/src/searchutils.c b/src/searchutils.c
index 160f80d..f27a01b 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -111,6 +111,37 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
     {
       wchar_t wc;
       size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
+#ifdef __CYGWIN__
+      /* Handle a UTF-8 sequence for a character beyond the base plane.
+         Cygwin's wchar_t is UTF-16, as in the underlying OS.  This
+         results in surrogate pairs which need some extra attention.  */
+      wint_t wci = 0;
+      if (mbclen == 3 && (wc & 0xdc00) == 0xd800)
+        {
+          /* We got the start of a 4 byte UTF-8 sequence.  This is returned
+             as a UTF-16 surrogate pair.  The first call to mbrtowc returned 3
+             and wc has been set to a high surrogate value, now we're going
+             to fetch the matching low surrogate.  This second call to mbrtowc
+             is supposed to return 1 to complete the 4 byte UTF-8 sequence.  */
+          wchar_t wc_2;
+          size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen,
+                                     &is);
+          if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00)
+            {
+              /* Match.  Convert this to a 4 byte wint_t which constitutes
+                 a 32-bit UTF-32 value.  */
+              wci = ( (((wint_t) (wc - 0xd800)) << 10)
+                     | ((wint_t) (wc_2 - 0xdc00)))
+                    + 0x10000;
+              ++mbclen;
+            }
+          else
+            {
+              /* Invalid UTF-8 sequence.  */
+              mbclen = (size_t) -1;
+            }
+        }
+#endif
       if (outlen + mb_cur_max >= outalloc)
         {
           size_t dm = m - len_map;
@@ -132,8 +163,35 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
         }
       else
         {
+          size_t ombclen;
           beg += mbclen;
-          size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+#ifdef __CYGWIN__
+          /* Handle Unicode characters beyond the base plane.  */
+          if (mbclen == 4)
+            {
+              /* towlower, taking wint_t (4 bytes), handles UCS-4 values.  */
+              wci = towlower (wci);
+              if (wci >= 0x10000)
+                {
+                  wci -= 0x10000;
+                  wc = (wci >> 10) | 0xd800;
+                  /* No need to check the return value.  When reading the
+                     high surrogate, the return value will be 0 and only the
+                     mbstate indicates that we're in the middle of reading a
+                     surrogate pair.  The next wcrtomb call reading the low
+                     surrogate will then return 4 and reset the mbstate.  */
+                  wcrtomb (p, wc, &os);
+                  wc = (wci & 0x3ff) | 0xdc00;
+                }
+              else
+                {
+                  wc = (wchar_t) wci;
+                }
+              ombclen = wcrtomb (p, wc, &os);
+            }
+          else
+#endif
+          ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
           *m = mbclen - ombclen;
           memset (m + 1, 0, ombclen - 1);
           m += ombclen;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c96ce06..581f688 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -87,6 +87,7 @@ TESTS =                                               \
   spencer1                                     \
   spencer1-locale                              \
   status                                       \
+  surrogate-pair                               \
   symlink                                      \
   turkish-I                                    \
   turkish-I-without-dot                                \
diff --git a/tests/surrogate-pair b/tests/surrogate-pair
new file mode 100755
index 0000000..a589819
--- /dev/null
+++ b/tests/surrogate-pair
@@ -0,0 +1,44 @@
+#!/bin/sh
+# Trigger a segfault-inducing bug with -i in grep-2.14 on Cygwin.
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+require_compiled_in_MB_support
+
+fail=0
+
+printf '\xf0\x90\x90\x85\n' > in || framework_failure_
+
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+# On Cygwin, before grep-2.15, this would segfault.
+grep -i anything-else in > out 2>&1
+# Require not just non-zero exit status, but exactly 1.
+test $? = 1 || fail=1
+# Expect no output.
+compare /dev/null out || fail=1
+
+# Also test whether a surrogate-pair in the search string works.
+for opt in '' -i -E -F -iE -iF; do
+  grep --file=in $opt in > out 2>&1 || fail=1
+  compare out in || fail=1
+done
+
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                   |    6 +++
 src/searchutils.c                      |   60 +++++++++++++++++++++++++++++++-
 tests/Makefile.am                      |    1 +
 tests/{dfa-coverage => surrogate-pair} |   22 +++++++++---
 4 files changed, 83 insertions(+), 6 deletions(-)
 copy tests/{dfa-coverage => surrogate-pair} (56%)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]