grep branch, master, updated. v2.18-23-g4f88962

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.18-23-g4f88962

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.18-23-g4f88962
Date:	Sat, 08 Mar 2014 02:28:02 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  4f88962189867f78c5125bb6cc858ed96bc0880b (commit)
      from  bcdf8dea9955dfcc6c72058396350a137bd8e92d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=4f88962189867f78c5125bb6cc858ed96bc0880b


commit 4f88962189867f78c5125bb6cc858ed96bc0880b
Author: Paul Eggert <address@hidden>
Date:   Fri Mar 7 18:27:28 2014 -0800

    grep: fix case-fold mismatches between DFA and regex
    
    The DFA code and the regex code didn't use the same semantics for
    case-folding.  The regex code says that the data char d matches
    the pattern char p if uc (d) == uc (p).  POSIX is unclear in this
    area; the simplest fix for now is to change the DFA code to agree
    with the regex code.  See <http://bugs.gnu.org/16919>.
    * src/dfa.c (static_assert): New macro, if not already defined.
    (setbit_case_fold_c): Assume MB_CUR_MAX is 1 and that case_fold
    is nonzero; all callers changed.
    (setbit_case_fold_c, parse_bracket_exp, lex, atom):
    Case-fold like the regex code does.
    (lonesome_lower): New constant.
    (case_folded_counterparts): New function.
    (parse_bracket_exp): Prefer plain setbit when case-folding is
    not needed.
    * src/dfa.h (CASE_FOLDED_BUFSIZE): New constant.
    (case_folded_counterparts): New function decl.
    * src/main.c (trivial_case_ignore): Case-fold like the regex code does.
    (main): Try to improve comment re trivial_case_ignore.
    * tests/case-fold-titlecase: Add lots more test cases.

diff --git a/src/dfa.c b/src/dfa.c
index 585a599..5910268 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -34,6 +34,12 @@
 #include <locale.h>
 #include <stdbool.h>
 
+/* Gawk doesn't use Gnulib, so don't assume static_assert is present.  */
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+    extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
+
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
 /* ISASCIIDIGIT differs from isdigit, as follows:
@@ -710,34 +716,16 @@ setbit_wc (wint_t wc, charclass c)
 #endif
 }
 
-/* Set a bit for B in the charclass C, if B is a valid single byte
-   character in the current character set.  If case is folded, set B's
-   lower and upper case variants similarly.  If MB_CUR_MAX > 1, the
-   resulting charset is used only as an optimization, and the caller
-   should set the appropriate field of struct mb_char_classes.  */
+/* Set a bit for B and its case variants in the charclass C.
+   MB_CUR_MAX must be 1.  */
 static void
 setbit_case_fold_c (int b, charclass c)
 {
-  if (MB_CUR_MAX > 1)
-    {
-      wint_t wc = btowc (b);
-      if (wc == WEOF)
-        return;
-      if (case_fold)
-        {
-          setbit_wc (towlower (wc), c);
-          setbit_wc (towupper (wc), c);
-        }
-    }
-  else
-    {
-      if (case_fold)
-        {
-          setbit (tolower (b), c);
-          setbit (toupper (b), c);
-        }
-    }
-  setbit (b, c);
+  int ub = toupper (b);
+  int i;
+  for (i = 0; i < NOTCHAR; i++)
+    if (toupper (i) == ub)
+      setbit (i, c);
 }
 
 
@@ -898,6 +886,50 @@ static unsigned char const *buf_end;    /* reference to 
end in dfaexec.  */
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
+                == CASE_FOLDED_BUFSIZE),
+               "CASE_FOLDED_BUFSIZE is wrong");
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
+
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1058,7 +1090,7 @@ parse_bracket_exp (void)
 
                   for (c2 = 0; c2 < NOTCHAR; ++c2)
                     if (pred->func (c2))
-                      setbit_case_fold_c (c2, ccl);
+                      setbit (c2, ccl);
                 }
               else
                 known_bracket_exp = false;
@@ -1125,8 +1157,21 @@ parse_bracket_exp (void)
                     }
                 }
               else if (using_simple_locale ())
-                for (; c <= c2; c++)
-                  setbit_case_fold_c (c, ccl);
+               {
+                  for (c1 = c; c1 <= c2; c1++)
+                    setbit (c1, ccl);
+                  if (case_fold)
+                    {
+                      int uc = toupper (c);
+                      int uc2 = toupper (c2);
+                      for (c1 = 0; c1 < NOTCHAR; c1++)
+                        {
+                          int uc1 = toupper (c1);
+                          if (uc <= uc1 && uc1 <= uc2)
+                            setbit (c1, ccl);
+                        }
+                    }
+                }
               else
                 known_bracket_exp = false;
 
@@ -1145,26 +1190,22 @@ parse_bracket_exp (void)
 
       if (MB_CUR_MAX == 1)
         {
-          setbit_case_fold_c (c, ccl);
+          if (case_fold)
+            setbit_case_fold_c (c, ccl);
+          else
+            setbit (c, ccl);
           continue;
         }
 
       if (case_fold)
         {
-          wint_t folded = towlower (wc);
-          if (folded != wc && !setbit_wc (folded, ccl))
-            {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = folded;
-            }
-          folded = towupper (wc);
-          if (folded != wc && !setbit_wc (folded, ccl))
-            {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = folded;
-            }
+          wchar_t folded[CASE_FOLDED_BUFSIZE];
+          int i, n = case_folded_counterparts (wc, folded);
+          REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                work_mbc->nchars + n);
+          for (i = 0; i < n; i++)
+            if (!setbit_wc (folded[i], ccl))
+              work_mbc->chars[work_mbc->nchars++] = folded[i];
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1510,7 +1551,7 @@ lex (void)
           if (MB_CUR_MAX > 1)
             return lasttok = WCHAR;
 
-          if (case_fold && (tolower (c) != c || toupper (c) != c))
+          if (case_fold && isalpha (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
@@ -1757,18 +1798,14 @@ atom (void)
   if (MBS_SUPPORT && tok == WCHAR)
     {
       addtok_wc (wctok);
+
       if (case_fold)
         {
-          wint_t folded = towlower (wctok);
-          if (folded != wctok)
-            {
-              addtok_wc (folded);
-              addtok (OR);
-            }
-          folded = towupper (wctok);
-          if (folded != wctok)
+          wchar_t folded[CASE_FOLDED_BUFSIZE];
+          int i, n = case_folded_counterparts (wctok, folded);
+          for (i = 0; i < n; i++)
             {
-              addtok_wc (folded);
+              addtok_wc (folded[i]);
               addtok (OR);
             }
         }
diff --git a/src/dfa.h b/src/dfa.h
index 7e0674f..ad2b854 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -101,3 +101,11 @@ extern void dfawarn (const char *);
 extern _Noreturn void dfaerror (const char *);
 
 extern int using_utf8 (void);
+
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
+   entry in LONESOME_LOWER; see dfa.c.  */
+enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };
+
+int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/main.c b/src/main.c
index 808e47a..4f8f1ef 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1892,11 +1892,12 @@ trivial_case_ignore (size_t len, char const *keys,
     return false;
 
   /* Worst case is that each byte B of KEYS is ASCII alphabetic and
-     the two other_case(B) characters, C and D, each occupies
-     MB_CUR_MAX bytes, so each B maps to [BCD], which requires 2 *
-     MB_CUR_MAX + 3 bytes; this is bounded above by the constant
-     expression 2 * MB_LEN_MAX + 3.  */
-  *new_keys = xnmalloc (len + 1, 2 * MB_LEN_MAX + 3);
+     CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each
+     occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which
+     requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is
+     bounded above by the constant expression CASE_FOLDED_BUFSIZE *
+     MB_LEN_MAX + 3.  */
+  *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3);
   char *p = *new_keys;
 
   mbstate_t mb_state = { 0 };
@@ -1918,9 +1919,9 @@ trivial_case_ignore (size_t len, char const *keys,
       keys += n;
       len -= n;
 
-      wint_t lc = towlower (wc);
-      wint_t uc = towupper (wc);
-      if (lc == wc && uc == wc)
+      wchar_t folded[CASE_FOLDED_BUFSIZE];
+      int nfolded = case_folded_counterparts (wc, folded);
+      if (nfolded <= 0)
         {
           memcpy (p, orig, n);
           p += n;
@@ -1933,20 +1934,18 @@ trivial_case_ignore (size_t len, char const *keys,
           memcpy (p, orig, n);
           p += n;
 
-          if (lc != wc)
-            {
-              size_t lcbytes = WCRTOMB (p, lc, &mb_state);
-              if (lcbytes == (size_t) -1)
-                goto skip_case_ignore_optimization;
-              p += lcbytes;
-            }
-          if (uc != wc)
+          int i = 0;
+          do
             {
-              size_t ucbytes = WCRTOMB (p, uc, &mb_state);
-              if (ucbytes == (size_t) -1 || ! mbsinit (&mb_state))
+              size_t nbytes = WCRTOMB (p, folded[i], &mb_state);
+              if (nbytes == (size_t) -1)
                 goto skip_case_ignore_optimization;
-              p += ucbytes;
+              p += nbytes;
             }
+          while (++i < nfolded);
+
+          if (! mbsinit (&mb_state))
+            goto skip_case_ignore_optimization;
 
           *p++ = ']';
         }
@@ -2351,16 +2350,16 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
-  /* As currently implemented, case-insensitive matching is expensive in
-     multi-byte locales because of a few outlier locales in which some
-     characters change size when converted to upper or lower case.  To
-     accommodate those, we revert to searching the input one line at a
-     time, rather than using the much more efficient buffer search.
-     However, if we have a regular expression, /foo/i, we can convert
-     it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
-     avoid the expensive read-and-process-a-line-at-a-time requirement.
-     Optimize-away the "-i" option, when possible, converting each
-     candidate alpha, C, in the regexp to [Cc].  */
+  /* Case-insensitive matching is expensive in multibyte locales
+     because a few characters may change size when converted to upper
+     or lower case.  To accommodate those, search the input one line
+     at a time, rather than using the much more efficient buffer search.
+
+     Try to convert a regular expression 'foo' (ignoring case) to an
+     equivalent regular expression '[fF][oO][oO]' (where case matters).
+     Not only does this avoid the expensive requirement to read and
+     process a line at a time, it also allows use of the kwset engine,
+     a win in non-UTF-8 multibyte locales.  */
   if (match_icase)
     {
       size_t new_keycc;
diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase
index 0ece5c8..f16022b 100755
--- a/tests/case-fold-titlecase
+++ b/tests/case-fold-titlecase
@@ -1,5 +1,5 @@
 #!/bin/sh
-# Check that case folding works even with titlecase characters.
+# Check that case folding works even with titlecase and similarly odd chars.
 
 # Copyright 2014 Free Software Foundation, Inc.
 
@@ -25,17 +25,162 @@ export LC_ALL
 
 fail=0
 
-LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ
-Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J
-lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ
-pattern=$(printf "$Lj\n") || framework_failure_
-printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_
+for testcase in \
+  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
+do
+  case $testcase in
+    0)
+      a='\302\265'     # U+00B5
+      b='\316\234'     # U+039C
+      c='\316\274'     # U+03BC
+      ;;
+    1)
+      a='\111'         # U+0049
+      b='\151'         # U+0069
+      c='\304\260'     # U+0130
+      ;;
+    2)
+      a='\111'         # U+0049
+      b='\151'         # U+0069
+      c='\304\261'     # U+0131
+      ;;
+    3)
+      a='\123'         # U+0053
+      b='\163'         # U+0073
+      c='\305\277'     # U+017F
+      ;;
+    4)
+      a='\307\204'     # U+01C4
+      b='\307\205'     # U+01C5
+      c='\307\206'     # U+01C6
+      ;;
+    5)
+      a='\307\207'     # U+01C7
+      b='\307\210'     # U+01C8
+      c='\307\211'     # U+01C9
+      ;;
+    6)
+      a='\307\212'     # U+01CA
+      b='\307\213'     # U+01CB
+      c='\307\214'     # U+01CC
+      ;;
+    7)
+      a='\307\261'     # U+01F1
+      b='\307\262'     # U+01F2
+      c='\307\263'     # U+01F3
+      ;;
+    8)
+      a='\315\205'     # U+0345
+      b='\316\231'     # U+0399
+      c='\316\271'     # U+03B9
+      ;;
+    9)
+      a='\316\243'     # U+03A3
+      b='\317\202'     # U+03C2
+      c='\317\203'     # U+03C3
+      ;;
+    10)
+      a='\316\222'     # U+0392
+      b='\316\262'     # U+03B2
+      c='\317\220'     # U+03D0
+      ;;
+    11)
+      a='\316\230'     # U+0398
+      b='\316\270'     # U+03B8
+      c='\317\221'     # U+03D1
+      ;;
+    12)
+      a='\316\246'     # U+03A6
+      b='\317\206'     # U+03C6
+      c='\317\225'     # U+03D5
+      ;;
+    13)
+      a='\316\240'     # U+03A0
+      b='\317\200'     # U+03C0
+      c='\317\226'     # U+03D6
+      ;;
+    14)
+      a='\316\232'     # U+039A
+      b='\316\272'     # U+03BA
+      c='\317\260'     # U+03F0
+      ;;
+    15)
+      a='\316\241'     # U+03A1
+      b='\317\201'     # U+03C1
+      c='\317\261'     # U+03F1
+      ;;
+    16)
+      a='\316\230'     # U+0398
+      b='\316\270'     # U+03B8
+      c='\317\264'     # U+03F4
+      ;;
+    17)
+      a='\316\225'     # U+0395
+      b='\316\265'     # U+03B5
+      c='\317\265'     # U+03F5
+      ;;
+    18)
+      a='\341\271\240' # U+1E60
+      b='\341\271\241' # U+1E61
+      c='\341\272\233' # U+1E9B
+      ;;
+    19)
+      a='\303\237'     # U+00DF
+      b='\303\237'     # U+00DF
+      c='\341\272\236' # U+1E9E
+      ;;
+    20)
+      a='\316\231'     # U+0399
+      b='\316\271'     # U+03B9
+      c='\341\276\276' # U+1FBE
+      ;;
+    21)
+      a='\316\251'     # U+03A9
+      b='\317\211'     # U+03C9
+      c='\342\204\246' # U+2126
+      ;;
+    22)
+      a='\113'         # U+004B
+      b='\153'         # U+006B
+      c='\342\204\252' # U+212A
+      ;;
+    23)
+      a='\303\205'     # U+00C5
+      b='\303\245'     # U+00E5
+      c='\342\204\253' # U+212B
+      ;;
+    24)
+      a='\316\243'     # U+03A3
+      b='\317\203'     # U+03C3
+      c='\317\262'     # U+03F2
+      ;;
+  esac
 
-grep -i "$pattern" in >out || fail=1
-compare in out || fail=1
+  printf "$a\\n$b\\n$c\\n" >in || framework_failure_
+  for pattern in "$a" "$b" "$c"; do
+     pat=$(printf "$pattern\\n") || framework_failure_
+     grep -i "\\(\\)\\1$pat" in >out-regex || fail=1
+     grep -i "$pat" in >out-dfa || fail=1
+     compare_ out-regex out-dfa || fail=1
+  done
+done
 
-pattern="($pattern)\\1"
-grep -Ei "$pattern" in >out || fail=1
-compare in out || fail=1
+# Try a unibyte test with ISO 8859-7, if available.
+if test "$(get-mb-cur-max el_GR.iso88597)" -eq 1; then
+  LC_ALL=el_GR.iso88597
+  export LC_ALL
+
+  a='\323' # SIGMA
+  b='\362' # stigma
+  c='\363' # sigma
+
+  printf "$a\\n$b\\n$c\\n" >in || framework_failure_
+  for pattern in "$a" "$b" "$c"; do
+     pat=$(printf "$pattern\\n") || framework_failure_
+     grep -i "\\(\\)\\1$pat" in >out-regex || fail=1
+     grep -i "$pat" in >out-dfa || fail=1
+     compare_ out-regex out-dfa || fail=1
+  done
+fi
 
 Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 src/dfa.c                 |  143 ++++++++++++++++++++++++--------------
 src/dfa.h                 |    8 ++
 src/main.c                |   57 ++++++++--------
 tests/case-fold-titlecase |  167 ++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 282 insertions(+), 93 deletions(-)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.18-23-g4f88962, Paul Eggert <=
Prev by Date: grep branch, master, updated. v2.18-22-gbcdf8de
Next by Date: grep branch, master, updated. v2.18-24-g370c374
Previous by thread: grep branch, master, updated. v2.18-22-gbcdf8de
Next by thread: grep branch, master, updated. v2.18-24-g370c374
Index(es):
- Date
- Thread