From f1a3831c32850859cd5faddb1749c095a89a2a84 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 28 Feb 2014 22:46:02 -0800 Subject: [PATCH] grep: fix bugs with -i and titlecase * NEWS: Document this. * src/dfa.c (setbit_wc): Simplify. (setbit_c): Remove; no longer used. (setbit_case_fold_c, parse_bracket_exp, atom): Don't mishandle titlecase. For 'atom', this removes the need for the refactoring of Bug#16729. (lex): Use the slower approach only for letters that have a differing case. * tests/case-fold-titlecase: New file. * tests/Makefile.am (TESTS): Add it. --- NEWS | 5 ++ src/dfa.c | 159 +++++++++++++++++++++++----------------------- tests/Makefile.am | 1 + tests/case-fold-titlecase | 41 ++++++++++++ 4 files changed, 127 insertions(+), 79 deletions(-) create mode 100755 tests/case-fold-titlecase diff --git a/NEWS b/NEWS index 6cfcaba..4b1364c 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,11 @@ GNU grep NEWS -*- outline -*- echo a@@a| grep -w @@ would not. Now, they both fail to match, per the documentation on how grep's -w works. + grep -i no longer mishandles patterns containing titlecase characters. + For example, in a locale containing the titlecase character + 'Lj' (U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J), + 'grep -i Lj' now matches 'LJ' (U+01C7 LATIN CAPITAL LETTER LJ). + * Noteworthy changes in release 2.18 (2014-02-20) [stable] diff --git a/src/dfa.c b/src/dfa.c index 4708895..b3d9da8 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -694,42 +694,27 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) this may happen when folding case in weird Turkish locales where dotless i/dotted I are not included in the chosen character set. Return whether a bit was set in the charclass. */ -#if MBS_SUPPORT static bool setbit_wc (wint_t wc, charclass c) { +#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -} - -/* Set a bit in the charclass for the given single byte character, - if it is valid in the current character set. */ -static void -setbit_c (int b, charclass c) -{ - /* Do nothing if b is invalid in this character set. */ - if (MB_CUR_MAX > 1 && btowc (b) == WEOF) - return; - setbit (b, c); -} #else -# define setbit_c setbit -static inline bool -setbit_wc (wint_t wc, charclass c) -{ abort (); /*NOTREACHED*/ return false; -} #endif +} -/* Like setbit_c, but if case is folded, set both cases of a letter. For - MB_CUR_MAX > 1, the resulting charset is only used as an optimization, - and the caller takes care of setting the appropriate field of struct - mb_char_classes. */ +/* Set a bit for B in the charclass C, if B is a valid single byte + character in the current character set. If case is folded, set B's + lower and upper case variants similarly. If MB_CUR_MAX > 1, the + resulting charset is used only as an optimization, and the caller + should set the appropriate field of struct mb_char_classes. */ static void setbit_case_fold_c (int b, charclass c) { @@ -738,16 +723,21 @@ setbit_case_fold_c (int b, charclass c) wint_t wc = btowc (b); if (wc == WEOF) return; - setbit (b, c); - if (case_fold && iswalpha (wc)) - setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c); + if (case_fold) + { + setbit_wc (towlower (wc), c); + setbit_wc (towupper (wc), c); + } } else { - setbit (b, c); - if (case_fold && isalpha (b)) - setbit_c (isupper (b) ? tolower (b) : toupper (b), c); + if (case_fold) + { + setbit (tolower (b), c); + setbit (toupper (b), c); + } } + setbit (b, c); } @@ -1104,52 +1094,51 @@ parse_bracket_exp (void) c2 = ']'; } - if (c2 == ']') + if (c2 != ']') { - /* In the case [x-], the - is an ordinary hyphen, - which is left in c1, the lookahead character. */ - lexptr -= cur_mb_len; - lexleft += cur_mb_len; - } - } - - if (c1 == '-' && c2 != ']') - { - if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) - FETCH_WC (c2, wc2, _("unbalanced [")); + if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH_WC (c2, wc2, _("unbalanced [")); - if (MB_CUR_MAX > 1) - { - /* When case folding map a range, say [m-z] (or even [M-z]) - to the pair of ranges, [m-z] [M-Z]. */ - REALLOC_IF_NECESSARY (work_mbc->range_sts, - range_sts_al, work_mbc->nranges + 1); - REALLOC_IF_NECESSARY (work_mbc->range_ends, - range_ends_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = - case_fold ? towlower (wc) : (wchar_t) wc; - work_mbc->range_ends[work_mbc->nranges++] = - case_fold ? towlower (wc2) : (wchar_t) wc2; - - if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + if (MB_CUR_MAX > 1) { + /* When case folding map a range, say [m-z] (or even [M-z]) + to the pair of ranges, [m-z] [M-Z]. Although this code + is wrong in multiple ways, it's never used in practice. + FIXME: Remove this (and related) unused code. */ REALLOC_IF_NECESSARY (work_mbc->range_sts, range_sts_al, work_mbc->nranges + 1); - work_mbc->range_sts[work_mbc->nranges] = towupper (wc); REALLOC_IF_NECESSARY (work_mbc->range_ends, range_ends_al, work_mbc->nranges + 1); - work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); + work_mbc->range_sts[work_mbc->nranges] = + case_fold ? towlower (wc) : (wchar_t) wc; + work_mbc->range_ends[work_mbc->nranges++] = + case_fold ? towlower (wc2) : (wchar_t) wc2; + + if (case_fold && (iswalpha (wc) || iswalpha (wc2))) + { + REALLOC_IF_NECESSARY (work_mbc->range_sts, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = towupper (wc); + REALLOC_IF_NECESSARY (work_mbc->range_ends, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); + } } + else if (using_simple_locale ()) + for (; c <= c2; c++) + setbit_case_fold_c (c, ccl); + else + known_bracket_exp = false; + + colon_warning_state |= 8; + FETCH_WC (c1, wc1, _("unbalanced [")); + continue; } - else if (using_simple_locale ()) - for (; c <= c2; c++) - setbit_case_fold_c (c, ccl); - else - known_bracket_exp = false; - colon_warning_state |= 8; - FETCH_WC (c1, wc1, _("unbalanced [")); - continue; + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + lexptr -= cur_mb_len; + lexleft += cur_mb_len; } colon_warning_state |= (c == ':') ? 2 : 4; @@ -1160,16 +1149,22 @@ parse_bracket_exp (void) continue; } - if (case_fold && iswalpha (wc)) + if (case_fold) { - wc = towlower (wc); - if (!setbit_wc (wc, ccl)) + wint_t folded = towlower (wc); + if (folded != wc && !setbit_wc (folded, ccl)) + { + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + 1); + work_mbc->chars[work_mbc->nchars++] = folded; + } + folded = towupper (wc); + if (folded != wc && !setbit_wc (folded, ccl)) { REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = wc; + work_mbc->chars[work_mbc->nchars++] = folded; } - wc = towupper (wc); } if (!setbit_wc (wc, ccl)) { @@ -1515,7 +1510,7 @@ lex (void) if (MB_CUR_MAX > 1) return lasttok = WCHAR; - if (case_fold && isalpha (c)) + if (case_fold && (tolower (c) != c || toupper (c) != c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); @@ -1759,17 +1754,23 @@ add_utf8_anychar (void) static void atom (void) { - if (0) + if (MBS_SUPPORT && tok == WCHAR) { - /* empty */ - } - else if (MBS_SUPPORT && tok == WCHAR) - { - addtok_wc (case_fold ? towlower (wctok) : wctok); - if (case_fold && iswalpha (wctok)) + addtok_wc (wctok); + if (case_fold) { - addtok_wc (towupper (wctok)); - addtok (OR); + wint_t folded = towlower (wctok); + if (folded != wctok) + { + addtok_wc (folded); + addtok (OR); + } + folded = towupper (wctok); + if (folded != wctok) + { + addtok_wc (folded); + addtok (OR); + } } tok = lex (); diff --git a/tests/Makefile.am b/tests/Makefile.am index 972ffc5..219e96a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -47,6 +47,7 @@ TESTS = \ case-fold-char-class \ case-fold-char-range \ case-fold-char-type \ + case-fold-titlecase \ char-class-multibyte \ char-class-multibyte2 \ dfa-coverage \ diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase new file mode 100755 index 0000000..0ece5c8 --- /dev/null +++ b/tests/case-fold-titlecase @@ -0,0 +1,41 @@ +#!/bin/sh +# Check that case folding works even with titlecase characters. + +# Copyright 2014 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ +require_compiled_in_MB_support +LC_ALL=en_US.UTF-8 +export LC_ALL + +fail=0 + +LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ +Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J +lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ +pattern=$(printf "$Lj\n") || framework_failure_ +printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_ + +grep -i "$pattern" in >out || fail=1 +compare in out || fail=1 + +pattern="($pattern)\\1" +grep -Ei "$pattern" in >out || fail=1 +compare in out || fail=1 + +Exit $fail -- 1.8.5.3