>From f11f0c9351fdd2bd65efdb469754096d1a237d61 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Thu, 27 Feb 2014 09:26:23 -0800 Subject: [PATCH] grep: fix multiple bugs with bracket expressions * NEWS: Document this. * src/dfa.c (using_simple_locale): New function. (parse_bracket_exp): Handle bracket expressions like [a-[.z.]] correctly. Don't assume that dfaexec handles expressions like [^a-z] correctly, as they can match multiple characters in some locales. * tests/posix-bracket: New file. * tests/Makefile.am (TESTS): Add it. --- NEWS | 4 ++ src/dfa.c | 129 +++++++++++++++++++++++++++++----------------------- tests/Makefile.am | 1 + tests/posix-bracket | 33 ++++++++++++++ 4 files changed, 110 insertions(+), 57 deletions(-) create mode 100755 tests/posix-bracket diff --git a/NEWS b/NEWS index 657f3d1..6cfcaba 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,10 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + grep no longer mishandles patterns like [a-[.z.]], and no longer + mishandles patterns like [^a] in locales that have multicharacter + collating sequences so that [^a] can match a string of two characters. + grep -P now works with -w and -x and backreferences. Before, echo aa|grep -Pw '(.)\1' would fail to match, yet echo aa|grep -Pw '(.)\2' would match. diff --git a/src/dfa.c b/src/dfa.c index 8906ed3..65ab5d6 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -182,7 +182,8 @@ enum EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches the empty string. */ - BACKREF, /* BACKREF is generated by \; it + BACKREF, /* BACKREF is generated by \ + or by any other construct that is not completely handled. If the scanner detects a transition on backref, it returns a kind of "semi-success" indicating that @@ -769,6 +770,45 @@ using_utf8 (void) return utf8; } +/* Return true if the current locale is known to be a unibyte locale + without multicharacter collating sequences and where range + comparisons simply use the native encoding. These locales can be + processed more efficiently. */ + +static bool +using_simple_locale (void) +{ + /* True if the native character set is known to be compatible with + the C locale. The following test isn't perfect, but it's good + enough in practice, as only ASCII and EBCDIC are in common use + and this test correctly accepts ASCII and rejects EBCDIC. */ + enum { native_c_charset = + ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 + && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 + && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 + && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 + && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 + && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 + && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 + && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 + && '}' == 125 && '~' == 126) + }; + + if (! native_c_charset || MB_CUR_MAX > 1) + return false; + else + { + static int unibyte_c = -1; + if (unibyte_c < 0) + { + char *locale = setlocale (LC_ALL, 0); + unibyte_c = (locale && (STREQ (locale, "C") + || STREQ (locale, "POSIX"))); + } + return unibyte_c; + } +} + /* Lexical analyzer. All the dross that deals with the obnoxious GNU Regex syntax bits is located here. The poor, suffering reader is referred to the GNU Regex documentation for the @@ -917,6 +957,10 @@ parse_bracket_exp (void) int c, c1, c2; charclass ccl; + /* True if this is a bracket expression that dfaexec is known to + process correctly. */ + bool known_bracket_exp = true; + /* Used to warn about [:space:]. Bit 0 = first character is a colon. Bit 1 = last character is a colon. @@ -958,6 +1002,7 @@ parse_bracket_exp (void) { FETCH_WC (c, wc, _("unbalanced [")); invert = 1; + known_bracket_exp = using_simple_locale (); } else invert = 0; @@ -972,16 +1017,14 @@ parse_bracket_exp (void) we just treat it as a bunch of ordinary characters. We can do this because we assume regex has checked for syntax errors before dfa is ever called. */ - if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) + if (c == '[') { #define MAX_BRACKET_STRING_LEN 32 char str[MAX_BRACKET_STRING_LEN + 1]; FETCH_WC (c1, wc1, _("unbalanced [")); - /* If pattern contains '[[:', '[[.', or '[[='. */ - if (c1 == ':' - /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */ - || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))) + if ((c1 == ':' && syntax_bits & RE_CHAR_CLASSES) + || c1 == '.' || c1 == '=') { size_t len = 0; for (;;) @@ -1000,7 +1043,10 @@ parse_bracket_exp (void) /* Fetch bracket. */ FETCH_WC (c, wc, _("unbalanced [")); if (c1 == ':') - /* build character class. */ + /* Build character class. POSIX allows character + classes to match multicharacter collating elements, + but the regex code does not support that, so do not + worry about that possibility. */ { char const *class = (case_fold && (STREQ (str, "upper") @@ -1024,28 +1070,9 @@ parse_bracket_exp (void) if (pred->func (c2)) setbit_case_fold_c (c2, ccl); } + else + known_bracket_exp = false; - else if (MBS_SUPPORT && (c1 == '=' || c1 == '.')) - { - char *elem = xmemdup (str, len + 1); - - if (c1 == '=') - /* build equivalence class. */ - { - REALLOC_IF_NECESSARY (work_mbc->equivs, - equivs_al, work_mbc->nequivs + 1); - work_mbc->equivs[work_mbc->nequivs++] = elem; - } - - if (c1 == '.') - /* build collating element. */ - { - REALLOC_IF_NECESSARY (work_mbc->coll_elems, - coll_elems_al, - work_mbc->ncoll_elems + 1); - work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; - } - } colon_warning_state |= 8; /* Fetch new lookahead character. */ @@ -1067,6 +1094,16 @@ parse_bracket_exp (void) /* build range characters. */ { FETCH_WC (c2, wc2, _("unbalanced [")); + + /* A bracket expression like [a-[.aa.]] matches an unknown set. + Treat it like [-a[.aa.]] while parsing it, and + remember that the set is unknown. */ + if (c2 == '[' && *lexptr == '.') + { + known_bracket_exp = false; + c2 = ']'; + } + if (c2 == ']') { /* In the case [x-], the - is an ordinary hyphen, @@ -1104,36 +1141,11 @@ parse_bracket_exp (void) work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2); } } + else if (using_simple_locale ()) + for (; c <= c2; c++) + setbit_case_fold_c (c, ccl); else - { - /* Defer to the system regex library about the meaning - of range expressions. */ - struct re_pattern_buffer re = { 0 }; - char const *compile_msg; -#if 199901 <= __STDC_VERSION__ - char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' }; -#else - char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' }; - pattern[2] = c; - pattern[5] = c2; -#endif - re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS); - compile_msg = re_compile_pattern (pattern, sizeof pattern, &re); - if (compile_msg) - dfaerror (compile_msg); - for (c = 0; c < NOTCHAR; c++) - { - char subject = c; - switch (re_match (&re, &subject, 1, 0, NULL)) - { - case 1: setbit (c, ccl); break; - case -1: break; - default: xalloc_die (); - } - } - regfree (&re); - re_set_syntax (syntax_bits); - } + known_bracket_exp = false; colon_warning_state |= 8; FETCH_WC (c1, wc1, _("unbalanced [")); @@ -1171,6 +1183,9 @@ parse_bracket_exp (void) if (colon_warning_state == 7) dfawarn (_("character class syntax is [[:space:]], not [:space:]")); + if (! known_bracket_exp) + return BACKREF; + if (MB_CUR_MAX > 1) { static charclass zeroclass; diff --git a/tests/Makefile.am b/tests/Makefile.am index 742a580..972ffc5 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -86,6 +86,7 @@ TESTS = \ pcre-w \ pcre-wx-backref \ pcre-z \ + posix-bracket \ prefix-of-multibyte \ r-dot \ repetition-overflow \ diff --git a/tests/posix-bracket b/tests/posix-bracket new file mode 100755 index 0000000..d9d1d84 --- /dev/null +++ b/tests/posix-bracket @@ -0,0 +1,33 @@ +#!/bin/sh +# Check various bracket expressions in the POSIX locale. + +# Copyright 2014 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +LC_ALL=C +export LC_ALL + +fail=0 + +echo a >in || framework_failure_ +for bracketed in '[.a.]' '[.a.]-a' 'a-[.a.]' '[.a.]-[.a.]' \ + '[=a=]' '[:alpha:]'; do + grep "[$bracketed]" in >out || fail=1 + compare in out || fail=1 + grep "[^$bracketed]" in >out && fail=1 + compare /dev/null out || fail=1 +done +Exit $fail -- 1.8.5.3