>From 8952431b790b409f4ef2ffdcb564475160548c50 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 9 Sep 2020 12:43:11 -0700 Subject: [PATCH] grep: fix -w bug in UTF-8 locales Problem reported by Mayo Fark (Bug#43225). * src/searchutils.c (wordchar_prev): In a UTF-8 locale, do not assume that an encoding-error byte cannot be part of a word constituent, as this assumption is incorrect for the last byte of a multibyte word constituent. * tests/word-delim-multibyte: Add a test for the bug. --- NEWS | 4 ++++ src/searchutils.c | 2 +- tests/word-delim-multibyte | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index acd95dd..28c7835 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,10 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + In UTF-8 locales, grep -w no longer ignores a multibyte word + constituent just before what would otherwise be a word match. + [Bug#43225 introduced in grep 2.28] + A performance regression with many duplicate patterns has been fixed. [Bug#43040 introduced in grep 3.4] diff --git a/src/searchutils.c b/src/searchutils.c index 84c319c..c4bb802 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -195,7 +195,7 @@ wordchar_prev (char const *buf, char const *cur, char const *end) return 0; unsigned char b = *--cur; if (! localeinfo.multibyte - || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2)) + || (localeinfo.using_utf8 && localeinfo.sbclen[b] == 1)) return sbwordchar[b]; char const *p = buf; cur -= mb_goback (&p, NULL, cur, end); diff --git a/tests/word-delim-multibyte b/tests/word-delim-multibyte index 7d2c433..31190ad 100755 --- a/tests/word-delim-multibyte +++ b/tests/word-delim-multibyte @@ -34,4 +34,12 @@ for locale in C en_US.UTF-8; do compare /dev/null err || fail=1 done +# Bug#43255 +printf 'a \303\255cone b\n' >in +for flag in '' -i; do + returns_ 1 env LC_ALL=en_US.UTF-8 grep -w $flag cone in >out 2>err || fail=1 + compare /dev/null out || fail=1 + compare /dev/null err || fail=1 +done + Exit $fail -- 2.17.1