From cb1e339f576fa3a8431dd544edfc1d3b7792ddf2 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Wed, 26 Mar 2014 08:56:50 -0700 Subject: [PATCH] grep: perform the kwset-helping DFA match in narrower range When kwsexec gives us the offset of a potential match, we compute line begin/end and then run the DFA matcher to see if there really is a match on that line. When the beginning of the line, BEG, is not on a multibyte character boundary, advance BEG until it on such a boundary, before running the DFA search. * src/dfasearch.c (EGexecute): As above. Add a comment. * tests/euc-mb: Add a test case that exercises this code. This addresses http://debbugs.gnu.org/17095. --- src/dfasearch.c | 8 +++++++- tests/euc-mb | 11 +++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 0b56960..d098a9b 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -236,6 +236,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, match = beg; while (beg > buf && beg[-1] != eol) --beg; + char const *dfa_start = beg; if (kwsm.index < kwset_exact_matches) { if (!MBS_SUPPORT) @@ -247,8 +248,13 @@ EGexecute (char const *buf, size_t size, size_t *match_size, || !is_mb_middle (&mb_start, match, buflim, kwsm.size[0])) goto success; + /* The matched line starts in the middle of a multibyte + character. Perform the DFA search starting from the + beginning of the next character. */ + dfa_start = mb_start; } - if (dfaexec (dfa, beg, (char *) end, 0, NULL, &backref) == NULL) + if (dfaexec (dfa, dfa_start, (char *) end, 0, NULL, + &backref) == NULL) continue; } else diff --git a/tests/euc-mb b/tests/euc-mb index c0af220..f44253f 100755 --- a/tests/euc-mb +++ b/tests/euc-mb @@ -30,7 +30,14 @@ fail=0 # Does EUC-JP work at all? make_input BABA |euc_grep AB && fail=1 -# Whole line rejected after matching in the middle of a multibyte char? -make_input BABAAB |euc_grep AB || fail=1 +# Here are two cases in which a KWSet search matches in the middle +# of a multibyte character. The first ensures that the DFA matcher +# finds the real match at the end of line. The second ensures that +# while the KWSet match found a false positive, the DFA matcher +# determines there is no match after all. +make_input BABAAB |euc_grep AB > out || fail=1 +make_input BABAAB > exp || framework_failure_ +compare out exp || fail=1 +make_input BABABA |euc_grep AB; test $? = 1 || fail=1 Exit $fail -- 1.9.0.258.g00eda23