From bfa9df03034ccfb65da9950cf1e1207faef1213c Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sun, 7 Dec 2014 20:16:41 +0900 Subject: [PATCH 2/2] dfa: remove word delimiter support for multibyte locales DFA supports word delimiter expressions, but it does not behave correctly for multibyte locales. Even if it were to be fixed, the DFA matcher's performance would be no better than that of regex. Thus, this change removes DFA support for word delimiter expressions in multibyte locales. * src/dfa.c (dfa_supported): Return false also when a pattern uses any word delimiter expression in a multibyte locale. --- src/dfa.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/dfa.c b/src/dfa.c index a28404b..d1e76e1 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3358,6 +3358,7 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, Here is the list of features that make this DFA matcher punt: - [M-N]-range-in-MB-locale: regex is up to 25% faster on [a-z] - back-reference: (.)\1 + - word-delimiter-in-MB-locale: \<, \>, \b */ static inline char * dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, @@ -3645,6 +3646,14 @@ dfa_supported (struct dfa const *d) { switch (d->tokens[i]) { + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (!d->multibyte) + continue; + /* fallthrough */ + case BACKREF: case MBCSET: return false; -- 2.3.7