From 82d4b2e55203de0fbabc547de9b95b2a6e5018d0 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 15 Dec 2014 23:40:17 +0900 Subject: [PATCH] dfa: improvement for checking of multibyte character boundary When found single bytes that cannot occur inside a multibyte character we can skip check for multibyte character boundary before the character. The improvement speeds up about 40% for input string which doesn't match even the first part of a pattern. * src/dfa.c (always_character_boundary): Add a new variable. It caches whether each byte can occur inside a multibyte character or not. (dfaalwayscb): Add a new function. (dfacomp): Use it. (skip_remains_mb): If an input character is single bytes that cannot occur inside a multibyte character, skip check for multibyte character boundary until there. --- src/dfa.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/dfa.c b/src/dfa.c index 806cb04..b65b7f2 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -451,6 +451,29 @@ struct dfa static void dfamust (struct dfa *dfa); static void regexp (void); +/* True if each byte can not occur inside a multibyte character */ +static bool always_character_boundary[NOTCHAR]; + +static void +dfaalwayscb (void) +{ + int i; + if (using_utf8 ()) + { + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + always_character_boundary[uc] = !((uc & 0xc0) ^ 0x80); + } + } + else + { + unsigned char const ucs[] = { '\0', '\n', '\r', '.', '/' }; + for (i = 0; i < sizeof ucs / sizeof ucs[0]; ++i) + always_character_boundary[ucs[i]] = true; + } +} + static void dfambcache (struct dfa *d) { @@ -3273,12 +3296,18 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or exceeds P. If WCP is non-NULL, set *WCP to the final wide character processed, or if no wide character is processed, set it to WEOF. - Both P and MBP must be no larger than END. */ + Both P and MBP must be no larger than END. + + If next character is character boundary, it always reaches P. So if + WCP is NULL, can omit check step by step and immediately return P. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, unsigned char const *mbp, char const *end, wint_t *wcp) { wint_t wc = WEOF; + if (wcp == NULL && always_character_boundary[*p]) + return p; + while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); @@ -3713,6 +3742,7 @@ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { dfainit (d); + dfaalwayscb (); dfambcache (d); dfaparse (s, len, d); dfamust (d); -- 2.2.0