[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 5/6] grep: tweak mb_goback performance
From: |
Paul Eggert |
Subject: |
[PATCH 5/6] grep: tweak mb_goback performance |
Date: |
Tue, 24 Aug 2021 00:45:40 -0700 |
* src/searchutils.c (mb_goback): Set *MBCLEN only in
non-UTF-8 encodings, since that’s the only time it’s needed,
and this lets us see more clearly that the UTF-8 clen value
is not useful to the caller.
---
src/searchutils.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/src/searchutils.c b/src/searchutils.c
index 03b4c59..f16dd84 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -93,24 +93,25 @@ mb_goback (char const **mb_start, size_t *mbclen, char
const *cur,
{
const char *p = *mb_start;
const char *p0 = p;
- size_t clen;
if (cur <= p)
return cur - p;
if (localeinfo.using_utf8)
{
+ /* UTF-8 permits scanning backward to the previous character.
+ Start by assuming CUR is at a character boundary. */
p = cur;
- clen = 1;
if ((*cur & 0xc0) == 0x80)
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{
mbstate_t mbs = { 0 };
- clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
if (i < clen && clen <= MB_LEN_MAX)
{
+ /* This multibyte character contains *CUR. */
p0 = cur - i;
p = p0 + clen;
}
@@ -119,7 +120,11 @@ mb_goback (char const **mb_start, size_t *mbclen, char
const *cur,
}
else
{
+ /* In non-UTF-8 encodings, to find character boundaries one must
+ in general scan forward from the start of the buffer. */
mbstate_t mbs = { 0 };
+ size_t clen;
+
do
{
clen = mb_clen (p, end - p, &mbs);
@@ -135,11 +140,12 @@ mb_goback (char const **mb_start, size_t *mbclen, char
const *cur,
p += clen;
}
while (p < cur);
+
+ if (mbclen)
+ *mbclen = clen;
}
*mb_start = p;
- if (mbclen)
- *mbclen = clen;
return p == cur ? 0 : cur - p0;
}
--
2.31.1