grep-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 5/6] grep: tweak mb_goback performance


From: Paul Eggert
Subject: [PATCH 5/6] grep: tweak mb_goback performance
Date: Tue, 24 Aug 2021 00:45:40 -0700

* src/searchutils.c (mb_goback): Set *MBCLEN only in
non-UTF-8 encodings, since that’s the only time it’s needed,
and this lets us see more clearly that the UTF-8 clen value
is not useful to the caller.
---
 src/searchutils.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/searchutils.c b/src/searchutils.c
index 03b4c59..f16dd84 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -93,24 +93,25 @@ mb_goback (char const **mb_start, size_t *mbclen, char 
const *cur,
 {
   const char *p = *mb_start;
   const char *p0 = p;
-  size_t clen;
 
   if (cur <= p)
     return cur - p;
 
   if (localeinfo.using_utf8)
     {
+      /* UTF-8 permits scanning backward to the previous character.
+         Start by assuming CUR is at a character boundary.  */
       p = cur;
-      clen = 1;
 
       if ((*cur & 0xc0) == 0x80)
         for (int i = 1; i <= 3; i++)
           if ((cur[-i] & 0xc0) != 0x80)
             {
               mbstate_t mbs = { 0 };
-              clen = mb_clen (cur - i, end - (cur - i), &mbs);
+              size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
               if (i < clen && clen <= MB_LEN_MAX)
                 {
+                  /* This multibyte character contains *CUR.  */
                   p0 = cur - i;
                   p = p0 + clen;
                 }
@@ -119,7 +120,11 @@ mb_goback (char const **mb_start, size_t *mbclen, char 
const *cur,
     }
   else
     {
+      /* In non-UTF-8 encodings, to find character boundaries one must
+         in general scan forward from the start of the buffer.  */
       mbstate_t mbs = { 0 };
+      size_t clen;
+
       do
         {
           clen = mb_clen (p, end - p, &mbs);
@@ -135,11 +140,12 @@ mb_goback (char const **mb_start, size_t *mbclen, char 
const *cur,
           p += clen;
         }
       while (p < cur);
+
+      if (mbclen)
+        *mbclen = clen;
     }
 
   *mb_start = p;
-  if (mbclen)
-    *mbclen = clen;
   return p == cur ? 0 : cur - p0;
 }
 
-- 
2.31.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]