emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master ed2def7 1/2: Improve string_char_and_length speed


From: Paul Eggert
Subject: master ed2def7 1/2: Improve string_char_and_length speed
Date: Sun, 26 Apr 2020 22:32:00 -0400 (EDT)

branch: master
commit ed2def7d5e423388ca75c6e10fd7b42e0c4789c7
Author: Paul Eggert <address@hidden>
Commit: Paul Eggert <address@hidden>

    Improve string_char_and_length speed
    
    This tweak improved the CPU time performance of
    ‘make compile-always’ by about 1.7% on my platform.
    * src/character.c (string_char): Remove; no longer used.
    * src/character.h (string_char_and_length): Redo so that it
    needn’t call string_char.  This helps the caller, which can now
    become a leaf function.
---
 src/character.c | 45 ---------------------------------------------
 src/character.h | 47 +++++++++++++++++++++++++++--------------------
 2 files changed, 27 insertions(+), 65 deletions(-)

diff --git a/src/character.c b/src/character.c
index edcec5f..4902e56 100644
--- a/src/character.c
+++ b/src/character.c
@@ -141,51 +141,6 @@ char_string (unsigned int c, unsigned char *p)
 }
 
 
-/* Return a character whose multibyte form is at P.  Set *LEN to the
-   byte length of the multibyte form.  */
-
-int
-string_char (const unsigned char *p, int *len)
-{
-  int c;
-  const unsigned char *saved_p = p;
-
-  if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
-    {
-      /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
-      c = string_char_advance (&p);
-    }
-  else if (! (*p & 0x08))
-    {
-      /* A 4-byte sequence of this form:
-        11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
-      c = ((((p)[0] & 0x7) << 18)
-          | (((p)[1] & 0x3F) << 12)
-          | (((p)[2] & 0x3F) << 6)
-          | ((p)[3] & 0x3F));
-      p += 4;
-    }
-  else
-    {
-      /* A 5-byte sequence of this form:
-
-        111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-        Note that the top 4 `x's are always 0, so shifting p[1] can
-        never exceed the maximum valid character codepoint. */
-      c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
-          (((p)[1] & 0x3F) << 18)
-          | (((p)[2] & 0x3F) << 12)
-          | (((p)[3] & 0x3F) << 6)
-          | ((p)[4] & 0x3F));
-      p += 5;
-    }
-
-  *len = p - saved_p;
-  return c;
-}
-
-
 /* Translate character C by translation table TABLE.  If no translation is
    found in TABLE, return the untranslated character.  If TABLE is a list,
    elements are char tables.  In that case, recursively translate C by all the
diff --git a/src/character.h b/src/character.h
index 4887473..d4d7750 100644
--- a/src/character.h
+++ b/src/character.h
@@ -85,7 +85,6 @@ enum
 };
 
 extern int char_string (unsigned, unsigned char *);
-extern int string_char (const unsigned char *, int *);
 
 /* UTF-8 encodings.  Use \x escapes, so they are portable to pre-C11
    compilers and can be concatenated with ordinary string literals.  */
@@ -371,33 +370,41 @@ raw_prev_char_len (unsigned char const *p)
 INLINE int
 string_char_and_length (unsigned char const *p, int *length)
 {
-  int c, len;
+  int c = p[0];
+  if (! (c & 0x80))
+    {
+      *length = 1;
+      return c;
+    }
+  eassume (0xC0 <= c);
 
-  if (! (p[0] & 0x80))
+  int d = (c << 6) + p[1] - ((0xC0 << 6) + 0x80);
+  if (! (c & 0x20))
     {
-      len = 1;
-      c = p[0];
+      *length = 2;
+      return d + (c < 0xC2 ? 0x3FFF80 : 0);
     }
-  else if (! (p[0] & 0x20))
+
+  d = (d << 6) + p[2] - ((0x20 << 12) + 0x80);
+  if (! (c & 0x10))
     {
-      len = 2;
-      c = ((((p[0] & 0x1F) << 6)
-           | (p[1] & 0x3F))
-          + (p[0] < 0xC2 ? 0x3FFF80 : 0));
+      *length = 3;
+      eassume (MAX_2_BYTE_CHAR < d && d <= MAX_3_BYTE_CHAR);
+      return d;
     }
-  else if (! (p[0] & 0x10))
+
+  d = (d << 6) + p[3] - ((0x10 << 18) + 0x80);
+  if (! (c & 0x08))
     {
-      len = 3;
-      c = (((p[0] & 0x0F) << 12)
-          | ((p[1] & 0x3F) << 6)
-          | (p[2] & 0x3F));
+      *length = 4;
+      eassume (MAX_3_BYTE_CHAR < d && d <= MAX_4_BYTE_CHAR);
+      return d;
     }
-  else
-    c = string_char (p, &len);
 
-  eassume (0 < len && len <= MAX_MULTIBYTE_LENGTH);
-  *length = len;
-  return c;
+  d = (d << 6) + p[4] - ((0x08 << 24) + 0x80);
+  *length = 5;
+  eassume (MAX_4_BYTE_CHAR < d && d <= MAX_5_BYTE_CHAR);
+  return d;
 }
 
 /* Return the character code of character whose multibyte form is at P.  */



reply via email to

[Prev in Thread] Current Thread [Next in Thread]