[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: speeding up `wc -m`
From: |
Bruno Haible |
Subject: |
Re: speeding up `wc -m` |
Date: |
Sun, 24 Jun 2018 16:24:04 +0200 |
User-agent: |
KMail/5.1.3 (Linux/4.4.0-128-generic; KDE/5.18.0; x86_64; ; ) |
Pádraig Brady wrote:
> I'm going to apply my whar-single module to gnulib to tweak
> it so the main bottleneck of calling locale_charset repeatedly
> is removed from wcwidth() and mbrtowc(), in a simple manner,
> without the need for another API.
This patch has two problems:
1) It introduces a test suite failure: A gnulib testdir
./gnulib-tool --create-testdir --dir=... --single-configure mbrtowc wcwidth
wchar-single
fails the test-wcwidth test (test-wcwidth.c:59) on Mac OS X.
2) It introduces a multithread-safety bug: You added 'static' variables
also to the normal (non-singlethreaded) code path. Now, when you have
two threads which operate in different locales (via uselocale()) and
1. thread1 does encoding = locale_charset ();
2. thread2 does encoding = locale_charset ();
3. thread1 does utf8_charset = STREQ_OPT (encoding, ...);
you've got a bug.
To eliminate such kinds of bugs, it is better to structure the code as
follows:
- A function that returns that value that can be cached.
- A function that does the caching AND NOTHING ELSE. So that it can be
#ifdefed
out easily.
- Code that uses the caching function. Do not add complexity here!
2018-06-24 Bruno Haible <address@hidden>
wchar-single: Fix test failure in wcwidth tests.
* tests/test-wcwidth.c (main): If the wchar-single module is present,
skip the tests in the C locale.
diff --git a/tests/test-wcwidth.c b/tests/test-wcwidth.c
index f0eb7ab..4aa6ab7 100644
--- a/tests/test-wcwidth.c
+++ b/tests/test-wcwidth.c
@@ -35,10 +35,12 @@ main ()
{
wchar_t wc;
-#ifdef C_CTYPE_ASCII
+#if !GNULIB_WCHAR_SINGLE
+# ifdef C_CTYPE_ASCII
/* Test width of ASCII characters. */
for (wc = 0x20; wc < 0x7F; wc++)
ASSERT (wcwidth (wc) == 1);
+# endif
#endif
/* Switch to an UTF-8 locale. */
2018-06-24 Bruno Haible <address@hidden>
mbrtowc, wcwidth: Fix MT-safety bug (regression from 2018-06-23).
* lib/mbrtowc.c (enc_t): New enum type.
(locale_enc, locale_enc_cached): New functions.
(mbrtowc): Eliminate static variables. Use locale_enc_cached instead.
* lib/wcwidth.c (is_locale_utf8, is_locale_utf8_cached): New functions.
(wcwidth): Eliminate static variables. Use is_locale_utf8_cached
instead.
* m4/mbrtowc.m4 (gl_PREREQ_MBRTOWC): Require AC_C_INLINE.
* m4/wcwidth.m4 (gl_PREREQ_WCWIDTH): New macro.
* modules/wcwidth (configure.ac): Invoke it.
diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c
index dc3597f..2f6df28 100644
--- a/lib/mbrtowc.c
+++ b/lib/mbrtowc.c
@@ -35,12 +35,60 @@
# include "streq.h"
# include "verify.h"
-#ifndef FALLTHROUGH
-# if __GNUC__ < 7
-# define FALLTHROUGH ((void) 0)
-# else
-# define FALLTHROUGH __attribute__ ((__fallthrough__))
+# ifndef FALLTHROUGH
+# if __GNUC__ < 7
+# define FALLTHROUGH ((void) 0)
+# else
+# define FALLTHROUGH __attribute__ ((__fallthrough__))
+# endif
# endif
+
+/* Returns a classification of special values of the encoding of the current
+ locale. */
+typedef enum {
+ enc_other, /* other */
+ enc_utf8, /* UTF-8 */
+ enc_eucjp, /* EUC-JP */
+ enc_94, /* EUC-KR, GB2312, BIG5 */
+ enc_euctw, /* EUC-TW */
+ enc_gb18030, /* GB18030 */
+ enc_sjis /* SJIS */
+} enc_t;
+static inline enc_t
+locale_enc (void)
+{
+ const char *encoding = locale_charset ();
+ if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
+ return enc_utf8;
+ if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
+ return enc_eucjp;
+ if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
+ || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
+ || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
+ return enc_94;
+ if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
+ return enc_euctw;
+ if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
+ return enc_gb18030;
+ if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
+ return enc_sjis;
+ return enc_other;
+}
+
+#if GNULIB_WCHAR_SINGLE
+/* When we know that the locale does not change, provide a speedup by
+ caching the value of locale_enc. */
+static int cached_locale_enc = -1;
+static inline enc_t
+locale_enc_cached (void)
+{
+ if (cached_locale_enc < 0)
+ cached_locale_enc = locale_enc ();
+ return cached_locale_enc;
+}
+#else
+/* By default, don't make assumptions, hence no caching. */
+# define locale_enc_cached locale_enc
#endif
verify (sizeof (mbstate_t) >= 4);
@@ -137,20 +185,9 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
if (m >= 4 || m >= MB_CUR_MAX)
goto invalid;
/* Here MB_CUR_MAX > 1 and 0 < m < 4. */
- {
- static int utf8_charset = -1;
- static const char *encoding;
-
-# if GNULIB_WCHAR_SINGLE
- if (utf8_charset == -1)
-# endif
- {
- encoding = locale_charset ();
- utf8_charset = STREQ_OPT (encoding,
- "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0,
0 ,0);
- }
-
- if (utf8_charset)
+ switch (locale_enc_cached ())
+ {
+ case enc_utf8: /* UTF-8 */
{
/* Cf. unistr/u8-mblen.c. */
unsigned char c = (unsigned char) p[0];
@@ -207,8 +244,7 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
/* As a reference for this code, you can use the GNU libiconv
implementation. Look for uses of the RET_TOOFEW macro. */
- if (STREQ_OPT (encoding,
- "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
+ case enc_eucjp: /* EUC-JP */
{
if (m == 1)
{
@@ -231,12 +267,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
}
goto invalid;
}
- if (STREQ_OPT (encoding,
- "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
- || STREQ_OPT (encoding,
- "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
- || STREQ_OPT (encoding,
- "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
+
+ case enc_94: /* EUC-KR, GB2312, BIG5 */
{
if (m == 1)
{
@@ -247,8 +279,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
}
goto invalid;
}
- if (STREQ_OPT (encoding,
- "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
+
+ case enc_euctw: /* EUC-TW */
{
if (m == 1)
{
@@ -266,8 +298,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
}
goto invalid;
}
- if (STREQ_OPT (encoding,
- "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
+
+ case enc_gb18030: /* GB18030 */
{
if (m == 1)
{
@@ -300,7 +332,8 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
}
goto invalid;
}
- if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
+
+ case enc_sjis: /* SJIS */
{
if (m == 1)
{
@@ -313,9 +346,10 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t
*ps)
goto invalid;
}
- /* An unknown multibyte encoding. */
- goto incomplete;
- }
+ default:
+ /* An unknown multibyte encoding. */
+ goto incomplete;
+ }
incomplete:
{
diff --git a/lib/wcwidth.c b/lib/wcwidth.c
index d03a50f..d33b6a9 100644
--- a/lib/wcwidth.c
+++ b/lib/wcwidth.c
@@ -26,28 +26,40 @@
#include "streq.h"
#include "uniwidth.h"
-int
-wcwidth (wchar_t wc)
-#undef wcwidth
+/* Returns 1 if the current locale is an UTF-8 locale, 0 otherwise. */
+static inline int
+is_locale_utf8 (void)
{
- static int utf8_charset = -1;
- static const char *encoding;
+ const char *encoding = locale_charset ();
+ return STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0);
+}
#if GNULIB_WCHAR_SINGLE
- if (utf8_charset == -1)
+/* When we know that the locale does not change, provide a speedup by
+ caching the value of is_locale_utf8. */
+static int cached_is_locale_utf8 = -1;
+static inline int
+is_locale_utf8_cached (void)
+{
+ if (cached_is_locale_utf8 < 0)
+ cached_is_locale_utf8 = is_locale_utf8 ();
+ return cached_is_locale_utf8;
+}
+#else
+/* By default, don't make assumptions, hence no caching. */
+# define is_locale_utf8_cached is_locale_utf8
#endif
- {
- encoding = locale_charset ();
- utf8_charset = STREQ_OPT (encoding,
- "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0);
- }
+int
+wcwidth (wchar_t wc)
+#undef wcwidth
+{
/* In UTF-8 locales, use a Unicode aware width function. */
- if (utf8_charset)
+ if (is_locale_utf8_cached ())
{
/* We assume that in a UTF-8 locale, a wide character is the same as a
Unicode character. */
- return uc_width (wc, encoding);
+ return uc_width (wc, "UTF-8");
}
else
{
diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4
index f789875..c706d04 100644
--- a/m4/mbrtowc.m4
+++ b/m4/mbrtowc.m4
@@ -1,4 +1,4 @@
-# mbrtowc.m4 serial 30 -*- coding: utf-8 -*-
+# mbrtowc.m4 serial 31 -*- coding: utf-8 -*-
dnl Copyright (C) 2001-2002, 2004-2005, 2008-2018 Free Software Foundation,
dnl Inc.
dnl This file is free software; the Free Software Foundation
@@ -634,6 +634,7 @@ AC_DEFUN([gl_MBRTOWC_C_LOCALE],
# Prerequisites of lib/mbrtowc.c.
AC_DEFUN([gl_PREREQ_MBRTOWC], [
+ AC_REQUIRE([AC_C_INLINE])
:
])
diff --git a/m4/wcwidth.m4 b/m4/wcwidth.m4
index 0605ce8..1cd489b 100644
--- a/m4/wcwidth.m4
+++ b/m4/wcwidth.m4
@@ -1,4 +1,4 @@
-# wcwidth.m4 serial 26
+# wcwidth.m4 serial 27
dnl Copyright (C) 2006-2018 Free Software Foundation, Inc.
dnl This file is free software; the Free Software Foundation
dnl gives unlimited permission to copy and/or distribute it,
@@ -115,3 +115,9 @@ changequote([,])dnl
dnl We don't substitute HAVE_WCWIDTH. We assume that if the system does not
dnl have the wcwidth function, then it does not declare it.
])
+
+# Prerequisites of lib/wcwidth.c.
+AC_DEFUN([gl_PREREQ_WCWIDTH], [
+ AC_REQUIRE([AC_C_INLINE])
+ :
+])
diff --git a/modules/wcwidth b/modules/wcwidth
index 5a27713..372c210 100644
--- a/modules/wcwidth
+++ b/modules/wcwidth
@@ -19,6 +19,7 @@ configure.ac:
gl_FUNC_WCWIDTH
if test $HAVE_WCWIDTH = 0 || test $REPLACE_WCWIDTH = 1; then
AC_LIBOBJ([wcwidth])
+ gl_PREREQ_WCWIDTH
fi
gl_WCHAR_MODULE_INDICATOR([wcwidth])