bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Fix mbscasestr test failure on native Windows with MSVC


From: Bruno Haible
Subject: Fix mbscasestr test failure on native Windows with MSVC
Date: Sun, 01 Sep 2024 13:11:10 +0200

On native Windows, in the UTF-8 locale, towlower and towupper don't even
support case mappings between ISO-8859-1 characters.

How to reproduce:
======================= foo.c =======================
#include <stdio.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
int main (int argc, char *argv[])
{
  char *loc = setlocale (LC_ALL, "French_France.65001");
  if (loc != NULL) printf ("-> %s\n", loc);
  {
    wchar_t wc = towlower (0x00C9);
    printf ("towlower(0x00C9) = 0x%04X\n", (unsigned int) wc);
  }
  {
    wchar_t wc = towlower (0x00E9);
    printf ("towlower(0x00E9) = 0x%04X\n", (unsigned int) wc);
  }
  {
    wchar_t wc = towupper (0x00C9);
    printf ("towupper(0x00C9) = 0x%04X\n", (unsigned int) wc);
  }
  {
    wchar_t wc = towupper (0x00E9);
    printf ("towupper(0x00E9) = 0x%04X\n", (unsigned int) wc);
  }
}
=====================================================
Output:
-> French_France.utf8
towlower(0x00C9) = 0x00C9
towlower(0x00E9) = 0x00E9
towupper(0x00C9) = 0x00C9
towupper(0x00E9) = 0x00E9

Whereas in an 8-bit locale, it works as expected.
Output with "French_France.1252":
-> French_France.1252
towlower(0x00C9) = 0x00E9
towlower(0x00E9) = 0x00E9
towupper(0x00C9) = 0x00C9
towupper(0x00E9) = 0x00C9

This is the cause for a test failure that I see with MSVC:

FAIL: test-mbscasestr2.sh
=========================

C:\cygwin64\home\bruno\testdir-all-for-mingw\gltests\test-mbscasestr2.c:56: 
assertion 'result == input + 19' failed

This patch fixes it.


2024-09-01  Bruno Haible  <bruno@clisp.org>

        Fix mbscasestr test failure on native Windows with MSVC.
        * lib/c32to-impl.h (FUNC): On native Windows, ignore the system's
        towlower/towupper function entirely.
        * tests/test-c32tolower.c (main): On native Windows, reenable test that
        previously failed.
        * tests/test-c32toupper.c (main): Likewise. Disable two other tests on
        native Windows.
        * doc/posix-functions/towlower.texi: Mention bug in the native Windows
        UTF-8 locale.
        * doc/posix-functions/towupper.texi: Likewise.

diff --git a/doc/posix-functions/towlower.texi 
b/doc/posix-functions/towlower.texi
index 3ac7336ed0..500c8fff1c 100644
--- a/doc/posix-functions/towlower.texi
+++ b/doc/posix-functions/towlower.texi
@@ -27,6 +27,9 @@
 @code{c32tolower}, operates on 32-bit wide characters and therefore does not
 have this limitation.
 @item
+On native Windows, in an UTF-8 locale, this function does not even do
+the simple expected mappings, such as from 0x00C9 to 0x00E9.
+@item
 This function returns wrong values even for the ASCII characters
 in a zh_CN.GB18030 locale on some platforms:
 @c https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339
diff --git a/doc/posix-functions/towupper.texi 
b/doc/posix-functions/towupper.texi
index 4ce05b946e..860b7ae438 100644
--- a/doc/posix-functions/towupper.texi
+++ b/doc/posix-functions/towupper.texi
@@ -27,6 +27,9 @@
 @code{c32toupper}, operates on 32-bit wide characters and therefore does not
 have this limitation.
 @item
+On native Windows, in an UTF-8 locale, this function does not even do
+the simple expected mappings, such as from 0x00E9 to 0x00C9.
+@item
 This function returns wrong values even for the ASCII characters
 in a zh_CN.GB18030 locale on some platforms:
 @c https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339
diff --git a/lib/c32to-impl.h b/lib/c32to-impl.h
index 32039c612d..2299ab75ba 100644
--- a/lib/c32to-impl.h
+++ b/lib/c32to-impl.h
@@ -73,11 +73,22 @@ FUNC (wint_t wc)
   /* The wchar_t encoding is UTF-16.
      The char32_t encoding is UCS-4.  */
 
+# if defined _WIN32 && !defined __CYGWIN__
+  /* On native Windows, in the UTF-8 locale, towlower and towupper are
+     lacking (at least) the mappings for ISO-8859-1 characters, such as
+     0x00C9 <-> 0x00E9.  Since it is expensive to test whether the locale
+     encoding is UTF-8, ignore the system's WCHAR_FUNC altogether.  */
+  if (wc != WEOF)
+    return UCS_FUNC (wc);
+  else
+    return wc;
+# else
   if (wc == WEOF || wc == (wchar_t) wc)
     /* wc is in the range for the tow* functions.  */
     return WCHAR_FUNC (wc);
   else
     return UCS_FUNC (wc);
+# endif
 
 #else /* macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Minix, Android */
   /* char32_t and wchar_t are equivalent.  */
diff --git a/tests/test-c32tolower.c b/tests/test-c32tolower.c
index eb956b5009..072338bde1 100644
--- a/tests/test-c32tolower.c
+++ b/tests/test-c32tolower.c
@@ -255,12 +255,10 @@ main (int argc, char *argv[])
           mb = for_character ("\302\265", 2);
           ASSERT (mb.nbytes == 2);
           ASSERT (memcmp (mb.buf, "\302\265", 2) == 0);
-        #if !(defined _WIN32 && !defined __CYGWIN__)
           /* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */
           mb = for_character ("\303\211", 2);
           ASSERT (mb.nbytes == 2);
           ASSERT (memcmp (mb.buf, "\303\251", 2) == 0);
-        #endif
           /* U+00DF LATIN SMALL LETTER SHARP S */
           mb = for_character ("\303\237", 2);
           ASSERT (mb.nbytes == 2);
diff --git a/tests/test-c32toupper.c b/tests/test-c32toupper.c
index 18c3ffddb5..eb9668afff 100644
--- a/tests/test-c32toupper.c
+++ b/tests/test-c32toupper.c
@@ -163,7 +163,7 @@ main (int argc, char *argv[])
           mb = for_character ("\262", 1);
           ASSERT (mb.nbytes == 1);
           ASSERT (memcmp (mb.buf, "\262", 1) == 0);
-        #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) || 
defined __FreeBSD__ || defined __NetBSD__ || defined __sun || defined 
__CYGWIN__)
+        #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) || 
defined __FreeBSD__ || defined __NetBSD__ || defined __sun || defined 
__CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__))
           /* U+00B5 MICRO SIGN */
           mb = for_character ("\265", 1);
           ASSERT (mb.nbytes == 1);
@@ -259,7 +259,7 @@ main (int argc, char *argv[])
           mb = for_character ("\302\262", 2);
           ASSERT (mb.nbytes == 2);
           ASSERT (memcmp (mb.buf, "\302\262", 2) == 0);
-        #if !(defined __GLIBC__ || defined MUSL_LIBC || (defined __APPLE__ && 
defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined 
__NetBSD__ || defined __OpenBSD__ || defined _AIX || defined __sun || defined 
__CYGWIN__ || defined __ANDROID__)
+        #if !(defined __GLIBC__ || defined MUSL_LIBC || (defined __APPLE__ && 
defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined 
__NetBSD__ || defined __OpenBSD__ || defined _AIX || defined __sun || defined 
__CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__) || defined __ANDROID__)
           /* U+00B5 MICRO SIGN */
           mb = for_character ("\302\265", 2);
           ASSERT (mb.nbytes == 2);
@@ -275,7 +275,6 @@ main (int argc, char *argv[])
           ASSERT (mb.nbytes == 2);
           ASSERT (memcmp (mb.buf, "\303\237", 2) == 0);
         #endif
-        #if !(defined _WIN32 && !defined __CYGWIN__)
           /* U+00E9 LATIN SMALL LETTER E WITH ACUTE */
           mb = for_character ("\303\251", 2);
           ASSERT (mb.nbytes == 2);
@@ -284,7 +283,6 @@ main (int argc, char *argv[])
           mb = for_character ("\303\277", 2);
           ASSERT (mb.nbytes == 2);
           ASSERT (memcmp (mb.buf, "\305\270", 2) == 0);
-        #endif
           /* U+0141 LATIN CAPITAL LETTER L WITH STROKE */
           mb = for_character ("\305\201", 2);
           ASSERT (mb.nbytes == 2);






reply via email to

[Prev in Thread] Current Thread [Next in Thread]