[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Fix mbscasestr test failure on native Windows with MSVC
From: |
Bruno Haible |
Subject: |
Fix mbscasestr test failure on native Windows with MSVC |
Date: |
Sun, 01 Sep 2024 13:11:10 +0200 |
On native Windows, in the UTF-8 locale, towlower and towupper don't even
support case mappings between ISO-8859-1 characters.
How to reproduce:
======================= foo.c =======================
#include <stdio.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
int main (int argc, char *argv[])
{
char *loc = setlocale (LC_ALL, "French_France.65001");
if (loc != NULL) printf ("-> %s\n", loc);
{
wchar_t wc = towlower (0x00C9);
printf ("towlower(0x00C9) = 0x%04X\n", (unsigned int) wc);
}
{
wchar_t wc = towlower (0x00E9);
printf ("towlower(0x00E9) = 0x%04X\n", (unsigned int) wc);
}
{
wchar_t wc = towupper (0x00C9);
printf ("towupper(0x00C9) = 0x%04X\n", (unsigned int) wc);
}
{
wchar_t wc = towupper (0x00E9);
printf ("towupper(0x00E9) = 0x%04X\n", (unsigned int) wc);
}
}
=====================================================
Output:
-> French_France.utf8
towlower(0x00C9) = 0x00C9
towlower(0x00E9) = 0x00E9
towupper(0x00C9) = 0x00C9
towupper(0x00E9) = 0x00E9
Whereas in an 8-bit locale, it works as expected.
Output with "French_France.1252":
-> French_France.1252
towlower(0x00C9) = 0x00E9
towlower(0x00E9) = 0x00E9
towupper(0x00C9) = 0x00C9
towupper(0x00E9) = 0x00C9
This is the cause for a test failure that I see with MSVC:
FAIL: test-mbscasestr2.sh
=========================
C:\cygwin64\home\bruno\testdir-all-for-mingw\gltests\test-mbscasestr2.c:56:
assertion 'result == input + 19' failed
This patch fixes it.
2024-09-01 Bruno Haible <bruno@clisp.org>
Fix mbscasestr test failure on native Windows with MSVC.
* lib/c32to-impl.h (FUNC): On native Windows, ignore the system's
towlower/towupper function entirely.
* tests/test-c32tolower.c (main): On native Windows, reenable test that
previously failed.
* tests/test-c32toupper.c (main): Likewise. Disable two other tests on
native Windows.
* doc/posix-functions/towlower.texi: Mention bug in the native Windows
UTF-8 locale.
* doc/posix-functions/towupper.texi: Likewise.
diff --git a/doc/posix-functions/towlower.texi
b/doc/posix-functions/towlower.texi
index 3ac7336ed0..500c8fff1c 100644
--- a/doc/posix-functions/towlower.texi
+++ b/doc/posix-functions/towlower.texi
@@ -27,6 +27,9 @@
@code{c32tolower}, operates on 32-bit wide characters and therefore does not
have this limitation.
@item
+On native Windows, in an UTF-8 locale, this function does not even do
+the simple expected mappings, such as from 0x00C9 to 0x00E9.
+@item
This function returns wrong values even for the ASCII characters
in a zh_CN.GB18030 locale on some platforms:
@c https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339
diff --git a/doc/posix-functions/towupper.texi
b/doc/posix-functions/towupper.texi
index 4ce05b946e..860b7ae438 100644
--- a/doc/posix-functions/towupper.texi
+++ b/doc/posix-functions/towupper.texi
@@ -27,6 +27,9 @@
@code{c32toupper}, operates on 32-bit wide characters and therefore does not
have this limitation.
@item
+On native Windows, in an UTF-8 locale, this function does not even do
+the simple expected mappings, such as from 0x00E9 to 0x00C9.
+@item
This function returns wrong values even for the ASCII characters
in a zh_CN.GB18030 locale on some platforms:
@c https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=57339
diff --git a/lib/c32to-impl.h b/lib/c32to-impl.h
index 32039c612d..2299ab75ba 100644
--- a/lib/c32to-impl.h
+++ b/lib/c32to-impl.h
@@ -73,11 +73,22 @@ FUNC (wint_t wc)
/* The wchar_t encoding is UTF-16.
The char32_t encoding is UCS-4. */
+# if defined _WIN32 && !defined __CYGWIN__
+ /* On native Windows, in the UTF-8 locale, towlower and towupper are
+ lacking (at least) the mappings for ISO-8859-1 characters, such as
+ 0x00C9 <-> 0x00E9. Since it is expensive to test whether the locale
+ encoding is UTF-8, ignore the system's WCHAR_FUNC altogether. */
+ if (wc != WEOF)
+ return UCS_FUNC (wc);
+ else
+ return wc;
+# else
if (wc == WEOF || wc == (wchar_t) wc)
/* wc is in the range for the tow* functions. */
return WCHAR_FUNC (wc);
else
return UCS_FUNC (wc);
+# endif
#else /* macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Minix, Android */
/* char32_t and wchar_t are equivalent. */
diff --git a/tests/test-c32tolower.c b/tests/test-c32tolower.c
index eb956b5009..072338bde1 100644
--- a/tests/test-c32tolower.c
+++ b/tests/test-c32tolower.c
@@ -255,12 +255,10 @@ main (int argc, char *argv[])
mb = for_character ("\302\265", 2);
ASSERT (mb.nbytes == 2);
ASSERT (memcmp (mb.buf, "\302\265", 2) == 0);
- #if !(defined _WIN32 && !defined __CYGWIN__)
/* U+00C9 LATIN CAPITAL LETTER E WITH ACUTE */
mb = for_character ("\303\211", 2);
ASSERT (mb.nbytes == 2);
ASSERT (memcmp (mb.buf, "\303\251", 2) == 0);
- #endif
/* U+00DF LATIN SMALL LETTER SHARP S */
mb = for_character ("\303\237", 2);
ASSERT (mb.nbytes == 2);
diff --git a/tests/test-c32toupper.c b/tests/test-c32toupper.c
index 18c3ffddb5..eb9668afff 100644
--- a/tests/test-c32toupper.c
+++ b/tests/test-c32toupper.c
@@ -163,7 +163,7 @@ main (int argc, char *argv[])
mb = for_character ("\262", 1);
ASSERT (mb.nbytes == 1);
ASSERT (memcmp (mb.buf, "\262", 1) == 0);
- #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) ||
defined __FreeBSD__ || defined __NetBSD__ || defined __sun || defined
__CYGWIN__)
+ #if !(defined __GLIBC__ || (defined __APPLE__ && defined __MACH__) ||
defined __FreeBSD__ || defined __NetBSD__ || defined __sun || defined
__CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__))
/* U+00B5 MICRO SIGN */
mb = for_character ("\265", 1);
ASSERT (mb.nbytes == 1);
@@ -259,7 +259,7 @@ main (int argc, char *argv[])
mb = for_character ("\302\262", 2);
ASSERT (mb.nbytes == 2);
ASSERT (memcmp (mb.buf, "\302\262", 2) == 0);
- #if !(defined __GLIBC__ || defined MUSL_LIBC || (defined __APPLE__ &&
defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined
__NetBSD__ || defined __OpenBSD__ || defined _AIX || defined __sun || defined
__CYGWIN__ || defined __ANDROID__)
+ #if !(defined __GLIBC__ || defined MUSL_LIBC || (defined __APPLE__ &&
defined __MACH__) || defined __FreeBSD__ || defined __DragonFly__ || defined
__NetBSD__ || defined __OpenBSD__ || defined _AIX || defined __sun || defined
__CYGWIN__ || (defined _WIN32 && !defined __CYGWIN__) || defined __ANDROID__)
/* U+00B5 MICRO SIGN */
mb = for_character ("\302\265", 2);
ASSERT (mb.nbytes == 2);
@@ -275,7 +275,6 @@ main (int argc, char *argv[])
ASSERT (mb.nbytes == 2);
ASSERT (memcmp (mb.buf, "\303\237", 2) == 0);
#endif
- #if !(defined _WIN32 && !defined __CYGWIN__)
/* U+00E9 LATIN SMALL LETTER E WITH ACUTE */
mb = for_character ("\303\251", 2);
ASSERT (mb.nbytes == 2);
@@ -284,7 +283,6 @@ main (int argc, char *argv[])
mb = for_character ("\303\277", 2);
ASSERT (mb.nbytes == 2);
ASSERT (memcmp (mb.buf, "\305\270", 2) == 0);
- #endif
/* U+0141 LATIN CAPITAL LETTER L WITH STROKE */
mb = for_character ("\305\201", 2);
ASSERT (mb.nbytes == 2);
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Fix mbscasestr test failure on native Windows with MSVC,
Bruno Haible <=