[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-libunistring] new u{8,16,32}-mb-prev-uc modules
From: |
Ben Pfaff |
Subject: |
[bug-libunistring] new u{8,16,32}-mb-prev-uc modules |
Date: |
Sat, 01 Jan 2011 15:03:39 -0800 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/23.2 (gnu/linux) |
Hi Bruno. In November I sent you a module for backward iteration
in UTF-8 strings for review:
http://permalink.gmane.org/gmane.comp.gnu.libunistring.bugs/55
I didn't get a response. Maybe you were waiting for the
equivalent modules for UTF-16 and UTF-32 that I had promised, or
maybe you just missed it, since it was deep in a thread. Anyway,
here is the whole collection.
Comments?
Thanks,
Ben.
--8<--------------------------cut here-------------------------->8--
From: Ben Pfaff <address@hidden>
Date: Sat, 1 Jan 2011 14:51:16 -0800
Subject: [PATCH] unistr: New modules for backward iteration in string.
New module 'u8-mb-prev-uc'.
* lib/unistr.in.h (u8_mb_prev_uc): New declaration.
(u8_mb_prev_uc_aux): New declaration.
* lib/unistr/u8-mb-prev-uc.c: New file.
* lib/unistr/u8-mb-prev-uc-aux.c: New file.
* tests/test-u8-mb-prev-uc.c: New file.
* modules/u8-mb-prev-uc: New file.
* modules/u8-mb-prev-uc-tests: New file.
New module 'u16-mb-prev-uc'.
* lib/unistr.in.h (u16_mb_prev_uc): New declaration.
(u16_mb_prev_uc_aux): New declaration.
* lib/unistr/u16-mb-prev-uc.c: New file.
* lib/unistr/u16-mb-prev-uc-aux.c: New file.
* tests/test-u16-mb-prev-uc.c: New file.
* modules/u16-mb-prev-uc: New file.
* modules/u16-mb-prev-uc-tests: New file.
New module 'u32-mb-prev-uc'.
* lib/unistr.in.h (u32_mb_prev_uc): New declaration.
* lib/unistr/u32-mb-prev-uc.c: New file.
* tests/test-u32-mb-prev-uc.c: New file.
* modules/u32-mb-prev-uc: New file.
* modules/u32-mb-prev-uc-tests: New file.
---
ChangeLog | 27 ++++
lib/unistr.in.h | 73 +++++++++-
lib/unistr/u16-mb-prev-uc-aux.c | 52 +++++++
lib/unistr/u16-mb-prev-uc.c | 62 ++++++++
lib/unistr/u32-mb-prev-uc.c | 43 ++++++
lib/unistr/u8-mb-prev-uc-aux.c | 128 ++++++++++++++++
lib/unistr/u8-mb-prev-uc.c | 139 +++++++++++++++++
modules/unistr/u16-mb-prev-uc | 28 ++++
modules/unistr/u16-mb-prev-uc-tests | 12 ++
modules/unistr/u32-mb-prev-uc | 27 ++++
modules/unistr/u32-mb-prev-uc-tests | 12 ++
modules/unistr/u8-mb-prev-uc | 28 ++++
modules/unistr/u8-mb-prev-uc-tests | 14 ++
tests/unistr/test-u16-mb-prev-uc.c | 89 +++++++++++
tests/unistr/test-u32-mb-prev-uc.c | 89 +++++++++++
tests/unistr/test-u8-mb-prev-uc.c | 279 +++++++++++++++++++++++++++++++++++
16 files changed, 1101 insertions(+), 1 deletions(-)
create mode 100644 lib/unistr/u16-mb-prev-uc-aux.c
create mode 100644 lib/unistr/u16-mb-prev-uc.c
create mode 100644 lib/unistr/u32-mb-prev-uc.c
create mode 100644 lib/unistr/u8-mb-prev-uc-aux.c
create mode 100644 lib/unistr/u8-mb-prev-uc.c
create mode 100644 modules/unistr/u16-mb-prev-uc
create mode 100644 modules/unistr/u16-mb-prev-uc-tests
create mode 100644 modules/unistr/u32-mb-prev-uc
create mode 100644 modules/unistr/u32-mb-prev-uc-tests
create mode 100644 modules/unistr/u8-mb-prev-uc
create mode 100644 modules/unistr/u8-mb-prev-uc-tests
create mode 100644 tests/unistr/test-u16-mb-prev-uc.c
create mode 100644 tests/unistr/test-u32-mb-prev-uc.c
create mode 100644 tests/unistr/test-u8-mb-prev-uc.c
diff --git a/ChangeLog b/ChangeLog
index 7acf4b6..624fcf1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2011-01-01 Ben Pfaff <address@hidden>
+
+ New module 'u8-mb-prev-uc'.
+ * lib/unistr.in.h (u8_mb_prev_uc): New declaration.
+ (u8_mb_prev_uc_aux): New declaration.
+ * lib/unistr/u8-mb-prev-uc.c: New file.
+ * lib/unistr/u8-mb-prev-uc-aux.c: New file.
+ * tests/test-u8-mb-prev-uc.c: New file.
+ * modules/u8-mb-prev-uc: New file.
+ * modules/u8-mb-prev-uc-tests: New file.
+
+ New module 'u16-mb-prev-uc'.
+ * lib/unistr.in.h (u16_mb_prev_uc): New declaration.
+ (u16_mb_prev_uc_aux): New declaration.
+ * lib/unistr/u16-mb-prev-uc.c: New file.
+ * lib/unistr/u16-mb-prev-uc-aux.c: New file.
+ * tests/test-u16-mb-prev-uc.c: New file.
+ * modules/u16-mb-prev-uc: New file.
+ * modules/u16-mb-prev-uc-tests: New file.
+
+ New module 'u32-mb-prev-uc'.
+ * lib/unistr.in.h (u32_mb_prev_uc): New declaration.
+ * lib/unistr/u32-mb-prev-uc.c: New file.
+ * tests/test-u32-mb-prev-uc.c: New file.
+ * modules/u32-mb-prev-uc: New file.
+ * modules/u32-mb-prev-uc-tests: New file.
+
2010-12-31 Ben Pfaff <address@hidden>
New module 'u8-grapheme-len'.
diff --git a/lib/unistr.in.h b/lib/unistr.in.h
index 2e7c618..b899172 100644
--- a/lib/unistr.in.h
+++ b/lib/unistr.in.h
@@ -1,5 +1,5 @@
/* Elementary Unicode string functions.
- Copyright (C) 2001-2002, 2005-2010 Free Software Foundation, Inc.
+ Copyright (C) 2001-2002, 2005-2011 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
@@ -294,6 +294,77 @@ extern int
u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n);
#endif
+/* Return the length (number of units) of the last character in S, putting
+ its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd,
+ and an appropriate number of units is returned.
+ The number of available units, N, must be > 0. */
+
+#if GNULIB_UNISTR_U8_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+ u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n);
+# else
+extern int
+ u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n);
+static inline int
+u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+ uint8_t c = s[n - 1];
+
+ if (c < 0x80)
+ {
+ *puc = c;
+ return 1;
+ }
+ else
+ return u8_mb_prev_uc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U16_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+ u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n);
+# else
+extern int
+ u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n);
+static inline int
+u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+ uint16_t c = s[n - 1];
+
+ if (c < 0xd800 || c >= 0xe000)
+ {
+ *puc = c;
+ return 1;
+ }
+ else
+ return u16_mb_prev_uc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U32_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+ u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n);
+# else
+static inline int
+u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n _GL_UNUSED_PARAMETER)
+{
+ uint32_t c = s[n - 1];
+
+ if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+ *puc = c;
+ else
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
+}
+# endif
+#endif
+
/* Put the multibyte character represented by UC in S, returning its
length. Return -1 upon failure, -2 if the number of available units, N,
is too small. The latter case cannot occur if N >= 6/2/1, respectively. */
diff --git a/lib/unistr/u16-mb-prev-uc-aux.c b/lib/unistr/u16-mb-prev-uc-aux.c
new file mode 100644
index 0000000..eeab787
--- /dev/null
+++ b/lib/unistr/u16-mb-prev-uc-aux.c
@@ -0,0 +1,52 @@
+/* Look at last character in UTF-16 string.
+ Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Written by Ben Pfaff <address@hidden>, 2011,
+ based on code by Bruno Haible <address@hidden>, 2001.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "unistr.h"
+
+#if defined IN_LIBUNISTRING || HAVE_INLINE
+
+int
+u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+ uint16_t c = s[n - 1];
+
+ if (c >= 0xdc00)
+ {
+ if (n >= 2)
+ {
+ if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00)
+ {
+ *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00);
+ return 2;
+ }
+ /* invalid multibyte character */
+ }
+ else
+ {
+ /* incomplete multibyte character */
+ }
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
+}
+
+#endif
diff --git a/lib/unistr/u16-mb-prev-uc.c b/lib/unistr/u16-mb-prev-uc.c
new file mode 100644
index 0000000..3511666
--- /dev/null
+++ b/lib/unistr/u16-mb-prev-uc.c
@@ -0,0 +1,62 @@
+/* Look at last character in UTF-16 string.
+ Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Written by Ben Pfaff <address@hidden>, 2011,
+ based on code by Bruno Haible <address@hidden>, 2001.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u16_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification. */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+ uint16_t c = s[n - 1];
+
+ if (c < 0xd800 || c >= 0xe000)
+ {
+ *puc = c;
+ return 1;
+ }
+ if (c >= 0xdc00)
+ {
+ if (n >= 2)
+ {
+ if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00)
+ {
+ *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00);
+ return 2;
+ }
+ /* invalid multibyte character */
+ }
+ else
+ {
+ /* incomplete multibyte character */
+ }
+ }
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
+}
+
+#endif
diff --git a/lib/unistr/u32-mb-prev-uc.c b/lib/unistr/u32-mb-prev-uc.c
new file mode 100644
index 0000000..398827b
--- /dev/null
+++ b/lib/unistr/u32-mb-prev-uc.c
@@ -0,0 +1,43 @@
+/* Look at last character in UTF-32 string.
+ Copyright (C) 2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Written by Bruno Haible <address@hidden>, 2002.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u32_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification. */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n)
+{
+ uint32_t c = s[n - 1];
+
+ if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+ *puc = c;
+ else
+ /* invalid multibyte character */
+ *puc = 0xfffd;
+ return 1;
+}
+
+#endif
diff --git a/lib/unistr/u8-mb-prev-uc-aux.c b/lib/unistr/u8-mb-prev-uc-aux.c
new file mode 100644
index 0000000..296a583
--- /dev/null
+++ b/lib/unistr/u8-mb-prev-uc-aux.c
@@ -0,0 +1,128 @@
+/* Look at last character in UCS-8 string.
+ Copyright (C) 2001-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Written by Ben Pfaff <address@hidden>, 2010,
+ based on code by Bruno Haible <address@hidden>, 2001.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "unistr.h"
+
+#if defined IN_LIBUNISTRING || HAVE_INLINE
+
+int
+u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+ uint8_t c_1 = s[n - 1];
+
+#if CONFIG_UNICODE_SAFETY
+ if (c_1 <= 0xbf)
+#endif
+ {
+ if (n >= 2)
+ {
+ uint8_t c_2 = s[n - 2];
+
+ if ((c_2 ^ 0x80) >= 0x40)
+ {
+#if CONFIG_UNICODE_SAFETY
+ if (c_2 >= 0xc2 && c_2 < 0xe0)
+#endif
+ {
+ *puc = ((unsigned int) (c_2 & 0x1f) << 6)
+ | (unsigned int) (c_1 ^ 0x80);
+ return 2;
+ }
+#if CONFIG_UNICODE_SAFETY
+ if (c_2 >= 0xe0 && c_2 < 0xf8)
+ {
+ /* incomplete multibyte character */
+ *puc = 0xfffd;
+ return 2;
+ }
+#endif
+ }
+ else if (n >= 3)
+ {
+ uint8_t c_3 = s[n - 3];
+
+ if ((c_3 ^ 0x80) >= 0x40)
+ {
+#if CONFIG_UNICODE_SAFETY
+ if ((c_3 == 0xe0 && c_2 >= 0xa0)
+ || (c_3 >= 0xe1 && c_3 < 0xed)
+ || (c_3 == 0xed && c_2 < 0xa0)
+ || (c_3 >= 0xee && c_3 < 0xf0))
+#endif
+ {
+ *puc = ((unsigned int) (c_3 & 0x0f) << 12)
+ | (unsigned int) ((c_2 ^ 0x80) << 6)
+ | (unsigned int) (c_1 ^ 0x80);
+ return 3;
+ }
+#if CONFIG_UNICODE_SAFETY
+ if (c_3 >= 0xe0 && c_3 < 0xf8)
+ {
+ /* 0xe0: overlong sequence.
+ 0xe1...0xec: not reached.
+ 0xed: UTF-16 surrogate.
+ 0xee...0xef: not reached.
+ 0xf0...0xf7: incomplete multibyte character. */
+ *puc = 0xfffd;
+ return 3;
+ }
+#endif
+ }
+ else if (n >= 4)
+ {
+ uint8_t c_4 = s[n - 4];
+
+ if ((c_4 ^ 0x80) >= 0x40)
+ {
+#if CONFIG_UNICODE_SAFETY
+ if ((c_4 == 0xf0 && c_3 >= 0x90)
+ || (c_4 >= 0xf1 && c_4 < 0xf4)
+ || (c_4 == 0xf4 && c_3 < 0x90))
+#endif
+ {
+ *puc = (unsigned int) ((c_4 & 0x07) << 18)
+ | (unsigned int) ((c_3 ^ 0x80) << 12)
+ | (unsigned int) ((c_2 ^ 0x80) << 6)
+ | (unsigned int) (c_1 ^ 0x80);
+ return 4;
+ }
+#if CONFIG_UNICODE_SAFETY
+ if (c_4 >= 0xf0 && c_4 < 0xf8)
+ {
+ /* 0xf0: overlong sequence.
+ 0xf1...0xf3: not reached.
+ 0xf4...0xf7: invalid code point above U+10FFFF */
+ *puc = 0xfffd;
+ return 4;
+ }
+#endif
+ }
+ }
+ }
+ }
+ }
+
+ /* invalid or incomplete multibyte character */
+ *puc = 0xfffd;
+ return 1;
+}
+
+#endif
diff --git a/lib/unistr/u8-mb-prev-uc.c b/lib/unistr/u8-mb-prev-uc.c
new file mode 100644
index 0000000..41eaf2b
--- /dev/null
+++ b/lib/unistr/u8-mb-prev-uc.c
@@ -0,0 +1,139 @@
+/* Look at last character in UTF-8 string.
+ Copyright (C) 2001-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Written by Ben Pfaff <address@hidden>, 2010,
+ based on code by Bruno Haible <address@hidden>, 2001.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u8_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification. */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+ uint8_t c_1 = s[n - 1];
+
+ if (c_1 < 0x80)
+ {
+ *puc = c_1;
+ return 1;
+ }
+
+#if CONFIG_UNICODE_SAFETY
+ if (c_1 <= 0xbf)
+#endif
+ {
+ if (n >= 2)
+ {
+ uint8_t c_2 = s[n - 2];
+
+ if ((c_2 ^ 0x80) >= 0x40)
+ {
+#if CONFIG_UNICODE_SAFETY
+ if (c_2 >= 0xc2 && c_2 < 0xe0)
+#endif
+ {
+ *puc = ((unsigned int) (c_2 & 0x1f) << 6)
+ | (unsigned int) (c_1 ^ 0x80);
+ return 2;
+ }
+#if CONFIG_UNICODE_SAFETY
+ if (c_2 >= 0xe0 && c_2 < 0xf8)
+ {
+ /* incomplete multibyte character */
+ *puc = 0xfffd;
+ return 2;
+ }
+#endif
+ }
+ else if (n >= 3)
+ {
+ uint8_t c_3 = s[n - 3];
+
+ if ((c_3 ^ 0x80) >= 0x40)
+ {
+#if CONFIG_UNICODE_SAFETY
+ if ((c_3 == 0xe0 && c_2 >= 0xa0)
+ || (c_3 >= 0xe1 && c_3 < 0xed)
+ || (c_3 == 0xed && c_2 < 0xa0)
+ || (c_3 >= 0xee && c_3 < 0xf0))
+#endif
+ {
+ *puc = ((unsigned int) (c_3 & 0x0f) << 12)
+ | (unsigned int) ((c_2 ^ 0x80) << 6)
+ | (unsigned int) (c_1 ^ 0x80);
+ return 3;
+ }
+#if CONFIG_UNICODE_SAFETY
+ if (c_3 >= 0xe0 && c_3 < 0xf8)
+ {
+ /* 0xe0: overlong sequence.
+ 0xe1...0xec: not reached.
+ 0xed: UTF-16 surrogate.
+ 0xee...0xef: not reached.
+ 0xf0...0xf7: incomplete multibyte character. */
+ *puc = 0xfffd;
+ return 3;
+ }
+#endif
+ }
+ else if (n >= 4)
+ {
+ uint8_t c_4 = s[n - 4];
+
+ if ((c_4 ^ 0x80) >= 0x40)
+ {
+#if CONFIG_UNICODE_SAFETY
+ if ((c_4 == 0xf0 && c_3 >= 0x90)
+ || (c_4 >= 0xf1 && c_4 < 0xf4)
+ || (c_4 == 0xf4 && c_3 < 0x90))
+#endif
+ {
+ *puc = (unsigned int) ((c_4 & 0x07) << 18)
+ | (unsigned int) ((c_3 ^ 0x80) << 12)
+ | (unsigned int) ((c_2 ^ 0x80) << 6)
+ | (unsigned int) (c_1 ^ 0x80);
+ return 4;
+ }
+#if CONFIG_UNICODE_SAFETY
+ if (c_4 >= 0xf0 && c_4 < 0xf8)
+ {
+ /* 0xf0: overlong sequence.
+ 0xf1...0xf3: not reached.
+ 0xf4...0xf7: invalid code point above U+10FFFF */
+ *puc = 0xfffd;
+ return 4;
+ }
+#endif
+ }
+ }
+ }
+ }
+ }
+
+ /* invalid or incomplete multibyte character */
+ *puc = 0xfffd;
+ return 1;
+}
+
+#endif
diff --git a/modules/unistr/u16-mb-prev-uc b/modules/unistr/u16-mb-prev-uc
new file mode 100644
index 0000000..508fc72
--- /dev/null
+++ b/modules/unistr/u16-mb-prev-uc
@@ -0,0 +1,28 @@
+Description:
+Look at last character in UTF-16 string.
+
+Files:
+lib/unistr/u16-mb-prev-uc.c
+lib/unistr/u16-mb-prev-uc-aux.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u16-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u16-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U16_MB_PREV_UC
+lib_SOURCES += unistr/u16-mb-prev-uc.c unistr/u16-mb-prev-uc-aux.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u16-mb-prev-uc-tests
b/modules/unistr/u16-mb-prev-uc-tests
new file mode 100644
index 0000000..a9f504f
--- /dev/null
+++ b/modules/unistr/u16-mb-prev-uc-tests
@@ -0,0 +1,12 @@
+Files:
+tests/unistr/test-u16-mb-prev-uc.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u16-mb-prev-uc
+check_PROGRAMS += test-u16-mb-prev-uc
+test_u16_mb_prev_uc_SOURCES = unistr/test-u16-mb-prev-uc.c
+test_u16_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u32-mb-prev-uc b/modules/unistr/u32-mb-prev-uc
new file mode 100644
index 0000000..ad7974a
--- /dev/null
+++ b/modules/unistr/u32-mb-prev-uc
@@ -0,0 +1,27 @@
+Description:
+Look at last character in UTF-32 string.
+
+Files:
+lib/unistr/u32-mb-prev-uc.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u32-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u32-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U32_MB_PREV_UC
+lib_SOURCES += unistr/u32-mb-prev-uc.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u32-mb-prev-uc-tests
b/modules/unistr/u32-mb-prev-uc-tests
new file mode 100644
index 0000000..e1e45c8
--- /dev/null
+++ b/modules/unistr/u32-mb-prev-uc-tests
@@ -0,0 +1,12 @@
+Files:
+tests/unistr/test-u32-mb-prev-uc.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u32-mb-prev-uc
+check_PROGRAMS += test-u32-mb-prev-uc
+test_u32_mb_prev_uc_SOURCES = unistr/test-u32-mb-prev-uc.c
+test_u32_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u8-mb-prev-uc b/modules/unistr/u8-mb-prev-uc
new file mode 100644
index 0000000..2a12805
--- /dev/null
+++ b/modules/unistr/u8-mb-prev-uc
@@ -0,0 +1,28 @@
+Description:
+Look at last character in UTF-8 string.
+
+Files:
+lib/unistr/u8-mb-prev-uc.c
+lib/unistr/u8-mb-prev-uc-aux.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u8-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U8_MB_PREV_UC
+lib_SOURCES += unistr/u8-mb-prev-uc.c unistr/u8-mb-prev-uc-aux.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u8-mb-prev-uc-tests
b/modules/unistr/u8-mb-prev-uc-tests
new file mode 100644
index 0000000..66a593a
--- /dev/null
+++ b/modules/unistr/u8-mb-prev-uc-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unistr/test-u8-mb-prev-uc.c
+tests/macros.h
+
+Depends-on:
+unistr/u8-mbtouc
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u8-mb-prev-uc
+check_PROGRAMS += test-u8-mb-prev-uc
+test_u8_mb_prev_uc_SOURCES = unistr/test-u8-mb-prev-uc.c
+test_u8_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/tests/unistr/test-u16-mb-prev-uc.c
b/tests/unistr/test-u16-mb-prev-uc.c
new file mode 100644
index 0000000..7f85e98
--- /dev/null
+++ b/tests/unistr/test-u16-mb-prev-uc.c
@@ -0,0 +1,89 @@
+/* Test of u16_mb_prev_uc() function.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Written by Ben Pfaff, 2011. */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void
+test_u16_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...)
+{
+ uint16_t s[16];
+ va_list args;
+ size_t n;
+
+ ucs4_t uc;
+ int len;
+
+ va_start (args, expect_uc);
+ n = 0;
+ for (;;)
+ {
+ int unit = va_arg (args, int);
+ if (unit == -1)
+ break;
+ else if (n >= sizeof s / sizeof *s)
+ abort ();
+
+ s[n++] = unit;
+ }
+ va_end (args);
+
+ len = u16_mb_prev_uc (&uc, s, n);
+ if (len != expect_len || uc != expect_uc)
+ {
+ size_t i;
+
+ fprintf (stderr, "u16_mb_prev_uc returned length %d and U+%04x, "
+ "expected length %d and U+%04x:",
+ len, (unsigned int) uc,
+ expect_len, (unsigned int) expect_uc);
+ for (i = 0; i < n; i++)
+ fprintf (stderr, " %04x", s[i]);
+ putc ('\n', stderr);
+ fflush (stderr);
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ /* Valid single-unit sequences. */
+ test_u16_mb_prev_uc (1, 'a', 'a', -1);
+ test_u16_mb_prev_uc (1, 0x3042, 0x3042, -1);
+ test_u16_mb_prev_uc (1, 'b', 'a', 'b', -1);
+ test_u16_mb_prev_uc (1, 'x', 0x3042, 'x', -1);
+
+ /* Valid surrogate pairs. */
+ test_u16_mb_prev_uc (2, 0x1f610, 0xd83d, 0xde10, -1);
+ test_u16_mb_prev_uc (2, 0x1f610, 'x', 0xd83d, 0xde10, -1);
+
+ /* Invalid surrogate pairs. */
+ test_u16_mb_prev_uc (1, 0xfffd, 0xd800, -1);
+ test_u16_mb_prev_uc (1, 0xfffd, 'a', 0xd800, -1);
+ test_u16_mb_prev_uc (1, 0xfffd, 0xdeff, -1);
+ test_u16_mb_prev_uc (1, 0xfffd, 'b', 0xdeff, -1);
+
+ return 0;
+}
diff --git a/tests/unistr/test-u32-mb-prev-uc.c
b/tests/unistr/test-u32-mb-prev-uc.c
new file mode 100644
index 0000000..6666877
--- /dev/null
+++ b/tests/unistr/test-u32-mb-prev-uc.c
@@ -0,0 +1,89 @@
+/* Test of u32_mb_prev_uc() function.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Written by Ben Pfaff, 2011. */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void
+test_u32_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...)
+{
+ uint32_t s[16];
+ va_list args;
+ size_t n;
+
+ ucs4_t uc;
+ int len;
+
+ va_start (args, expect_uc);
+ n = 0;
+ for (;;)
+ {
+ int unit = va_arg (args, int);
+ if (unit == -1)
+ break;
+ else if (n >= sizeof s / sizeof *s)
+ abort ();
+
+ s[n++] = unit;
+ }
+ va_end (args);
+
+ len = u32_mb_prev_uc (&uc, s, n);
+ if (len != expect_len || uc != expect_uc)
+ {
+ size_t i;
+
+ fprintf (stderr, "u32_mb_prev_uc returned length %d and U+%04x, "
+ "expected length %d and U+%04x:",
+ len, (unsigned int) uc,
+ expect_len, (unsigned int) expect_uc);
+ for (i = 0; i < n; i++)
+ fprintf (stderr, " %04x", s[i]);
+ putc ('\n', stderr);
+ fflush (stderr);
+ abort ();
+ }
+}
+
+int
+main (void)
+{
+ /* Valid. */
+ test_u32_mb_prev_uc (1, 'a', 'a', -1);
+ test_u32_mb_prev_uc (1, 0x3042, 0x3042, -1);
+ test_u32_mb_prev_uc (1, 'b', 'a', 'b', -1);
+ test_u32_mb_prev_uc (1, 'x', 0x3042, 'x', -1);
+
+ /* Surrogate pairs are invalid in UTF-32. */
+ test_u32_mb_prev_uc (1, 0xfffd, 0xd83d, 0xde10, -1);
+ test_u32_mb_prev_uc (1, 0xfffd, 'x', 0xd83d, 0xde10, -1);
+
+ /* Malformed surrogate pairs are doubly invalid in UTF-32. */
+ test_u32_mb_prev_uc (1, 0xfffd, 0xd800, -1);
+ test_u32_mb_prev_uc (1, 0xfffd, 'a', 0xd800, -1);
+ test_u32_mb_prev_uc (1, 0xfffd, 0xdeff, -1);
+ test_u32_mb_prev_uc (1, 0xfffd, 'b', 0xdeff, -1);
+
+ return 0;
+}
diff --git a/tests/unistr/test-u8-mb-prev-uc.c
b/tests/unistr/test-u8-mb-prev-uc.c
new file mode 100644
index 0000000..fd092ca
--- /dev/null
+++ b/tests/unistr/test-u8-mb-prev-uc.c
@@ -0,0 +1,279 @@
+/* Test of u8_mb_prev_uc() function.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Written by Ben Pfaff, 2010. */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+
+#include "macros.h"
+
+struct uc
+ {
+ /* UTF-8 representation. */
+ const uint8_t *s;
+ int n;
+
+ /* Code point. */
+ ucs4_t uc;
+ };
+
+/* Print the N code points and their representations in UC on stderr, preceded
+ by TITLE. */
+static void
+print_ucs (const char *title, const struct uc *uc, size_t n)
+{
+ fprintf (stderr, "%s:", title);
+ for (; n-- > 0; uc++)
+ {
+ size_t i;
+
+ fprintf (stderr, " <");
+ for (i = 0; i < uc->n; i++)
+ {
+ if (i > 0)
+ putc (' ', stderr);
+ fprintf (stderr, "%02x", (unsigned int) uc->s[i]);
+ }
+ fprintf (stderr, "> U+%04X", (unsigned int) uc->uc);
+ }
+ putc ('\n', stderr);
+}
+
+/* Reverses the order of the N elements of UC. */
+static void
+reverse_ucs (struct uc *uc, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n / 2; i++)
+ {
+ size_t j = n - (i + 1);
+ struct uc tmp = uc[i];
+ uc[i] = uc[j];
+ uc[j] = tmp;
+ }
+}
+
+static bool
+equal_ucs (const struct uc *a, size_t n_a, const struct uc *b, size_t n_b)
+{
+ if (n_a != n_b)
+ return false;
+ for (; n_a-- > 0; a++, b++)
+ if (a->n != b->n || a->s != b->s || a->uc != b->uc)
+ return false;
+ return true;
+}
+
+/* Checks that the N units in S yield the same code points whether iterated
+ in the forward or reverse direction. */
+static void
+check_bidirectionally (const uint8_t *s, int n)
+{
+ struct uc ucf[16];
+ struct uc ucr[16];
+ int n_ucf, n_ucr;
+ int used;
+
+ assert (n <= SIZEOF (ucf));
+ assert (n <= SIZEOF (ucr));
+
+ /* Translate units to code points forward. */
+ used = 0;
+ n_ucf = 0;
+ while (used < n)
+ {
+ struct uc *uc = &ucf[n_ucf++];
+ uc->s = &s[used];
+ uc->n = u8_mbtouc (&uc->uc, uc->s, n - used);
+ ASSERT (uc->n >= 1);
+ ASSERT (uc->n <= n - used);
+ used += uc->n;
+ }
+
+ /* Translate units to code points backward. */
+ used = 0;
+ n_ucr = 0;
+ while (used < n)
+ {
+ struct uc *uc = &ucr[n_ucr++];
+ uc->n = u8_mb_prev_uc (&uc->uc, s, n - used);
+ ASSERT (uc->n >= 1);
+ ASSERT (uc->n <= n - used);
+ used += uc->n;
+ uc->s = &s[n - used];
+ }
+ reverse_ucs (ucr, n_ucr);
+
+ /* Check that the results were the same. */
+ if (!equal_ucs (ucf, n_ucf, ucr, n_ucr))
+ {
+ fprintf (stderr, "%s:%d: forward and reverse differ\n",
+ __FILE__, __LINE__);
+ print_ucs ("forward", ucf, n_ucf);
+ print_ucs ("reverse", ucr, n_ucr);
+ fflush (stderr);
+ abort ();
+ }
+}
+
+#if CONFIG_UNICODE_SAFETY
+static void
+do_exhaustive_test (const uint8_t *start, uint8_t *s, int n)
+{
+ /* The units to test. */
+ static const uint8_t units[] = {
+ /* The smallest value in each class. (Any other member or members would
+ work as well). */
+ 0x00, 0x80, 0x90, 0xa0, 0xc0, 0xc2, 0xe0, 0xe1, 0xed, 0xee, 0xf0, 0xf1,
+ 0xf4, 0xf5,
+
+ /* The UTF-8 units that make up U+FFFD, since that is such a special value
+ for these routines. */
+ 0xef, 0xbf, 0xbd
+ };
+ int i;
+
+ for (i = 0; i < SIZEOF (units); i++)
+ {
+ s[0] = units[i];
+ if (n > 1)
+ do_exhaustive_test (start, s + 1, n - 1);
+ else
+ check_bidirectionally (start, (s + 1) - start);
+ }
+}
+
+/* This test exhaustively compares how u8_mbtouc() and u8_mb_prev_uc() treat
+ all UTF-8 well-formed and ill-formed sequences that are MAX_LENGTH units or
+ shorter. To do so in a reasonable amount of time, it uses a trick: many
+ UTF-8 unit values are in classes whose members are all treated the same way.
+ Thus, it is only necessary to test one member of each class. */
+static void
+exhaustive_test (int max_length)
+{
+ uint8_t s[16];
+ int length;
+
+ assert (max_length <= SIZEOF (s));
+ for (length = 0; length <= max_length; length++)
+ do_exhaustive_test (s, s, length);
+}
+#endif /* CONFIG_UNICODE_SAFETY */
+
+static void
+do_well_formed_test (const uint8_t *start, uint8_t *s, int n)
+{
+ if (n == 0)
+ {
+ check_bidirectionally (start, s - start);
+ return;
+ }
+
+ /* Test single-byte characters. */
+ s[0] = 0;
+ do_well_formed_test (start, s + 1, n - 1);
+
+ s[0] = 0x41;
+ do_well_formed_test (start, s + 1, n - 1);
+
+ /* Test 2-byte characters. */
+ if (n >= 2)
+ {
+ s[0] = 0xc2;
+ s[1] = 0xb0;
+ do_well_formed_test (start, s + 2, n - 2);
+ }
+
+ /* Test 3-byte characters. */
+ if (n >= 3)
+ {
+ s[0] = 0xe0;
+ s[1] = 0xa0;
+ s[2] = 0xa5;
+ do_well_formed_test (start, s + 3, n - 3);
+
+ s[0] = 0xe5;
+ s[1] = 0xbf;
+ s[2] = 0x81;
+ do_well_formed_test (start, s + 3, n - 3);
+
+ s[0] = 0xed;
+ s[1] = 0x9f;
+ s[2] = 0x99;
+ do_well_formed_test (start, s + 3, n - 3);
+ }
+
+ /* Test 4-byte characters. */
+ if (n >= 4)
+ {
+ s[0] = 0xf0;
+ s[1] = 0x90;
+ s[2] = 0xbb;
+ s[3] = 0x80;
+ do_well_formed_test (start, s + 4, n - 4);
+
+ s[0] = 0xf2;
+ s[1] = 0x80;
+ s[2] = 0xbf;
+ s[3] = 0x80;
+ do_well_formed_test (start, s + 4, n - 4);
+
+ s[0] = 0xf4;
+ s[1] = 0x8f;
+ s[2] = 0x80;
+ s[3] = 0xbf;
+ do_well_formed_test (start, s + 4, n - 4);
+ }
+}
+
+/* Checks iteration through all possible sets of UTF-8 sequence lengths with
+ no more than MAX_LENGTH units. */
+static void
+well_formed_test (int max_length)
+{
+ uint8_t s[16];
+ int length;
+
+ assert (max_length <= SIZEOF (s));
+ for (length = 0; length <= max_length; length++)
+ do_well_formed_test (s, s, length);
+}
+
+int
+main (void)
+{
+#if CONFIG_UNICODE_SAFETY
+ /* This only passes if Unicode safety was compiled in, because most of the
+ sequences that it tests are ill-formed UTF-8.
+
+ Runtime increases exponentially with the argument: 4 runs in a fraction
+ of a second, 5 in a few seconds, 6 in half a minute. */
+ exhaustive_test (5);
+#endif
+
+ /* This only tests well-formed characters so it should always pass.
+
+ Runtime increases exponentially but much more slowly than with
+ exhaustive_test(). */
+ well_formed_test (10);
+
+ return 0;
+}
--
1.7.1
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [bug-libunistring] new u{8,16,32}-mb-prev-uc modules,
Ben Pfaff <=