From eb8f72e7d238ee3351411b903898075c2787fc07 Mon Sep 17 00:00:00 2001 From: Julian Graham Date: Sat, 2 Jan 2010 00:06:23 -0500 Subject: [PATCH] Support for Unicode string normalization functions * libguile/strings.c, libguile/strings.h (normalize_str, scm_string_normalize_nfc, scm_string_normalize_nfd, scm_normalize_nfkc, scm_string_normalize_nfkd): New functions. * test-suite/tests/strings.test: Unit tests for `string-normalize-nfc', `string-normalize-nfd', `string-normalize-nfkc', and `string-normalize-nfkd'. * doc/ref/api-data.texi (String Comparison): Documentation for normalization functions. --- doc/ref/api-data.texi | 64 ++++++++++++++++++++++++++++++++++++ libguile/strings.c | 73 +++++++++++++++++++++++++++++++++++++++++ libguile/strings.h | 5 +++ test-suite/tests/strings.test | 40 ++++++++++++++++++++++ 4 files changed, 182 insertions(+), 0 deletions(-) diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index b959ab9..9cc8ea5 100755 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -3252,6 +3252,70 @@ Compute a hash value for @var{S}. the optional argument @var{bound} is a non-ne Compute a hash value for @var{S}. the optional argument @var{bound} is a non-negative exact integer specifying the range of the hash function. A positive value restricts the return value to the range [0,bound). @end deffn +Because the same visual appearance of an abstract Unicode character can +be obtained via multiple sequences of Unicode characters, even the +case-insensitive string comparison functions described above may return address@hidden when presented with strings containing different +representations of the same character. For example, the Unicode +character ``LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE'' can be +represented with a single character (U+1E69) or by the character ``LATIN +SMALL LETTER S'' (U+0073) followed by the combining marks ``COMBINING +DOT BELOW'' (U+0323) and ``COMBINING DOT ABOVE'' (U+0307). + +For this reason, it is often desirable to ensure that the strings +to be compared are using a mutually consistent representation for every +character. The Unicode standard defines two methods of normalizing the +contents of strings: Decomposition, which breaks composite characters +into a set of constituent characters with an ordering defined by the +Unicode Standard; and composition, which performs the converse. + +There are two decomposition operations. ``Canonical decomposition'' +produces character sequences that share the same visual appearance as +the original characters, while ``compatiblity decomposition'' produces +ones whose visual appearances may differ from the originals but which +represent the same abstract character. + +These operations are encapsulated in the following set of normalization +forms: + address@hidden @dfn address@hidden NFD +Characters are decomposed to their canonical forms. + address@hidden NFKD +Characters are decomposed to their compatibility forms. + address@hidden NFC +Characters are decomposed to their canonical forms, then composed. + address@hidden NFKC +Characters are decomposed to their compatibility forms, then composed. + address@hidden table + +The functions below put their arguments into one of the forms described +above. + address@hidden {Scheme Procedure} string-normalize-nfd s address@hidden {C Function} scm_string_normalize_nfd (s) +Return the @code{NFD} normalized form of @var{s}. address@hidden deffn + address@hidden {Scheme Procedure} string-normalize-nfkd s address@hidden {C Function} scm_string_normalize_nfkd (s) +Return the @code{NFKD} normalized form of @var{s}. address@hidden deffn + address@hidden {Scheme Procedure} string-normalize-nfc s address@hidden {C Function} scm_string_normalize_nfc (s) +Return the @code{NFC} normalized form of @var{s}. address@hidden deffn + address@hidden {Scheme Procedure} string-normalize-nfkc s address@hidden {C Function} scm_string_normalize_nfkc (s) +Return the @code{NFKC} normalized form of @var{s}. address@hidden deffn + @node String Searching @subsubsection String Searching diff --git a/libguile/strings.c b/libguile/strings.c index 711da9c..84df48a 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1736,6 +1737,78 @@ scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len) return len; } +/* This function is a partial clone of SCM_STRING_TO_U32_BUF from + libguile/i18n.c. It would be useful to have this factored out into a more + convenient location, but its use of alloca makes that tricky to do. */ + +static SCM +normalize_str (SCM string, uninorm_t form) +{ + SCM ret; + scm_t_uint32 *w_str; + scm_t_wchar *cbuf; + size_t rlen, len = scm_i_string_length (string); + + if (scm_i_is_narrow_string (string)) + { + size_t i; + const char *buf = scm_i_string_chars (string); + + w_str = alloca (sizeof (scm_t_wchar) * (len + 1)); + + for (i = 0; i < len; i ++) + w_str[i] = (unsigned char) buf[i]; + w_str[len] = 0; + } + else w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string); + w_str = u32_normalize (form, w_str, len, NULL, &rlen); + + ret = scm_i_make_wide_string (rlen, &cbuf); + u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen); + free (w_str); + return ret; +} + +SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0, + (SCM string), + "Returns the NFC normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfc +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFC); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0, + (SCM string), + "Returns the NFD normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfd +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFD); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0, + (SCM string), + "Returns the NFKC normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfkc +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFKC); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0, + (SCM string), + "Returns the NFKD normalized form of @var{string}.") +#define FUNC_NAME s_scm_string_normalize_nfkd +{ + SCM_VALIDATE_STRING (1, string); + return normalize_str (string, UNINORM_NFKD); +} +#undef FUNC_NAME + /* converts C scm_array of strings to SCM scm_list of strings. */ /* If argc < 0, a null terminated scm_array is assumed. */ SCM diff --git a/libguile/strings.h b/libguile/strings.h index edff0f8..6eafafa 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -142,6 +142,11 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp, SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str); SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len); +SCM_API SCM scm_string_normalize_nfd (SCM str); +SCM_API SCM scm_string_normalize_nfkd (SCM str); +SCM_API SCM scm_string_normalize_nfc (SCM str); +SCM_API SCM scm_string_normalize_nfkc (SCM str); + SCM_API SCM scm_makfromstrs (int argc, char **argv); diff --git a/test-suite/tests/strings.test b/test-suite/tests/strings.test index 013c1a8..984178d 100644 --- a/test-suite/tests/strings.test +++ b/test-suite/tests/strings.test @@ -386,6 +386,46 @@ (string-ci>=? (string-ints 0) (string-ints 255))))) ;; +;; Unicode string normalization forms +;; + +;; +;; string-normalize-nfd +;; + +(with-test-prefix "string-normalize-nfd" + + (pass-if "canonical decomposition is equal?" + (equal? (string-normalize-nfd "\xe9") "\x65\u0301"))) + +;; +;; string-normalize-nfkd +;; + +(with-test-prefix "string-normalize-nfkd" + + (pass-if "compatibility decomposition is equal?" + (equal? (string-normalize-nfkd "\u1e9b\u0323") "s\u0323\u0307"))) + +;; +;; string-normalize-nfc +;; + +(with-test-prefix "string-normalize-nfc" + + (pass-if "canonical composition is equal?" + (equal? (string-normalize-nfc "\x65\u0301") "\xe9"))) + +;; +;; string-normalize-nfkc +;; + +(with-test-prefix "string-normalize-nfkc" + + (pass-if "compatibility composition is equal?" + (equal? (string-normalize-nfkc "\u1e9b\u0323") "\u1e69"))) + +;; ;; string-ref ;; -- 1.6.3.3