From 82052a10da40a6d814ab30a944859c83bb6fe68e Mon Sep 17 00:00:00 2001 From: Julian Graham Date: Mon, 21 Dec 2009 08:46:49 -0500 Subject: [PATCH] Improved support for Unicode title case in Guile's string and character APIs. * doc/ref/api-data.texi (Characters): Documentation for `char-title-case?' and `char-titlecase'. * doc/ref/api-i18n.texi (Character Case Mapping): Documentation for `char-locale-titlecase' and `string-locale-titlecase'. * libguile/chars.c, libguile/chars.h (scm_char_title_case_p, scm_char_titlecase, scm_c_titlecase): New functions. * libguile/i18n.c, libguile/i18n.h (chr_to_case, scm_char_locale_titlecase, str_to_case, scm_string_locale_titlecase): New functions. * libguile/i18n.c (scm_char_locale_downcase, scm_char_locale_upcase, scm_string_locale_downcase, scm_string_locale_upcase): Refactor to share code via chr_to_case and str_to_case, as appropriate. * module/ice-9/i18n.scm (char-locale-title-case, string-locale-titlecase): New functions. * libguile/srfi-13.c (string_titlecase_x): Use uc_totitle instead of uc_toupper. * test-suite/tests/chars.test: Tests for `char-title-case?' and `char-titlecase'. * test-suite/tests/i18n.test: Tests for `char-locale-titlecase' and `string-locale-titlecase'. * test-suite/tests/srfi-13.test: Tests for `string-titlecase'. --- doc/ref/api-data.texi | 13 +++ doc/ref/api-i18n.texi | 12 ++ libguile/chars.c | 24 ++++ libguile/chars.h | 3 + libguile/i18n.c | 238 ++++++++++++++++++++++++----------------- libguile/i18n.h | 2 + libguile/srfi-13.c | 2 +- module/ice-9/i18n.scm | 4 +- test-suite/tests/chars.test | 13 ++- test-suite/tests/i18n.test | 13 +++ test-suite/tests/srfi-13.test | 19 ++++ 11 files changed, 242 insertions(+), 101 deletions(-) diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index 81f44de..e5bda97 100755 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -1869,6 +1869,12 @@ Return @code{#t} iff @var{chr} is uppercase, else @code{#f}. Return @code{#t} iff @var{chr} is lowercase, else @code{#f}. @end deffn address@hidden char-title-case? address@hidden {Scheme Procedure} char-title-case? chr address@hidden {C Function} scm_char_title_case_p (chr) +Return @code{#t} iff @var{chr} is titlecase, else @code{#f}. address@hidden deffn + @deffn {Scheme Procedure} char-is-both? chr @deffnx {C Function} scm_char_is_both_p (chr) Return @code{#t} iff @var{chr} is either uppercase or lowercase, else @@ -1901,6 +1907,13 @@ Return the uppercase character version of @var{chr}. Return the lowercase character version of @var{chr}. @end deffn address@hidden char-titlecase address@hidden {Scheme Procedure} char-titlecase chr address@hidden {C Function} scm_char_titlecase (chr) +Return the titlecase character version of @var{chr} if one exists; +otherwise return the uppercase version. address@hidden deffn + @node Character Sets @subsection Character Sets diff --git a/doc/ref/api-i18n.texi b/doc/ref/api-i18n.texi index ee76544..b82a3a2 100644 --- a/doc/ref/api-i18n.texi +++ b/doc/ref/api-i18n.texi @@ -197,6 +197,12 @@ Return the uppercase character that corresponds to @var{chr} according to either @var{locale} or the current locale. @end deffn address@hidden {Scheme Procedure} char-locale-titlecase chr [locale] address@hidden {C Function} scm_char_locale_titlecase (chr, locale) +Return the titlecase character that corresponds to @var{chr} according +to either @var{locale} or the current locale. address@hidden deffn + @deffn {Scheme Procedure} string-locale-upcase str [locale] @deffnx {C Function} scm_string_locale_upcase (str, locale) Return a new string that is the uppercase version of @var{str} @@ -209,6 +215,12 @@ Return a new string that is the down-case version of @var{str} according to either @var{locale} or the current locale. @end deffn address@hidden {Scheme Procedure} string-locale-titlecase str [locale] address@hidden {C Function} scm_string_locale_titlecase (str, locale) +Return a new string that is the titlecase version of @var{str} +according to either @var{locale} or the current locale. address@hidden deffn + Note that in the current implementation Guile has no notion of multibyte characters and in a multibyte locale characters may not be converted correctly. diff --git a/libguile/chars.c b/libguile/chars.c index 68e6dc1..245e289 100644 --- a/libguile/chars.c +++ b/libguile/chars.c @@ -391,6 +391,14 @@ SCM_DEFINE (scm_char_lower_case_p, "char-lower-case?", 1, 0, 0, } #undef FUNC_NAME +SCM_DEFINE (scm_char_title_case_p, "char-title-case?", 1, 0, 0, + (SCM chr), + "Return @code{#t} iff @var{chr} is titlecase, else @code{#f}.\n") +#define FUNC_NAME s_scm_char_title_case_p +{ + return scm_char_set_contains_p (scm_char_set_title_case, chr); +} +#undef FUNC_NAME SCM_DEFINE (scm_char_is_both_p, "char-is-both?", 1, 0, 0, (SCM chr), @@ -458,6 +466,16 @@ SCM_DEFINE (scm_char_downcase, "char-downcase", 1, 0, 0, } #undef FUNC_NAME +SCM_DEFINE (scm_char_titlecase, "char-titlecase", 1, 0, 0, + (SCM chr), + "Return the titlecase character version of @var{chr}.") +#define FUNC_NAME s_scm_char_titlecase +{ + SCM_VALIDATE_CHAR (1, chr); + return SCM_MAKE_CHAR (scm_c_titlecase (SCM_CHAR(chr))); +} +#undef FUNC_NAME + @@ -480,6 +498,12 @@ scm_c_downcase (scm_t_wchar c) return uc_tolower ((int) c); } +scm_t_wchar +scm_c_titlecase (scm_t_wchar c) +{ + return uc_totitle ((int) c); +} + /* There are a few sets of character names: R5RS, Guile diff --git a/libguile/chars.h b/libguile/chars.h index 04eb9f0..f4f2390 100644 --- a/libguile/chars.h +++ b/libguile/chars.h @@ -75,13 +75,16 @@ SCM_API SCM scm_char_numeric_p (SCM chr); SCM_API SCM scm_char_whitespace_p (SCM chr); SCM_API SCM scm_char_upper_case_p (SCM chr); SCM_API SCM scm_char_lower_case_p (SCM chr); +SCM_API SCM scm_char_title_case_p (SCM chr); SCM_API SCM scm_char_is_both_p (SCM chr); SCM_API SCM scm_char_to_integer (SCM chr); SCM_API SCM scm_integer_to_char (SCM n); SCM_API SCM scm_char_upcase (SCM chr); SCM_API SCM scm_char_downcase (SCM chr); +SCM_API SCM scm_char_titlecase (SCM chr); SCM_API scm_t_wchar scm_c_upcase (scm_t_wchar c); SCM_API scm_t_wchar scm_c_downcase (scm_t_wchar c); +SCM_API scm_t_wchar scm_c_titlecase (scm_t_wchar c); SCM_INTERNAL const char *scm_i_charname (SCM chr); SCM_INTERNAL SCM scm_i_charname_to_char (const char *charname, size_t charname_len); diff --git a/libguile/i18n.c b/libguile/i18n.c index 3a6cb06..b689caf 100644 --- a/libguile/i18n.c +++ b/libguile/i18n.c @@ -1098,22 +1098,17 @@ u32_locale_tocase (const scm_t_uint32 *c_s1, size_t len, } - -SCM_DEFINE (scm_char_locale_downcase, "char-locale-downcase", 1, 1, 0, - (SCM chr, SCM locale), - "Return the lowercase character that corresponds to @var{chr} " - "according to either @var{locale} or the current locale.") -#define FUNC_NAME s_scm_char_locale_downcase +static SCM +chr_to_case (SCM chr, scm_t_locale c_locale, + scm_t_uint32 *(*func) (const scm_t_uint32 *, size_t, const char *, + uninorm_t, scm_t_uint32 *, size_t *), + int *err) { int ret; - scm_t_locale c_locale; scm_t_wchar *buf; - scm_t_uint32 *downbuf; - size_t downlen; - SCM str, downchar; - - SCM_VALIDATE_CHAR (1, chr); - SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); + scm_t_uint32 *convbuf; + size_t convlen; + SCM str, convchar; str = scm_i_make_wide_string (1, &buf); buf[0] = SCM_CHAR (chr); @@ -1121,26 +1116,49 @@ SCM_DEFINE (scm_char_locale_downcase, "char-locale-downcase", 1, 1, 0, if (c_locale != NULL) RUN_IN_LOCALE_SECTION (c_locale, ret = u32_locale_tocase ((scm_t_uint32 *) buf, 1, - &downbuf, - &downlen, u32_tolower)); + &convbuf, + &convlen, func)); else ret = - u32_locale_tocase ((scm_t_uint32 *) buf, 1, &downbuf, - &downlen, u32_tolower); + u32_locale_tocase ((scm_t_uint32 *) buf, 1, &convbuf, + &convlen, func); if (SCM_UNLIKELY (ret != 0)) { - errno = ret; - scm_syserror (FUNC_NAME); + *err = ret; + return NULL; } - if (downlen == 1) - downchar = SCM_MAKE_CHAR ((scm_t_wchar) downbuf[0]); + if (convlen == 1) + convchar = SCM_MAKE_CHAR ((scm_t_wchar) convbuf[0]); else - downchar = chr; - free (downbuf); + convchar = chr; + free (convbuf); + + return convchar; +} + +SCM_DEFINE (scm_char_locale_downcase, "char-locale-downcase", 1, 1, 0, + (SCM chr, SCM locale), + "Return the lowercase character that corresponds to @var{chr} " + "according to either @var{locale} or the current locale.") +#define FUNC_NAME s_scm_char_locale_downcase +{ + scm_t_locale c_locale; + SCM ret; + int err = 0; + + SCM_VALIDATE_CHAR (1, chr); + SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); - return downchar; + ret = chr_to_case (chr, c_locale, u32_tolower, &err); + + if (err != 0) + { + errno = err; + scm_syserror (FUNC_NAME); + } + return ret; } #undef FUNC_NAME @@ -1150,59 +1168,60 @@ SCM_DEFINE (scm_char_locale_upcase, "char-locale-upcase", 1, 1, 0, "according to either @var{locale} or the current locale.") #define FUNC_NAME s_scm_char_locale_upcase { - int ret; scm_t_locale c_locale; - scm_t_wchar *buf; - scm_t_uint32 *upbuf; - size_t uplen; - SCM str, upchar; + SCM ret; + int err = 0; SCM_VALIDATE_CHAR (1, chr); SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); - str = scm_i_make_wide_string (1, &buf); - buf[0] = SCM_CHAR (chr); + ret = chr_to_case (chr, c_locale, u32_toupper, &err); - if (c_locale != NULL) - RUN_IN_LOCALE_SECTION (c_locale, ret = - u32_locale_tocase ((scm_t_uint32 *) buf, 1, - &upbuf, - &uplen, u32_toupper)); - else - ret = - u32_locale_tocase ((scm_t_uint32 *) buf, 1, &upbuf, - &uplen, u32_toupper); + if (err != 0) + { + errno = err; + scm_syserror (FUNC_NAME); + } + return ret; +} +#undef FUNC_NAME - if (SCM_UNLIKELY (ret != 0)) +SCM_DEFINE (scm_char_locale_titlecase, "char-locale-titlecase", 1, 1, 0, + (SCM chr, SCM locale), + "Return the titlecase character that corresponds to @var{chr} " + "according to either @var{locale} or the current locale.") +#define FUNC_NAME s_scm_char_locale_titlecase +{ + scm_t_locale c_locale; + SCM ret; + int err = 0; + + SCM_VALIDATE_CHAR (1, chr); + SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); + + ret = chr_to_case (chr, c_locale, u32_totitle, &err); + + if (err != 0) { - errno = ret; + errno = err; scm_syserror (FUNC_NAME); } - if (uplen == 1) - upchar = SCM_MAKE_CHAR ((scm_t_wchar) upbuf[0]); - else - upchar = chr; - free (upbuf); - return upchar; + return ret; } #undef FUNC_NAME -SCM_DEFINE (scm_string_locale_upcase, "string-locale-upcase", 1, 1, 0, - (SCM str, SCM locale), - "Return a new string that is the uppercase version of " - "@var{str} according to either @var{locale} or the current " - "locale.") -#define FUNC_NAME s_scm_string_locale_upcase +static SCM +str_to_case (SCM str, scm_t_locale c_locale, + scm_t_uint32 *(*func) (const scm_t_uint32 *, size_t, const char *, + uninorm_t, scm_t_uint32 *, size_t *), + int *err) { scm_t_wchar *c_str, *c_buf; - scm_t_uint32 *c_upstr; - size_t len, uplen; + scm_t_uint32 *c_convstr; + size_t len, convlen; int ret; - scm_t_locale c_locale; - SCM upstr; + SCM convstr; - SCM_VALIDATE_STRING (1, str); - SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); len = scm_i_string_length (str); if (len == 0) return scm_nullstr; @@ -1211,28 +1230,52 @@ SCM_DEFINE (scm_string_locale_upcase, "string-locale-upcase", 1, 1, 0, if (c_locale) RUN_IN_LOCALE_SECTION (c_locale, ret = u32_locale_tocase ((scm_t_uint32 *) c_str, len, - &c_upstr, - &uplen, u32_toupper)); + &c_convstr, + &convlen, func)); else ret = u32_locale_tocase ((scm_t_uint32 *) c_str, len, - &c_upstr, &uplen, u32_toupper); + &c_convstr, &convlen, func); scm_remember_upto_here (str); if (SCM_UNLIKELY (ret != 0)) { - errno = ret; - scm_syserror (FUNC_NAME); + *err = ret; + return NULL; } - upstr = scm_i_make_wide_string (uplen, &c_buf); - memcpy (c_buf, c_upstr, uplen * sizeof (scm_t_wchar)); - free (c_upstr); + convstr = scm_i_make_wide_string (convlen, &c_buf); + memcpy (c_buf, c_convstr, convlen * sizeof (scm_t_wchar)); + free (c_convstr); - scm_i_try_narrow_string (upstr); + scm_i_try_narrow_string (convstr); - return upstr; + return convstr; +} + +SCM_DEFINE (scm_string_locale_upcase, "string-locale-upcase", 1, 1, 0, + (SCM str, SCM locale), + "Return a new string that is the uppercase version of " + "@var{str} according to either @var{locale} or the current " + "locale.") +#define FUNC_NAME s_scm_string_locale_upcase +{ + scm_t_locale c_locale; + SCM ret; + int err = 0; + + SCM_VALIDATE_STRING (1, str); + SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); + + ret = str_to_case (str, c_locale, u32_toupper, &err); + + if (err != 0) + { + errno = err; + scm_syserror (FUNC_NAME); + } + return ret; } #undef FUNC_NAME @@ -1243,45 +1286,46 @@ SCM_DEFINE (scm_string_locale_downcase, "string-locale-downcase", 1, 1, 0, "locale.") #define FUNC_NAME s_scm_string_locale_downcase { - scm_t_wchar *c_str, *c_buf; - scm_t_uint32 *c_downstr; - size_t len, downlen; - int ret; scm_t_locale c_locale; - SCM downstr; + SCM ret; + int err = 0; SCM_VALIDATE_STRING (1, str); SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); - len = scm_i_string_length (str); - if (len == 0) - return scm_nullstr; - SCM_STRING_TO_U32_BUF (str, c_str); - if (c_locale) - RUN_IN_LOCALE_SECTION (c_locale, ret = - u32_locale_tocase ((scm_t_uint32 *) c_str, len, - &c_downstr, - &downlen, u32_tolower)); - else - ret = - u32_locale_tocase ((scm_t_uint32 *) c_str, len, - &c_downstr, &downlen, u32_tolower); + ret = str_to_case (str, c_locale, u32_tolower, &err); - scm_remember_upto_here (str); - - if (SCM_UNLIKELY (ret != 0)) + if (err != 0) { - errno = ret; + errno = err; scm_syserror (FUNC_NAME); } + return ret; +} +#undef FUNC_NAME + +SCM_DEFINE (scm_string_locale_titlecase, "string-locale-titlecase", 1, 1, 0, + (SCM str, SCM locale), + "Return a new string that is the title-case version of " + "@var{str} according to either @var{locale} or the current " + "locale.") +#define FUNC_NAME s_scm_string_locale_titlecase +{ + scm_t_locale c_locale; + SCM ret; + int err = 0; - downstr = scm_i_make_wide_string (downlen, &c_buf); - memcpy (c_buf, c_downstr, downlen * sizeof (scm_t_wchar)); - free (c_downstr); + SCM_VALIDATE_STRING (1, str); + SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale); - scm_i_try_narrow_string (downstr); + ret = str_to_case (str, c_locale, u32_totitle, &err); - return downstr; + if (err != 0) + { + errno = err; + scm_syserror (FUNC_NAME); + } + return ret; } #undef FUNC_NAME diff --git a/libguile/i18n.h b/libguile/i18n.h index 16045eb..c2792ac 100644 --- a/libguile/i18n.h +++ b/libguile/i18n.h @@ -38,8 +38,10 @@ SCM_API SCM scm_char_locale_ci_gt (SCM c1, SCM c2, SCM locale); SCM_API SCM scm_char_locale_ci_eq (SCM c1, SCM c2, SCM locale); SCM_API SCM scm_char_locale_upcase (SCM chr, SCM locale); SCM_API SCM scm_char_locale_downcase (SCM chr, SCM locale); +SCM_API SCM scm_char_locale_titlecase (SCM chr, SCM locale); SCM_API SCM scm_string_locale_upcase (SCM chr, SCM locale); SCM_API SCM scm_string_locale_downcase (SCM chr, SCM locale); +SCM_API SCM scm_string_locale_titlecase (SCM chr, SCM locale); SCM_API SCM scm_locale_string_to_integer (SCM str, SCM base, SCM locale); SCM_API SCM scm_locale_string_to_inexact (SCM str, SCM locale); diff --git a/libguile/srfi-13.c b/libguile/srfi-13.c index cf2abfc..c4e8571 100644 --- a/libguile/srfi-13.c +++ b/libguile/srfi-13.c @@ -2198,7 +2198,7 @@ string_titlecase_x (SCM str, size_t start, size_t end) { if (!in_word) { - scm_i_string_set_x (str, i, uc_toupper (SCM_CHAR (ch))); + scm_i_string_set_x (str, i, uc_totitle (SCM_CHAR (ch))); in_word = 1; } else diff --git a/module/ice-9/i18n.scm b/module/ice-9/i18n.scm index e63ec74..52d7cb4 100644 --- a/module/ice-9/i18n.scm +++ b/module/ice-9/i18n.scm @@ -42,8 +42,8 @@ char-locale-ci? char-locale-ci=? ;; character mapping - char-locale-downcase char-locale-upcase - string-locale-downcase string-locale-upcase + char-locale-downcase char-locale-upcase char-locale-titlecase + string-locale-downcase string-locale-upcase string-locale-titlecase ;; reading numbers locale-string->integer locale-string->inexact diff --git a/test-suite/tests/chars.test b/test-suite/tests/chars.test index 67e72a6..0194903 100644 --- a/test-suite/tests/chars.test +++ b/test-suite/tests/chars.test @@ -204,6 +204,13 @@ (not (char-lower-case? #\1)) (not (char-lower-case? #\+)))) + (pass-if "char-title-case?" + (and (char-title-case? #\762) + (not (char-title-case? #\A)) + (not (char-title-case? #\a)) + (not (char-title-case? #\1)) + (not (char-title-case? #\+)))) + (pass-if "char-is-both? works" (and (not (char-is-both? #\?)) @@ -245,7 +252,11 @@ (eqv? (char-upcase #\a) #\A)) (pass-if "char-downcase" - (eqv? (char-downcase #\A) #\a))) + (eqv? (char-downcase #\A) #\a)) + + (pass-if "char-titlecase" + (and (eqv? (char-titlecase #\a) #\A) + (eqv? (char-titlecase #\763) #\762)))) (with-test-prefix "charnames" diff --git a/test-suite/tests/i18n.test b/test-suite/tests/i18n.test index 89924b6..b2431dd 100644 --- a/test-suite/tests/i18n.test +++ b/test-suite/tests/i18n.test @@ -202,6 +202,14 @@ (and (eq? #\Z (char-locale-upcase #\z)) (eq? #\Z (char-locale-upcase #\z (make-locale LC_ALL "C"))))) + (pass-if "char-locale-titlecase" + (and (eq? #\T (char-locale-titlecase #\t)) + (eq? #\T (char-locale-titlecase #\t (make-locale LC_ALL "C"))))) + + (pass-if "char-locale-titlecase-Dz" + (and (eq? #\762 (char-locale-titlecase #\763)) + (eq? #\762 (char-locale-titlecase #\763 (make-locale LC_ALL "C"))))) + (pass-if "char-locale-upcase Turkish" (under-turkish-utf8-locale-or-unresolved (lambda () @@ -229,6 +237,11 @@ (and (string=? "Z" (string-locale-upcase "z")) (string=? "Z" (string-locale-upcase "z" (make-locale LC_ALL "C"))))) + (pass-if "string-locale-titlecase" + (and (string=? "Tt" (string-locale-titlecase "tt")) + (string=? "Tt" (string-locale-titlecase + "tt" (make-locale LC_ALL "C"))))) + (pass-if "string-locale-upcase Turkish" (under-turkish-utf8-locale-or-unresolved (lambda () diff --git a/test-suite/tests/srfi-13.test b/test-suite/tests/srfi-13.test index 0d2ff59..6864287 100644 --- a/test-suite/tests/srfi-13.test +++ b/test-suite/tests/srfi-13.test @@ -148,6 +148,25 @@ (string-any char-upper-case? "abCDE" 1 4)))) ;;; +;;; string-titlecase +;;; + +(with-test-prefix "string-titlecase" + + (pass-if "all-lower" + (string=? "Foo" (string-titlecase "foo"))) + + (pass-if "all-upper" + (string=? "Foo" (string-titlecase "FOO"))) + + (pass-if "two-words" + (string=? "Hello, World!" (string-titlecase "hello, world!"))) + + (pass-if "titlecase-characters" + (string=? (list->string '(#\762)) + (string-titlecase (list->string '(#\763)))))) + +;;; ;;; string-append/shared ;;; -- 1.6.3.3