/* unicode.c - functions to convert unicode characters */ /* Copyright (C) 2010 Free Software Foundation, Inc. This file is part of GNU Bash, the Bourne Again SHell. Bash is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Bash is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Bash. If not, see . */ #include #if defined (HANDLE_MULTIBYTE) #include #include #include #ifdef HAVE_UNISTD_H #include #endif #include #if HAVE_ICONV # include #endif #include #include "bashintl.h" #if !defined (STREQ) # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0) #endif /* !STREQ */ #if defined (HAVE_LOCALE_CHARSET) extern const char *locale_charset __P((void)); #else extern char *get_locale_var __P((char *)); #endif const char *charset; static int utf8locale = 0; #if defined (HAVE_ICONV) static iconv_t localconv; #endif #ifndef HAVE_LOCALE_CHARSET static char charset_buffer[40]={0}; static char * stub_charset () { char *locale, *s, *t; locale = get_locale_var ("LC_CTYPE"); if (locale == 0 || *locale == 0) { strcpy(charset_buffer, "ASCII"); } else { s = strrchr (locale, '.'); if (s) { t = strchr (s, '@'); if (t) *t = 0; strcpy(charset_buffer, s); } else { strcpy(charset_buffer, locale); } /* free(locale) If we can Modify the buffer surely we need to free it?*/ } return charset_buffer; } #endif int utf32tobig5 (s, c) char *s; unsigned long c; { int l; if (c <= 0x7F) { s[0] = (char)c; l = 1; } else if ((c >= 0x8000) && (c <= 0xFFFF)) { s[0] = (char)(c>>8); s[1] = (char)(c &0xFF); l = 2; } else { /* Error Invalid UTF-8 */ l = 0; } s[l] = '\0'; return l; } int utf32toutf8 (s, c) char *s; unsigned long c; { int l; if (c <= 0x7F) { s[0] = (char)c; l = 1; } else if (c <= 0x7FF) { s[0] = (c >> 6) | 0xc0; /* 110x xxxx */ s[1] = (c & 0x3f) | 0x80; /* 10xx xxxx */ l = 2; } else if (c <= 0xFFFF) { s[0] = (c >> 12) | 0xe0; /* 1110 xxxx */ s[1] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ s[2] = (c & 0x3f) | 0x80; /* 10xx xxxx */ l = 3; } else if (c <= 0x1FFFFF) { s[0] = (c >> 18) | 0xf0; /* 1111 0xxx */ s[1] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */ s[2] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ s[3] = ( c & 0x3f) | 0x80; /* 10xx xxxx */ l = 4; } else if (c <= 0x3FFFFFF) { s[0] = (c >> 24) | 0xf8; /* 1111 10xx */ s[1] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */ s[2] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */ s[3] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ s[4] = ( c & 0x3f) | 0x80; /* 10xx xxxx */ l = 5; } else if (c <= 0x7FFFFFFF) { s[0] = (c >> 30) | 0xfc; /* 1111 110x */ s[1] = ((c >> 24) & 0x3f) | 0x80; /* 10xx xxxx */ s[2] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */ s[3] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */ s[4] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */ s[5] = ( c & 0x3f) | 0x80; /* 10xx xxxx */ l = 6; } else { /* Error Invalid UTF-8 */ l = 0; } s[l] = '\0'; return l; } int utf32toutf16 (s, c) unsigned short *s; unsigned long c; { int l=0; if (c < 0xD800) { // Valid character directly convertible to 16 bits s[0] = (unsigned short)(c&0xFFFF); l=1; } else if ( (c >= 0x0000E000) && (c <= 0x0010FFFF) ) { // Character will be converted to 2 UTF-16 elements c -= 0x0010000; s[0] = (unsigned short)((c >> 10) + 0xD800); /* 1101 10XX XXXX XXXX */ s[1] = (unsigned short)((c & 0x3FFUL) + 0xDC00); /* 1101 11XX XXXX XXXX */ l=2; } s[l] = 0; return l; } int utf32towchar (ws, c) wchar_t *ws; unsigned long c; { int l=0; if ( sizeof (wchar_t) == 4) { ws[0]=c; l=1; } else if ( sizeof (wchar_t) == 2) { l=utf32toutf16(ws, c); } ws[l] = 0; return l; } /* convert a single unicode-32 character into a multibyte string and put the result in S, which must be large enough (at least MB_LEN_MAX bytes) */ int utf32tomb (s, c) char *s; unsigned long c; { size_t n=0; wchar_t wstr[3]; #if HAVE_ICONV char utf8buf[25], *optr; size_t obytesleft, sn; ICONV_CONST char *iptr; #endif #if HAVE_NL_LANGINFO char *codeset; #endif if ( n == 0 ) { /* * Encode Method 1 * UTF 0x00 -> 0x7f = ASCII Just copy */ if ( c <= 0x7f ) { s[0]=(char)c; n=1; } } #if __STDC_ISO_10646__ if ( n == 0 ) { /* * Encode Method 2 * Use wcstombs */ if( utf32towchar(wstr, c) ) n = wcstombs (s, wstr, MB_LEN_MAX); if(n == -1) /* Error Encoding so let another method try */ n=0; } #endif #if HAVE_NL_LANGINFO if ( n == 0 ) { /* * Encode Method 3 * Targets UTF-8 cool just encode. */ codeset = nl_langinfo (CODESET); if (STREQ (codeset, "UTF-8")) n = utf32toutf8 (s, c); } #endif #if HAVE_ICONV if ( n == 0 ) { /* * Encode Method 4 * Lets try iconv. */ # if HAVE_LOCALE_CHARSET charset = locale_charset (); /* XXX - fix later */ # else charset = stub_charset (); # endif /* this is mostly from coreutils-8.5/lib/unicodeio.c */ if( STREQ (charset, "UTF-8")) n = utf32toutf8 (s, c); else { localconv = iconv_open (charset, "UTF-8"); if (localconv != (iconv_t)-1) { sn = utf32toutf8 (utf8buf, c); optr = s; obytesleft = MB_LEN_MAX; iptr = utf8buf; iconv (localconv, NULL, NULL, NULL, NULL); /* Reset iconv internal state */ if (iconv (localconv, &iptr, &sn, &optr, &obytesleft) == (size_t)-1) n=0; else n=(optr - s); } } } #endif if ( n == 0 ) { /* * Error Encoding */ #if MB_LEN_MAX > 13 n=sprintf(s, "", c); /* s buffer only 24 characters long */ #endif builtin_warning (_("U+%08lx unsupported in destination charset \"%s\" "), c, charset); } s[n]=0; return n; } #endif /* HANDLE_MULTIBYTE */