/* unicode.c - functions to convert unicode characters */
/* Copyright (C) 2010 Free Software Foundation, Inc.
This file is part of GNU Bash, the Bourne Again SHell.
Bash is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Bash is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Bash. If not, see .
*/
#include
#if defined (HANDLE_MULTIBYTE)
#include
#include
#include
#ifdef HAVE_UNISTD_H
#include
#endif
#include
#if HAVE_ICONV
# include
#endif
#include
#include "bashintl.h"
#if !defined (STREQ)
# define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
#endif /* !STREQ */
#if defined (HAVE_LOCALE_CHARSET)
extern const char *locale_charset __P((void));
#else
extern char *get_locale_var __P((char *));
#endif
const char *charset;
static int utf8locale = 0;
#if defined (HAVE_ICONV)
static iconv_t localconv;
#endif
#ifndef HAVE_LOCALE_CHARSET
static char charset_buffer[40]={0};
static char *
stub_charset ()
{
char *locale, *s, *t;
locale = get_locale_var ("LC_CTYPE");
if (locale == 0 || *locale == 0)
{
strcpy(charset_buffer, "ASCII");
}
else
{
s = strrchr (locale, '.');
if (s)
{
t = strchr (s, '@');
if (t)
*t = 0;
strcpy(charset_buffer, s);
}
else
{
strcpy(charset_buffer, locale);
}
/* free(locale) If we can Modify the buffer surely we need to free it?*/
}
return charset_buffer;
}
#endif
int
utf32tobig5 (s, c)
char *s;
unsigned long c;
{
int l;
if (c <= 0x7F)
{
s[0] = (char)c;
l = 1;
}
else if ((c >= 0x8000) && (c <= 0xFFFF))
{
s[0] = (char)(c>>8);
s[1] = (char)(c &0xFF);
l = 2;
}
else
{
/* Error Invalid UTF-8 */
l = 0;
}
s[l] = '\0';
return l;
}
int
utf32toutf8 (s, c)
char *s;
unsigned long c;
{
int l;
if (c <= 0x7F)
{
s[0] = (char)c;
l = 1;
}
else if (c <= 0x7FF)
{
s[0] = (c >> 6) | 0xc0; /* 110x xxxx */
s[1] = (c & 0x3f) | 0x80; /* 10xx xxxx */
l = 2;
}
else if (c <= 0xFFFF)
{
s[0] = (c >> 12) | 0xe0; /* 1110 xxxx */
s[1] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */
s[2] = (c & 0x3f) | 0x80; /* 10xx xxxx */
l = 3;
}
else if (c <= 0x1FFFFF)
{
s[0] = (c >> 18) | 0xf0; /* 1111 0xxx */
s[1] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */
s[2] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */
s[3] = ( c & 0x3f) | 0x80; /* 10xx xxxx */
l = 4;
}
else if (c <= 0x3FFFFFF)
{
s[0] = (c >> 24) | 0xf8; /* 1111 10xx */
s[1] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */
s[2] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */
s[3] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */
s[4] = ( c & 0x3f) | 0x80; /* 10xx xxxx */
l = 5;
}
else if (c <= 0x7FFFFFFF)
{
s[0] = (c >> 30) | 0xfc; /* 1111 110x */
s[1] = ((c >> 24) & 0x3f) | 0x80; /* 10xx xxxx */
s[2] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */
s[3] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */
s[4] = ((c >> 6) & 0x3f) | 0x80; /* 10xx xxxx */
s[5] = ( c & 0x3f) | 0x80; /* 10xx xxxx */
l = 6;
}
else
{
/* Error Invalid UTF-8 */
l = 0;
}
s[l] = '\0';
return l;
}
int
utf32toutf16 (s, c)
unsigned short *s;
unsigned long c;
{
int l=0;
if (c < 0xD800)
{
// Valid character directly convertible to 16 bits
s[0] = (unsigned short)(c&0xFFFF);
l=1;
}
else if ( (c >= 0x0000E000) && (c <= 0x0010FFFF) )
{
// Character will be converted to 2 UTF-16 elements
c -= 0x0010000;
s[0] = (unsigned short)((c >> 10) + 0xD800); /* 1101 10XX XXXX XXXX */
s[1] = (unsigned short)((c & 0x3FFUL) + 0xDC00); /* 1101 11XX XXXX XXXX */
l=2;
}
s[l] = 0;
return l;
}
int
utf32towchar (ws, c)
wchar_t *ws;
unsigned long c;
{
int l=0;
if ( sizeof (wchar_t) == 4)
{
ws[0]=c;
l=1;
}
else if ( sizeof (wchar_t) == 2)
{
l=utf32toutf16(ws, c);
}
ws[l] = 0;
return l;
}
/* convert a single unicode-32 character into a multibyte string and put the
result in S, which must be large enough (at least MB_LEN_MAX bytes) */
int
utf32tomb (s, c)
char *s;
unsigned long c;
{
size_t n=0;
wchar_t wstr[3];
#if HAVE_ICONV
char utf8buf[25], *optr;
size_t obytesleft, sn;
ICONV_CONST char *iptr;
#endif
#if HAVE_NL_LANGINFO
char *codeset;
#endif
if ( n == 0 )
{
/*
* Encode Method 1
* UTF 0x00 -> 0x7f = ASCII Just copy
*/
if ( c <= 0x7f )
{
s[0]=(char)c;
n=1;
}
}
#if __STDC_ISO_10646__
if ( n == 0 )
{
/*
* Encode Method 2
* Use wcstombs
*/
if( utf32towchar(wstr, c) )
n = wcstombs (s, wstr, MB_LEN_MAX);
if(n == -1)
/* Error Encoding so let another method try */
n=0;
}
#endif
#if HAVE_NL_LANGINFO
if ( n == 0 )
{
/*
* Encode Method 3
* Targets UTF-8 cool just encode.
*/
codeset = nl_langinfo (CODESET);
if (STREQ (codeset, "UTF-8"))
n = utf32toutf8 (s, c);
}
#endif
#if HAVE_ICONV
if ( n == 0 )
{
/*
* Encode Method 4
* Lets try iconv.
*/
# if HAVE_LOCALE_CHARSET
charset = locale_charset (); /* XXX - fix later */
# else
charset = stub_charset ();
# endif
/* this is mostly from coreutils-8.5/lib/unicodeio.c */
if( STREQ (charset, "UTF-8"))
n = utf32toutf8 (s, c);
else
{
localconv = iconv_open (charset, "UTF-8");
if (localconv != (iconv_t)-1)
{
sn = utf32toutf8 (utf8buf, c);
optr = s;
obytesleft = MB_LEN_MAX;
iptr = utf8buf;
iconv (localconv, NULL, NULL, NULL, NULL); /* Reset iconv internal state */
if (iconv (localconv, &iptr, &sn, &optr, &obytesleft) == (size_t)-1)
n=0;
else
n=(optr - s);
}
}
}
#endif
if ( n == 0 )
{
/*
* Error Encoding
*/
#if MB_LEN_MAX > 13
n=sprintf(s, "", c); /* s buffer only 24 characters long */
#endif
builtin_warning (_("U+%08lx unsupported in destination charset \"%s\" "), c, charset);
}
s[n]=0;
return n;
}
#endif /* HANDLE_MULTIBYTE */