/* unicode.c - functions to convert unicode characters */

/* Copyright (C) 2010 Free Software Foundation, Inc.

   This file is part of GNU Bash, the Bourne Again SHell.

   Bash is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   Bash is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with Bash.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <config.h>

#if defined (HANDLE_MULTIBYTE)

#include <stdc.h>
#include <wchar.h>
#include <bashansi.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <limits.h>

#if HAVE_ICONV
#  include <iconv.h>
#endif

#include <xmalloc.h>

#include "bashintl.h"

#if !defined (STREQ)
#  define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
#endif /* !STREQ */

#if defined (HAVE_LOCALE_CHARSET)
extern const char *locale_charset __P((void));
#else
extern char *get_locale_var __P((char *));
#endif

const char *charset;
static int utf8locale = 0;
#if defined (HAVE_ICONV)
static iconv_t localconv;
#endif

#ifndef HAVE_LOCALE_CHARSET
static char charset_buffer[40]={0};
static char *
stub_charset ()
{
  char *locale, *s, *t;

  locale = get_locale_var ("LC_CTYPE");
  if (locale == 0 || *locale == 0)
    {
      strcpy(charset_buffer, "ASCII");
    }
  else
    {
      s = strrchr (locale, '.');
      if (s)
	{
	  t = strchr (s, '@');
	  if (t)
	    *t = 0;
	  strcpy(charset_buffer, s);
	}
      else
	{
	  strcpy(charset_buffer, locale);
	}
      /* free(locale)  If we can Modify the buffer surely we need to free it?*/
    }
  return charset_buffer;
}
#endif


int
utf32tobig5 (s, c)
     char *s;
     unsigned long c;
{
  int l;

  if (c <= 0x7F)
    {
      s[0] = (char)c;
      l = 1;
    }
  else if ((c >= 0x8000) && (c <= 0xFFFF))
    {
      s[0] = (char)(c>>8);
      s[1] = (char)(c      &0xFF);
      l = 2;
    }
  else
    {
      /* Error Invalid UTF-8 */
      l = 0;
    }
  s[l] = '\0';
  return l;
}
int
utf32toutf8 (s, c)
     char *s;
     unsigned long c;
{
  int l;

  if (c <= 0x7F)
    {
      s[0] = (char)c;
      l = 1;
    }
  else if (c <= 0x7FF)
    {
      s[0] = (c >>   6)		| 0xc0; /* 110x xxxx */
      s[1] = (c		& 0x3f)	| 0x80; /* 10xx xxxx */
      l = 2;
    }
  else if (c <= 0xFFFF)
    {
      s[0] =  (c >> 12)         | 0xe0; /* 1110 xxxx */
      s[1] = ((c >>  6) & 0x3f) | 0x80; /* 10xx xxxx */
      s[2] =  (c        & 0x3f) | 0x80; /* 10xx xxxx */
      l = 3;
    }
  else if (c <= 0x1FFFFF)
    {
      s[0] =  (c >> 18)         | 0xf0; /* 1111 0xxx */
      s[1] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */
      s[2] = ((c >>  6) & 0x3f) | 0x80; /* 10xx xxxx */
      s[3] = ( c        & 0x3f) | 0x80; /* 10xx xxxx */
      l = 4;
    }
  else if (c <= 0x3FFFFFF)
    {
      s[0] =  (c >> 24)         | 0xf8; /* 1111 10xx */
      s[1] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */
      s[2] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */
      s[3] = ((c >>  6) & 0x3f) | 0x80; /* 10xx xxxx */
      s[4] = ( c        & 0x3f) | 0x80; /* 10xx xxxx */
      l = 5;
    }
  else if (c <= 0x7FFFFFFF)
    {
      s[0] =  (c >> 30)         | 0xfc; /* 1111 110x */
      s[1] = ((c >> 24) & 0x3f) | 0x80; /* 10xx xxxx */
      s[2] = ((c >> 18) & 0x3f) | 0x80; /* 10xx xxxx */
      s[3] = ((c >> 12) & 0x3f) | 0x80; /* 10xx xxxx */
      s[4] = ((c >>  6) & 0x3f) | 0x80; /* 10xx xxxx */
      s[5] = ( c        & 0x3f) | 0x80; /* 10xx xxxx */
      l = 6;
    }
  else
    {
      /* Error Invalid UTF-8 */
      l = 0;
    }
  s[l] = '\0';
  return l;
}
int
utf32toutf16 (s, c)
     unsigned short *s;
     unsigned long c;
{
  int l=0;
  if (c < 0xD800)
    {
      // Valid character directly convertible to 16 bits
      s[0] = (unsigned short)(c&0xFFFF);
      l=1;
    }
  else if ( (c >= 0x0000E000) && (c <= 0x0010FFFF) )
    {
      // Character will be converted to 2 UTF-16 elements
      c -= 0x0010000;
      s[0] = (unsigned short)((c >> 10)     + 0xD800); /* 1101 10XX XXXX XXXX */
      s[1] = (unsigned short)((c & 0x3FFUL) + 0xDC00); /* 1101 11XX XXXX XXXX */
      l=2;
    }
  s[l] = 0;
  return l;
}
int
utf32towchar (ws, c)
     wchar_t *ws;
     unsigned long c;
{
  int l=0;
  if ( sizeof (wchar_t) == 4)
    {
      ws[0]=c;
      l=1;
    }
  else if ( sizeof (wchar_t) == 2)
    {
      l=utf32toutf16(ws, c);
    }
  ws[l] = 0;
  return l;
}

/* convert a single unicode-32 character into a multibyte string and put the
   result in S, which must be large enough (at least MB_LEN_MAX bytes) */
int
utf32tomb (s, c)
     char *s;
     unsigned long c;
{
  size_t n=0;
  wchar_t wstr[3];
#if HAVE_ICONV
  char utf8buf[25], *optr;
  size_t obytesleft, sn;
  ICONV_CONST char *iptr;
#endif
#if HAVE_NL_LANGINFO
  char *codeset;
#endif

  if ( n == 0 )
    {
     /*
      *  Encode Method 1 
      *  UTF 0x00 -> 0x7f = ASCII Just copy
      */
      if ( c <= 0x7f )
	{
	  s[0]=(char)c;
	  n=1;
	}
    }

#if __STDC_ISO_10646__
  if ( n == 0 )
    {
     /*
      *  Encode Method 2 
      *  Use wcstombs
      */
      if( utf32towchar(wstr, c) )
	  n = wcstombs (s, wstr, MB_LEN_MAX);
      if(n == -1)
	/* Error Encoding so let another method try */
	n=0;
    }
#endif

#if HAVE_NL_LANGINFO
  if ( n == 0 )
    {
     /*
      *  Encode Method 3 
      *  Targets UTF-8 cool just encode.
      */
      codeset = nl_langinfo (CODESET);
      if (STREQ (codeset, "UTF-8"))
	  n = utf32toutf8 (s, c);
    }
#endif

#if HAVE_ICONV
  if ( n == 0 )
    {
     /*
      *  Encode Method 4 
      *  Lets try iconv.
      */
#  if HAVE_LOCALE_CHARSET
      charset = locale_charset ();	/* XXX - fix later */
#  else
      charset = stub_charset ();
#  endif
      /* this is mostly from coreutils-8.5/lib/unicodeio.c */
      if( STREQ (charset, "UTF-8"))
	  n = utf32toutf8 (s, c);
      else 
	{
	  localconv = iconv_open (charset, "UTF-8");
	  if (localconv != (iconv_t)-1)
	    {
	      sn = utf32toutf8 (utf8buf, c);

	      optr = s;
	      obytesleft = MB_LEN_MAX;
	      iptr = utf8buf;

	      iconv (localconv, NULL, NULL, NULL, NULL); /* Reset iconv internal state */
	      if (iconv (localconv, &iptr, &sn, &optr, &obytesleft) == (size_t)-1)
		n=0;
	      else
		n=(optr - s);
	    }
	}
    }
#endif

    if ( n == 0 ) 
      {
	/*
	*  Error Encoding 
	*/
#if  MB_LEN_MAX > 13
	n=sprintf(s, "<U+%08lx>", c); /* s buffer only 24 characters long */
#endif
	builtin_warning (_("U+%08lx unsupported in destination charset \"%s\" "), c, charset);
      }
    s[n]=0;
    return n;
}

#endif /* HANDLE_MULTIBYTE */