bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [bug-gnulib] iconv made easy


From: Bruno Haible
Subject: Re: [bug-gnulib] iconv made easy
Date: Mon, 13 Dec 2004 12:33:57 +0100
User-agent: KMail/1.5

Simon Josefsson wrote:
> What do you think about this module?

I like the idea, because although powerful, iconv() is hard to use the
right way.

Your function returns a memory block that is often way too large and thus
wastes memory.

In many applications, I guess, the (to_codeset, from_codeset) pair will be
the same for many strings. Therefore I think it's worth providing a function
that takes an iconv_t and doesn't need to iconv_open/iconv_close.

> I know the function doesn't handle embedded ASCII #0

iconv() handles NUL bytes correctly; you don't need to handle them specially.

> I'm thinking of another API, 'iconv_lz', that would be take
> zero-terminated strings in the locale's code set, and convert them to
> a specified code set.  But that would need nl_langinfo(CODESET) so it
> wouldn't be thread safe

You have a strange notion of "thread safe". nl_langinfo(CODESET) can be
called in multiple threads simultaneously without locking. It's only
setlocale() in other threads that can disturb nl_langinfo(CODESET).
Therefore IMO it's setlocale() which is not MT-safe.

> +  p = (ICONV_CONST char *) str;
> + ...
> +  err = iconv (cd, (ICONV_CONST char **) &p, &inbytes_remaining,

One of these two occurrences of ICONV_CONST is unnecessary.


Appended you find two alternative codes, taken from gettext (and tested for
4 years), and another one, taken from my unfinished libunistring - more
powerful but untested.

You will notice that there are two approaches to converting a string:
a) allocate an initial buffer and extend it as needed, stopping and
   restarting iconv() each time a realloc is needed,
b) call iconv() once to determine the length and then once again for
   filling the result string.
I never measured which one is more efficient; probably it will also
depend on the iconv implementation (glibc is noticeably faster for
large strings than for small ones) and on the initial buffer size in
case a).

Which API do you find worth pursuing?

Bruno


========================== gettext's .h file ===========================

#if HAVE_ICONV
#include <iconv.h>
#endif

#if HAVE_ICONV
/* Converts the STRING through the conversion descriptor CD.  */
extern char *convert_string (iconv_t cd, const char *string);
#endif

extern char *iconv_string (const char *string, const char *from_code, const 
char *to_code);

========================== gettext's .c file ===========================

#if HAVE_ICONV

/* Converts an entire string from one encoding to another, using iconv.
   Return value: 0 if successful, otherwise -1 and errno set.  */
static int
iconv_string (iconv_t cd, const char *start, const char *end,
              char **resultp, size_t *lengthp)
{
#define tmpbufsize 4096
  size_t length;
  char *result;

  /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
# if defined _LIBICONV_VERSION \
    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
  /* Set to the initial state.  */
  iconv (cd, NULL, NULL, NULL, NULL);
# endif

  /* Determine the length we need.  */
  {
    size_t count = 0;
    char tmpbuf[tmpbufsize];
    const char *inptr = start;
    size_t insize = end - start;

    while (insize > 0)
      {
        char *outptr = tmpbuf;
        size_t outsize = tmpbufsize;
        size_t res = iconv (cd,
                            (ICONV_CONST char **) &inptr, &insize,
                            &outptr, &outsize);

        if (res == (size_t)(-1))
          {
            if (errno == E2BIG)
              ;
            else if (errno == EINVAL)
              break;
            else
              return -1;
          }
# if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
        /* Irix iconv() inserts a NUL byte if it cannot convert.  */
        else if (res > 0)
          return -1;
# endif
        count += outptr - tmpbuf;
      }
    /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
    {
      char *outptr = tmpbuf;
      size_t outsize = tmpbufsize;
      size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);

      if (res == (size_t)(-1))
        return -1;
      count += outptr - tmpbuf;
    }
# endif
    length = count;
  }

  *lengthp = length;
  *resultp = result = xrealloc (*resultp, length);
  if (length == 0)
    return 0;

  /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
# if defined _LIBICONV_VERSION \
    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
  /* Return to the initial state.  */
  iconv (cd, NULL, NULL, NULL, NULL);
# endif

  /* Do the conversion for real.  */
  {
    const char *inptr = start;
    size_t insize = end - start;
    char *outptr = result;
    size_t outsize = length;

    while (insize > 0)
      {
        size_t res = iconv (cd,
                            (ICONV_CONST char **) &inptr, &insize,
                            &outptr, &outsize);

        if (res == (size_t)(-1))
          {
            if (errno == EINVAL)
              break;
            else
              return -1;
          }
# if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
        /* Irix iconv() inserts a NUL byte if it cannot convert.  */
        else if (res > 0)
          return -1;
# endif
      }
    /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
    {
      size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);

      if (res == (size_t)(-1))
        return -1;
    }
# endif
    if (outsize != 0)
      abort ();
  }

  return 0;
#undef tmpbufsize
}

char *
convert_string (iconv_t cd, const char *string)
{
  size_t len = strlen (string) + 1;
  char *result = NULL;
  size_t resultlen;

  if (iconv_string (cd, string, string + len, &result, &resultlen) == 0)
    /* Verify the result has exactly one NUL byte, at the end.  */
    if (resultlen > 0 && result[resultlen - 1] == '\0'
        && strlen (result) == resultlen - 1)
      return result;

  error (EXIT_FAILURE, 0, _("conversion failure"));
  /* NOTREACHED */
  return NULL;
}

char *
iconv_string (const char *string, const char *from_code, const char *to_code)
{
  /* If the two encodings are the same, nothing to do.  */
  if (from_code == to_code)
    return xstrdup (string);
  else
    {
#if HAVE_ICONV
      iconv_t cd;
      char *result;

      /* Avoid glibc-2.1 bug with EUC-KR.  */
# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined 
_LIBICONV_VERSION
      if (strcmp (canon_from_code, "EUC-KR") == 0)
        cd = (iconv_t)(-1);
      else
# endif
      cd = iconv_open (canon_to_code, canon_from_code);
      if (cd == (iconv_t)(-1))
        return NULL;
      result = convert_string (cd, string);
      iconv_close (cd);
      return result;
    }
}

========================== libunistring's .h file ===========================
/* Conversions between Unicode and legacy encodings.
   Copyright (C) 2002 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify it
   under the terms of the GNU Library General Public License as published
   by the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
   USA.  */

#ifndef _UNICONV_H
#define _UNICONV_H

#include "unitypes.h"

/* Get size_t.  */
#include <stddef.h>


#ifdef __cplusplus
extern "C" {
#endif

/* Handling of unconvertible characters.  */
enum uniconv_ilseq_handler
{
  uniconv_error,                /* return and set errno = EILSEQ */
  uniconv_question_mark,        /* use one '?' per unconvertible character */
  uniconv_escape_sequence       /* use escape sequence \uxxxx or \Uxxxxxxxx */
};

/* Converts an entire string, possibly including NUL bytes, from one encoding
   to another.
   Converts a memory region given in encoding FROMCODE to a new memory
   region in encoding TOCODE. FROMCODE and TOCODE are as for iconv_open(3).
   The input is in the memory region between start (inclusive) and end
   (exclusive).  If resultp is not NULL, the output string is stored in
   *resultp; malloc/realloc is used to allocate the result.  If offsets is
   not NULL, it should point to an array of end-start integers; this array
   is filled with offsets into the result, i.e. the character starting at
   start[i] corresponds to the character starting at (*resultp)[offsets[i]],
   and other offsets are set to (size_t)(-1).
   Return value: 0 if successful, otherwise -1 and errno set.
   Particular errno values: EINVAL, EILSEQ, ENOMEM.  */
extern int
       uniconv_string PARAMS ((const char *tocode, const char *fromcode,
                               enum uniconv_ilseq_handler handler,
                               const char *start, const char *end,
                               size_t *offsets,
                               char **resultp, size_t *lengthp));

extern int
       u8_conv_from_encoding PARAMS ((const char *fromcode,
                                      enum uniconv_ilseq_handler handler,
                                      const char *start, const char *end,
                                      size_t *offsets,
                                      uint8_t **resultp, size_t *lengthp));

/* Converts a NUL terminated string from a given encoding.
   The result is malloc allocated, or NULL (with errno set) in case of error.
   Particular errno values: EILSEQ, ENOMEM.  */
extern uint8_t *
       u8_strconv_from_encoding PARAMS ((const char *string,
                                         const char *fromcode,
                                         enum uniconv_ilseq_handler handler));

/* Converts a NUL terminated string to a given encoding.
   The result is malloc allocated, or NULL (with errno set) in case of error.
   Particular errno values: EILSEQ, ENOMEM.  */
extern char *
       u8_strconv_to_encoding PARAMS ((const uint8_t *string,
                                       const char *tocode,
                                       enum uniconv_ilseq_handler handler));

/* Return the encoding of the current locale.  */
extern const char *
       locale_charset PARAMS ((void));

/* Converts a NUL terminated string from the locale encoding.
   The result is malloc allocated, or NULL (with errno set) in case of error.
   Particular errno values: ENOMEM.  */
extern uint8_t *
       u8_conv_from_locale PARAMS ((const char *string));

/* Converts a NUL terminated string to the locale encoding.
   The result is malloc allocated, or NULL (with errno set) in case of error.
   Particular errno values: ENOMEM.  */
extern char *
       u8_conv_to_locale PARAMS ((const uint8_t *string));

/* In all of the above, FROMCODE can also be one of the following values:
      "autodetect_utf8"         supports ISO-8859-1 and UTF-8
      "autodetect_jp"           supports EUC-JP, ISO-2022-JP-2 and SHIFT_JIS
      "autodetect_kr"           supports EUC-KR and ISO-2022-KR
   More names can be defined for autodetection.  */

/* Registers an encoding name for autodetection.
   TRY_IN_ORDER is a NULL terminated list of encodings to be tried.
   Returns 0 upon success, or -1 (with errno set) in case of error.
   Particular errno values: ENOMEM.  */
extern int
       uniconv_register_autodetect PARAMS ((const char *name,
                                            const char * const *try_in_order));

#ifdef __cplusplus
}
#endif

#endif /* _UNICONV_H */

========================== libunistring's .c file ===========================
/* Conversions between Unicode and legacy encodings.
   Copyright (C) 2002 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify it
   under the terms of the GNU Library General Public License as published
   by the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
   USA.  */

/* Written by Bruno Haible <address@hidden>.  */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "liballoca.h"

/* Specification.  */
#include "uniconv.h"

#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

#if HAVE_ICONV
# include <iconv.h>
#endif

#include "strcaseeq.h"
#include "unistr.h"

/* For those losing systems which don't have 'alloca' we have to add
   some additional code emulating it.  */
#ifdef HAVE_ALLOCA
# define freea(p) /* nothing */
#else
# define alloca(n) malloc (n)
# define freea(p) free (p)
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))


/* Autodetection list.  */

struct autodetect_alias
{
  struct autodetect_alias *next;
  const char *name;
  const char * const *encodings_to_try;
};

static const char * const autodetect_utf8_try[] =
{
  /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
     be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1.  */
  "UTF-8", "ISO-8859-1",
  NULL
};
static const char * const autodetect_jp_try[] =
{
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
     it will fail.
     Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
     is unavoidable. People will condemn SHIFT_JIS.
     If we tried SHIFT_JIS first, then some short EUC-JP inputs would
     come out wrong, and people would condemn EUC-JP and Unix, which
     would not be good.
     Finally try SHIFT_JIS.  */
  "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
  NULL
};
static const char * const autodetect_kr_try[] =
{
  /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
     it will fail.
     Finally try EUC-KR.  */
  "ISO-2022-KR", "EUC-KR",
  NULL
};

static struct autodetect_alias autodetect_predefined[] =
{
  { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try },
  { &autodetect_predefined[2], "autodetect_jp",   autodetect_jp_try },
  { NULL,                      "autodetect_kr",   autodetect_kr_try }
};

static struct autodetect_alias *autodetect_list = &autodetect_predefined[0];
static struct autodetect_alias **autodetect_list_end =
  &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next;

int
uniconv_register_autodetect (name, try_in_order)
     const char *name;
     const char * const *try_in_order;
{
  size_t namelen;
  size_t listlen;
  size_t memneed;
  size_t i;
  char *memory;
  struct autodetect_alias *new_alias;
  char *new_name;
  const char **new_try_in_order;

  /* The TRY_IN_ORDER list must not be empty.  */
  if (try_in_order[0] == NULL)
    {
      errno = EINVAL;
      return -1;
    }

  /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
     with dynamic extent.  */
  namelen = strlen (name) + 1;
  memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *);
  for (i = 0; try_in_order[i] != NULL; i++)
    memneed += sizeof (char *) + strlen (try_in_order[i]) + 1;
  listlen = i;

  memory = (char *) malloc (memneed);
  if (memory != NULL)
    {
      new_alias = (struct autodetect_alias *) memory;
      memory += sizeof (struct autodetect_alias);

      new_try_in_order = (const char **) memory;
      memory += (listlen + 1) * sizeof (char *);

      new_name = (char *) memory;
      memcpy (new_name, name, namelen);
      memory += namelen;

      for (i = 0; i < listlen; i++)
        {
          size_t len = strlen (try_in_order[i]) + 1;
          memcpy (memory, try_in_order[i], len);
          new_try_in_order[i] = (const char *) memory;
          memory += len;
        }
      new_try_in_order[i] = NULL;

      /* Now insert the new alias.  */
      new_alias->name = new_name;
      new_alias->encodings_to_try = new_try_in_order;
      new_alias->next = NULL;
      /* FIXME: Not multithread-safe.  */
      *autodetect_list_end = new_alias;
      autodetect_list_end = &new_alias->next;
      return 0;
    }
  else
    {
      errno = ENOMEM;
      return -1;
    }
}


int
uniconv_string (tocode, fromcode, handler, start, end, offsets, resultp, 
lengthp)
     const char *tocode;
     const char *fromcode;
     enum uniconv_ilseq_handler handler;
     const char *start;
     const char *end;
     size_t *offsets;
     char **resultp;
     size_t *lengthp;
{
  iconv_t cd;

  if (end == start)
    {
      /* Nothing to convert.  */
      if (resultp != NULL)
        *resultp = NULL;
      if (lengthp != NULL)
        *lengthp = 0;
      return 0;
    }

  /* Avoid glibc-2.1 bug with EUC-KR.  */
#if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined 
_LIBICONV_VERSION
  if (STRCASEEQ (fromcode, "EUC-KR"))
    cd = (iconv_t)(-1);
  else
#endif
  {
    /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5,
       we want to use transliteration.  */
#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || 
_LIBICONV_VERSION >= 0x0105
    if (handler != uniconv_error)
      {
        size_t len = strlen (tocode);
        char *tocode_suffixed = (char *) alloca (len + 10 + 1);
        memcpy (tocode_suffixed, tocode, len);
        memcpy (tocode_suffixed + len, "//TRANSLIT", 10 + 1);

        cd = iconv_open (tocode_suffixed, fromcode);

        freea (tocode_suffixed);
      }
    else
#endif
    cd = iconv_open (tocode, fromcode);
  }
  if (cd != (iconv_t)(-1))
    {
      /* Use a temporary buffer, so that for small strings, a single malloc()
         call will be sufficient.  */
      union { unsigned int align; char buf[4096]; } tmp;
#     define tmpbuf tmp.buf

      char *result = tmpbuf;
      size_t allocated = sizeof (tmpbuf);
      size_t length = 0;
      const char *inptr = start;
      size_t insize = end - start;
      size_t i;

      if (offsets != NULL)
        for (i = 0; i < insize; i++)
          offsets[i] = (size_t)(-1);

      while (insize > 0)
        {
          char *outptr = result + length;
          size_t outsize = allocated - length;
          size_t res;
          bool grow;

          if (offsets != NULL)
            {
              size_t try;

              offsets[inptr - start] = length;

              res = (size_t)(-1);
              for (try = 1; try <= insize; try++)
                {
                  res = iconv (cd,
                               (ICONV_CONST char **) &inptr, &try,
                               &outptr, &outsize);
                  if (!(res == (size_t)(-1) && errno == EINVAL))
                    break;
                }
              insize = end - inptr;
            }
          else
            res = iconv (cd,
                         (ICONV_CONST char **) &inptr, &insize,
                         &outptr, &outsize);

          length = outptr - result;
          grow = (length > allocated / 2);
          if (res == (size_t)(-1))
            {
              if (errno == E2BIG)
                grow = true;
              else if (errno == EINVAL)
                break;
              else if (errno == EILSEQ && handler != uniconv_error)
                {
                  /* Error handling can produce up to 10 bytes of output.  */
                  if (length + 10 > allocated)
                    {
                      char *memory;

                      allocated = 2 * allocated;
                      if (length + 10 > allocated)
                        abort ();
                      if (result == tmpbuf)
                        memory = (char *) malloc (allocated);
                      else
                        memory = (char *) realloc (result, allocated);
                      if (memory == NULL)
                        {
                          iconv_close (cd);
                          if (result != tmpbuf)
                            free (result);
                          errno = ENOMEM;
                          return -1;
                        }
                      if (result == tmpbuf)
                        memcpy (memory, tmpbuf, length);
                      result = memory;
                      grow = false;
                    }

                  /* Error handling.  */
                  if (strcmp (tocode, "UTF-8") != 0)
                    {
                      /* Convert the next character to Unicode.
                         1. In order to know the number of bytes it takes.
                         2. In order to produce the right \uxxxx sequence.  */
                      iconv_t cd2 = iconv_open ("UTF-8", fromcode);

                      if (cd2 != (iconv_t)(-1))
                        {
                          char scratchbuf[64];
                          const char *ip = inptr;
                          size_t in;
                          char *scratchptr = &scratchbuf[0];
                          size_t scratchsize = sizeof (scratchbuf);

                          for (in = 1; in <= insize; in++)
                            {
                              res = iconv (cd2,
                                           (ICONV_CONST char **) &ip, &in,
                                           &scratchptr, &scratchsize);
                              if (!(res == (size_t)(-1) && errno == EINVAL))
                                break;
                            }
                          if (res == (size_t)(-1))
                            {
                              if (errno != EILSEQ)
                                abort ();
                              ip += in;
                              insize -= ip - inptr;
                              inptr = ip;
                              result[length++] = '?';
                            }
                          else
                            {
                              inptr = ip;
                              if (handler == uniconv_escape_sequence)
                                {
                                  ucs4_t uc;

                                  if (u8_mbtouc_safe (&uc, scratchbuf,
                                                      scratchptr - scratchbuf) 
>= 0)
                                    {
                                      static char hex[16] = "0123456789ABCDEF";
                                      result[length++] = '\\';
                                      if (uc < 0x10000)
                                        {
                                          result[length++] = 'u';
                                        }
                                      else
                                        {
                                          result[length++] = 'U';
                                          result[length++] = hex[(uc>>28) & 15];
                                          result[length++] = hex[(uc>>24) & 15];
                                          result[length++] = hex[(uc>>20) & 15];
                                          result[length++] = hex[(uc>>16) & 15];
                                        }
                                      result[length++] = hex[(uc>>12) & 15];
                                      result[length++] = hex[(uc>>8) & 15];
                                      result[length++] = hex[(uc>>4) & 15];
                                      result[length++] = hex[uc & 15];
                                    }
                                  else
                                    /* cd2 produced invalid UTF-8.  */
                                    result[length++] = '?';
                                }
                              else
                                /* handler == uniconv_question_mark.  */
                                result[length++] = '?';
                            }
                          iconv_close (cd2);
                        }
                      else
                        {
                          /* Assume the next character is a single byte.  */
                          inptr++;
                          result[length++] = '?';
                        }
                    }
                  else
                    {
                      /* Already converting to UTF-8.  */
                      char scratchbuf[64];
                      const char *ip = inptr;
                      size_t in;
                      char *scratchptr = &scratchbuf[0];
                      size_t scratchsize = sizeof (scratchbuf);

                      for (in = 1; in <= insize; in++)
                        {
                          res = iconv (cd,
                                       (ICONV_CONST char **) &ip, &in,
                                       &scratchptr, &scratchsize);
                          if (!(res == (size_t)(-1) && errno == EINVAL))
                            break;
                        }
                      ip += in;
                      insize -= ip - inptr;
                      inptr = ip;
                      result[length++] = '?';
                    }
                }
              else
                {
                  int saved_errno = errno;
                  iconv_close (cd);
                  if (result != tmpbuf)
                    free (result);
                  errno = saved_errno;
                  return -1;
                }
            }
          if (insize == 0)
            break;
          if (grow)
            {
              char *memory;

              allocated = 2 * allocated;
              if (result == tmpbuf)
                memory = (char *) malloc (allocated);
              else
                memory = (char *) realloc (result, allocated);
              if (memory == NULL)
                {
                  iconv_close (cd);
                  if (result != tmpbuf)
                    free (result);
                  errno = ENOMEM;
                  return -1;
                }
              if (result == tmpbuf)
                memcpy (memory, tmpbuf, length);
              result = memory;
            }
        }
      /* Now get the conversion state back to the initial state.
         But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
#if defined _LIBICONV_VERSION || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 
<= 1) || defined __sun)
      for (;;)
        {
          char *outptr = result + length;
          size_t outsize = allocated - length;
          size_t res;

          res = iconv (cd, NULL, NULL, &outptr, &outsize);
          length = outptr - result;
          if (res == (size_t)(-1))
            {
              if (errno == E2BIG)
                {
                  char *memory;

                  allocated = 2 * allocated;
                  if (result == tmpbuf)
                    memory = (char *) malloc (allocated);
                  else
                    memory = (char *) realloc (result, allocated);
                  if (memory == NULL)
                    {
                      iconv_close (cd);
                      if (result != tmpbuf)
                        free (result);
                      errno = ENOMEM;
                      return -1;
                    }
                  if (result == tmpbuf)
                    memcpy (memory, tmpbuf, length);
                  result = memory;
                }
              else
                {
                  int saved_errno = errno;
                  iconv_close (cd);
                  if (result != tmpbuf)
                    free (result);
                  errno = saved_errno;
                  return -1;
                }
            }
          else
            break;
        }
#endif
      iconv_close (cd);

      /* Now the final memory allocation.  */
      if (resultp != NULL)
        {
          if (result == tmpbuf)
            {
              char *memory;

              memory = (char *) malloc (length);
              if (memory != NULL)
                {
                  memcpy (memory, tmpbuf, length);
                  result = memory;
                }
              else
                {
                  errno = ENOMEM;
                  return -1;
                }
            }
          else if (length < allocated)
            {
              /* Shrink the allocated memory if possible.  */
              char *memory;

              memory = (char *) realloc (result, length);
              if (memory != NULL)
                result = memory;
            }
          *resultp = result;
        }
      else
        {
          if (result != tmpbuf)
            free (result);
        }
      if (lengthp != NULL)
        *lengthp = length;
      return 0;
#     undef tmpbuf
    }
  else
    {
      if (errno == EINVAL)
        {
          struct autodetect_alias *alias;

          /* Unsupported fromcode or tocode. Check whether the caller
             requested autodetection.  */
          for (alias = autodetect_list; alias != NULL; alias = alias->next)
            if (strcmp (fromcode, alias->name) == 0)
              {
                int ret;
                const char * const *encodings = alias->encodings_to_try;

                do
                  {
                    ret = uniconv_string (tocode, *encodings, handler,
                                          start, end, offsets,
                                          resultp, lengthp);
                    if (!(ret < 0 && errno == EILSEQ))
                      return ret;
                    encodings++;
                  }
                while (*encodings != NULL);

                /* Return the last call's result.  */
                return ret;
              }

          /* It wasn't an autodetection name.  */
        }
      return -1;
    }
}

int
u8_conv_from_encoding (fromcode, handler, start, end, offsets, resultp, lengthp)
     const char *fromcode;
     enum uniconv_ilseq_handler handler;
     const char *start;
     const char *end;
     size_t *offsets;
     uint8_t **resultp;
     size_t *lengthp;
{
  if (STRCASEEQ (fromcode, "UTF-8", 'U','T','F','-','8',0,0,0,0))
    {
      /* Conversion from UTF-8 to UTF-8.  No need to go through iconv().  */
      size_t length = end - start;

      if (u8_check ((const uint8_t *) start, length))
        {
          errno = EILSEQ;
          return -1;
        }

      if (offsets != NULL)
        {
          size_t i;

          for (i = 0; i < length; )
            {
              int count = u8_mblen (start + i, length - i);
              /* We can rely on count > 0 because of the previous u8_check.  */
              if (count <= 0)
                abort ();
              offsets[i] = i;
              i++;
              while (--count > 0)
                offsets[i++] = (size_t)(-1);
            }
        }

      /* Memory allocation.  */
      if (resultp != NULL)
        {
          if (length > 0)
            {
              uint8_t *memory;

              memory = (uint8_t *) malloc (length);
              if (memory == NULL)
                {
                  errno = ENOMEM;
                  return -1;
                }
              memcpy ((char *) memory, start, length);
              *resultp = memory;
            }
          else
            *resultp = NULL;
        }
      if (lengthp != NULL)
        *lengthp = length;
      return 0;
    }
  else
    return uniconv_string ("UTF-8", fromcode, handler, start, end, offsets,
                           (char **) resultp, lengthp);
}

uint8_t *
u8_strconv_from_encoding (string, fromcode, handler)
     const char *string;
     const char *fromcode;
     enum uniconv_ilseq_handler handler;
{
  uint8_t *result;
  size_t length;

  if (u8_conv_from_encoding (fromcode, handler,
                             string, string + strlen (string) + 1, NULL,
                             &result, &length) < 0)
    return NULL;
  /* Verify the result has exactly one NUL unit, at the end.  */
  if (!(length > 0 && result[length-1] == 0
        && u8_strlen (result) == length-1))
    {
      free (result);
      errno = EILSEQ;
      return NULL;
    }
  return result;
}

char *
u8_strconv_to_encoding (string, tocode, handler)
     const uint8_t *string;
     const char *tocode;
     enum uniconv_ilseq_handler handler;
{
  char *result;
  size_t length;

  if (STRCASEEQ (tocode, "UTF-8", 'U','T','F','-','8',0,0,0,0))
    {
      /* Conversion from UTF-8 to UTF-8.  No need to go through iconv().  */
      length = u8_strlen (string) + 1;
#if CONFIG_SAFETY
      if (u8_check (string, length))
        {
          errno = EILSEQ;
          return NULL;
        }
#endif
      result = (char *) malloc (length);
      if (result == NULL)
        {
          errno = ENOMEM;
          return NULL;
        }
      memcpy (result, (const char *) string, length);
      return result;
    }
  else
    {
      if (uniconv_string (tocode, "UTF-8", handler,
                          string, string + u8_strlen (string) + 1, NULL,
                          &result, &length) < 0)
        return NULL;
      /* Verify the result has exactly one NUL byte, at the end.  */
      if (!(length > 0 && result[length-1] == '\0'
            && strlen (result) == length-1))
        {
          free (result);
          errno = EILSEQ;
          return NULL;
        }
      return result;
    }
}

uint8_t *
u8_conv_from_locale (string)
     const char *string;
{
  const char *encoding = locale_charset ();
  return u8_strconv_from_encoding (string, encoding, uniconv_question_mark);
}

char *
u8_conv_to_locale (string)
     const uint8_t *string;
{
  const char *encoding = locale_charset ();
  return u8_strconv_to_encoding (string, encoding, uniconv_question_mark);
}






reply via email to

[Prev in Thread] Current Thread [Next in Thread]