[Date Prev][Date Next] [Chronological] [Thread] [Top]

String conversions UTF8 <-> ISO-8859-1



The OpenLDAP client supports some kind of string conversions but lacks
supporting conversions between UTF8 <-> ISO-8859-1.
Bellow you find the code taken from the internet and adapted to the
OpenLDAP interface.
Hope you will add this to the OpenLDAP client.

Patrick Dreyer



ldap_utf8.h
===========

/*
 * ISO-8859-1 MultiByte Char / UTF-8 Conversion Routines
 */

/* UTF-8 string to ISO-8859-1 MultiByte string */
LDAP_F(int) ldap_x_utf8s_to_iso_8859_1s LDAP_P((
	char *mbstr, LDAP_CONST char *utf8str, size_t count));

/* ISO-8859-1 MultiByte string to UTF-8 string */
LDAP_F(int) ldap_x_iso_8859_1s_to_utf8s LDAP_P((
	char *utf8str, LDAP_CONST char *mbstr, size_t count));


utf8-8-conv.c (top of file)
===========================

// Map from the most-significant 6 bits of the first byte to the total
number of bytes in a
// UTF-8 character.
static char UTF8_2_ISO_8859_1_len[] =
{
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* erroneous */
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6
};

static char UTF8_2_ISO_8859_1_mask[] = {0x3F, 0x7F, 0x1F, 0x0F, 0x07,
0x03, 0x01};


utf8-8-conv.c (end of file)
===========================


/*----------------------------------------------------------------------
-------
   Convert a UTF-8 string to a ISO-8859-1 MultiByte string.
   No more than 'count' bytes will be written to the output buffer.
   Return the size of the converted string in bytes, excl null
terminator.
*/
int
ldap_x_utf8s_to_iso_8859_1s( char *mbstr, const char *utf8str, size_t
count )
{
  int res = 0;

  while (*utf8str != '\0')
  {
    int           len = UTF8_2_ISO_8859_1_len[(*utf8str >> 2) & 0x3F];
    unsigned long u   = *utf8str & UTF8_2_ISO_8859_1_mask[len];

    // erroneous
    if (len == 0)
      len = 5;

    for (++utf8str; --len > 0 && (*utf8str != '\0'); ++utf8str)
    {
      // be sure this is not an unexpected start of a new character
      if ((*utf8str & 0xC0) != 0x80)
        break;

      u = (u << 6) | (*utf8str & 0x3F);
    }

    if (mbstr != 0 && count != 0)
    {
      // be sure there is enough space left in the destination buffer
      if (res >= count)
        return res;

      // add the mapped character to the destination string or '?'
(0x1A, SUB) if character
      // can't be represented in ISO-8859-1
      *mbstr++ = (u <= 0xFF ? (char)u : '?');
    }
    ++res;
  }

  // add the terminating null character
  if (mbstr != 0 && count != 0)
  {
    // be sure there is enough space left in the destination buffer
    if (res >= count)
      return res;
    *mbstr = 0;
  }

  return res;
} // ldap_x_utf8s_to_iso_8859_1s


/*----------------------------------------------------------------------
-------
   Convert a ISO-8859-1 MultiByte string to a UTF-8 string.
   No more than 'count' bytes will be written to the output buffer.
   Return the size of the converted string in bytes, excl null
terminator.
*/   
int
ldap_x_iso_8859_1s_to_utf8s(char *utf8str, const char *mbstr, size_t
count)
{
  int res = 0;

  // loop until we reach the end of the mb string
  for (; *mbstr != '\0'; ++mbstr)
  {
    // the character needs no mapping if the highest bit is not set
    if ((*mbstr & 0x80) == 0) 
    {
      if (utf8str != 0 && count != 0)
      {
        // be sure there is enough space left in the destination buffer
        if (res >= count)
          return res;

        *utf8str++ = *mbstr;
      }
      ++res;
    }

    // otherwise mapping is necessary
    else
    {
      if (utf8str != 0 && count != 0)
      {
        // be sure there is enough space left in the destination buffer
        if (res+1 >= count)
          return res;

        *utf8str++ = (0xC0 | (0x03 & (*mbstr >> 6)));
        *utf8str++ = (0x80 | (0x3F & *mbstr));
      }
      res += 2;
    }
  }

  // add the terminating null character
  if (utf8str != 0 && count != 0)
  {
    // be sure there is enough space left in the destination buffer
    if (res >= count)
      return res;
    *utf8str = 0;
  }

  return res;
} // ldap_x_iso_8859_1s_to_utf8s