/*
 * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * Convert a string between UTF-8 and the locale's charset.
 */

#include <stdlib.h>
#include <string.h>

#include "utf8.h"


#ifdef _WIN32

/* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
 * code.
 */

#include <stdio.h>
#include <windows.h>

static unsigned char *make_utf8_string(const wchar_t *unicode)
{
    int size = 0, index = 0, out_index = 0;
    unsigned char *out;
    unsigned short c;

    /* first calculate the size of the target string */
    c = unicode[index++];
    while(c) {
        if(c < 0x0080) {
            size += 1;
        } else if(c < 0x0800) {
            size += 2;
        } else {
            size += 3;
        }
        c = unicode[index++];
    }	

    out = malloc(size + 1);
    if (out == NULL)
        return NULL;
    index = 0;

    c = unicode[index++];
    while(c)
    {
        if(c < 0x080) {
            out[out_index++] = (unsigned char)c;
        } else if(c < 0x800) {
            out[out_index++] = 0xc0 | (c >> 6);
            out[out_index++] = 0x80 | (c & 0x3f);
        } else {
            out[out_index++] = 0xe0 | (c >> 12);
            out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
            out[out_index++] = 0x80 | (c & 0x3f);
        }
        c = unicode[index++];
    }
    out[out_index] = 0x00;

    return out;
}

static wchar_t *make_unicode_string(const unsigned char *utf8)
{
    int size = 0, index = 0, out_index = 0;
    wchar_t *out;
    unsigned char c;

    /* first calculate the size of the target string */
    c = utf8[index++];
    while(c) {
        if((c & 0x80) == 0) {
            index += 0;
	} else if((c & 0xe0) == 0xe0) {
	    index += 2;
	} else {
	    index += 1;
	}
	size += 1;
	c = utf8[index++];
    }

    out = malloc((size + 1) * sizeof(wchar_t));
    if (out == NULL)
	return NULL;
    index = 0;

    c = utf8[index++];
    while(c)
    {
	if((c & 0x80) == 0) {
	    out[out_index++] = c;
	} else if((c & 0xe0) == 0xe0) {
	    out[out_index] = (c & 0x1F) << 12;
		c = utf8[index++];
	    out[out_index] |= (c & 0x3F) << 6;
		c = utf8[index++];
	    out[out_index++] |= (c & 0x3F);
	} else {
	    out[out_index] = (c & 0x3F) << 6;
		c = utf8[index++];
	    out[out_index++] |= (c & 0x3F);
	}
	c = utf8[index++];
    }
    out[out_index] = 0;

    return out;
}

int utf8_encode(const char *from, char **to)
{
	wchar_t *unicode;
	int wchars, err;

	wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
			strlen(from), NULL, 0);

	if(wchars == 0)
	{
		fprintf(stderr, "Unicode translation error %d\n", GetLastError());
		return -1;
	}

	unicode = calloc(wchars + 1, sizeof(unsigned short));
	if(unicode == NULL)
	{
		fprintf(stderr, "Out of memory processing string to UTF8\n");
		return -1;
	}

	err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
			strlen(from), unicode, wchars);
	if(err != wchars)
	{
		free(unicode);
		fprintf(stderr, "Unicode translation error %d\n", GetLastError());
		return -1;
	}

	/* On NT-based windows systems, we could use WideCharToMultiByte(), but
	 * MS doesn't actually have a consistent API across win32.
	 */
	*to = make_utf8_string(unicode);

	free(unicode);
	return 0;
}

int utf8_decode(const char *from, char **to)
{
    wchar_t *unicode;
    int chars, err;

    /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
     * MS doesn't actually have a consistent API across win32.
     */
    unicode = make_unicode_string(from);
    if(unicode == NULL)
    {
	fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
	return -1;
    }

    chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
	    -1, NULL, 0, NULL, NULL);

    if(chars == 0)
    {
	fprintf(stderr, "Unicode translation error %d\n", GetLastError());
	free(unicode);
	return -1;
    }

    *to = calloc(chars + 1, sizeof(unsigned char));
    if(*to == NULL)
    {
	fprintf(stderr, "Out of memory processing string to local charset\n");
	free(unicode);
	return -1;
    }

    err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
	    -1, *to, chars, NULL, NULL);
    if(err != chars)
    {
	fprintf(stderr, "Unicode translation error %d\n", GetLastError());
	free(unicode);
	free(*to);
	*to = NULL;
	return -1;
    }

    free(unicode);
    return 0;
}

#else /* End win32. Rest is for real operating systems */

/*
 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * See the corresponding header file for a description of the functions
 * that this file provides.
 *
 * This was first written for Ogg Vorbis but could be of general use.
 *
 * The only deliberate assumption about data sizes is that a short has
 * at least 16 bits, but this code has only been tested on systems with
 * 8-bit char, 16-bit short and 32-bit int.
 */


#include <stdlib.h>

#include "charmaps.h"

static int ascii_strcasecmp(const char *s1, const char *s2)
{
  char c1, c2;

  for (;; s1++, s2++) {
    if (!*s1 || !*s1)
      break;
    if (*s1 == *s2)
      continue;
    c1 = *s1;
    if ('a' <= c1 && c1 <= 'z')
      c1 += 'A' - 'a';
    c2 = *s2;
    if ('a' <= c2 && c2 <= 'z')
      c2 += 'A' - 'a';
    if (c1 != c2)
      break;
  }
  return (unsigned char)*s1 - (unsigned char)*s2;
}

/*
 * UTF-8 equivalents of the C library's wctomb() and mbtowc().
 */

int utf8_mbtowc(int *pwc, const char *s, size_t n)
{
  unsigned char c;
  int wc, i, k;

  if (!n || !s)
    return 0;

  c = *s;
  if (c < 0x80) {
    if (pwc)
      *pwc = c;
    return c ? 1 : 0;
  }
  else if (c < 0xc2)
    return -1;
  else if (c < 0xe0) {
    if (n >= 2 && (s[1] & 0xc0) == 0x80) {
      if (pwc)
	*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
      return 2;
    }
    else
      return -1;
  }
  else if (c < 0xf0)
    k = 3;
  else if (c < 0xf8)
    k = 4;
  else if (c < 0xfc)
    k = 5;
  else if (c < 0xfe)
    k = 6;
  else
    return -1;

  if (n < k)
    return -1;
  wc = *s++ & ((1 << (7 - k)) - 1);
  for (i = 1; i < k; i++) {
    if ((*s & 0xc0) != 0x80)
      return -1;
    wc = (wc << 6) | (*s++ & 0x3f);
  }
  if (wc < (1 << (5 * k - 4)))
    return -1;
  if (pwc)
    *pwc = wc;
  return k;
}

int utf8_wctomb(char *s, int wc1)
{
  unsigned int wc = wc1;

  if (!s)
    return 0;
  if (wc < (1 << 7)) {
    *s++ = wc;
    return 1;
  }
  else if (wc < (1 << 11)) {
    *s++ = 0xc0 | (wc >> 6);
    *s++ = 0x80 | (wc & 0x3f);
    return 2;
  }
  else if (wc < (1 << 16)) {
    *s++ = 0xe0 | (wc >> 12);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 3;
  }
  else if (wc < (1 << 21)) {
    *s++ = 0xf0 | (wc >> 18);
    *s++ = 0x80 | ((wc >> 12) & 0x3f);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 4;
  }
  else if (wc < (1 << 26)) {
    *s++ = 0xf8 | (wc >> 24);
    *s++ = 0x80 | ((wc >> 18) & 0x3f);
    *s++ = 0x80 | ((wc >> 12) & 0x3f);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 5;
  }
  else if (wc < (1 << 31)) {
    *s++ = 0xfc | (wc >> 30);
    *s++ = 0x80 | ((wc >> 24) & 0x3f);
    *s++ = 0x80 | ((wc >> 18) & 0x3f);
    *s++ = 0x80 | ((wc >> 12) & 0x3f);
    *s++ = 0x80 | ((wc >> 6) & 0x3f);
    *s++ = 0x80 | (wc & 0x3f);
    return 6;
  }
  else
    return -1;
}

/*
 * The charset "object" and methods.
 */

struct charset {
  int max;
  int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
  int (*wctomb)(void *table, char *s, int wc);
  void *map;
};

int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
{
  return (*charset->mbtowc)(charset->map, pwc, s, n);
}

int charset_wctomb(struct charset *charset, char *s, int wc)
{
  return (*charset->wctomb)(charset->map, s, wc);
}

int charset_max(struct charset *charset)
{
  return charset->max;
}

/*
 * Implementation of UTF-8.
 */

static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
{
  return utf8_mbtowc(pwc, s, n);
}

static int wctomb_utf8(void *map, char *s, int wc)
{
  return utf8_wctomb(s, wc);
}

/*
 * Implementation of US-ASCII.
 * Probably on most architectures this compiles to less than 256 bytes
 * of code, so we can save space by not having a table for this one.
 */

static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
{
  int wc;

  if (!n || !s)
    return 0;
  wc = (unsigned char)*s;
  if (wc & ~0x7f)
    return -1;
  if (pwc)
    *pwc = wc;
  return wc ? 1 : 0;
}

static int wctomb_ascii(void *map, char *s, int wc)
{
  if (!s)
    return 0;
  if (wc & ~0x7f)
    return -1;
  *s = wc;
  return 1;
}

/*
 * Implementation of ISO-8859-1.
 * Probably on most architectures this compiles to less than 256 bytes
 * of code, so we can save space by not having a table for this one.
 */

static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
{
  int wc;

  if (!n || !s)
    return 0;
  wc = (unsigned char)*s;
  if (wc & ~0xff)
    return -1;
  if (pwc)
    *pwc = wc;
  return wc ? 1 : 0;
}

static int wctomb_iso1(void *map, char *s, int wc)
{
  if (!s)
    return 0;
  if (wc & ~0xff)
    return -1;
  *s = wc;
  return 1;
}

// Implementation of any 8-bit charset.

struct map {
  const unsigned short *from;
  struct inverse_map *to;
};

static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
{
  struct map *map = map1;
  unsigned short wc;

  if (!n || !s)
    return 0;
  wc = map->from[(unsigned char)*s];
  if (wc == 0xffff)
    return -1;
  if (pwc)
    *pwc = (int)wc;
  return wc ? 1 : 0;
}

struct inverse_map {
  unsigned char first[256];
  unsigned char next[256];
};

#define HASH(i) ((i) & 0xff)

static struct inverse_map *make_inverse_map(const unsigned short *from)
{
  struct inverse_map *to;
  char used[256];
  int i, j, k;

  to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
  if (!to)
    return 0;
  for (i = 0; i < 256; i++)
    to->first[i] = to->next[i] = used[i] = 0;
  for (i = 255; i >= 0; i--)
    if (from[i] != 0xffff) {
      k = HASH(from[i]);
      to->next[i] = to->first[k];
      to->first[k] = i;
      used[k] = 1;
    }

  /* Point the empty buckets at an empty list. */
  for (i = 0; i < 256; i++)
    if (!to->next[i])
      break;
  if (i < 256)
    for (j = 0; j < 256; j++)
      if (!used[j])
	to->first[j] = i;

  return to;
}

int wctomb_8bit(void *map1, char *s, int wc1)
{
  struct map *map = map1;
  unsigned short wc = wc1;
  int i;

  if (!s)
    return 0;

  if (wc1 & ~0xffff)
    return -1;

  if (1) /* Change 1 to 0 to test the case where malloc fails. */
    if (!map->to)
      map->to = make_inverse_map(map->from);

  if (map->to) {
    /* Use the inverse map. */
    i = map->to->first[HASH(wc)];
    for (;;) {
      if (map->from[i] == wc) {
	*s = i;
	return 1;
      }
      if (!(i = map->to->next[i]))
	break;
    }
  }
  else {
    /* We don't have an inverse map, so do a linear search. */
    for (i = 0; i < 256; i++)
      if (map->from[i] == wc) {
	*s = i;
	return 1;
      }
  }

  return -1;
}

struct charset charset_utf8 = {
  6,
  &mbtowc_utf8,
  &wctomb_utf8,
  0
};

struct charset charset_iso1 = {
  1,
  &mbtowc_iso1,
  &wctomb_iso1,
  0
};

struct charset charset_ascii = {
  1,
  &mbtowc_ascii,
  &wctomb_ascii,
  0
};

struct charset *charset_find(const char *code)
{
  int i;

  /* Find good (MIME) name. */
  for (i = 0; names[i].bad; i++)
    if (!ascii_strcasecmp(code, names[i].bad)) {
      code = names[i].good;
      break;
    }

  /* Recognise some charsets for which we avoid using a table. */
  if (!ascii_strcasecmp(code, "UTF-8"))
    return &charset_utf8;
  if (!ascii_strcasecmp(code, "US-ASCII"))
    return &charset_ascii;
  if (!ascii_strcasecmp(code, "ISO-8859-1"))
    return &charset_iso1;

  /* Look for a mapping for a simple 8-bit encoding. */
  for (i = 0; maps[i].name; i++)
    if (!ascii_strcasecmp(code, maps[i].name)) {
      if (!maps[i].charset) {
	maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
	if (maps[i].charset) {
	  struct map *map = (struct map *)malloc(sizeof(struct map));
	  if (!map) {
	    free(maps[i].charset);
	    maps[i].charset = 0;
	  }
	  else {
	    maps[i].charset->max = 1;
	    maps[i].charset->mbtowc = &mbtowc_8bit;
	    maps[i].charset->wctomb = &wctomb_8bit;
	    maps[i].charset->map = map;
	    map->from = maps[i].map;
	    map->to = 0; /* inverse mapping is created when required */
	  }
	}
      }
      return maps[i].charset;
    }

  return 0;
}

int charset_convert(const char *fromcode, const char *tocode,
		    const char *from, size_t fromlen,
		    char **to, size_t *tolen)
{
  int ret = 0;
  struct charset *charset1, *charset2;
  char *tobuf, *p, *newbuf;
  int i, j, wc;

  charset1 = charset_find(fromcode);
  charset2 = charset_find(tocode);
  if (!charset1 || !charset2 )
    return -1;

  tobuf = (char *)malloc(fromlen * charset2->max + 1);
  if (!tobuf)
    return -2;

  for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
    i = charset_mbtowc(charset1, &wc, from, fromlen);
    if (!i)
      i = 1;
    else if (i == -1) {
      i  = 1;
      wc = '#';
      ret = 2;
    }
    j = charset_wctomb(charset2, p, wc);
    if (j == -1) {
      if (!ret)
	ret = 1;
      j = charset_wctomb(charset2, p, '?');
      if (j == -1)
	j = 0;
    }
  }

  if (tolen)
    *tolen = p - tobuf;
  *p++ = '\0';
  if (to) {
    newbuf = realloc(tobuf, p - tobuf);
    *to = newbuf ? newbuf : tobuf;
  }
  else
    free(tobuf);

  return ret;
}

//***********************************************************************
/* utf8.c continue */

static char *current_charset = 0;

void convert_set_charset(char *charset)
{
 if (!charset)
  charset = getenv("CHARSET");

 if(current_charset){
  free(current_charset);
  current_charset = 0;
 }
 if(charset && *charset)
  current_charset = strdup(charset);
}

static int convert_buffer(const char *fromcode, const char *tocode,
			  const char *from, size_t fromlen,
			  char **to, size_t *tolen)
{
  int ret = -1;

  ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
  if (ret != -1)
    return ret;

  return ret;
}

static int convert_string(const char *fromcode, const char *tocode,
			  const char *from, char **to, char replace)
{
  int ret;
  size_t fromlen;
  char *s;

  fromlen = strlen(from);
  ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
  if (ret == -2)
    return -1;
  if (ret != -1)
    return ret;

  s = malloc(fromlen + 1);
  if (!s)
    return -1;
  strcpy(s, from);
  *to = s;
  for (; *s; s++)
    if (*s & ~0x7f)
      *s = replace;
  return 3;
}

int utf8_encode(const char *from, char **to)
{
  char *charset;

  if (!current_charset)
    convert_set_charset(0);
  charset = current_charset ? current_charset : "ISO-8859-1";
  return convert_string(charset, "UTF-8", from, to, '#');
}

int utf8_decode(const char *from, char **to)
{
  char *charset;

  if(*from == 0) {
      *to = malloc(1);
      **to = 0;
      return 1;
  }

  if (!current_charset)
    convert_set_charset(0);
  charset = current_charset ? current_charset : "ISO-8859-1";
  return convert_string("UTF-8", charset, from, to, '?');
}

#endif // !WIN32
