/**
 * Copyright 2002 Peppercon AG
 * Author: Thomas Breitfeld <thomas@peppercon.de>
 *
 * Description: character encoding conversion routines
 */

#include <string.h>
#include <errno.h>
#include <malloc.h>
#include <pp/utf.h>

#define CHAR_NEEDS_UTF_CONV(c) ((c) & 0x80)

static inline int _char_latin_to_utf8(char * utf8_buf, const char * latin_buf);
static inline int _char_utf8_to_latin(char * latin_buf, const char * utf8_buf);

/*
 * converts an pur ASCII or any 8 bit Latin code string to its
 * UTF-8 counterpart.
 * The returned buffer will be newly allocated with malloc and is
 * supposed to be freed by the caller using free.
 * If the length of the converted string exceeds 2^16 NULL will be
 * returned and errno is set to EINVAL.
 */
char *
pp_latin_to_utf8(const char * str, size_t * new_length)
{
    size_t i = 0, length;
    char * utf8_buf;

    if (!str) { errno = EINVAL; return NULL; }

    length = strlen(str);

    /* check length constraint */
    if ((length*2+1) > 0xffff) { errno = EINVAL; return NULL; }

    /* get the memory (we need max. 2*length+1) */
        if ((utf8_buf = malloc(length*2+1)) == NULL) return NULL;

    /* and last fill it up */
    while (*str) {
        i += _char_latin_to_utf8(&utf8_buf[i], str++);
    }
    utf8_buf[i] = '\0';

    if (new_length) *new_length = i;
    return utf8_buf;
}

/*
 * converts an UTF-8 (only 2-byte encodings) to a ASCII/Latin code
 * string.
 * The returned buffer will be newly allocated with malloc and is
 * supposed to be freed by the caller using free.
 */
char *
pp_utf8_to_latin(const char * str, size_t * new_length)
{
    size_t i = 0, length;
    char * latin_buf;

    if (!str) { errno = EINVAL; return NULL; }

    length = strlen(str);

    /* get the memory (we need max. length+1) */
    if ((latin_buf = malloc(length*2+1)) == NULL) return NULL;

    /* and last fill it up */
    while (*str) {
        int j = _char_utf8_to_latin(&latin_buf[i++], str);
        str += j;
    }
    latin_buf[i] = '\0';
    
    if (new_length) *new_length = i;
    return latin_buf;
}

/*
 * Converts a single 8 bit character to its utf format.
 * Returns how many bytes where written to the utf8_buffer.
 */
static inline int
_char_latin_to_utf8(char* utf8_buf, const char *latin_buf)
{
    if (*latin_buf & 0x80) {
        utf8_buf[0] = 0xc0 | (*latin_buf >> 6);
        utf8_buf[1] = 0x80 | (0x3f & *latin_buf);
        return 2;
    } else {
        *utf8_buf = *latin_buf;
        return 1;
    }
}

/*
 * Converts a UTF8 encoded character to its 8-bit character if possible. 
 * Replace >2-byte UTF8 encodings with a '?'.
 * Returns how many bytes where read from the utf8_buffer.
 */
static inline int
_char_utf8_to_latin(char * latin_buf, const char * utf8_buf)
{
    /* 3 and 4 byte encoded -> replace with '?' */
    if ((unsigned char)*utf8_buf >= 0xe0) { 
        *latin_buf = '?';
        return ((unsigned char)*latin_buf >= 0xf0 ? 4 : 3);
    } else if ((unsigned char)*utf8_buf >= 0xc0) {
        *latin_buf = ((utf8_buf[0] & 0x3f) << 6) | (utf8_buf[1] & 0x3f);
        return 2;
    } else {
        *latin_buf = *utf8_buf;
        return 1;
    }
}

