NAME ^

src/encodings/utf8.c - UTF-8 encoding

DESCRIPTION ^

UTF-8 (http://www.utf-8.com/).

Functions ^

*/

#include "parrot/parrot.h" #include "../unicode.h" #include "utf8.h"

/* HEADER: src/encodings/utf8.h */

#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl utf8")

const char Parrot_utf8skip[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* cjk etc. */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6 /* cjk etc. */ };

#if 0 typedef unsigned char utf8_t; #endif

static void iter_init(Interp *, const STRING *src, String_iter *iter);

/*

FUNCDOC: utf8_characters

Returns the number of characters in the byte_len bytes from *ptr.

*/

static UINTVAL utf8_characters(const utf8_t *ptr, UINTVAL byte_len) { const utf8_t *u8ptr = ptr; const utf8_t *u8end = u8ptr + byte_len; UINTVAL characters = 0;

    while (u8ptr < u8end) {
        u8ptr += UTF8SKIP(u8ptr);
        characters++;
    }

    if (u8ptr > u8end) {
        internal_exception(MALFORMED_UTF8, "Unaligned end in UTF-8 string\n");
    }

    return characters;
}

/*

FUNCDOC: utf8_decode

Returns the integer for the UTF-8 character found at *ptr.

*/

static UINTVAL utf8_decode(const utf8_t *ptr) { const utf8_t *u8ptr = ptr; UINTVAL c = *u8ptr;

    if (UTF8_IS_START(c)) {
        UINTVAL len = UTF8SKIP(u8ptr);
        UINTVAL count;

        c &= UTF8_START_MASK(len);
        for (count = 1; count < len; count++) {
            u8ptr++;
            if (!UTF8_IS_CONTINUATION(*u8ptr)) {
                internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
            }
            c = UTF8_ACCUMULATE(c, *u8ptr);
        }

        if (UNICODE_IS_SURROGATE(c)) {
            internal_exception(MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
        }
    }
    else if (!UNICODE_IS_INVARIANT(c)) {
        internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
    }

    return c;
}

/*

FUNCDOC: utf8_encode

Returns the UTF-8 encoding of integer c.

*/

static void * utf8_encode(void *ptr, UINTVAL c) { utf8_t *u8ptr = (utf8_t *)ptr; UINTVAL len = UNISKIP(c); utf8_t *u8end = u8ptr + len - 1;

    if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) {
        internal_exception(INVALID_CHARACTER,
                           "Invalid character for UTF-8 encoding\n");
    }

    while (u8end > u8ptr) {
        *u8end-- =
            (utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK);
        c >>= UTF8_ACCUMULATION_SHIFT;
    }
    *u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len));

    return u8ptr + len;
}

/*

FUNCDOC: utf8_skip_forward

Moves ptr n characters forward.

*/

static const void * utf8_skip_forward(const void *ptr, UINTVAL n) { const utf8_t *u8ptr = (const utf8_t *)ptr;

    while (n-- > 0) {
        u8ptr += UTF8SKIP(u8ptr);
    }

    return u8ptr;
}

/*

FUNCDOC: utf8_skip_backward

Moves ptr n characters back.

*/

static const void * utf8_skip_backward(const void *ptr, UINTVAL n) { const utf8_t *u8ptr = (const utf8_t *)ptr;

    while (n-- > 0) {
        u8ptr--;
        while (UTF8_IS_CONTINUATION(*u8ptr))
            u8ptr--;
    }

    return u8ptr;
}

/*

Iterator Functions ^

FUNCDOC: utf8_decode_and_advance

The UTF-8 implementation of the string iterator's get_and_advance function.

FUNCDOC: utf8_encode_and_advance

The UTF-8 implementation of the string iterator's set_and_advance function.

*/

static UINTVAL utf8_decode_and_advance(Interp *interp, String_iter *i) { const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos); UINTVAL c = *u8ptr;

    if (UTF8_IS_START(c)) {
        UINTVAL len = UTF8SKIP(u8ptr);

        c &= UTF8_START_MASK(len);
        i->bytepos += len;
        for (len--; len; len--) {
            u8ptr++;
            if (!UTF8_IS_CONTINUATION(*u8ptr)) {
                internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
            }
            c = UTF8_ACCUMULATE(c, *u8ptr);
        }

        if (UNICODE_IS_SURROGATE(c)) {
            internal_exception(MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
        }
    }
    else if (!UNICODE_IS_INVARIANT(c)) {
        internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
    }
    else {
        i->bytepos++;
    }

    i->charpos++;
    return c;
}

static void utf8_encode_and_advance(Interp *interp, String_iter *i, UINTVAL c) { const STRING *s = i->str; unsigned char *new_pos, *pos;

    pos = (unsigned char *)s->strstart + i->bytepos;
    new_pos = (unsigned char *)utf8_encode(pos, c);
    i->bytepos += (new_pos - pos);
    /* XXX possible buffer overrun exception? */
    assert(i->bytepos <= PObj_buflen(s));
    i->charpos++;
}

/*

FUNCDOC: utf8_set_position

The UTF-8 implementation of the string iterator's set_position function.

*/

/* XXX Should use quickest direction */ static void utf8_set_position(Interp *interp, String_iter *i, UINTVAL pos) { const utf8_t *u8ptr = (utf8_t *)i->str->strstart;

    i->charpos = pos;
    while (pos-- > 0) {
        u8ptr += UTF8SKIP(u8ptr);
    }
    i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
}

static STRING * to_encoding(Interp *interp, STRING *src, STRING *dest) { STRING *result; String_iter src_iter; UINTVAL offs, c, dest_len, dest_pos, src_len; int in_place = dest == NULL; unsigned char *new_pos, *pos, *p;

    if (src->encoding == Parrot_utf8_encoding_ptr)
        return in_place ? src : string_copy(interp, src);
    src_len = src->strlen;
    if (in_place) {
        result = src;
    }
    else {
        result = dest;
    }

    /* init iter before possilby changing encoding */
    ENCODING_ITER_INIT(interp, src, &src_iter);
    result->charset  = Parrot_unicode_charset_ptr;
    result->encoding = Parrot_utf8_encoding_ptr;
    result->strlen   = src_len;

    if (!src->strlen)
        return dest;

    if (in_place) {
        /* need intermediate memory */
        p = (unsigned char *)mem_sys_allocate(src_len);
    }
    else {
        Parrot_reallocate_string(interp, dest, src_len);
        p = (unsigned char *)dest->strstart;
    }
    if (src->charset == Parrot_ascii_charset_ptr) {
        for (dest_len = 0; dest_len < src_len; ++dest_len) {
            p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
        }
        result->bufused = dest_len;
    }
    else {
        dest_len = src_len;
        dest_pos = 0;
        for (offs = 0; offs < src_len; ++offs) {
            c = src_iter.get_and_advance(interp, &src_iter);
            if (dest_len - dest_pos < 6) {
                UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
                if (need < 16)
                    need = 16;
                dest_len += need;
                if (in_place)
                    p = (unsigned char *)mem_sys_realloc(p, dest_len);
                else {
                    result->bufused = dest_pos;
                    Parrot_reallocate_string(interp, dest, dest_len);
                    p = (unsigned char *)dest->strstart;
                }
            }

            pos = p + dest_pos;
            new_pos = (unsigned char *)utf8_encode(pos, c);
            dest_pos += (new_pos - pos);
        }
        result->bufused = dest_pos;
    }
    if (in_place) {
        Parrot_reallocate_string(interp, src, src->bufused);
        memcpy(src->strstart, p, src->bufused);
        mem_sys_free(p);
    }
    return result;
}

static UINTVAL get_codepoint(Interp *interp, const STRING *src, UINTVAL offset) { const utf8_t * const start = (utf8_t *)utf8_skip_forward(src->strstart, offset); return utf8_decode(start); }

static void set_codepoint(Interp *interp, STRING *src, UINTVAL offset, UINTVAL codepoint) { const void *start; void *p; DECL_CONST_CAST;

    start = utf8_skip_forward(src->strstart, offset);
    p = const_cast(start);
    utf8_encode(p, codepoint);
}

static UINTVAL get_byte(Interp *interp, const STRING *src, UINTVAL offset) { unsigned char *contents = (unsigned char *)src->strstart; if (offset >= src->bufused) { /* internal_exception(0, "get_byte past the end of the buffer (%i of %i)", offset, src->bufused);*/ return 0; } return contents[offset]; }

static void set_byte(Interp *interp, const STRING *src, UINTVAL offset, UINTVAL byte) { unsigned char *contents; if (offset >= src->bufused) { internal_exception(0, "set_byte past the end of the buffer"); } contents = (unsigned char *)src->strstart; contents[offset] = (unsigned char)byte; }

static STRING * get_codepoints(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count) { String_iter iter; UINTVAL start; STRING *return_string = Parrot_make_COW_reference(interp, src); iter_init(interp, src, &iter); iter.set_position(interp, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; iter.set_position(interp, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; return return_string; }

static STRING * get_bytes(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count) { STRING *return_string = Parrot_make_COW_reference(interp, src); return_string->encoding = src->encoding; /* XXX */ return_string->charset = src->charset;

    return_string->strstart = (char *)return_string->strstart + offset ;
    return_string->bufused = count;

    return_string->strlen = count;
    return_string->hashval = 0;

    return return_string;
}

static STRING * get_codepoints_inplace(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *return_string) { String_iter iter; UINTVAL start; Parrot_reuse_COW_reference(interp, src, return_string); iter_init(interp, src, &iter); iter.set_position(interp, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; iter.set_position(interp, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; return return_string; }

static STRING * get_bytes_inplace(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *return_string) { UNIMPL; return NULL; }

static void set_codepoints(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *new_codepoints) { UNIMPL; }

static void set_bytes(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *new_bytes) { UNIMPL; }

/* Unconditionally makes the string be in this encoding, if that's valid */ static void become_encoding(Interp *interp, STRING *src) { UNIMPL; }

static UINTVAL codepoints(Interp *interp, STRING *src) { String_iter iter; /* * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ iter_init(interp, src, &iter); while (iter.bytepos < src->bufused) iter.get_and_advance(interp, &iter); return iter.charpos; }

static UINTVAL bytes(Interp *interp, STRING *src) { return src->bufused; }

static void iter_init(Interp *interp, const STRING *src, String_iter *iter) { iter->str = src; iter->bytepos = iter->charpos = 0; iter->get_and_advance = utf8_decode_and_advance; iter->set_and_advance = utf8_encode_and_advance; iter->set_position = utf8_set_position; }

ENCODING * Parrot_encoding_utf8_init(Interp *interp) { ENCODING *return_encoding = Parrot_new_encoding(interp);

    static const ENCODING base_encoding = {
        "utf8",
        4, /* Max bytes per codepoint 0 .. 0x10ffff */
        to_encoding,
        get_codepoint,
        set_codepoint,
        get_byte,
        set_byte,
        get_codepoints,
        get_codepoints_inplace,
        get_bytes,
        get_bytes_inplace,
        set_codepoints,
        set_bytes,
        become_encoding,
        codepoints,
        bytes,
        iter_init
    };
    memcpy(return_encoding, &base_encoding, sizeof (ENCODING));
    Parrot_register_encoding(interp, "utf8", return_encoding);
    return return_encoding;
}

/*

SEE ALSO ^

src/encodings/fixed_8.c, src/string.c, include/parrot/string.h, docs/string.pod.

*/

/* * Local variables: * c-file-style: "parrot" * End: * vim: expandtab shiftwidth=4: */


parrot