NAME ^

src/encodings/utf8.c - UTF-8 encoding

DESCRIPTION ^

UTF-8 (http://www.utf-8.com/).

Functions ^

static UINTVAL utf8_characters(PARROT_INTERP, NOTNULL(const utf8_t *ptr), UINTVAL byte_len)

Returns the number of characters in the byte_len bytes from *ptr.

static UINTVAL utf8_decode(PARROT_INTERP, NOTNULL(const utf8_t *ptr))

Returns the integer for the UTF-8 character found at *ptr.

static void *utf8_encode(PARROT_INTERP, NOTNULL(void *ptr), UINTVAL c)

Returns the UTF-8 encoding of integer c.

static const void *utf8_skip_forward(NOTNULL(const void *ptr), UINTVAL n)

Moves ptr n characters forward.

static const void *utf8_skip_backward(NOTNULL(const void *ptr), UINTVAL n)

Moves ptr n characters back.

Iterator Functions ^

static UINTVAL utf8_decode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i))

The UTF-8 implementation of the string iterator's get_and_advance function.

*/

static UINTVAL utf8_decode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i)) { const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos); UINTVAL c = *u8ptr;

    if (UTF8_IS_START(c)) {
        UINTVAL len = UTF8SKIP(u8ptr);

        c &= UTF8_START_MASK(len);
        i->bytepos += len;
        for (len--; len; len--) {
            u8ptr++;
            if (!UTF8_IS_CONTINUATION(*u8ptr)) {
                real_exception(interp, NULL, MALFORMED_UTF8, "Malformed UTF-8 string\n");
            }
            c = UTF8_ACCUMULATE(c, *u8ptr);
        }

        if (UNICODE_IS_SURROGATE(c)) {
            real_exception(interp, NULL, MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
        }
    }
    else if (!UNICODE_IS_INVARIANT(c)) {
        real_exception(interp, NULL, MALFORMED_UTF8, "Malformed UTF-8 string\n");
    }
    else {
        i->bytepos++;
    }

    i->charpos++;
    return c;
}
/*

static void utf8_encode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i), UINTVAL c)

The UTF-8 implementation of the string iterator's set_and_advance function.

static void utf8_set_position(SHIM_INTERP, NOTNULL(String_iter *i), UINTVAL pos)

The UTF-8 implementation of the string iterator's set_position function.

static STRING *to_encoding(PARROT_INTERP, NOTNULL(STRING *src), NULLOK(STRING *dest))

TODO: Not yet documented!!!

static UINTVAL get_codepoint(PARROT_INTERP, NOTNULL(const STRING *src), UINTVAL offset)

TODO: Not yet documented!!!

static void set_codepoint(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL codepoint)

TODO: Not yet documented!!!

static UINTVAL get_byte(SHIM_INTERP, NOTNULL(const STRING *src), UINTVAL offset)

TODO: Not yet documented!!!

static void set_byte(PARROT_INTERP, NOTNULL(const STRING *src), UINTVAL offset, UINTVAL byte)

TODO: Not yet documented!!!

PARROT_CANNOT_RETURN_NULL static STRING *get_codepoints(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count)

TODO: Not yet documented!!!

PARROT_CANNOT_RETURN_NULL static STRING *get_bytes(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count)

TODO: Not yet documented!!!

PARROT_CANNOT_RETURN_NULL static STRING *get_codepoints_inplace(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count, NOTNULL(STRING *return_string))

TODO: Not yet documented!!!

static STRING *get_bytes_inplace(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *return_string))

TODO: Not yet documented!!!

static void set_codepoints(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *new_codepoints))

TODO: Not yet documented!!!

static void set_bytes(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *new_bytes))

TODO: Not yet documented!!!

static void become_encoding(PARROT_INTERP, SHIM(STRING *src))

Unconditionally makes the string be in this encoding, if that's valid

static UINTVAL codepoints(PARROT_INTERP, NOTNULL(STRING *src))

TODO: Not yet documented!!!

PARROT_PURE_FUNCTION static UINTVAL bytes(SHIM_INTERP, NOTNULL(STRING *src))

TODO: Not yet documented!!!

static void iter_init(SHIM_INTERP, NOTNULL(const STRING *src), NOTNULL(String_iter *iter))

TODO: Not yet documented!!!

ENCODING *Parrot_encoding_utf8_init(PARROT_INTERP)

TODO: Not yet documented!!!

SEE ALSO ^

src/encodings/fixed_8.c, src/string.c, include/parrot/string.h, docs/string.pod.


parrot