NAME ^

src/encodings/utf8.c - UTF-8 encoding

DESCRIPTION ^

UTF-8 (http://www.utf-8.com/).

Functions ^

static UINTVAL utf8_characters

Returns the number of characters in the byte_len bytes from *ptr.

static UINTVAL utf8_decode

Returns the integer for the UTF-8 character found at *ptr.

static void *utf8_encode

Returns the UTF-8 encoding of integer c.

static const void *utf8_skip_forward

Moves ptr n characters forward.

static const void *utf8_skip_backward

Moves ptr n characters back.

Iterator Functions ^

static UINTVAL utf8_decode_and_advance

The UTF-8 implementation of the string iterator's get_and_advance function.

*/

static UINTVAL utf8_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i)) { const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos); UINTVAL c = *u8ptr;

    if (UTF8_IS_START(c)) {
        UINTVAL len = UTF8SKIP(u8ptr);

        c &= UTF8_START_MASK(len);
        i->bytepos += len;
        for (len--; len; len--) {
            u8ptr++;

            if (!UTF8_IS_CONTINUATION(*u8ptr))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Malformed UTF-8 string\n");

            c = UTF8_ACCUMULATE(c, *u8ptr);
        }

        if (UNICODE_IS_SURROGATE(c))
            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                "Surrogate in UTF-8 string\n");
    }
    else if (!UNICODE_IS_INVARIANT(c)) {
        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
            "Malformed UTF-8 string\n");
    }
    else {
        i->bytepos++;
    }

    i->charpos++;
    return c;
}
/*

static void utf8_encode_and_advance

The UTF-8 implementation of the string iterator's set_and_advance function.

static void utf8_set_position

The UTF-8 implementation of the string iterator's set_position function.

static STRING *to_encoding

Converts the string src to this particular encoding. If dest is provided, it will contain the result. Otherwise this function operates in place.

static UINTVAL get_codepoint

Returns the codepoint in string src at position offset.

static void set_codepoint

Sets, in string src at position offset, the codepoint codepoint.

static UINTVAL get_byte

Returns the byte in string src at position offset.

static void set_byte

Sets, in string src at position offset, the byte byte.

static STRING *get_codepoints

Returns the codepoints in string src at position offset and length count.

static STRING *get_bytes

Returns the bytes in string src at position offset and length count.

static STRING *get_codepoints_inplace

Gets from string src at position offset count codepoints and returns them in return_string.

static STRING *get_bytes_inplace

Gets from string src at position offset count bytes and returns them in return_string.

static void set_codepoints

Replaces in string src at position offset for count codepoints with the contents of string new_codepoints.

static void set_bytes

Replaces in string src at position offset for count bytes with the contents of string new_bytes.

static void become_encoding

Unconditionally makes the string be in this encoding, if that's valid

static UINTVAL codepoints

Returns the number of codepoints in string src.

static UINTVAL bytes

Returns the number of bytes in string src.

static void iter_init

Initializes for string src the string iterator iter.

ENCODING *Parrot_encoding_utf8_init

Initializes the UTF-8 encoding.

SEE ALSO ^

src/encodings/fixed_8.c, src/string.c, include/parrot/string.h, docs/string.pod.


parrot