parrotcode: UTF-8 encoding | |
Contents | C |
src/encodings/utf8.c - UTF-8 encoding
UTF-8 (http://www.utf-8.com/).
*/
#include "parrot/parrot.h" #include "../unicode.h" #include "utf8.h"
/* HEADER: src/encodings/utf8.h */
#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl utf8")
const char Parrot_utf8skip[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* cjk etc. */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6 /* cjk etc. */ };
#if 0 typedef unsigned char utf8_t; #endif
static void iter_init(Interp *, const STRING *src, String_iter *iter);
/*
FUNCDOC: utf8_characters
Returns the number of characters in the byte_len
bytes from *ptr
.
*/
static UINTVAL utf8_characters(const utf8_t *ptr, UINTVAL byte_len) { const utf8_t *u8ptr = ptr; const utf8_t *u8end = u8ptr + byte_len; UINTVAL characters = 0;
while (u8ptr < u8end) {
u8ptr += UTF8SKIP(u8ptr);
characters++;
}
if (u8ptr > u8end) {
internal_exception(MALFORMED_UTF8, "Unaligned end in UTF-8 string\n");
}
return characters;
}
/*
FUNCDOC: utf8_decode
Returns the integer for the UTF-8 character found at *ptr
.
*/
static UINTVAL utf8_decode(const utf8_t *ptr) { const utf8_t *u8ptr = ptr; UINTVAL c = *u8ptr;
if (UTF8_IS_START(c)) {
UINTVAL len = UTF8SKIP(u8ptr);
UINTVAL count;
c &= UTF8_START_MASK(len);
for (count = 1; count < len; count++) {
u8ptr++;
if (!UTF8_IS_CONTINUATION(*u8ptr)) {
internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
}
c = UTF8_ACCUMULATE(c, *u8ptr);
}
if (UNICODE_IS_SURROGATE(c)) {
internal_exception(MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
}
}
else if (!UNICODE_IS_INVARIANT(c)) {
internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
}
return c;
}
/*
FUNCDOC: utf8_encode
Returns the UTF-8 encoding of integer c
.
*/
static void * utf8_encode(void *ptr, UINTVAL c) { utf8_t *u8ptr = (utf8_t *)ptr; UINTVAL len = UNISKIP(c); utf8_t *u8end = u8ptr + len - 1;
if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) {
internal_exception(INVALID_CHARACTER,
"Invalid character for UTF-8 encoding\n");
}
while (u8end > u8ptr) {
*u8end-- =
(utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK);
c >>= UTF8_ACCUMULATION_SHIFT;
}
*u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len));
return u8ptr + len;
}
/*
FUNCDOC: utf8_skip_forward
Moves ptr
n
characters forward.
*/
static const void * utf8_skip_forward(const void *ptr, UINTVAL n) { const utf8_t *u8ptr = (const utf8_t *)ptr;
while (n-- > 0) {
u8ptr += UTF8SKIP(u8ptr);
}
return u8ptr;
}
/*
FUNCDOC: utf8_skip_backward
Moves ptr
n
characters back.
*/
static const void * utf8_skip_backward(const void *ptr, UINTVAL n) { const utf8_t *u8ptr = (const utf8_t *)ptr;
while (n-- > 0) {
u8ptr--;
while (UTF8_IS_CONTINUATION(*u8ptr))
u8ptr--;
}
return u8ptr;
}
/*
FUNCDOC: utf8_decode_and_advance
The UTF-8 implementation of the string iterator's get_and_advance
function.
FUNCDOC: utf8_encode_and_advance
The UTF-8 implementation of the string iterator's set_and_advance
function.
*/
static UINTVAL utf8_decode_and_advance(Interp *interp, String_iter *i) { const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos); UINTVAL c = *u8ptr;
if (UTF8_IS_START(c)) {
UINTVAL len = UTF8SKIP(u8ptr);
c &= UTF8_START_MASK(len);
i->bytepos += len;
for (len--; len; len--) {
u8ptr++;
if (!UTF8_IS_CONTINUATION(*u8ptr)) {
internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
}
c = UTF8_ACCUMULATE(c, *u8ptr);
}
if (UNICODE_IS_SURROGATE(c)) {
internal_exception(MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
}
}
else if (!UNICODE_IS_INVARIANT(c)) {
internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
}
else {
i->bytepos++;
}
i->charpos++;
return c;
}
static void utf8_encode_and_advance(Interp *interp, String_iter *i, UINTVAL c) { const STRING *s = i->str; unsigned char *new_pos, *pos;
pos = (unsigned char *)s->strstart + i->bytepos;
new_pos = (unsigned char *)utf8_encode(pos, c);
i->bytepos += (new_pos - pos);
/* XXX possible buffer overrun exception? */
assert(i->bytepos <= PObj_buflen(s));
i->charpos++;
}
/*
FUNCDOC: utf8_set_position
The UTF-8 implementation of the string iterator's set_position
function.
*/
/* XXX Should use quickest direction */ static void utf8_set_position(Interp *interp, String_iter *i, UINTVAL pos) { const utf8_t *u8ptr = (utf8_t *)i->str->strstart;
i->charpos = pos;
while (pos-- > 0) {
u8ptr += UTF8SKIP(u8ptr);
}
i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
}
static STRING * to_encoding(Interp *interp, STRING *src, STRING *dest) { STRING *result; String_iter src_iter; UINTVAL offs, c, dest_len, dest_pos, src_len; int in_place = dest == NULL; unsigned char *new_pos, *pos, *p;
if (src->encoding == Parrot_utf8_encoding_ptr)
return in_place ? src : string_copy(interp, src);
src_len = src->strlen;
if (in_place) {
result = src;
}
else {
result = dest;
}
/* init iter before possilby changing encoding */
ENCODING_ITER_INIT(interp, src, &src_iter);
result->charset = Parrot_unicode_charset_ptr;
result->encoding = Parrot_utf8_encoding_ptr;
result->strlen = src_len;
if (!src->strlen)
return dest;
if (in_place) {
/* need intermediate memory */
p = (unsigned char *)mem_sys_allocate(src_len);
}
else {
Parrot_reallocate_string(interp, dest, src_len);
p = (unsigned char *)dest->strstart;
}
if (src->charset == Parrot_ascii_charset_ptr) {
for (dest_len = 0; dest_len < src_len; ++dest_len) {
p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
}
result->bufused = dest_len;
}
else {
dest_len = src_len;
dest_pos = 0;
for (offs = 0; offs < src_len; ++offs) {
c = src_iter.get_and_advance(interp, &src_iter);
if (dest_len - dest_pos < 6) {
UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
if (need < 16)
need = 16;
dest_len += need;
if (in_place)
p = (unsigned char *)mem_sys_realloc(p, dest_len);
else {
result->bufused = dest_pos;
Parrot_reallocate_string(interp, dest, dest_len);
p = (unsigned char *)dest->strstart;
}
}
pos = p + dest_pos;
new_pos = (unsigned char *)utf8_encode(pos, c);
dest_pos += (new_pos - pos);
}
result->bufused = dest_pos;
}
if (in_place) {
Parrot_reallocate_string(interp, src, src->bufused);
memcpy(src->strstart, p, src->bufused);
mem_sys_free(p);
}
return result;
}
static UINTVAL get_codepoint(Interp *interp, const STRING *src, UINTVAL offset) { const utf8_t * const start = (utf8_t *)utf8_skip_forward(src->strstart, offset); return utf8_decode(start); }
static void set_codepoint(Interp *interp, STRING *src, UINTVAL offset, UINTVAL codepoint) { const void *start; void *p; DECL_CONST_CAST;
start = utf8_skip_forward(src->strstart, offset);
p = const_cast(start);
utf8_encode(p, codepoint);
}
static UINTVAL get_byte(Interp *interp, const STRING *src, UINTVAL offset) { unsigned char *contents = (unsigned char *)src->strstart; if (offset >= src->bufused) { /* internal_exception(0, "get_byte past the end of the buffer (%i of %i)", offset, src->bufused);*/ return 0; } return contents[offset]; }
static void set_byte(Interp *interp, const STRING *src, UINTVAL offset, UINTVAL byte) { unsigned char *contents; if (offset >= src->bufused) { internal_exception(0, "set_byte past the end of the buffer"); } contents = (unsigned char *)src->strstart; contents[offset] = (unsigned char)byte; }
static STRING * get_codepoints(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count) { String_iter iter; UINTVAL start; STRING *return_string = Parrot_make_COW_reference(interp, src); iter_init(interp, src, &iter); iter.set_position(interp, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; iter.set_position(interp, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; return return_string; }
static STRING * get_bytes(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count) { STRING *return_string = Parrot_make_COW_reference(interp, src); return_string->encoding = src->encoding; /* XXX */ return_string->charset = src->charset;
return_string->strstart = (char *)return_string->strstart + offset ;
return_string->bufused = count;
return_string->strlen = count;
return_string->hashval = 0;
return return_string;
}
static STRING * get_codepoints_inplace(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *return_string) { String_iter iter; UINTVAL start; Parrot_reuse_COW_reference(interp, src, return_string); iter_init(interp, src, &iter); iter.set_position(interp, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; iter.set_position(interp, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; return return_string; }
static STRING * get_bytes_inplace(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *return_string) { UNIMPL; return NULL; }
static void set_codepoints(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *new_codepoints) { UNIMPL; }
static void set_bytes(Interp *interp, STRING *src, UINTVAL offset, UINTVAL count, STRING *new_bytes) { UNIMPL; }
/* Unconditionally makes the string be in this encoding, if that's valid */ static void become_encoding(Interp *interp, STRING *src) { UNIMPL; }
static UINTVAL codepoints(Interp *interp, STRING *src) { String_iter iter; /* * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ iter_init(interp, src, &iter); while (iter.bytepos < src->bufused) iter.get_and_advance(interp, &iter); return iter.charpos; }
static UINTVAL bytes(Interp *interp, STRING *src) { return src->bufused; }
static void iter_init(Interp *interp, const STRING *src, String_iter *iter) { iter->str = src; iter->bytepos = iter->charpos = 0; iter->get_and_advance = utf8_decode_and_advance; iter->set_and_advance = utf8_encode_and_advance; iter->set_position = utf8_set_position; }
ENCODING * Parrot_encoding_utf8_init(Interp *interp) { ENCODING *return_encoding = Parrot_new_encoding(interp);
static const ENCODING base_encoding = {
"utf8",
4, /* Max bytes per codepoint 0 .. 0x10ffff */
to_encoding,
get_codepoint,
set_codepoint,
get_byte,
set_byte,
get_codepoints,
get_codepoints_inplace,
get_bytes,
get_bytes_inplace,
set_codepoints,
set_bytes,
become_encoding,
codepoints,
bytes,
iter_init
};
memcpy(return_encoding, &base_encoding, sizeof (ENCODING));
Parrot_register_encoding(interp, "utf8", return_encoding);
return return_encoding;
}
/*
src/encodings/fixed_8.c, src/string.c, include/parrot/string.h, docs/string.pod.
*/
/* * Local variables: * c-file-style: "parrot" * End: * vim: expandtab shiftwidth=4: */
|