NAME ^

src/string_primitives.c - String Primitives

DESCRIPTION ^

This file collects together all the functions that call into the ICU API.

Functions ^

*/

/* HEADERIZER HFILE: include/parrot/string_primitives.h */

#include "parrot/parrot.h" #if PARROT_HAS_ICU # include <unicode/ucnv.h> # include <unicode/utypes.h> # include <unicode/uchar.h> # include <unicode/ustring.h> #else # include <ctype.h> #endif #include <assert.h>

/*

FUNCDOC: string_set_data_directory

Set the directory where ICU finds its data files (encodings, locales, etc.).

*/

PARROT_API void string_set_data_directory(PARROT_INTERP, const char *dir) { #if PARROT_HAS_ICU u_setDataDirectory(dir);

    /* Since u_setDataDirectory doesn't have a result code, we'll spot
       check that everything is okay by making sure that '9' had decimal
       value 9. Using 57 rather than '9' so that the encoding of this
       source code file isn't an issue.... (Don't want to get bitten by
       EBCDIC.) */

    if (!u_isdigit(57) || (u_charDigitValue(57) != 9)) {
            real_exception(interp, NULL, ICU_ERROR,
                "string_set_data_directory: ICU data files not found"
                "(apparently) for directory [%s]", dir);
    }
#else
    UNUSED(dir);

    real_exception(interp, NULL, ICU_ERROR,
        "string_set_data_directory: parrot compiled without ICU support");
#endif
}

/*

Creates a Parrot string from an "external" buffer, converting from any supported encoding into Parrot string's internal format.

*/

PARROT_API void string_fill_from_buffer(PARROT_INTERP, NOTNULL(const void *buffer), UINTVAL len, const char *encoding_name, NULLOK(STRING *s)) { #if PARROT_HAS_ICU UErrorCode icuError = U_ZERO_ERROR; UConverter *conv = NULL; UChar *target = NULL; UChar *target_limit = NULL; const char *source = NULL; const char *source_limit = NULL;

    assert(buffer);
    assert(encoding_name);

    if (s && !len) {
        /* XXX: I _guess_ this is always an empty string--is that right? */
        s->bufused = 0;
        s->strlen = 0;
        return;
    }

    /* big guess--allocate same space for string as buffer needed.
       may be able to make a more educated guess based on the encoding. */
    Parrot_allocate_string(interp, s, len);

    conv = ucnv_open(encoding_name, &icuError);

    if (!conv || icuError != U_ZERO_ERROR) {
        /* unknown encoding??? */
        real_exception(interp, NULL, ICU_ERROR,
                "string_fill_from_buffer: ICU error from ucnv_open()");

    }

    target = (UChar *)s->strstart;
    /* buflen may be larger than what we asked for,
     * so take advantage of the space
     */
    target_limit = (UChar *)((char *)PObj_bufstart(s) + PObj_buflen(s) - 1);
    source = buffer;
    source_limit = source + len;

    ucnv_toUnicode(conv, &target, target_limit, &source,
            source_limit, NULL, TRUE, &icuError);

    while (icuError == U_BUFFER_OVERFLOW_ERROR) {
        const size_t consumed_length = (char *)target - (char *)(s->strstart);

        /* double size, at least */
        Parrot_reallocate_string(interp, s, 2 * PObj_buflen(s));

        target = (UChar *)((char *)s->strstart + consumed_length);
        target_limit = (UChar *)((char *)PObj_bufstart(s) + PObj_buflen(s) - 1);

        icuError = U_ZERO_ERROR;
        ucnv_toUnicode(conv, &target, target_limit, &source,
                source_limit, NULL, TRUE, &icuError);
    }

    ucnv_close(conv);

    if (icuError != U_ZERO_ERROR) {
        /* handle error */
        real_exception(interp, NULL, ICU_ERROR,
                "string_fill_from_buffer: ICU error from ucnv_toUnicode()");
    }

    real_exception(interp, NULL, UNIMPLEMENTED, "Can't do unicode yet");

    /* temporary; need to promote to rep 4 if has non-BMP characters*/
    s->bufused = (char *)target - (char *)s->strstart;
    string_compute_strlen(interp, s);
#else
    UNUSED(interp);
    UNUSED(buffer);
    UNUSED(len);
    UNUSED(encoding_name);
    UNUSED(s);

    real_exception(interp, NULL, ICU_ERROR,
        "string_fill_from_buffer: parrot compiled without ICU support");
#endif
}

/* Unescape a single character. We assume that we're at the start of a sequence, right after the \ */ PARROT_API Parrot_UInt4 string_unescape_one(PARROT_INTERP, NOTNULL(UINTVAL *offset), STRING *string) { UINTVAL workchar = 0; UINTVAL charcount = 0; const UINTVAL len = string_length(interp, string); /* Well, not right now */ UINTVAL codepoint = CHARSET_GET_BYTE(interp, string, *offset); ++*offset; switch (codepoint) { case 'x': codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= '0' && codepoint <= '9') { workchar = codepoint - '0'; } else if (codepoint >= 'a' && codepoint <= 'f') { workchar = codepoint - 'a' + 10; } else if (codepoint >= 'A' && codepoint <= 'F') { workchar = codepoint - 'A' + 10; } else if (codepoint == '{') { int i; ++*offset; workchar = 0; for (i = 0; i < 8 && *offset < len; ++i, ++*offset) { codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint == '}') { ++*offset; return workchar; } workchar *= 16; if (codepoint >= '0' && codepoint <= '9') { workchar += codepoint - '0'; } else if (codepoint >= 'a' && codepoint <= 'f') { workchar += codepoint - 'a' + 10; } else if (codepoint >= 'A' && codepoint <= 'F') { workchar += codepoint - 'A' + 10; } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence inside {}"); } } if (*offset == len) real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence no '}'"); } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in"); } ++*offset; if (*offset < len) { workchar *= 16; codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= '0' && codepoint <= '9') { workchar += codepoint - '0'; } else if (codepoint >= 'a' && codepoint <= 'f') { workchar += codepoint - 'a' + 10; } else if (codepoint >= 'A' && codepoint <= 'F') { workchar += codepoint - 'A' + 10; } else { return workchar; } } else { return workchar; } ++*offset; return workchar; case 'c': codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= 'A' && codepoint <= 'Z') { workchar = codepoint - 'A' + 1; } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence"); } ++*offset; return workchar; case 'u': workchar = 0; for (charcount = 0; charcount < 4; charcount++) { if (*offset < len) { workchar *= 16; codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= '0' && codepoint <= '9') { workchar += codepoint - '0'; } else if (codepoint >= 'a' && codepoint <= 'f') { workchar += codepoint - 'a' + 10; } else if (codepoint >= 'A' && codepoint <= 'F') { workchar += codepoint - 'A' + 10; } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in uxxx escape"); } } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in uxxx escape - too short"); } ++*offset; } return workchar; case 'U': workchar = 0; for (charcount = 0; charcount < 8; charcount++) { if (*offset < len) { workchar *= 16; codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= '0' && codepoint <= '9') { workchar += codepoint - '0'; } else if (codepoint >= 'a' && codepoint <= 'f') { workchar += codepoint - 'a' + 10; } else if (codepoint >= 'A' && codepoint <= 'F') { workchar += codepoint - 'A' + 10; } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in Uxxx escape"); } } else { real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in uxxx escape - too short"); } ++*offset; } return workchar; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': workchar = codepoint - '0'; if (*offset < len) { workchar *= 8; codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= '0' && codepoint <= '7') { workchar += codepoint - '0'; } else { return workchar; } } else { return workchar; } ++*offset; if (*offset < len) { workchar *= 8; codepoint = CHARSET_GET_BYTE(interp, string, *offset); if (codepoint >= '0' && codepoint <= '7') { workchar += codepoint - '0'; } else { return workchar; } } else { return workchar; } ++*offset; return workchar; case 'a': return 7; /* bell */ case 'b': return 8; /* bs */ case 't': return 9; case 'n': return 10; case 'v': return 11; case 'f': return 12; case 'r': return 13; case 'e': return 27; case 92: /* \ */ return 92; case '"': return '"'; }

    return codepoint;  /* any not special return the char */
}

/*

Character Property Functions ^

FUNCDOC: Parrot_char_digit_value

Returns the decimal digit value of the specified character if it is a decimal digit character. If not, then -1 is returned.

Note that as currently written, Parrot_char_digit_value() can correctly return the decimal digit value of characters for which Parrot_char_is_digit() returns false.

*/

PARROT_API UINTVAL Parrot_char_digit_value(SHIM_INTERP, UINTVAL character) { #if PARROT_HAS_ICU return u_charDigitValue(character); #else if ((character >= 0x30) || (character <= 0x39)) return character - 0x30; return -1; #endif }

/*

SEE ALSO ^

include/parrot/string_primitives.h

include/parrot/string.h

src/string.c

*/

/* * Local variables: * c-file-style: "parrot" * End: * vim: expandtab shiftwidth=4: */


parrot