diff options
| author | Zac Pierson <zacpiersonhehe@gmail.com> | 2017-03-21 14:16:42 -0500 |
|---|---|---|
| committer | Zac Pierson <zacpiersonhehe@gmail.com> | 2017-03-21 14:16:42 -0500 |
| commit | c7bb861d3ca19f1a81043b8ad1e014ad10aa225f (patch) | |
| tree | 56719cc445ebe06b4d2e486e2da3c17c57e216c2 /src/utf8proc/utf8proc.c | |
| parent | d890731716ea96fd69d7543c7d7225702cbd7268 (diff) | |
| parent | 188b290dd50664aa8a89955ac2ab7dbebf7a653d (diff) | |
Merge https://github.com/gingerBill/Odin
"Fixed" a proc overload bug. Still needs a *real* fix.
Diffstat (limited to 'src/utf8proc/utf8proc.c')
| -rw-r--r-- | src/utf8proc/utf8proc.c | 110 |
1 files changed, 66 insertions, 44 deletions
diff --git a/src/utf8proc/utf8proc.c b/src/utf8proc/utf8proc.c index 7a6c984ac..c14bbe13f 100644 --- a/src/utf8proc/utf8proc.c +++ b/src/utf8proc/utf8proc.c @@ -166,24 +166,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut if (uc < 0x00) { return 0; } else if (uc < 0x80) { - dst[0] = uc; + dst[0] = (utf8proc_uint8_t) uc; return 1; } else if (uc < 0x800) { - dst[0] = 0xC0 + (uc >> 6); - dst[1] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); + dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 2; // Note: we allow encoding 0xd800-0xdfff here, so as not to change // the API, however, these are actually invalid in UTF-8 } else if (uc < 0x10000) { - dst[0] = 0xE0 + (uc >> 12); - dst[1] = 0x80 + ((uc >> 6) & 0x3F); - dst[2] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 3; } else if (uc < 0x110000) { - dst[0] = 0xF0 + (uc >> 18); - dst[1] = 0x80 + ((uc >> 12) & 0x3F); - dst[2] = 0x80 + ((uc >> 6) & 0x3F); - dst[3] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 4; } else return 0; } @@ -193,28 +193,28 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t if (uc < 0x00) { return 0; } else if (uc < 0x80) { - dst[0] = uc; + dst[0] = (utf8proc_uint8_t)uc; return 1; } else if (uc < 0x800) { - dst[0] = 0xC0 + (uc >> 6); - dst[1] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); + dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 2; } else if (uc == 0xFFFF) { - dst[0] = 0xFF; + dst[0] = (utf8proc_uint8_t)0xFF; return 1; } else if (uc == 0xFFFE) { - dst[0] = 0xFE; + dst[0] = (utf8proc_uint8_t)0xFE; return 1; } else if (uc < 0x10000) { - dst[0] = 0xE0 + (uc >> 12); - dst[1] = 0x80 + ((uc >> 6) & 0x3F); - dst[2] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 3; } else if (uc < 0x110000) { - dst[0] = 0xF0 + (uc >> 18); - dst[1] = 0x80 + ((uc >> 12) & 0x3F); - dst[2] = 0x80 + ((uc >> 6) & 0x3F); - dst[3] = 0x80 + (uc & 0x3F); + dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); + dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); + dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); + dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 4; } else return 0; } @@ -383,7 +383,7 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { } UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { - return (utf8proc_category_t)utf8proc_get_property(c)->category; + return utf8proc_get_property(c)->category; } UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { @@ -391,11 +391,9 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { return s[utf8proc_category(c)]; } - - #define utf8proc_decompose_lump(replacement_uc) \ - return utf8proc_decompose_char((utf8proc_int32_t)(replacement_uc), dst, bufsize, \ - (utf8proc_option_t)((utf8proc_int32_t)options & ~UTF8PROC_LUMP), last_boundclass) + return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ + options & ~UTF8PROC_LUMP, last_boundclass) UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { const utf8proc_property_t *property; @@ -458,12 +456,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, category == UTF8PROC_CATEGORY_ME) return 0; } if (options & UTF8PROC_CASEFOLD) { - if ((utf8proc_int16_t)property->casefold_seqindex != UINT16_MAX) { + if (property->casefold_seqindex != UINT16_MAX) { return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); } } if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { - if ((utf8proc_int16_t)property->decomp_seqindex != UINT16_MAX && + if (property->decomp_seqindex != UINT16_MAX && (!property->decomp_type || (options & UTF8PROC_COMPAT))) { return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); } @@ -486,6 +484,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ) { + return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +) { /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ utf8proc_ssize_t wpos = 0; if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) @@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; } + if (custom_func != NULL) { + uc = custom_func(uc, custom_data); /* user-specified custom mapping */ + } decomp_result = utf8proc_decompose_char( uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, &boundclass @@ -545,9 +554,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( return wpos; } -UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { - /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored - ASSERT: 'buffer' has one spare byte of free space at the end! */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { utf8proc_ssize_t rpos; utf8proc_ssize_t wpos = 0; @@ -621,7 +629,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, starter_property = unsafe_get_property(*starter); } if (starter_property->comb_index < 0x8000 && - (utf8proc_int16_t)current_property->comb_index != UINT16_MAX && + current_property->comb_index != UINT16_MAX && current_property->comb_index >= 0x8000) { int sidx = starter_property->comb_index; int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx]; @@ -655,6 +663,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, } length = wpos; } + return length; +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored + ASSERT: 'buffer' has one spare byte of free space at the end! */ + length = utf8proc_normalize_utf32(buffer, length, options); + if (length < 0) return length; { utf8proc_ssize_t rpos, wpos = 0; utf8proc_int32_t uc; @@ -677,14 +693,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ) { + return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); +} + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +) { utf8proc_int32_t *buffer; utf8proc_ssize_t result; *dstptr = NULL; - result = utf8proc_decompose(str, strlen, NULL, 0, options); + result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); if (result < 0) return result; buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); if (!buffer) return UTF8PROC_ERROR_NOMEM; - result = utf8proc_decompose(str, strlen, buffer, result, options); + result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); if (result < 0) { free(buffer); return result; @@ -705,29 +728,28 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; - utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | - UTF8PROC_DECOMPOSE)); + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_DECOMPOSE); return retval; } UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; - utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | - UTF8PROC_COMPOSE)); + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_COMPOSE); return retval; } UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; - utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | - UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT)); + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); return retval; } UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { utf8proc_uint8_t *retval; - utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | - UTF8PROC_COMPOSE | UTF8PROC_COMPAT)); + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_COMPOSE | UTF8PROC_COMPAT); return retval; } - |