diff options
| author | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-06-29 18:03:32 -0400 |
|---|---|---|
| committer | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-06-29 18:42:56 -0400 |
| commit | 8b305a4c67b0ddab5c1133c6e0efce85c98c46c5 (patch) | |
| tree | d355d6cfb391b8c0ffc8256c6820bbcbfd1256bf /src/ucg/ucg.c | |
| parent | d4e2fa037797c026a1b7803f1514a87792af8888 (diff) | |
Add `UCG` library to Odin compiler
Diffstat (limited to 'src/ucg/ucg.c')
| -rw-r--r-- | src/ucg/ucg.c | 686 |
1 files changed, 686 insertions, 0 deletions
diff --git a/src/ucg/ucg.c b/src/ucg/ucg.c new file mode 100644 index 000000000..c3e270e1a --- /dev/null +++ b/src/ucg/ucg.c @@ -0,0 +1,686 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Feoramund + * SPDX-License-Identifier: BSD-3-Clause + */ + + +// +// NOTE(Feoramund): This is my UCG library, adapted for use within the Odin compiler. +// Most of the comments have been let alone and may not strictly apply anymore. +// +// 1. The UCG allocator interface was replaced by gbAllocator. +// 2. The UCG UTF-8 decoder was replaced with the one already in the compiler. +// 3. Non-essential code was stripped. +// 4. Some types were changed for compatibility. +// + + +/* This is the data that is allocated when an allocator is passed to + * ucg_decode_grapheme_clusters. */ +typedef struct { + i32 byte_index; + i32 rune_index; + i32 width; +} ucg_grapheme; + + +/* #include "ucg.h" */ +#include "ucg_tables.h" + +#define UCG_TABLE_LEN(t) (sizeof(t) / sizeof(int32_t)) + +#define ZERO_WIDTH_SPACE 0x200B +#define ZERO_WIDTH_NON_JOINER 0x200C +#define ZERO_WIDTH_JOINER 0x200D +#define WORD_JOINER 0x2060 + +int ucg_binary_search(int32_t value, const int32_t* table, int length, int stride) { + GB_ASSERT(table != NULL); + GB_ASSERT(length > 0); + GB_ASSERT(stride > 0); + + int n = length; + int t = 0; + for (/**/; n > 1; /**/) { + int m = n / 2; + int p = t + m * stride; + if (value >= table[p]) { + t = p; + n = n - m; + } else { + n = m; + } + } + if (n != 0 && value >= table[t]) { + return t; + } + return -1; +} + +// +// The procedures below are accurate as of Unicode 15.1.0. +// + +bool ucg_is_control(int32_t r) { + if (r <= 0x1F || (0x7F <= r && r <= 0x9F)) { + return true; + } + return false; +} + +// Emoji_Modifier +bool ucg_is_emoji_modifier(int32_t r) { + return 0x1F3FB <= r && r <= 0x1F3FF; +} + +// Regional_Indicator +bool ucg_is_regional_indicator(int32_t r) { + return 0x1F1E6 <= r && r <= 0x1F1FF; +} + +// General_Category=Enclosing_Mark +bool ucg_is_enclosing_mark(int32_t r) { + switch (r) { + case 0x0488: + case 0x0489: + case 0x1ABE: + return true; + } + + if (0x20DD <= r && r <= 0x20E0) { return true; } + if (0x20E2 <= r && r <= 0x20E4) { return true; } + if (0xA670 <= r && r <= 0xA672) { return true; } + + return false; +} + +// Prepended_Concatenation_Mark +bool ucg_is_prepended_concatenation_mark(int32_t r) { + switch (r) { + case 0x006DD: + case 0x0070F: + case 0x008E2: + case 0x110BD: + case 0x110CD: + return true; + } + + if (0x00600 <= r && r <= 0x00605) { return true; } + if (0x00890 <= r && r <= 0x00891) { return true; } + + return false; +} + +// General_Category=Spacing_Mark +bool ucg_is_spacing_mark(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_spacing_mark_ranges, UCG_TABLE_LEN(ucg_spacing_mark_ranges)/2, 2); + if (p >= 0 && ucg_spacing_mark_ranges[p] <= r && r <= ucg_spacing_mark_ranges[p+1]) { + return true; + } + return false; +} + +// General_Category=Nonspacing_Mark +bool ucg_is_nonspacing_mark(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_nonspacing_mark_ranges, UCG_TABLE_LEN(ucg_nonspacing_mark_ranges)/2, 2); + if (p >= 0 && ucg_nonspacing_mark_ranges[p] <= r && r <= ucg_nonspacing_mark_ranges[p+1]) { + return true; + } + return false; +} + +// Extended_Pictographic +bool ucg_is_emoji_extended_pictographic(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_emoji_extended_pictographic_ranges, UCG_TABLE_LEN(ucg_emoji_extended_pictographic_ranges)/2, 2); + if (p >= 0 && ucg_emoji_extended_pictographic_ranges[p] <= r && r <= ucg_emoji_extended_pictographic_ranges[p+1]) { + return true; + } + return false; +} + +// Grapheme_Extend +bool ucg_is_grapheme_extend(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_grapheme_extend_ranges, UCG_TABLE_LEN(ucg_grapheme_extend_ranges)/2, 2); + if (p >= 0 && ucg_grapheme_extend_ranges[p] <= r && r <= ucg_grapheme_extend_ranges[p+1]) { + return true; + } + return false; +} + + +// Hangul_Syllable_Type=Leading_Jamo +bool ucg_is_hangul_syllable_leading(int32_t r) { + return (0x1100 <= r && r <= 0x115F) || (0xA960 <= r && r <= 0xA97C); +} + +// Hangul_Syllable_Type=Vowel_Jamo +bool ucg_is_hangul_syllable_vowel(int32_t r) { + return (0x1160 <= r && r <= 0x11A7) || (0xD7B0 <= r && r <= 0xD7C6); +} + +// Hangul_Syllable_Type=Trailing_Jamo +bool ucg_is_hangul_syllable_trailing(int32_t r) { + return (0x11A8 <= r && r <= 0x11FF) || (0xD7CB <= r && r <= 0xD7FB); +} + +// Hangul_Syllable_Type=LV_Syllable +bool ucg_is_hangul_syllable_lv(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_hangul_syllable_lv_singlets, UCG_TABLE_LEN(ucg_hangul_syllable_lv_singlets), 1); + if (p >= 0 && r == ucg_hangul_syllable_lv_singlets[p]) { + return true; + } + return false; +} + +// Hangul_Syllable_Type=LVT_Syllable +bool ucg_is_hangul_syllable_lvt(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_hangul_syllable_lvt_ranges, UCG_TABLE_LEN(ucg_hangul_syllable_lvt_ranges)/2, 2); + if (p >= 0 && ucg_hangul_syllable_lvt_ranges[p] <= r && r <= ucg_hangul_syllable_lvt_ranges[p+1]) { + return true; + } + return false; +} + + +// Indic_Syllabic_Category=Consonant_Preceding_Repha +bool ucg_is_indic_consonant_preceding_repha(int32_t r) { + switch (r) { + case 0x00D4E: + case 0x11941: + case 0x11D46: + case 0x11F02: + return true; + } + return false; +} + +// Indic_Syllabic_Category=Consonant_Prefixed +bool ucg_is_indic_consonant_prefixed(int32_t r) { + switch (r) { + case 0x1193F: + case 0x11A3A: + return true; + } + + if (0x111C2 <= r && r <= 0x111C3) { return true; } + if (0x11A84 <= r && r <= 0x11A89) { return true; } + + return false; +} + +// Indic_Conjunct_Break=Linker +bool ucg_is_indic_conjunct_break_linker(int32_t r) { + switch (r) { + case 0x094D: + case 0x09CD: + case 0x0ACD: + case 0x0B4D: + case 0x0C4D: + case 0x0D4D: + return true; + } + return false; +} + +// Indic_Conjunct_Break=Consonant +bool ucg_is_indic_conjunct_break_consonant(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_indic_conjunct_break_consonant_ranges, UCG_TABLE_LEN(ucg_indic_conjunct_break_consonant_ranges)/2, 2); + if (p >= 0 && ucg_indic_conjunct_break_consonant_ranges[p] <= r && r <= ucg_indic_conjunct_break_consonant_ranges[p+1]) { + return true; + } + return false; +} + +// Indic_Conjunct_Break=Extend +bool ucg_is_indic_conjunct_break_extend(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_indic_conjunct_break_extend_ranges, UCG_TABLE_LEN(ucg_indic_conjunct_break_extend_ranges)/2, 2); + if (p >= 0 && ucg_indic_conjunct_break_extend_ranges[p] <= r && r <= ucg_indic_conjunct_break_extend_ranges[p+1]) { + return true; + } + return false; +} + + +/* +``` +Indic_Syllabic_Category = Consonant_Preceding_Repha, or +Indic_Syllabic_Category = Consonant_Prefixed, or +Prepended_Concatenation_Mark = Yes +``` +*/ +bool ucg_is_gcb_prepend_class(int32_t r) { + return ucg_is_indic_consonant_preceding_repha(r) || ucg_is_indic_consonant_prefixed(r) || ucg_is_prepended_concatenation_mark(r); +} + +/* +``` +Grapheme_Extend = Yes, or +Emoji_Modifier = Yes + +This includes: +General_Category = Nonspacing_Mark +General_Category = Enclosing_Mark +U+200C ZERO WIDTH NON-JOINER + +plus a few General_Category = Spacing_Mark needed for canonical equivalence. +``` +*/ +bool ucg_is_gcb_extend_class(int32_t r) { + return ucg_is_grapheme_extend(r) || ucg_is_emoji_modifier(r); +} + +// Return values: +// +// - 2 if East_Asian_Width=F or W, or +// - 0 if non-printable / zero-width, or +// - 1 in all other cases. +// +int ucg_normalized_east_asian_width(int32_t r) { + if (ucg_is_control(r)) { + return 0; + } else if (r <= 0x10FF) { + // Easy early out for low runes. + return 1; + } + + switch (r) { + // This is a different interpretation of the BOM which occurs in the middle of text. + case 0xFEFF: /* ZERO_WIDTH_NO_BREAK_SPACE */ + case ZERO_WIDTH_SPACE: + case ZERO_WIDTH_NON_JOINER: + case ZERO_WIDTH_JOINER: + case WORD_JOINER: + return 0; + } + + intptr_t p = ucg_binary_search(r, ucg_normalized_east_asian_width_ranges, UCG_TABLE_LEN(ucg_normalized_east_asian_width_ranges)/3, 3); + if (p >= 0 && ucg_normalized_east_asian_width_ranges[p] <= r && r <= ucg_normalized_east_asian_width_ranges[p+1]) { + return (int)ucg_normalized_east_asian_width_ranges[p+2]; + } + return 1; +} + +// +// End of Unicode 15.1.0 block. +// + +enum grapheme_cluster_sequence { + None, + Indic, + Emoji, + Regional, +}; + +typedef struct { + ucg_grapheme* graphemes; + i32 rune_count; + i32 grapheme_count; + i32 width; + + int32_t last_rune; + bool last_rune_breaks_forward; + + i32 last_width; + i32 last_grapheme_count; + + bool bypass_next_rune; + + int regional_indicator_counter; + + enum grapheme_cluster_sequence current_sequence; + bool continue_sequence; +} ucg_decoder_state; + + +void _ucg_decode_grapheme_clusters_deferred_step( + gbAllocator allocator, + ucg_decoder_state* state, + i32 byte_index, + int32_t this_rune +) { + // "Break at the start and end of text, unless the text is empty." + // + // GB1: sot ÷ Any + // GB2: Any ÷ eot + if (state->rune_count == 0 && state->grapheme_count == 0) { + state->grapheme_count += 1; + } + + if (state->grapheme_count > state->last_grapheme_count) { + state->width += ucg_normalized_east_asian_width(this_rune); + + /* if (allocator != NULL) { */ + state->graphemes = (ucg_grapheme*)gb_resize(allocator, + state->graphemes, + sizeof(ucg_grapheme) * (state->grapheme_count), + sizeof(ucg_grapheme) * (1 + state->grapheme_count)); + + ucg_grapheme append = { + byte_index, + state->rune_count, + state->width - state->last_width, + }; + + state->graphemes[state->grapheme_count - 1] = append; + /* } */ + + state->last_grapheme_count = state->grapheme_count; + state->last_width = state->width; + } + + state->last_rune = this_rune; + state->rune_count += 1; + + if (!state->continue_sequence) { + state->current_sequence = None; + state->regional_indicator_counter = 0; + } + state->continue_sequence = false; +} + +int ucg_decode_grapheme_clusters( + gbAllocator allocator, + const uint8_t* str, + int str_len, + + ucg_grapheme** out_graphemes, + i32* out_rune_count, + i32* out_grapheme_count, + i32* out_width +) { + // The following procedure implements text segmentation by breaking on + // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from + // the Unicode® Standard Annex #29, entitled: + // + // UNICODE TEXT SEGMENTATION + // + // Version: Unicode 15.1.0 + // Date: 2023-08-16 + // Revision: 43 + // + // This procedure is conformant[4] to UAX29-C1-1, otherwise known as the + // extended, non-legacy ruleset. + // + // Please see the references for more information. + // + // + // [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries + // [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table + // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + // [4]: https://www.unicode.org/reports/tr29/#Conformance + + // Additionally, this procedure takes into account Standard Annex #11, + // in order to estimate how visually wide the string will appear on a + // monospaced display. This can only ever be a rough guess, as this tends + // to be an implementation detail relating to which fonts are being used, + // how codepoints are interpreted and drawn, if codepoint sequences are + // interpreted correctly, and et cetera. + // + // For example, a program may not properly interpret an emoji modifier + // sequence and print the component glyphs instead of one whole glyph. + // + // See here for more information: https://www.unicode.org/reports/tr11/ + // + // NOTE: There is no explicit mention of what to do with zero-width spaces + // as far as grapheme cluster segmentation goes, therefore this + // implementation may count and return graphemes with a `width` of zero. + // + // Treat them as any other space. + + ucg_decoder_state state = {0}; + +#define UCG_DEFERRED_DECODE_STEP() (_ucg_decode_grapheme_clusters_deferred_step(allocator, &state, byte_index, this_rune)) + + for (i32 byte_index = 0, bytes_advanced = 0; byte_index < str_len; byte_index += bytes_advanced) { + int32_t this_rune = GB_RUNE_INVALID; + bytes_advanced = (i32)(utf8_decode(str+byte_index, str_len-byte_index, &this_rune)); + if (this_rune == GB_RUNE_INVALID || bytes_advanced == 0) { + // There was a Unicode parsing error; bail out. + if (out_graphemes != NULL) { *out_graphemes = state.graphemes; } + if (out_rune_count != NULL) { *out_rune_count = state.rune_count; } + if (out_grapheme_count != NULL) { *out_grapheme_count = state.grapheme_count; } + if (out_width != NULL) { *out_width = state.width; } + + // Return an error code. + return -1; + } + + // "Do not break between a CR and LF. Otherwise, break before and after controls." + // + // GB3: CR × LF + // GB4: (Control | CR | LF) ÷ + // GB5: ÷ (Control | CR | LF) + if (this_rune == '\n' && state.last_rune == '\r') { + state.last_rune_breaks_forward = false; + state.bypass_next_rune = false; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_control(this_rune)) { + state.grapheme_count += 1; + state.last_rune_breaks_forward = true; + state.bypass_next_rune = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (This check is for rules that work forwards, instead of backwards.) + if (state.bypass_next_rune) { + if (state.last_rune_breaks_forward) { + state.grapheme_count += 1; + state.last_rune_breaks_forward = false; + } + + state.bypass_next_rune = false; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (Optimization 1: Prevent low runes from proceeding further.) + // + // * 0xA9 and 0xAE are in the Extended_Pictographic range, + // which is checked later in GB11. + if (this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF) { + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (Optimization 2: Check if the rune is in the Hangul space before getting specific.) + if (0x1100 <= this_rune && this_rune <= 0xD7FB) { + // "Do not break Hangul syllable sequences." + // + // GB6: L × (L | V | LV | LVT) + // GB7: (LV | V) × (V | T) + // GB8: (LVT | T) × T + if (ucg_is_hangul_syllable_leading(this_rune) || + ucg_is_hangul_syllable_lv(this_rune) || + ucg_is_hangul_syllable_lvt(this_rune)) + { + if (!ucg_is_hangul_syllable_leading(state.last_rune)) { + state.grapheme_count += 1; + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_hangul_syllable_vowel(this_rune)) { + if (ucg_is_hangul_syllable_leading(state.last_rune) || + ucg_is_hangul_syllable_vowel(state.last_rune) || + ucg_is_hangul_syllable_lv(state.last_rune)) + { + UCG_DEFERRED_DECODE_STEP(); continue; + } + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_hangul_syllable_trailing(this_rune)) { + if (ucg_is_hangul_syllable_trailing(state.last_rune) || + ucg_is_hangul_syllable_lvt(state.last_rune) || + ucg_is_hangul_syllable_lv(state.last_rune) || + ucg_is_hangul_syllable_vowel(state.last_rune)) + { + UCG_DEFERRED_DECODE_STEP(); continue; + } + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); continue; + } + } + + // "Do not break before extending characters or ZWJ." + // + // GB9: × (Extend | ZWJ) + if (this_rune == ZERO_WIDTH_JOINER) { + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_gcb_extend_class(this_rune)) { + // (Support for GB9c.) + if (state.current_sequence == Indic) { + if (ucg_is_indic_conjunct_break_extend(this_rune) && ( + ucg_is_indic_conjunct_break_linker(state.last_rune) || + ucg_is_indic_conjunct_break_consonant(state.last_rune) )) + { + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_indic_conjunct_break_linker(this_rune) && ( + ucg_is_indic_conjunct_break_linker(state.last_rune) || + ucg_is_indic_conjunct_break_extend(state.last_rune) || + ucg_is_indic_conjunct_break_consonant(state.last_rune) )) + { + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (Support for GB11.) + if (state.current_sequence == Emoji && ( + ucg_is_gcb_extend_class(state.last_rune) || + ucg_is_emoji_extended_pictographic(state.last_rune) )) + { + state.continue_sequence = true; + } + + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // _The GB9a and GB9b rules only apply to extended grapheme clusters:_ + // "Do not break before SpacingMarks, or after Prepend characters." + // + // GB9a: × SpacingMark + // GB9b: Prepend × + if (ucg_is_spacing_mark(this_rune)) { + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_gcb_prepend_class(this_rune)) { + state.grapheme_count += 1; + state.bypass_next_rune = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // _The GB9c rule only applies to extended grapheme clusters:_ + // "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker." + // + // GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant} + if (ucg_is_indic_conjunct_break_consonant(this_rune)) { + if (state.current_sequence == Indic) { + if (state.last_rune == ZERO_WIDTH_JOINER || + ucg_is_indic_conjunct_break_linker(state.last_rune)) + { + state.continue_sequence = true; + } else { + state.grapheme_count += 1; + } + } else { + state.grapheme_count += 1; + state.current_sequence = Indic; + state.continue_sequence = true; + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_indic_conjunct_break_extend(this_rune)) { + if (state.current_sequence == Indic) { + if (ucg_is_indic_conjunct_break_consonant(state.last_rune) || + ucg_is_indic_conjunct_break_linker(state.last_rune)) + { + state.continue_sequence = true; + } else { + state.grapheme_count += 1; + } + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_indic_conjunct_break_linker(this_rune)) { + if (state.current_sequence == Indic) { + if (ucg_is_indic_conjunct_break_extend(state.last_rune) || + ucg_is_indic_conjunct_break_linker(state.last_rune)) + { + state.continue_sequence = true; + } else { + state.grapheme_count += 1; + } + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // + // (Curiously, there is no GB10.) + // + + // "Do not break within emoji modifier sequences or emoji zwj sequences." + // + // GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} + if (ucg_is_emoji_extended_pictographic(this_rune)) { + if (state.current_sequence != Emoji || state.last_rune != ZERO_WIDTH_JOINER) { + state.grapheme_count += 1; + } + state.current_sequence = Emoji; + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // "Do not break within emoji flag sequences. + // That is, do not break between regional indicator (RI) symbols + // if there is an odd number of RI characters before the break point." + // + // GB12: sot (RI RI)* RI × RI + // GB13: [^RI] (RI RI)* RI × RI + if (ucg_is_regional_indicator(this_rune)) { + if ((state.regional_indicator_counter & 1) == 0) { + state.grapheme_count += 1; + } + + state.current_sequence = Regional; + state.continue_sequence = true; + state.regional_indicator_counter += 1; + + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // "Otherwise, break everywhere." + // + // GB999: Any ÷ Any + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); + } + +#undef UCG_DEFERRED_DECODE_STEP + + if (out_graphemes != NULL) { *out_graphemes = state.graphemes; } + if (out_rune_count != NULL) { *out_rune_count = state.rune_count; } + if (out_grapheme_count != NULL) { *out_grapheme_count = state.grapheme_count; } + if (out_width != NULL) { *out_width = state.width; } + + return 0; +} + +#undef UCG_TABLE_LEN +#undef ZERO_WIDTH_SPACE +#undef ZERO_WIDTH_NON_JOINER +#undef ZERO_WIDTH_JOINER +#undef WORD_JOINER |