diff options
| author | gingerBill <gingerBill@users.noreply.github.com> | 2025-10-13 15:28:23 +0100 |
|---|---|---|
| committer | gingerBill <gingerBill@users.noreply.github.com> | 2025-10-13 15:28:23 +0100 |
| commit | 2bc7344f27cd8480e97af7677e066c6af9ad097a (patch) | |
| tree | 3befd11237bda910976d5e87d735ba7b40d55e2c /core/unicode | |
| parent | 0d7efeab5f2ebd8716912a6731ee7d5d6bae89b6 (diff) | |
Add `Grapheme_Iterator`
Diffstat (limited to 'core/unicode')
| -rw-r--r-- | core/unicode/utf8/grapheme.odin | 320 |
1 files changed, 148 insertions, 172 deletions
diff --git a/core/unicode/utf8/grapheme.odin b/core/unicode/utf8/grapheme.odin index 50d1789ab..b8e51688c 100644 --- a/core/unicode/utf8/grapheme.odin +++ b/core/unicode/utf8/grapheme.odin @@ -23,9 +23,40 @@ normalized_east_asian_width :: unicode.normalized_east_asian_width Grapheme :: struct { byte_index: int, rune_index: int, - width: int, + width: int, } + +Grapheme_Cluster_Sequence :: enum { + None, + Indic, + Emoji, + Regional, +} + +Grapheme_Iterator :: struct { + str: string, + curr_offset: int, + + grapheme_count: int, // The number of graphemes in the string + rune_count: int, // The number of runes in the string + width: int, // The widrth of the string in number of monospace cells + + last_rune: rune, + last_rune_breaks_forward: bool, + + last_width: int, + last_grapheme_count: int, + + bypass_next_rune: bool, + + regional_indicator_counter: int, + + current_sequence: Grapheme_Cluster_Sequence, + continue_sequence: bool, +} + + /* Count the individual graphemes in a UTF-8 string. @@ -39,7 +70,9 @@ Returns: */ @(require_results) grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) { - _, graphemes, runes, width = decode_grapheme_clusters(str, false) + it := decode_grapheme_iterator_make(str) + for _, _ in decode_grapheme_iterate(&it) {/**/} + graphemes, runes, width = it.grapheme_count, it.rune_count, it.width return } @@ -70,155 +103,98 @@ decode_grapheme_clusters :: proc( rune_count: int, width: int, ) { - // The following procedure implements text segmentation by breaking on - // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from - // the Unicode® Standard Annex #29, entitled: - // - // UNICODE TEXT SEGMENTATION - // - // Version: Unicode 15.1.0 - // Date: 2023-08-16 - // Revision: 43 - // - // This procedure is conformant[4] to UAX29-C1-1, otherwise known as the - // extended, non-legacy ruleset. - // - // Please see the references below for more information. - // - // - // NOTE(Feoramund): This procedure has not been highly optimized. - // A couple opportunities were taken to bypass repeated checking when a - // rune is outside of certain codepoint ranges, but little else has been - // done. Standard switches, conditionals, and binary search are used to - // see if a rune fits into a certain category. - // - // I did find that only one prior rune of state was necessary to build an - // algorithm that successfully passes all 4,835 test cases provided with - // this implementation from the Unicode organization's website. - // - // My initial implementation tracked explicit breaks and counted them once - // the string iteration had terminated. I've found this current - // implementation to be far simpler and need no allocations (unless the - // caller wants position data). - // - // Most rules work backwards instead of forwards which has helped keep this - // simple, despite its length and verbosity. - // - // - // The implementation has been left verbose and in the order described by - // the specification, to enable better readability and future upkeep. - // - // Some possible optimizations might include: - // - // - saving the type of `last_rune` instead of the exact rune. - // - reordering rules. - // - combining tables. - // - // - // [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries - // [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table - // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules - // [4]: https://www.unicode.org/reports/tr29/#Conformance - - // Additionally, this procedure now takes into account Standard Annex #11, - // in order to estimate how visually wide the string will appear on a - // monospaced display. This can only ever be a rough guess, as this tends - // to be an implementation detail relating to which fonts are being used, - // how codepoints are interpreted and drawn, if codepoint sequences are - // interpreted correctly, and et cetera. - // - // For example, a program may not properly interpret an emoji modifier - // sequence and print the component glyphs instead of one whole glyph. - // - // See here for more information: https://www.unicode.org/reports/tr11/ - // - // NOTE: There is no explicit mention of what to do with zero-width spaces - // as far as grapheme cluster segmentation goes, therefore this - // implementation may count and return graphemes with a `width` of zero. - // - // Treat them as any other space. - - Grapheme_Cluster_Sequence :: enum { - None, - Indic, - Emoji, - Regional, - } - context.allocator = allocator - last_rune: rune - last_rune_breaks_forward: bool + it := decode_grapheme_iterator_make(str) + for _, grapheme in decode_grapheme_iterate(&it) { + append(&graphemes, grapheme) + } - last_width: int - last_grapheme_count: int + grapheme_count = it.grapheme_count + rune_count = it.rune_count + width = it.width + return +} - bypass_next_rune: bool +@(require_results) +decode_grapheme_iterator_make :: proc(str: string) -> (it: Grapheme_Iterator) { + it.str = str + return +} - regional_indicator_counter: int +@(require_results) +decode_grapheme_iterate :: proc(it: ^Grapheme_Iterator) -> (text: string, grapheme: Grapheme, ok: bool) { + for it.curr_offset < len(it.str) { + if ok { + return + } - current_sequence: Grapheme_Cluster_Sequence - continue_sequence: bool + str := it.str[it.curr_offset:] + this_rune, this_rune_width := decode_rune(str) + byte_index := it.curr_offset + it.curr_offset += this_rune_width - for this_rune, byte_index in str { defer { // "Break at the start and end of text, unless the text is empty." // // GB1: sot ÷ Any // GB2: Any ÷ eot - if rune_count == 0 && grapheme_count == 0 { - grapheme_count += 1 + if it.rune_count == 0 && it.grapheme_count == 0 { + it.grapheme_count += 1 } - if grapheme_count > last_grapheme_count { - width += normalized_east_asian_width(this_rune) - if track_graphemes { - append(&graphemes, Grapheme{ - byte_index, - rune_count, - width - last_width, - }) + if it.grapheme_count > it.last_grapheme_count { + it.width += normalized_east_asian_width(this_rune) + grapheme = Grapheme{ + byte_index, + it.rune_count, + it.width - it.last_width, } - last_grapheme_count = grapheme_count - last_width = width + text = it.str[byte_index:][:grapheme.width] + ok = true + + + it.last_grapheme_count = it.grapheme_count + it.last_width = it.width } - last_rune = this_rune - rune_count += 1 + it.last_rune = this_rune + it.rune_count += 1 - if !continue_sequence { - current_sequence = .None - regional_indicator_counter = 0 + if !it.continue_sequence { + it.current_sequence = .None + it.regional_indicator_counter = 0 } - continue_sequence = false + it.continue_sequence = false } + // "Do not break between a CR and LF. Otherwise, break before and after controls." // // GB3: CR × LF // GB4: (Control | CR | LF) ÷ // GB5: ÷ (Control | CR | LF) - if this_rune == '\n' && last_rune == '\r' { - last_rune_breaks_forward = false - bypass_next_rune = false + if this_rune == '\n' && it.last_rune == '\r' { + it.last_rune_breaks_forward = false + it.bypass_next_rune = false continue } if is_control(this_rune) { - grapheme_count += 1 - last_rune_breaks_forward = true - bypass_next_rune = true + it.grapheme_count += 1 + it.last_rune_breaks_forward = true + it.bypass_next_rune = true continue } // (This check is for rules that work forwards, instead of backwards.) - if bypass_next_rune { - if last_rune_breaks_forward { - grapheme_count += 1 - last_rune_breaks_forward = false + if it.bypass_next_rune { + if it.last_rune_breaks_forward { + it.grapheme_count += 1 + it.last_rune_breaks_forward = false } - bypass_next_rune = false + it.bypass_next_rune = false continue } @@ -227,7 +203,7 @@ decode_grapheme_clusters :: proc( // * 0xA9 and 0xAE are in the Extended_Pictographic range, // which is checked later in GB11. if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF { - grapheme_count += 1 + it.grapheme_count += 1 continue } @@ -241,30 +217,30 @@ decode_grapheme_clusters :: proc( if is_hangul_syllable_leading(this_rune) || is_hangul_syllable_lv(this_rune) || is_hangul_syllable_lvt(this_rune) { - if !is_hangul_syllable_leading(last_rune) { - grapheme_count += 1 + if !is_hangul_syllable_leading(it.last_rune) { + it.grapheme_count += 1 } continue } if is_hangul_syllable_vowel(this_rune) { - if is_hangul_syllable_leading(last_rune) || - is_hangul_syllable_vowel(last_rune) || - is_hangul_syllable_lv(last_rune) { + if is_hangul_syllable_leading(it.last_rune) || + is_hangul_syllable_vowel(it.last_rune) || + is_hangul_syllable_lv(it.last_rune) { continue } - grapheme_count += 1 + it.grapheme_count += 1 continue } if is_hangul_syllable_trailing(this_rune) { - if is_hangul_syllable_trailing(last_rune) || - is_hangul_syllable_lvt(last_rune) || - is_hangul_syllable_lv(last_rune) || - is_hangul_syllable_vowel(last_rune) { + if is_hangul_syllable_trailing(it.last_rune) || + is_hangul_syllable_lvt(it.last_rune) || + is_hangul_syllable_lv(it.last_rune) || + is_hangul_syllable_vowel(it.last_rune) { continue } - grapheme_count += 1 + it.grapheme_count += 1 continue } } @@ -273,25 +249,25 @@ decode_grapheme_clusters :: proc( // // GB9: × (Extend | ZWJ) if this_rune == ZERO_WIDTH_JOINER { - continue_sequence = true + it.continue_sequence = true continue } if is_gcb_extend_class(this_rune) { // (Support for GB9c.) - if current_sequence == .Indic { + if it.current_sequence == .Indic { if is_indic_conjunct_break_extend(this_rune) && ( - is_indic_conjunct_break_linker(last_rune) || - is_indic_conjunct_break_consonant(last_rune) ) { - continue_sequence = true + is_indic_conjunct_break_linker(it.last_rune) || + is_indic_conjunct_break_consonant(it.last_rune) ) { + it.continue_sequence = true continue } - if is_indic_conjunct_break_linker(this_rune) && ( - is_indic_conjunct_break_linker(last_rune) || - is_indic_conjunct_break_extend(last_rune) || - is_indic_conjunct_break_consonant(last_rune) ) { - continue_sequence = true + if is_indic_conjunct_break_linker(this_rune) && ( + is_indic_conjunct_break_linker(it.last_rune) || + is_indic_conjunct_break_extend(it.last_rune) || + is_indic_conjunct_break_consonant(it.last_rune) ) { + it.continue_sequence = true continue } @@ -299,10 +275,10 @@ decode_grapheme_clusters :: proc( } // (Support for GB11.) - if current_sequence == .Emoji && ( - is_gcb_extend_class(last_rune) || - is_emoji_extended_pictographic(last_rune) ) { - continue_sequence = true + if it.current_sequence == .Emoji && ( + is_gcb_extend_class(it.last_rune) || + is_emoji_extended_pictographic(it.last_rune) ) { + it.continue_sequence = true } continue @@ -318,8 +294,8 @@ decode_grapheme_clusters :: proc( } if is_gcb_prepend_class(this_rune) { - grapheme_count += 1 - bypass_next_rune = true + it.grapheme_count += 1 + it.bypass_next_rune = true continue } @@ -328,40 +304,40 @@ decode_grapheme_clusters :: proc( // // GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant} if is_indic_conjunct_break_consonant(this_rune) { - if current_sequence == .Indic { - if last_rune == ZERO_WIDTH_JOINER || - is_indic_conjunct_break_linker(last_rune) { - continue_sequence = true + if it.current_sequence == .Indic { + if it.last_rune == ZERO_WIDTH_JOINER || + is_indic_conjunct_break_linker(it.last_rune) { + it.continue_sequence = true } else { - grapheme_count += 1 + it.grapheme_count += 1 } } else { - grapheme_count += 1 - current_sequence = .Indic - continue_sequence = true + it.grapheme_count += 1 + it.current_sequence = .Indic + it.continue_sequence = true } continue } if is_indic_conjunct_break_extend(this_rune) { - if current_sequence == .Indic { - if is_indic_conjunct_break_consonant(last_rune) || - is_indic_conjunct_break_linker(last_rune) { - continue_sequence = true + if it.current_sequence == .Indic { + if is_indic_conjunct_break_consonant(it.last_rune) || + is_indic_conjunct_break_linker(it.last_rune) { + it.continue_sequence = true } else { - grapheme_count += 1 + it.grapheme_count += 1 } } continue } if is_indic_conjunct_break_linker(this_rune) { - if current_sequence == .Indic { - if is_indic_conjunct_break_extend(last_rune) || - is_indic_conjunct_break_linker(last_rune) { - continue_sequence = true + if it.current_sequence == .Indic { + if is_indic_conjunct_break_extend(it.last_rune) || + is_indic_conjunct_break_linker(it.last_rune) { + it.continue_sequence = true } else { - grapheme_count += 1 + it.grapheme_count += 1 } } continue @@ -375,11 +351,11 @@ decode_grapheme_clusters :: proc( // // GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} if is_emoji_extended_pictographic(this_rune) { - if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER { - grapheme_count += 1 + if it.current_sequence != .Emoji || it.last_rune != ZERO_WIDTH_JOINER { + it.grapheme_count += 1 } - current_sequence = .Emoji - continue_sequence = true + it.current_sequence = .Emoji + it.continue_sequence = true continue } @@ -390,13 +366,13 @@ decode_grapheme_clusters :: proc( // GB12: sot (RI RI)* RI × RI // GB13: [^RI] (RI RI)* RI × RI if is_regional_indicator(this_rune) { - if regional_indicator_counter & 1 == 0 { - grapheme_count += 1 + if it.regional_indicator_counter & 1 == 0 { + it.grapheme_count += 1 } - current_sequence = .Regional - continue_sequence = true - regional_indicator_counter += 1 + it.current_sequence = .Regional + it.continue_sequence = true + it.regional_indicator_counter += 1 continue } @@ -404,8 +380,8 @@ decode_grapheme_clusters :: proc( // "Otherwise, break everywhere." // // GB999: Any ÷ Any - grapheme_count += 1 + it.grapheme_count += 1 } return -} +}
\ No newline at end of file |