aboutsummaryrefslogtreecommitdiff
path: root/core/unicode
diff options
context:
space:
mode:
authorgingerBill <gingerBill@users.noreply.github.com>2025-10-13 15:28:23 +0100
committergingerBill <gingerBill@users.noreply.github.com>2025-10-13 15:28:23 +0100
commit2bc7344f27cd8480e97af7677e066c6af9ad097a (patch)
tree3befd11237bda910976d5e87d735ba7b40d55e2c /core/unicode
parent0d7efeab5f2ebd8716912a6731ee7d5d6bae89b6 (diff)
Add `Grapheme_Iterator`
Diffstat (limited to 'core/unicode')
-rw-r--r--core/unicode/utf8/grapheme.odin320
1 files changed, 148 insertions, 172 deletions
diff --git a/core/unicode/utf8/grapheme.odin b/core/unicode/utf8/grapheme.odin
index 50d1789ab..b8e51688c 100644
--- a/core/unicode/utf8/grapheme.odin
+++ b/core/unicode/utf8/grapheme.odin
@@ -23,9 +23,40 @@ normalized_east_asian_width :: unicode.normalized_east_asian_width
Grapheme :: struct {
byte_index: int,
rune_index: int,
- width: int,
+ width: int,
}
+
+Grapheme_Cluster_Sequence :: enum {
+ None,
+ Indic,
+ Emoji,
+ Regional,
+}
+
+Grapheme_Iterator :: struct {
+ str: string,
+ curr_offset: int,
+
+ grapheme_count: int, // The number of graphemes in the string
+ rune_count: int, // The number of runes in the string
+ width: int, // The widrth of the string in number of monospace cells
+
+ last_rune: rune,
+ last_rune_breaks_forward: bool,
+
+ last_width: int,
+ last_grapheme_count: int,
+
+ bypass_next_rune: bool,
+
+ regional_indicator_counter: int,
+
+ current_sequence: Grapheme_Cluster_Sequence,
+ continue_sequence: bool,
+}
+
+
/*
Count the individual graphemes in a UTF-8 string.
@@ -39,7 +70,9 @@ Returns:
*/
@(require_results)
grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
- _, graphemes, runes, width = decode_grapheme_clusters(str, false)
+ it := decode_grapheme_iterator_make(str)
+ for _, _ in decode_grapheme_iterate(&it) {/**/}
+ graphemes, runes, width = it.grapheme_count, it.rune_count, it.width
return
}
@@ -70,155 +103,98 @@ decode_grapheme_clusters :: proc(
rune_count: int,
width: int,
) {
- // The following procedure implements text segmentation by breaking on
- // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
- // the Unicode® Standard Annex #29, entitled:
- //
- // UNICODE TEXT SEGMENTATION
- //
- // Version: Unicode 15.1.0
- // Date: 2023-08-16
- // Revision: 43
- //
- // This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
- // extended, non-legacy ruleset.
- //
- // Please see the references below for more information.
- //
- //
- // NOTE(Feoramund): This procedure has not been highly optimized.
- // A couple opportunities were taken to bypass repeated checking when a
- // rune is outside of certain codepoint ranges, but little else has been
- // done. Standard switches, conditionals, and binary search are used to
- // see if a rune fits into a certain category.
- //
- // I did find that only one prior rune of state was necessary to build an
- // algorithm that successfully passes all 4,835 test cases provided with
- // this implementation from the Unicode organization's website.
- //
- // My initial implementation tracked explicit breaks and counted them once
- // the string iteration had terminated. I've found this current
- // implementation to be far simpler and need no allocations (unless the
- // caller wants position data).
- //
- // Most rules work backwards instead of forwards which has helped keep this
- // simple, despite its length and verbosity.
- //
- //
- // The implementation has been left verbose and in the order described by
- // the specification, to enable better readability and future upkeep.
- //
- // Some possible optimizations might include:
- //
- // - saving the type of `last_rune` instead of the exact rune.
- // - reordering rules.
- // - combining tables.
- //
- //
- // [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
- // [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
- // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- // [4]: https://www.unicode.org/reports/tr29/#Conformance
-
- // Additionally, this procedure now takes into account Standard Annex #11,
- // in order to estimate how visually wide the string will appear on a
- // monospaced display. This can only ever be a rough guess, as this tends
- // to be an implementation detail relating to which fonts are being used,
- // how codepoints are interpreted and drawn, if codepoint sequences are
- // interpreted correctly, and et cetera.
- //
- // For example, a program may not properly interpret an emoji modifier
- // sequence and print the component glyphs instead of one whole glyph.
- //
- // See here for more information: https://www.unicode.org/reports/tr11/
- //
- // NOTE: There is no explicit mention of what to do with zero-width spaces
- // as far as grapheme cluster segmentation goes, therefore this
- // implementation may count and return graphemes with a `width` of zero.
- //
- // Treat them as any other space.
-
- Grapheme_Cluster_Sequence :: enum {
- None,
- Indic,
- Emoji,
- Regional,
- }
-
context.allocator = allocator
- last_rune: rune
- last_rune_breaks_forward: bool
+ it := decode_grapheme_iterator_make(str)
+ for _, grapheme in decode_grapheme_iterate(&it) {
+ append(&graphemes, grapheme)
+ }
- last_width: int
- last_grapheme_count: int
+ grapheme_count = it.grapheme_count
+ rune_count = it.rune_count
+ width = it.width
+ return
+}
- bypass_next_rune: bool
+@(require_results)
+decode_grapheme_iterator_make :: proc(str: string) -> (it: Grapheme_Iterator) {
+ it.str = str
+ return
+}
- regional_indicator_counter: int
+@(require_results)
+decode_grapheme_iterate :: proc(it: ^Grapheme_Iterator) -> (text: string, grapheme: Grapheme, ok: bool) {
+ for it.curr_offset < len(it.str) {
+ if ok {
+ return
+ }
- current_sequence: Grapheme_Cluster_Sequence
- continue_sequence: bool
+ str := it.str[it.curr_offset:]
+ this_rune, this_rune_width := decode_rune(str)
+ byte_index := it.curr_offset
+ it.curr_offset += this_rune_width
- for this_rune, byte_index in str {
defer {
// "Break at the start and end of text, unless the text is empty."
//
// GB1: sot ÷ Any
// GB2: Any ÷ eot
- if rune_count == 0 && grapheme_count == 0 {
- grapheme_count += 1
+ if it.rune_count == 0 && it.grapheme_count == 0 {
+ it.grapheme_count += 1
}
- if grapheme_count > last_grapheme_count {
- width += normalized_east_asian_width(this_rune)
- if track_graphemes {
- append(&graphemes, Grapheme{
- byte_index,
- rune_count,
- width - last_width,
- })
+ if it.grapheme_count > it.last_grapheme_count {
+ it.width += normalized_east_asian_width(this_rune)
+ grapheme = Grapheme{
+ byte_index,
+ it.rune_count,
+ it.width - it.last_width,
}
- last_grapheme_count = grapheme_count
- last_width = width
+ text = it.str[byte_index:][:grapheme.width]
+ ok = true
+
+
+ it.last_grapheme_count = it.grapheme_count
+ it.last_width = it.width
}
- last_rune = this_rune
- rune_count += 1
+ it.last_rune = this_rune
+ it.rune_count += 1
- if !continue_sequence {
- current_sequence = .None
- regional_indicator_counter = 0
+ if !it.continue_sequence {
+ it.current_sequence = .None
+ it.regional_indicator_counter = 0
}
- continue_sequence = false
+ it.continue_sequence = false
}
+
// "Do not break between a CR and LF. Otherwise, break before and after controls."
//
// GB3: CR × LF
// GB4: (Control | CR | LF) ÷
// GB5: ÷ (Control | CR | LF)
- if this_rune == '\n' && last_rune == '\r' {
- last_rune_breaks_forward = false
- bypass_next_rune = false
+ if this_rune == '\n' && it.last_rune == '\r' {
+ it.last_rune_breaks_forward = false
+ it.bypass_next_rune = false
continue
}
if is_control(this_rune) {
- grapheme_count += 1
- last_rune_breaks_forward = true
- bypass_next_rune = true
+ it.grapheme_count += 1
+ it.last_rune_breaks_forward = true
+ it.bypass_next_rune = true
continue
}
// (This check is for rules that work forwards, instead of backwards.)
- if bypass_next_rune {
- if last_rune_breaks_forward {
- grapheme_count += 1
- last_rune_breaks_forward = false
+ if it.bypass_next_rune {
+ if it.last_rune_breaks_forward {
+ it.grapheme_count += 1
+ it.last_rune_breaks_forward = false
}
- bypass_next_rune = false
+ it.bypass_next_rune = false
continue
}
@@ -227,7 +203,7 @@ decode_grapheme_clusters :: proc(
// * 0xA9 and 0xAE are in the Extended_Pictographic range,
// which is checked later in GB11.
if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
- grapheme_count += 1
+ it.grapheme_count += 1
continue
}
@@ -241,30 +217,30 @@ decode_grapheme_clusters :: proc(
if is_hangul_syllable_leading(this_rune) ||
is_hangul_syllable_lv(this_rune) ||
is_hangul_syllable_lvt(this_rune) {
- if !is_hangul_syllable_leading(last_rune) {
- grapheme_count += 1
+ if !is_hangul_syllable_leading(it.last_rune) {
+ it.grapheme_count += 1
}
continue
}
if is_hangul_syllable_vowel(this_rune) {
- if is_hangul_syllable_leading(last_rune) ||
- is_hangul_syllable_vowel(last_rune) ||
- is_hangul_syllable_lv(last_rune) {
+ if is_hangul_syllable_leading(it.last_rune) ||
+ is_hangul_syllable_vowel(it.last_rune) ||
+ is_hangul_syllable_lv(it.last_rune) {
continue
}
- grapheme_count += 1
+ it.grapheme_count += 1
continue
}
if is_hangul_syllable_trailing(this_rune) {
- if is_hangul_syllable_trailing(last_rune) ||
- is_hangul_syllable_lvt(last_rune) ||
- is_hangul_syllable_lv(last_rune) ||
- is_hangul_syllable_vowel(last_rune) {
+ if is_hangul_syllable_trailing(it.last_rune) ||
+ is_hangul_syllable_lvt(it.last_rune) ||
+ is_hangul_syllable_lv(it.last_rune) ||
+ is_hangul_syllable_vowel(it.last_rune) {
continue
}
- grapheme_count += 1
+ it.grapheme_count += 1
continue
}
}
@@ -273,25 +249,25 @@ decode_grapheme_clusters :: proc(
//
// GB9: × (Extend | ZWJ)
if this_rune == ZERO_WIDTH_JOINER {
- continue_sequence = true
+ it.continue_sequence = true
continue
}
if is_gcb_extend_class(this_rune) {
// (Support for GB9c.)
- if current_sequence == .Indic {
+ if it.current_sequence == .Indic {
if is_indic_conjunct_break_extend(this_rune) && (
- is_indic_conjunct_break_linker(last_rune) ||
- is_indic_conjunct_break_consonant(last_rune) ) {
- continue_sequence = true
+ is_indic_conjunct_break_linker(it.last_rune) ||
+ is_indic_conjunct_break_consonant(it.last_rune) ) {
+ it.continue_sequence = true
continue
}
- if is_indic_conjunct_break_linker(this_rune) && (
- is_indic_conjunct_break_linker(last_rune) ||
- is_indic_conjunct_break_extend(last_rune) ||
- is_indic_conjunct_break_consonant(last_rune) ) {
- continue_sequence = true
+ if is_indic_conjunct_break_linker(this_rune) && (
+ is_indic_conjunct_break_linker(it.last_rune) ||
+ is_indic_conjunct_break_extend(it.last_rune) ||
+ is_indic_conjunct_break_consonant(it.last_rune) ) {
+ it.continue_sequence = true
continue
}
@@ -299,10 +275,10 @@ decode_grapheme_clusters :: proc(
}
// (Support for GB11.)
- if current_sequence == .Emoji && (
- is_gcb_extend_class(last_rune) ||
- is_emoji_extended_pictographic(last_rune) ) {
- continue_sequence = true
+ if it.current_sequence == .Emoji && (
+ is_gcb_extend_class(it.last_rune) ||
+ is_emoji_extended_pictographic(it.last_rune) ) {
+ it.continue_sequence = true
}
continue
@@ -318,8 +294,8 @@ decode_grapheme_clusters :: proc(
}
if is_gcb_prepend_class(this_rune) {
- grapheme_count += 1
- bypass_next_rune = true
+ it.grapheme_count += 1
+ it.bypass_next_rune = true
continue
}
@@ -328,40 +304,40 @@ decode_grapheme_clusters :: proc(
//
// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
if is_indic_conjunct_break_consonant(this_rune) {
- if current_sequence == .Indic {
- if last_rune == ZERO_WIDTH_JOINER ||
- is_indic_conjunct_break_linker(last_rune) {
- continue_sequence = true
+ if it.current_sequence == .Indic {
+ if it.last_rune == ZERO_WIDTH_JOINER ||
+ is_indic_conjunct_break_linker(it.last_rune) {
+ it.continue_sequence = true
} else {
- grapheme_count += 1
+ it.grapheme_count += 1
}
} else {
- grapheme_count += 1
- current_sequence = .Indic
- continue_sequence = true
+ it.grapheme_count += 1
+ it.current_sequence = .Indic
+ it.continue_sequence = true
}
continue
}
if is_indic_conjunct_break_extend(this_rune) {
- if current_sequence == .Indic {
- if is_indic_conjunct_break_consonant(last_rune) ||
- is_indic_conjunct_break_linker(last_rune) {
- continue_sequence = true
+ if it.current_sequence == .Indic {
+ if is_indic_conjunct_break_consonant(it.last_rune) ||
+ is_indic_conjunct_break_linker(it.last_rune) {
+ it.continue_sequence = true
} else {
- grapheme_count += 1
+ it.grapheme_count += 1
}
}
continue
}
if is_indic_conjunct_break_linker(this_rune) {
- if current_sequence == .Indic {
- if is_indic_conjunct_break_extend(last_rune) ||
- is_indic_conjunct_break_linker(last_rune) {
- continue_sequence = true
+ if it.current_sequence == .Indic {
+ if is_indic_conjunct_break_extend(it.last_rune) ||
+ is_indic_conjunct_break_linker(it.last_rune) {
+ it.continue_sequence = true
} else {
- grapheme_count += 1
+ it.grapheme_count += 1
}
}
continue
@@ -375,11 +351,11 @@ decode_grapheme_clusters :: proc(
//
// GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
if is_emoji_extended_pictographic(this_rune) {
- if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
- grapheme_count += 1
+ if it.current_sequence != .Emoji || it.last_rune != ZERO_WIDTH_JOINER {
+ it.grapheme_count += 1
}
- current_sequence = .Emoji
- continue_sequence = true
+ it.current_sequence = .Emoji
+ it.continue_sequence = true
continue
}
@@ -390,13 +366,13 @@ decode_grapheme_clusters :: proc(
// GB12: sot (RI RI)* RI × RI
// GB13: [^RI] (RI RI)* RI × RI
if is_regional_indicator(this_rune) {
- if regional_indicator_counter & 1 == 0 {
- grapheme_count += 1
+ if it.regional_indicator_counter & 1 == 0 {
+ it.grapheme_count += 1
}
- current_sequence = .Regional
- continue_sequence = true
- regional_indicator_counter += 1
+ it.current_sequence = .Regional
+ it.continue_sequence = true
+ it.regional_indicator_counter += 1
continue
}
@@ -404,8 +380,8 @@ decode_grapheme_clusters :: proc(
// "Otherwise, break everywhere."
//
// GB999: Any ÷ Any
- grapheme_count += 1
+ it.grapheme_count += 1
}
return
-}
+} \ No newline at end of file