aboutsummaryrefslogtreecommitdiff
path: root/core/unicode
diff options
context:
space:
mode:
authorgingerBill <bill@gingerbill.org>2020-09-26 23:16:18 +0100
committergingerBill <bill@gingerbill.org>2020-09-26 23:16:18 +0100
commitb9076b0d5b5b2e84926eb85e81bc611597faf06c (patch)
treeff3631375e487e1de7e990a05033b352713bb91c /core/unicode
parentc43b8ef387389a02749e2c99f7a80c0cc15f5e03 (diff)
Add `package unicode/utf8/utf8string` for efficient utf8 codepoint indexing to strings
Diffstat (limited to 'core/unicode')
-rw-r--r--core/unicode/utf8/utf8string/string.odin155
1 files changed, 155 insertions, 0 deletions
diff --git a/core/unicode/utf8/utf8string/string.odin b/core/unicode/utf8/utf8string/string.odin
new file mode 100644
index 000000000..a34e754cf
--- /dev/null
+++ b/core/unicode/utf8/utf8string/string.odin
@@ -0,0 +1,155 @@
+package utf8string
+
+import "core:unicode/utf8"
+import "core:runtime"
+import "builtin"
+
+String :: struct {
+ contents: string,
+ rune_count: int,
+
+ // cached information
+ non_ascii: int, // index to non-ascii code points
+ width: int, // 0 if ascii
+ byte_pos: int,
+ rune_pos: int,
+}
+
+@(private)
+_len :: builtin.len; // helper procedure
+
+init :: proc(s: ^String, contents: string) -> ^String {
+ s.contents = contents;
+ s.byte_pos = 0;
+ s.rune_pos = 0;
+
+ for i in 0..<_len(contents) {
+ if contents[i] >= utf8.RUNE_SELF {
+ s.rune_count = utf8.rune_count_in_string(contents);
+ _, s.width = utf8.decode_rune_in_string(contents);
+ s.non_ascii = i;
+ return s;
+ }
+ }
+
+ s.rune_count = _len(contents);
+ s.width = 0;
+ s.non_ascii = _len(contents);
+ return s;
+}
+
+to_string :: proc(s: ^String) -> string {
+ return s.contents;
+}
+
+len :: proc(s: ^String) -> int {
+ return s.rune_count;
+}
+
+
+is_ascii :: proc(s: ^String) -> bool {
+ return s.width == 0;
+}
+
+at :: proc(s: ^String, i: int, loc := #caller_location) -> (r: rune) {
+ runtime.bounds_check_error_loc(loc, i, s.rune_count);
+
+ if i < s.non_ascii {
+ return rune(s.contents[i]);
+ }
+
+ switch i {
+ case 0:
+ r, s.width = utf8.decode_rune_in_string(s.contents);
+ s.rune_pos = 0;
+ s.byte_pos = 0;
+ return;
+
+ case s.rune_count-1:
+ r, s.width = utf8.decode_rune_in_string(s.contents);
+ s.rune_pos = i;
+ s.byte_pos = _len(s.contents) - s.width;
+ return;
+
+ case s.rune_pos-1:
+ r, s.width = utf8.decode_rune_in_string(s.contents[0:s.byte_pos]);
+ s.rune_pos = i;
+ s.byte_pos -= s.width;
+ return;
+
+ case s.rune_pos+1:
+ s.rune_pos = i;
+ s.byte_pos += s.width;
+ fallthrough;
+ case s.rune_pos:
+ r, s.width = utf8.decode_rune_in_string(s.contents[s.byte_pos:]);
+ return;
+ }
+
+ // Linear scan
+ scan_forward := true;
+ if i < s.rune_pos {
+ if i < (s.rune_pos-s.non_ascii)/2 {
+ s.byte_pos, s.rune_pos = s.non_ascii, s.non_ascii;
+ } else {
+ scan_forward = false;
+ }
+ } else if i-s.rune_pos < (s.rune_count-s.rune_pos)/2 {
+ // scan_forward = true;
+ } else {
+ s.byte_pos, s.rune_pos = _len(s.contents), s.rune_count;
+ scan_forward = false;
+ }
+
+ if scan_forward {
+ for {
+ r, s.width = utf8.decode_rune_in_string(s.contents[s.byte_pos:]);
+ if s.rune_pos == i {
+ return;
+ }
+ s.rune_pos += 1;
+ s.byte_pos += s.width;
+
+ }
+ } else {
+ for {
+ r, s.width = utf8.decode_last_rune_in_string(s.contents[:s.byte_pos]);
+ s.rune_pos -= 1;
+ s.byte_pos -= s.width;
+ if s.rune_pos == i {
+ return;
+ }
+ }
+ }
+}
+
+slice :: proc(s: ^String, i, j: int, loc := #caller_location) -> string {
+ runtime.slice_expr_error_lo_hi_loc(loc, i, j, s.rune_count);
+
+ if j < s.non_ascii {
+ return s.contents[i:j];
+ }
+
+ if i == j {
+ return "";
+ }
+
+ lo, hi: int;
+ if i < s.non_ascii {
+ lo = i;
+ } else if i == s.rune_count {
+ lo = _len(s.contents);
+ } else {
+ at(s, i, loc);
+ lo = s.byte_pos;
+ }
+
+ if j == s.rune_count {
+ hi = _len(s.contents);
+ } else {
+ at(s, j, loc);
+ hi = s.byte_pos;
+ }
+
+ return s.contents[lo:hi];
+}