Add `package unicode/utf8/utf8string` for efficient utf8 codepoint indexing to strings

author: gingerBill <bill@gingerbill.org> 2020-09-26 23:16:18 +0100
committer: gingerBill <bill@gingerbill.org> 2020-09-26 23:16:18 +0100
commit: b9076b0d5b5b2e84926eb85e81bc611597faf06c (patch)
tree: ff3631375e487e1de7e990a05033b352713bb91c /core/unicode
parent: c43b8ef387389a02749e2c99f7a80c0cc15f5e03 (diff)
1 files changed, 155 insertions, 0 deletions
diff --git a/core/unicode/utf8/utf8string/string.odin b/core/unicode/utf8/utf8string/string.odin
new file mode 100644
index 000000000..a34e754cf
--- /dev/null
+++ b/core/unicode/utf8/utf8string/string.odin
@@ -0,0 +1,155 @@
+package utf8string
+
+import "core:unicode/utf8"
+import "core:runtime"
+import "builtin"
+
+String :: struct {
+	contents:   string,
+	rune_count: int,
+
+	// cached information
+	non_ascii:  int, // index to non-ascii code points
+	width:      int, // 0 if ascii
+	byte_pos:   int,
+	rune_pos:   int,
+}
+
+@(private)
+_len :: builtin.len; // helper procedure
+
+init :: proc(s: ^String, contents: string) -> ^String {
+	s.contents = contents;
+	s.byte_pos = 0;
+	s.rune_pos = 0;
+
+	for i in 0..<_len(contents) {
+		if contents[i] >= utf8.RUNE_SELF {
+			s.rune_count = utf8.rune_count_in_string(contents);
+			_, s.width = utf8.decode_rune_in_string(contents);
+			s.non_ascii = i;
+			return s;
+		}
+	}
+
+	s.rune_count = _len(contents);
+	s.width = 0;
+	s.non_ascii = _len(contents);
+	return s;
+}
+
+to_string :: proc(s: ^String) -> string {
+	return s.contents;
+}
+
+len :: proc(s: ^String) -> int {
+	return s.rune_count;
+}
+
+
+is_ascii :: proc(s: ^String) -> bool {
+	return s.width == 0;
+}
+
+at :: proc(s: ^String, i: int, loc := #caller_location) -> (r: rune) {
+	runtime.bounds_check_error_loc(loc, i, s.rune_count);
+
+	if i < s.non_ascii {
+		return rune(s.contents[i]);
+	}
+
+	switch i {
+	case 0:
+		r, s.width = utf8.decode_rune_in_string(s.contents);
+		s.rune_pos = 0;
+		s.byte_pos = 0;
+		return;
+
+	case s.rune_count-1:
+		r, s.width = utf8.decode_rune_in_string(s.contents);
+		s.rune_pos = i;
+		s.byte_pos = _len(s.contents) - s.width;
+		return;
+
+	case s.rune_pos-1:
+		r, s.width = utf8.decode_rune_in_string(s.contents[0:s.byte_pos]);
+		s.rune_pos = i;
+		s.byte_pos -= s.width;
+		return;
+
+	case s.rune_pos+1:
+		s.rune_pos = i;
+		s.byte_pos += s.width;
+		fallthrough;
+	case s.rune_pos:
+		r, s.width = utf8.decode_rune_in_string(s.contents[s.byte_pos:]);
+		return;
+	}
+
+	// Linear scan
+	scan_forward := true;
+	if i < s.rune_pos {
+		if i < (s.rune_pos-s.non_ascii)/2 {
+			s.byte_pos, s.rune_pos = s.non_ascii, s.non_ascii;
+		} else {
+			scan_forward = false;
+		}
+	} else if i-s.rune_pos < (s.rune_count-s.rune_pos)/2 {
+		// scan_forward = true;
+	} else {
+		s.byte_pos, s.rune_pos = _len(s.contents), s.rune_count;
+		scan_forward = false;
+	}
+
+	if scan_forward {
+		for {
+			r, s.width = utf8.decode_rune_in_string(s.contents[s.byte_pos:]);
+			if s.rune_pos == i {
+				return;
+			}
+			s.rune_pos += 1;
+			s.byte_pos += s.width;
+
+		}
+	} else {
+		for {
+			r, s.width = utf8.decode_last_rune_in_string(s.contents[:s.byte_pos]);
+			s.rune_pos -= 1;
+			s.byte_pos -= s.width;
+			if s.rune_pos == i {
+				return;
+			}
+		}
+	}
+}
+
+slice :: proc(s: ^String, i, j: int, loc := #caller_location) -> string {
+	runtime.slice_expr_error_lo_hi_loc(loc, i, j, s.rune_count);
+
+	if j < s.non_ascii {
+		return s.contents[i:j];
+	}
+
+	if i == j {
+		return "";
+	}
+
+	lo, hi: int;
+	if i < s.non_ascii {
+		lo = i;
+	} else if i == s.rune_count {
+		lo = _len(s.contents);
+	} else {
+		at(s, i, loc);
+		lo = s.byte_pos;
+	}
+
+	if j == s.rune_count {
+		hi = _len(s.contents);
+	} else {
+		at(s, j, loc);
+		hi = s.byte_pos;
+	}
+
+	return s.contents[lo:hi];
+}
author	gingerBill <bill@gingerbill.org>	2020-09-26 23:16:18 +0100
committer	gingerBill <bill@gingerbill.org>	2020-09-26 23:16:18 +0100
commit	b9076b0d5b5b2e84926eb85e81bc611597faf06c (patch)
tree	ff3631375e487e1de7e990a05033b352713bb91c /core/unicode
parent	c43b8ef387389a02749e2c99f7a80c0cc15f5e03 (diff)