aboutsummaryrefslogtreecommitdiff
path: root/core/unicode
diff options
context:
space:
mode:
authorgingerBill <bill@gingerbill.org>2020-11-10 16:47:56 +0000
committergingerBill <bill@gingerbill.org>2020-11-10 16:47:56 +0000
commit49e140f4db1f9fffa541c4d58efa91b7128c4ff4 (patch)
treefc1c22d7d6cd6c509c96e407388155dd090b8b59 /core/unicode
parent95b94a0f5669d8fb1c38da945bd73505f5c112d3 (diff)
Add utf8.full_rune
Diffstat (limited to 'core/unicode')
-rw-r--r--core/unicode/utf8/utf8.odin41
1 files changed, 41 insertions, 0 deletions
diff --git a/core/unicode/utf8/utf8.odin b/core/unicode/utf8/utf8.odin
index f008c3881..50d24d562 100644
--- a/core/unicode/utf8/utf8.odin
+++ b/core/unicode/utf8/utf8.odin
@@ -350,3 +350,44 @@ rune_size :: proc(r: rune) -> int {
}
return -1;
}
+
+// full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not
+// An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
+full_rune :: proc(b: []byte) -> bool {
+ n := len(b);
+ if n == 0 {
+ return false;
+ }
+ x := _first[b[0]];
+ if n >= int(x & 7) {
+ return true;
+ }
+ accept := accept_ranges[x>>4];
+ if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) {
+ return true;
+ } else if n > 2 && (b[2] < LOCB || HICB < b[2]) {
+ return true;
+ }
+ return false;
+}
+
+// full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not
+// An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR)
+full_rune_in_string :: proc(s: string) -> bool {
+ return full_rune(transmute([]byte)s);
+}
+
+
+_first := [256]u8{
+ 0x00..0x7f = 0xf0, // ascii, size 1
+ 0x80..0xc1 = 0xf1, // invalid, size 1
+ 0xc2..0xdf = 0x02, // accept 1, size 2
+ 0xe0 = 0x13, // accept 1, size 3
+ 0xe1..0xec = 0x03, // accept 0, size 3
+ 0xed = 0x23, // accept 2, size 3
+ 0xee..0xef = 0x03, // accept 0, size 3
+ 0xf0 = 0x34, // accept 3, size 4
+ 0xf1..0xf3 = 0x04, // accept 0, size 4
+ 0xf4 = 0x44, // accept 4, size 4
+ 0xf5..0xff = 0xf1, // ascii, size 1
+};