diff options
| author | gingerBill <bill@gingerbill.org> | 2021-08-31 22:21:13 +0100 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2021-08-31 22:21:13 +0100 |
| commit | 251da264ed6e0f039931683c7b0d4b97e88c8d99 (patch) | |
| tree | c7a9a088477d2452c2cf850458c62d994a211df6 /core/unicode | |
| parent | b176af27427a6c39448a71a8023e4a9877f0a51c (diff) | |
Remove unneeded semicolons from the core library
Diffstat (limited to 'core/unicode')
| -rw-r--r-- | core/unicode/letter.odin | 152 | ||||
| -rw-r--r-- | core/unicode/tables.odin | 42 | ||||
| -rw-r--r-- | core/unicode/utf16/utf16.odin | 66 | ||||
| -rw-r--r-- | core/unicode/utf8/utf8.odin | 344 |
4 files changed, 302 insertions, 302 deletions
diff --git a/core/unicode/letter.odin b/core/unicode/letter.odin index b498e4272..891c90bf3 100644 --- a/core/unicode/letter.odin +++ b/core/unicode/letter.odin @@ -1,195 +1,195 @@ package unicode -MAX_RUNE :: '\U00010fff'; // Maximum valid unicode code point -REPLACEMENT_CHAR :: '\ufffd'; // Represented an invalid code point -MAX_ASCII :: '\u007f'; // Maximum ASCII value -MAX_LATIN1 :: '\u00ff'; // Maximum Latin-1 value +MAX_RUNE :: '\U00010fff' // Maximum valid unicode code point +REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point +MAX_ASCII :: '\u007f' // Maximum ASCII value +MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int { - n := length; - t := 0; + n := length + t := 0 for n > 1 { - m := n / 2; - p := t + m*stride; + m := n / 2 + p := t + m*stride if c >= table[p] { - t = p; - n = n-m; + t = p + n = n-m } else { - n = m; + n = m } } if n != 0 && c >= table[t] { - return t; + return t } - return -1; + return -1 } to_lower :: proc(r: rune) -> rune { - c := i32(r); - p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3); + c := i32(r) + p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3) if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] { - return rune(c + to_lower_ranges[p+2] - 500); + return rune(c + to_lower_ranges[p+2] - 500) } - p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2); + p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2) if p >= 0 && c == to_lower_singlets[p] { - return rune(c + to_lower_singlets[p+1] - 500); + return rune(c + to_lower_singlets[p+1] - 500) } - return rune(c); + return rune(c) } to_upper :: proc(r: rune) -> rune { - c := i32(r); - p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3); + c := i32(r) + p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3) if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] { - return rune(c + to_upper_ranges[p+2] - 500); + return rune(c + to_upper_ranges[p+2] - 500) } - p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2); + p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2) if p >= 0 && c == to_upper_singlets[p] { - return rune(c + to_upper_singlets[p+1] - 500); + return rune(c + to_upper_singlets[p+1] - 500) } - return rune(c); + return rune(c) } to_title :: proc(r: rune) -> rune { - c := i32(r); - p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2); + c := i32(r) + p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2) if p >= 0 && c == to_upper_singlets[p] { - return rune(c + to_title_singlets[p+1] - 500); + return rune(c + to_title_singlets[p+1] - 500) } - return rune(c); + return rune(c) } is_lower :: proc(r: rune) -> bool { if r <= MAX_ASCII { - return u32(r)-'a' < 26; + return u32(r)-'a' < 26 } - c := i32(r); - p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3); + c := i32(r) + p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3) if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] { - return true; + return true } - p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2); + p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2) if p >= 0 && c == to_upper_singlets[p] { - return true; + return true } - return false; + return false } is_upper :: proc(r: rune) -> bool { if r <= MAX_ASCII { - return u32(r)-'A' < 26; + return u32(r)-'A' < 26 } - c := i32(r); - p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3); + c := i32(r) + p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3) if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] { - return true; + return true } - p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2); + p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2) if p >= 0 && c == to_lower_singlets[p] { - return true; + return true } - return false; + return false } -is_alpha :: is_letter; +is_alpha :: is_letter is_letter :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pLmask != 0; + return char_properties[u8(r)]&pLmask != 0 } if is_upper(r) || is_lower(r) { - return true; + return true } - c := i32(r); - p := binary_search(c, alpha_ranges[:], len(alpha_ranges)/2, 2); + c := i32(r) + p := binary_search(c, alpha_ranges[:], len(alpha_ranges)/2, 2) if p >= 0 && alpha_ranges[p] <= c && c <= alpha_ranges[p+1] { - return true; + return true } - p = binary_search(c, alpha_singlets[:], len(alpha_singlets), 1); + p = binary_search(c, alpha_singlets[:], len(alpha_singlets), 1) if p >= 0 && c == alpha_singlets[p] { - return true; + return true } - return false; + return false } is_title :: proc(r: rune) -> bool { - return is_upper(r) && is_lower(r); + return is_upper(r) && is_lower(r) } is_digit :: proc(r: rune) -> bool { if r <= MAX_LATIN1 { - return '0' <= r && r <= '9'; + return '0' <= r && r <= '9' } - return false; + return false } -is_white_space :: is_space; +is_white_space :: is_space is_space :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { switch r { case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xa0: - return true; + return true } - return false; + return false } - c := i32(r); - p := binary_search(c, space_ranges[:], len(space_ranges)/2, 2); + c := i32(r) + p := binary_search(c, space_ranges[:], len(space_ranges)/2, 2) if p >= 0 && space_ranges[p] <= c && c <= space_ranges[p+1] { - return true; + return true } - return false; + return false } is_combining :: proc(r: rune) -> bool { - c := i32(r); + c := i32(r) return c >= 0x0300 && (c <= 0x036f || (c >= 0x1ab0 && c <= 0x1aff) || (c >= 0x1dc0 && c <= 0x1dff) || (c >= 0x20d0 && c <= 0x20ff) || - (c >= 0xfe20 && c <= 0xfe2f)); + (c >= 0xfe20 && c <= 0xfe2f)) } is_graphic :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pg != 0; + return char_properties[u8(r)]&pg != 0 } - return false; + return false } is_print :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pp != 0; + return char_properties[u8(r)]&pp != 0 } - return false; + return false } is_control :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pC != 0; + return char_properties[u8(r)]&pC != 0 } - return false; + return false } is_number :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pN != 0; + return char_properties[u8(r)]&pN != 0 } - return false; + return false } is_punct :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pP != 0; + return char_properties[u8(r)]&pP != 0 } - return false; + return false } is_symbol :: proc(r: rune) -> bool { if u32(r) <= MAX_LATIN1 { - return char_properties[u8(r)]&pS != 0; + return char_properties[u8(r)]&pS != 0 } - return false; + return false } diff --git a/core/unicode/tables.odin b/core/unicode/tables.odin index ff4793402..f43827413 100644 --- a/core/unicode/tables.odin +++ b/core/unicode/tables.odin @@ -1,16 +1,16 @@ package unicode -@(private) pC :: 1<<0; // a control character. -@(private) pP :: 1<<1; // a punctuation character. -@(private) pN :: 1<<2; // a numeral. -@(private) pS :: 1<<3; // a symbolic character. -@(private) pZ :: 1<<4; // a spacing character. -@(private) pLu :: 1<<5; // an upper-case letter. -@(private) pLl :: 1<<6; // a lower-case letter. -@(private) pp :: 1<<7; // a printable character according to Go's definition. -@(private) pg :: pp | pZ; // a graphical character according to the Unicode definition. -@(private) pLo :: pLl | pLu; // a letter that is neither upper nor lower case. -@(private) pLmask :: pLo; +@(private) pC :: 1<<0 // a control character. +@(private) pP :: 1<<1 // a punctuation character. +@(private) pN :: 1<<2 // a numeral. +@(private) pS :: 1<<3 // a symbolic character. +@(private) pZ :: 1<<4 // a spacing character. +@(private) pLu :: 1<<5 // an upper-case letter. +@(private) pLl :: 1<<6 // a lower-case letter. +@(private) pp :: 1<<7 // a printable character according to Go's definition. +@(private) pg :: pp | pZ // a graphical character according to the Unicode definition. +@(private) pLo :: pLl | pLu // a letter that is neither upper nor lower case. +@(private) pLmask :: pLo char_properties := [MAX_LATIN1+1]u8{ 0x00 = pC, // '\x00' @@ -269,7 +269,7 @@ char_properties := [MAX_LATIN1+1]u8{ 0xFD = pLl | pp, // 'ý' 0xFE = pLl | pp, // 'þ' 0xFF = pLl | pp, // 'ÿ' -}; +} alpha_ranges := [?]i32{ @@ -425,7 +425,7 @@ alpha_ranges := [?]i32{ 0xffca, 0xffcf, 0xffd2, 0xffd7, 0xffda, 0xffdc, -}; +} alpha_singlets := [?]i32{ 0x00aa, @@ -460,7 +460,7 @@ alpha_singlets := [?]i32{ 0x2128, 0xfb3e, 0xfe74, -}; +} space_ranges := [?]i32{ 0x0009, 0x000d, // tab and newline @@ -475,7 +475,7 @@ space_ranges := [?]i32{ 0x205f, 0x205f, // medium mathematical space 0x3000, 0x3000, // ideographic space 0xfeff, 0xfeff, -}; +} unicode_spaces := [?]i32{ 0x0009, // tab @@ -492,7 +492,7 @@ unicode_spaces := [?]i32{ 0x205f, // medium mathematical space 0x3000, // ideographic space 0xfeff, // unknown -}; +} to_upper_ranges := [?]i32{ 0x0061, 0x007a, 468, // a-z A-Z @@ -530,7 +530,7 @@ to_upper_ranges := [?]i32{ 0x2170, 0x217f, 484, 0x24d0, 0x24e9, 474, 0xff41, 0xff5a, 468, -}; +} to_upper_singlets := [?]i32{ 0x00ff, 621, @@ -873,7 +873,7 @@ to_upper_singlets := [?]i32{ 0x1fc3, 509, 0x1fe5, 507, 0x1ff3, 509, -}; +} to_lower_ranges := [?]i32{ 0x0041, 0x005a, 532, // A-Z a-z @@ -912,7 +912,7 @@ to_lower_ranges := [?]i32{ 0x2160, 0x216f, 516, // - - 0x24b6, 0x24cf, 526, // - - 0xff21, 0xff3a, 532, // - - -}; +} to_lower_singlets := [?]i32{ 0x0100, 501, @@ -1248,7 +1248,7 @@ to_lower_singlets := [?]i32{ 0x1fcc, 491, 0x1fec, 493, 0x1ffc, 491, -}; +} to_title_singlets := [?]i32{ 0x01c4, 501, @@ -1259,4 +1259,4 @@ to_title_singlets := [?]i32{ 0x01cc, 499, 0x01f1, 501, 0x01f3, 499, -}; +} diff --git a/core/unicode/utf16/utf16.odin b/core/unicode/utf16/utf16.odin index 4c76956cc..27edf088d 100644 --- a/core/unicode/utf16/utf16.odin +++ b/core/unicode/utf16/utf16.odin @@ -1,82 +1,82 @@ package utf16 -REPLACEMENT_CHAR :: '\ufffd'; -MAX_RUNE :: '\U0010ffff'; +REPLACEMENT_CHAR :: '\ufffd' +MAX_RUNE :: '\U0010ffff' -_surr1 :: 0xd800; -_surr2 :: 0xdc00; -_surr3 :: 0xe000; -_surr_self :: 0x10000; +_surr1 :: 0xd800 +_surr2 :: 0xdc00 +_surr3 :: 0xe000 +_surr_self :: 0x10000 is_surrogate :: proc(r: rune) -> bool { - return _surr1 <= r && r < _surr3; + return _surr1 <= r && r < _surr3 } decode_surrogate_pair :: proc(r1, r2: rune) -> rune { if _surr1 <= r1 && r1 < _surr2 && _surr2 <= r2 && r2 < _surr3 { - return (r1-_surr1)<<10 | (r2 - _surr2) + _surr_self; + return (r1-_surr1)<<10 | (r2 - _surr2) + _surr_self } - return REPLACEMENT_CHAR; + return REPLACEMENT_CHAR } encode_surrogate_pair :: proc(c: rune) -> (r1, r2: rune) { - r := c; + r := c if r < _surr_self || r > MAX_RUNE { - return REPLACEMENT_CHAR, REPLACEMENT_CHAR; + return REPLACEMENT_CHAR, REPLACEMENT_CHAR } - r -= _surr_self; - return _surr1 + (r>>10)&0x3ff, _surr2 + r&0x3ff; + r -= _surr_self + return _surr1 + (r>>10)&0x3ff, _surr2 + r&0x3ff } encode :: proc(d: []u16, s: []rune) -> int { - n, m := 0, len(d); + n, m := 0, len(d) loop: for r in s { switch r { case 0..<_surr1, _surr3 ..< _surr_self: if m+1 < n { break loop; } - d[n] = u16(r); - n += 1; + d[n] = u16(r) + n += 1 case _surr_self ..= MAX_RUNE: if m+2 < n { break loop; } - r1, r2 := encode_surrogate_pair(r); - d[n] = u16(r1); - d[n+1] = u16(r2); - n += 2; + r1, r2 := encode_surrogate_pair(r) + d[n] = u16(r1) + d[n+1] = u16(r2) + n += 2 case: if m+1 < n { break loop; } - d[n] = u16(REPLACEMENT_CHAR); - n += 1; + d[n] = u16(REPLACEMENT_CHAR) + n += 1 } } - return n; + return n } encode_string :: proc(d: []u16, s: string) -> int { - n, m := 0, len(d); + n, m := 0, len(d) loop: for r in s { switch r { case 0..<_surr1, _surr3 ..< _surr_self: if m+1 < n { break loop; } - d[n] = u16(r); - n += 1; + d[n] = u16(r) + n += 1 case _surr_self ..= MAX_RUNE: if m+2 < n { break loop; } - r1, r2 := encode_surrogate_pair(r); - d[n] = u16(r1); - d[n+1] = u16(r2); - n += 2; + r1, r2 := encode_surrogate_pair(r) + d[n] = u16(r1) + d[n+1] = u16(r2) + n += 2 case: if m+1 < n { break loop; } - d[n] = u16(REPLACEMENT_CHAR); - n += 1; + d[n] = u16(REPLACEMENT_CHAR) + n += 1 } } - return n; + return n } diff --git a/core/unicode/utf8/utf8.odin b/core/unicode/utf8/utf8.odin index 61f54e07f..ba9bb6de0 100644 --- a/core/unicode/utf8/utf8.odin +++ b/core/unicode/utf8/utf8.odin @@ -1,36 +1,36 @@ package utf8 -RUNE_ERROR :: '\ufffd'; -RUNE_SELF :: 0x80; -RUNE_BOM :: 0xfeff; -RUNE_EOF :: ~rune(0); -MAX_RUNE :: '\U0010ffff'; -UTF_MAX :: 4; - -SURROGATE_MIN :: 0xd800; -SURROGATE_MAX :: 0xdfff; - -T1 :: 0b0000_0000; -TX :: 0b1000_0000; -T2 :: 0b1100_0000; -T3 :: 0b1110_0000; -T4 :: 0b1111_0000; -T5 :: 0b1111_1000; - -MASKX :: 0b0011_1111; -MASK2 :: 0b0001_1111; -MASK3 :: 0b0000_1111; -MASK4 :: 0b0000_0111; - -RUNE1_MAX :: 1<<7 - 1; -RUNE2_MAX :: 1<<11 - 1; -RUNE3_MAX :: 1<<16 - 1; +RUNE_ERROR :: '\ufffd' +RUNE_SELF :: 0x80 +RUNE_BOM :: 0xfeff +RUNE_EOF :: ~rune(0) +MAX_RUNE :: '\U0010ffff' +UTF_MAX :: 4 + +SURROGATE_MIN :: 0xd800 +SURROGATE_MAX :: 0xdfff + +T1 :: 0b0000_0000 +TX :: 0b1000_0000 +T2 :: 0b1100_0000 +T3 :: 0b1110_0000 +T4 :: 0b1111_0000 +T5 :: 0b1111_1000 + +MASKX :: 0b0011_1111 +MASK2 :: 0b0001_1111 +MASK3 :: 0b0000_1111 +MASK4 :: 0b0000_0111 + +RUNE1_MAX :: 1<<7 - 1 +RUNE2_MAX :: 1<<11 - 1 +RUNE3_MAX :: 1<<16 - 1 // The default lowest and highest continuation byte. -LOCB :: 0b1000_0000; -HICB :: 0b1011_1111; +LOCB :: 0b1000_0000 +HICB :: 0b1011_1111 -Accept_Range :: struct {lo, hi: u8}; +Accept_Range :: struct {lo, hi: u8} accept_ranges := [5]Accept_Range{ {0x80, 0xbf}, @@ -38,7 +38,7 @@ accept_ranges := [5]Accept_Range{ {0x80, 0x9f}, {0x90, 0xbf}, {0x80, 0x8f}, -}; +} accept_sizes := [256]u8{ 0x00..0x7f = 0xf0, @@ -52,329 +52,329 @@ accept_sizes := [256]u8{ 0xf1..0xf3 = 0x04, 0xf4 = 0x44, 0xf5..0xff = 0xf1, -}; +} encode_rune :: proc(c: rune) -> ([4]u8, int) { - r := c; + r := c - buf: [4]u8; - i := u32(r); - mask :: u8(0x3f); + buf: [4]u8 + i := u32(r) + mask :: u8(0x3f) if i <= 1<<7-1 { - buf[0] = u8(r); - return buf, 1; + buf[0] = u8(r) + return buf, 1 } if i <= 1<<11-1 { - buf[0] = 0xc0 | u8(r>>6); - buf[1] = 0x80 | u8(r) & mask; - return buf, 2; + buf[0] = 0xc0 | u8(r>>6) + buf[1] = 0x80 | u8(r) & mask + return buf, 2 } // Invalid or Surrogate range if i > 0x0010ffff || (0xd800 <= i && i <= 0xdfff) { - r = 0xfffd; + r = 0xfffd } if i <= 1<<16-1 { - buf[0] = 0xe0 | u8(r>>12); - buf[1] = 0x80 | u8(r>>6) & mask; - buf[2] = 0x80 | u8(r) & mask; - return buf, 3; + buf[0] = 0xe0 | u8(r>>12) + buf[1] = 0x80 | u8(r>>6) & mask + buf[2] = 0x80 | u8(r) & mask + return buf, 3 } - buf[0] = 0xf0 | u8(r>>18); - buf[1] = 0x80 | u8(r>>12) & mask; - buf[2] = 0x80 | u8(r>>6) & mask; - buf[3] = 0x80 | u8(r) & mask; - return buf, 4; + buf[0] = 0xf0 | u8(r>>18) + buf[1] = 0x80 | u8(r>>12) & mask + buf[2] = 0x80 | u8(r>>6) & mask + buf[3] = 0x80 | u8(r) & mask + return buf, 4 } decode_rune_in_string :: #force_inline proc(s: string) -> (rune, int) { - return decode_rune(transmute([]u8)s); + return decode_rune(transmute([]u8)s) } decode_rune :: proc(s: []u8) -> (rune, int) { - n := len(s); + n := len(s) if n < 1 { - return RUNE_ERROR, 0; + return RUNE_ERROR, 0 } - s0 := s[0]; - x := accept_sizes[s0]; + s0 := s[0] + x := accept_sizes[s0] if x >= 0xF0 { - mask := rune(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff. - return rune(s[0])&~mask | RUNE_ERROR&mask, 1; + mask := rune(x) << 31 >> 31 // NOTE(bill): Create 0x0000 or 0xffff. + return rune(s[0])&~mask | RUNE_ERROR&mask, 1 } - sz := x & 7; - accept := accept_ranges[x>>4]; + sz := x & 7 + accept := accept_ranges[x>>4] if n < int(sz) { - return RUNE_ERROR, 1; + return RUNE_ERROR, 1 } - b1 := s[1]; + b1 := s[1] if b1 < accept.lo || accept.hi < b1 { - return RUNE_ERROR, 1; + return RUNE_ERROR, 1 } if sz == 2 { - return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2; + return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2 } - b2 := s[2]; + b2 := s[2] if b2 < LOCB || HICB < b2 { - return RUNE_ERROR, 1; + return RUNE_ERROR, 1 } if sz == 3 { - return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3; + return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3 } - b3 := s[3]; + b3 := s[3] if b3 < LOCB || HICB < b3 { - return RUNE_ERROR, 1; + return RUNE_ERROR, 1 } - return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4; + return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4 } string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) { - n := rune_count_in_string(s); + n := rune_count_in_string(s) - runes = make([]rune, n, allocator); - i := 0; + runes = make([]rune, n, allocator) + i := 0 for r in s { - runes[i] = r; - i += 1; + runes[i] = r + i += 1 } - return; + return } runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string { - byte_count := 0; + byte_count := 0 for r in runes { - _, w := encode_rune(r); - byte_count += w; + _, w := encode_rune(r) + byte_count += w } - bytes := make([]byte, byte_count, allocator); - offset := 0; + bytes := make([]byte, byte_count, allocator) + offset := 0 for r in runes { - b, w := encode_rune(r); - copy(bytes[offset:], b[:w]); - offset += w; + b, w := encode_rune(r) + copy(bytes[offset:], b[:w]) + offset += w } - return string(bytes); + return string(bytes) } decode_last_rune_in_string :: #force_inline proc(s: string) -> (rune, int) { - return decode_last_rune(transmute([]u8)s); + return decode_last_rune(transmute([]u8)s) } decode_last_rune :: proc(s: []u8) -> (rune, int) { - r: rune; - size: int; - start, end, limit: int; + r: rune + size: int + start, end, limit: int - end = len(s); + end = len(s) if end == 0 { - return RUNE_ERROR, 0; + return RUNE_ERROR, 0 } - start = end-1; - r = rune(s[start]); + start = end-1 + r = rune(s[start]) if r < RUNE_SELF { - return r, 1; + return r, 1 } - limit = max(end - UTF_MAX, 0); + limit = max(end - UTF_MAX, 0) for start-=1; start >= limit; start-=1 { if rune_start(s[start]) { - break; + break } } - start = max(start, 0); - r, size = decode_rune(s[start:end]); + start = max(start, 0) + r, size = decode_rune(s[start:end]) if start+size != end { - return RUNE_ERROR, 1; + return RUNE_ERROR, 1 } - return r, size; + return r, size } rune_at_pos :: proc(s: string, pos: int) -> rune { if pos < 0 { - return RUNE_ERROR; + return RUNE_ERROR } - i := 0; + i := 0 for r in s { if i == pos { - return r; + return r } - i += 1; + i += 1 } - return RUNE_ERROR; + return RUNE_ERROR } rune_string_at_pos :: proc(s: string, pos: int) -> string { if pos < 0 { - return ""; + return "" } - i := 0; + i := 0 for c, offset in s { if i == pos { - w := rune_size(c); - return s[offset:][:w]; + w := rune_size(c) + return s[offset:][:w] } - i += 1; + i += 1 } - return ""; + return "" } rune_at :: proc(s: string, byte_index: int) -> rune { - r, _ := decode_rune_in_string(s[byte_index:]); - return r; + r, _ := decode_rune_in_string(s[byte_index:]) + return r } // Returns the byte position of rune at position pos in s with an optional start byte position. // Returns -1 if it runs out of the string. rune_offset :: proc(s: string, pos: int, start: int = 0) -> int { if pos < 0 { - return -1; + return -1 } - i := 0; + i := 0 for _, offset in s[start:] { if i == pos { - return offset+start; + return offset+start } - i += 1; + i += 1 } - return -1; + return -1 } valid_rune :: proc(r: rune) -> bool { if r < 0 { - return false; + return false } else if SURROGATE_MIN <= r && r <= SURROGATE_MAX { - return false; + return false } else if r > MAX_RUNE { - return false; + return false } - return true; + return true } valid_string :: proc(s: string) -> bool { - n := len(s); + n := len(s) for i := 0; i < n; { - si := s[i]; + si := s[i] if si < RUNE_SELF { // ascii - i += 1; - continue; + i += 1 + continue } - x := accept_sizes[si]; + x := accept_sizes[si] if x == 0xf1 { - return false; + return false } - size := int(x & 7); + size := int(x & 7) if i+size > n { - return false; + return false } - ar := accept_ranges[x>>4]; + ar := accept_ranges[x>>4] if b := s[i+1]; b < ar.lo || ar.hi < b { - return false; + return false } else if size == 2 { // Okay } else if c := s[i+2]; c < 0x80 || 0xbf < c { - return false; + return false } else if size == 3 { // Okay } else if d := s[i+3]; b < 0x80 || 0xbf < d { - return false; + return false } - i += size; + i += size } - return true; + return true } rune_start :: #force_inline proc(b: u8) -> bool { - return b&0xc0 != 0x80; + return b&0xc0 != 0x80 } rune_count_in_string :: #force_inline proc(s: string) -> int { - return rune_count(transmute([]u8)s); + return rune_count(transmute([]u8)s) } rune_count :: proc(s: []u8) -> int { - count := 0; - n := len(s); + count := 0 + n := len(s) for i := 0; i < n; { - defer count += 1; - si := s[i]; + defer count += 1 + si := s[i] if si < RUNE_SELF { // ascii - i += 1; - continue; + i += 1 + continue } - x := accept_sizes[si]; + x := accept_sizes[si] if x == 0xf1 { - i += 1; - continue; + i += 1 + continue } - size := int(x & 7); + size := int(x & 7) if i+size > n { - i += 1; - continue; + i += 1 + continue } - ar := accept_ranges[x>>4]; + ar := accept_ranges[x>>4] if b := s[i+1]; b < ar.lo || ar.hi < b { - size = 1; + size = 1 } else if size == 2 { // Okay } else if c := s[i+2]; c < 0x80 || 0xbf < c { - size = 1; + size = 1 } else if size == 3 { // Okay } else if d := s[i+3]; d < 0x80 || 0xbf < d { - size = 1; + size = 1 } - i += size; + i += size } - return count; + return count } rune_size :: proc(r: rune) -> int { switch { - case r < 0: return -1; - case r <= 1<<7 - 1: return 1; - case r <= 1<<11 - 1: return 2; - case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1; - case r <= 1<<16 - 1: return 3; - case r <= MAX_RUNE: return 4; - } - return -1; + case r < 0: return -1 + case r <= 1<<7 - 1: return 1 + case r <= 1<<11 - 1: return 2 + case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1 + case r <= 1<<16 - 1: return 3 + case r <= MAX_RUNE: return 4 + } + return -1 } // full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR) full_rune :: proc(b: []byte) -> bool { - n := len(b); + n := len(b) if n == 0 { - return false; + return false } - x := _first[b[0]]; + x := _first[b[0]] if n >= int(x & 7) { - return true; + return true } - accept := accept_ranges[x>>4]; + accept := accept_ranges[x>>4] if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) { - return true; + return true } else if n > 2 && (b[2] < LOCB || HICB < b[2]) { - return true; + return true } - return false; + return false } // full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not // An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR) full_rune_in_string :: proc(s: string) -> bool { - return full_rune(transmute([]byte)s); + return full_rune(transmute([]byte)s) } @@ -390,4 +390,4 @@ _first := [256]u8{ 0xf1..0xf3 = 0x04, // accept 0, size 4 0xf4 = 0x44, // accept 4, size 4 0xf5..0xff = 0xf1, // ascii, size 1 -}; +} |