diff options
| author | skytrias <skytrias@protonmail.com> | 2022-12-01 04:20:50 +0100 |
|---|---|---|
| committer | skytrias <skytrias@protonmail.com> | 2022-12-18 23:11:23 +0100 |
| commit | 3f4bbbec29e663aab98ade2d0448456138c43ac8 (patch) | |
| tree | f479c9b74b3b165506c21bc9a3e39275ffd8b504 /core/text/lua | |
| parent | 70bd220f3477b31c4e34b9cc08ee2c975194d26a (diff) | |
add proper unicode walking
Diffstat (limited to 'core/text/lua')
| -rw-r--r-- | core/text/lua/strlib.odin | 325 |
1 files changed, 206 insertions, 119 deletions
diff --git a/core/text/lua/strlib.odin b/core/text/lua/strlib.odin index 47ca73d24..2d4543f75 100644 --- a/core/text/lua/strlib.odin +++ b/core/text/lua/strlib.odin @@ -1,5 +1,7 @@ package strlib +import "core:unicode" +import "core:unicode/utf8" import "core:strings" MAXCAPTURES :: 32 @@ -10,7 +12,7 @@ Capture :: struct { } Match :: struct { - start, end: int, + byte_start, byte_end: int, } Error :: enum { @@ -20,6 +22,7 @@ Error :: enum { Invalid_Pattern_Capture, Unfinished_Capture, Malformed_Pattern, + Rune_Error, } L_ESC :: '%' @@ -34,8 +37,8 @@ MatchState :: struct { capture: [MAXCAPTURES]Capture, } -match_class :: proc(c: u8, cl: u8) -> (res: bool) { - switch tolower(cl) { +match_class :: proc(c: rune, cl: rune) -> (res: bool) { + switch unicode.to_lower(cl) { case 'a': res = isalpha(c) case 'c': res = iscntrl(c) case 'd': res = isdigit(c) @@ -52,53 +55,92 @@ match_class :: proc(c: u8, cl: u8) -> (res: bool) { return islower(cl) ? res : !res } -isalpha :: proc(c: u8) -> bool { - return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') +isalpha :: proc(c: rune) -> bool { + return unicode.is_alpha(c) } -isdigit :: proc(c: u8) -> bool { - return '0' <= c && c <= '9' +isdigit :: proc(c: rune) -> bool { + return unicode.is_digit(c) } -isalnum :: proc(c: u8) -> bool { - return isalpha(c) || isdigit(c) +isalnum :: proc(c: rune) -> bool { + return unicode.is_alpha(c) || unicode.is_digit(c) } -iscntrl :: proc(c: u8) -> bool { - return c <= '\007' || (c >= '\010' && c <= '\017') || (c >= '\020' && c <= '\027') || (c >= '\030' && c <= '\037') || c == '\177' +iscntrl :: proc(c: rune) -> bool { + return unicode.is_control(c) } -islower :: proc(c: u8) -> bool { - return c >= 'a' && c <= 'z' +islower :: proc(c: rune) -> bool { + return unicode.is_lower(c) } -isupper :: proc(c: u8) -> bool { - return c >= 'A' && c <= 'Z' +isupper :: proc(c: rune) -> bool { + return unicode.is_upper(c) } -isgraph :: proc(c: u8) -> bool { - return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') +isgraph :: proc(c: rune) -> bool { + return unicode.is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') } -ispunct :: proc(c: u8) -> bool { - return (c >= '{' && c <= '~') || (c == '`') || (c >= '[' && c <= '_') || (c == '@') || (c >= ':' && c <= '?') || (c >= '(' && c <= '/') || (c >= '!' && c <= '\'') +ispunct :: proc(c: rune) -> bool { + return unicode.is_punct(c) } -isxdigit :: proc(c: u8) -> bool { - return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') +isxdigit :: proc(c: rune) -> bool { + return unicode.is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') } -isspace :: proc(c: u8) -> bool { - return c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r' || c == ' ' +isspace :: proc(c: rune) -> bool { + return unicode.is_space(c) } -// ascii safe -tolower :: proc(c: u8) -> u8 { - if c >= 65 && c <= 90 { // upper case - return c + 32 +utf8_peek :: proc(bytes: string) -> (c: rune, size: int, err: Error) { + c, size = utf8.decode_rune_in_string(bytes) + + if c == utf8.RUNE_ERROR { + err = .Rune_Error + } + + return +} + +utf8_advance :: proc(bytes: string, index: ^int) -> (c: rune, err: Error) { + size: int + c, size = utf8.decode_rune_in_string(bytes[index^:]) + + if c == utf8.RUNE_ERROR { + err = .Rune_Error + } + + index^ += size + return +} + +// continuation byte? +is_cont :: proc(b: byte) -> bool { + return b & 0xc0 == 0x80 +} + +utf8_prev :: proc(bytes: string, a, b: int) -> int { + b := b + + for a < b && is_cont(bytes[b - 1]) { + b -= 1 + } + + return a < b ? b - 1 : a +} + +utf8_next :: proc(bytes: string, a: int) -> int { + a := a + b := len(bytes) + + for a < b - 1 && is_cont(bytes[a + 1]) { + a += 1 } - return c + return a < b ? a + 1 : b } check_capture :: proc(ms: ^MatchState, l: rune) -> (int, Error) { @@ -125,54 +167,52 @@ capture_to_close :: proc(ms: ^MatchState) -> (int, Error) { return 0, .Invalid_Pattern_Capture } -classend :: proc(ms: ^MatchState, p: int) -> (int, Error) { - ch := ms.pattern[p] - p := p + 1 +classend :: proc(ms: ^MatchState, p: int) -> (step: int, err: Error) { + step = p + ch := utf8_advance(ms.pattern, &step) or_return switch ch { case L_ESC: { - // if > 0 { - // fmt.eprintln("ERR classend: not enough pattern length") - // return nil - // } + if step == len(ms.pattern) { + err = .Malformed_Pattern + return + } - return p + 1, .OK + utf8_advance(ms.pattern, &step) or_return } case '[': { - if ms.pattern[p] == '^' { - p += 1 + // fine with step by 1 + if ms.pattern[step] == '^' { + step += 1 } - for ms.pattern[p] != ']' { - // if p == len(ms.pattern) { - // return 0, .Malformed_Pattern - // } + // run till end is reached + for ms.pattern[step] != ']' { + if step == len(ms.pattern) { + err = .Malformed_Pattern + return + } - ch := ms.pattern[p] - p += 1 + // dont care about utf8 here + step += 1 - if p < len(ms.pattern) && ch == L_ESC { + if step < len(ms.pattern) && ms.pattern[step] == L_ESC { // skip escapes like '%' - p += 1 + step += 1 } - - // if ms.pattern[p] == ']' { - // break - // } } - return p + 1, .OK - } - - case: { - return p, .OK + // advance last time + step += 1 } } + + return } -matchbracketclass :: proc(ms: ^MatchState, c: u8, p, ec: int) -> bool { - sig := true +matchbracketclass :: proc(ms: ^MatchState, c: rune, p, ec: int) -> (sig: bool, err: Error) { + sig = true p := p if ms.pattern[p + 1] == '^' { @@ -180,98 +220,127 @@ matchbracketclass :: proc(ms: ^MatchState, c: u8, p, ec: int) -> bool { sig = false } - p += 1 - // while inside of class range for p < ec { - ch := ms.pattern[p] + char := utf8_advance(ms.pattern, &p) or_return // e.g. %a - if ms.pattern[p] == L_ESC { - p += 1 + if char == L_ESC { + next := utf8_advance(ms.pattern, &p) or_return - if match_class(c, ms.pattern[p]) { - return sig - } - } else if p + 2 < len(ms.pattern) && ms.pattern[p + 1] == '-' { - // e.g. [a-z] check - if ms.pattern[p] <= c && c <= ms.pattern[p + 2] { - return sig + if match_class(c, next) { + return } + } else { + next, next_size := utf8_peek(ms.pattern[p:]) or_return - p += 2 - } else if ms.pattern[p] == c { - return sig - } + // TODO test case for [a-???] where ??? is missing + if next == '-' && p + next_size < len(ms.pattern) { + // advance 2 codepoints + p += next_size + last := utf8_advance(ms.pattern, &p) or_return - p += 1 + if char <= c && c <= last { + return + } + } else if char == c { + return + } + } } - return !sig + sig = !sig + return } -singlematch :: proc(ms: ^MatchState, s, p, ep: int) -> bool { +singlematch :: proc(ms: ^MatchState, s, p, ep: int) -> (matched: bool, schar_size: int, err: Error) { if s >= len(ms.src) { - return false + return } - switch ms.pattern[p] { - case '.': return true - case L_ESC: return match_class(ms.src[s], ms.pattern[p + 1]) - case '[': return matchbracketclass(ms, ms.src[s], p, ep - 1) - case: return ms.src[s] == ms.pattern[p] + pchar, psize := utf8_peek(ms.pattern[p:]) or_return + schar, ssize := utf8_peek(ms.src[s:]) or_return + schar_size = ssize + + switch pchar { + case '.': matched = true + case L_ESC: { + pchar_next, _ := utf8_peek(ms.pattern[p + psize:]) or_return + matched = match_class(schar, pchar_next) + } + case '[': { + matched = matchbracketclass(ms, schar, p, ep - 1) or_return + } + case: { + matched = schar == pchar + } } + + return } -matchbalance :: proc(ms: ^MatchState, s, p: int) -> (int, Error) { +matchbalance :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { if p >= len(ms.pattern) - 1 { return INVALID, .Invalid_Pattern_Capture } + schar, ssize := utf8_peek(ms.src[s:]) or_return + pchar, psize := utf8_peek(ms.pattern[p:]) or_return + // skip until the src and pattern match - if ms.src[s] != ms.pattern[p] { + if schar != pchar { return INVALID, .OK } s_begin := s cont := 1 - s := s + 1 - begin := ms.pattern[p] - end := ms.pattern[p + 1] + s := s + ssize + begin := pchar + end, _ := utf8_peek(ms.pattern[p + psize:]) or_return for s < len(ms.src) { - ch := ms.src[s] + ch := utf8_advance(ms.src, &s) or_return if ch == end { cont -= 1 if cont == 0 { - return s + 1, .OK + return s, .OK } } else if ch == begin { cont += 1 } - - s += 1 } return INVALID, .OK } max_expand :: proc(ms: ^MatchState, s, p, ep: int) -> (res: int, err: Error) { - i := 0 - for singlematch(ms, s + i, p, ep) { - i += 1 + m := s + + // count up matches + for { + matched, size := singlematch(ms, m, p, ep) or_return + + if !matched { + break + } + + m += size } - for i >= 0 { - result := match(ms, s + i, ep + 1) or_return + for s <= m { + result := match(ms, m, ep + 1) or_return if result != INVALID { return result, .OK } - i -= 1 + if s == m { + break + } + + m = utf8_prev(ms.src, s, m) } return INVALID, .OK @@ -285,10 +354,15 @@ min_expand :: proc(ms: ^MatchState, s, p, ep: int) -> (res: int, err: Error) { if result != INVALID { return result, .OK - } else if singlematch(ms, s, p, ep) { - s += 1 } else { - return INVALID, .OK + // TODO receive next step maybe? + matched, rune_size := singlematch(ms, s, p, ep) or_return + + if matched { + s += rune_size + } else { + return INVALID, .OK + } } } } @@ -339,7 +413,9 @@ match :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { return s, .OK } - switch ms.pattern[p] { + // NOTE we can walk by ascii steps if we know the characters are ascii + char, _ := utf8_peek(ms.pattern[p:]) or_return + switch char { case '(': { if ms.pattern[p + 1] == ')' { s = start_capture(ms, s, p + 2, CAP_POSITION) or_return @@ -389,13 +465,23 @@ match :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { } ep := classend(ms, p) or_return - previous := s == 0 ? '\x00' : ms.src[s - 1] - // allow last character to count too - current := s >= len(ms.src) ? '\x00' : ms.src[s] + previous, current: rune + + // get previous + if s != 0 { + temp := utf8_prev(ms.src, 0, s) + previous, _ = utf8_peek(ms.src[temp:]) or_return + } + + // get current + if s != len(ms.src) { + current, _ = utf8_peek(ms.src[s:]) or_return + } + + m1 := matchbracketclass(ms, previous, p, ep - 1) or_return + m2 := matchbracketclass(ms, current, p, ep - 1) or_return - // fmt.eprintln("TRY", rune(ms.src[s]), ep) - if !matchbracketclass(ms, previous, p, ep - 1) && - matchbracketclass(ms, current, p, ep - 1) { + if !m1 && m2 { return match(ms, s, ep) } @@ -428,8 +514,9 @@ match :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { match_default :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { s := s ep := classend(ms, p) or_return + single_matched, ssize := singlematch(ms, s, p, ep) or_return - if !singlematch(ms, s, p, ep) { + if !single_matched { epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0 if epc == '*' || epc == '?' || epc == '-' { @@ -442,7 +529,7 @@ match_default :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { switch epc { case '?': { - result := match(ms, s + 1, ep + 1) or_return + result := match(ms, s + ssize, ep + 1) or_return if result != INVALID { s = result @@ -452,7 +539,7 @@ match_default :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { } case '+': { - s = max_expand(ms, s + 1, p, ep) or_return + s = max_expand(ms, s + ssize, p, ep) or_return } case '*': { @@ -464,7 +551,7 @@ match_default :: proc(ms: ^MatchState, s, p: int) -> (unused: int, err: Error) { } case: { - return match(ms, s + 1, ep) + return match(ms, s + ssize, ep) } } } @@ -643,8 +730,8 @@ gmatch :: proc( ok = true first := length > 1 ? 1 : 0 cap := captures[first] - res = haystack[cap.start:cap.end] - haystack^ = haystack[cap.end:] + res = haystack[cap.byte_start:cap.byte_end] + haystack^ = haystack[cap.byte_end:] } } @@ -677,13 +764,13 @@ gsub_builder :: proc( cap := captures[0] // write front till capture - strings.write_string(builder, haystack[:cap.start]) + strings.write_string(builder, haystack[:cap.byte_start]) // write replacements strings.write_string(builder, replace) // advance string till end - haystack = haystack[cap.end:] + haystack = haystack[cap.byte_end:] } strings.write_string(builder, haystack[:]) @@ -722,11 +809,11 @@ gsub_with :: proc( cap := captures[0] - word := haystack[cap.start:cap.end] + word := haystack[cap.byte_start:cap.byte_end] call(data, word) // advance string till end - haystack = haystack[cap.end:] + haystack = haystack[cap.byte_end:] } } @@ -744,8 +831,8 @@ gfind :: proc( if length != 0 && err == .OK { ok = true cap := captures[0] - res = haystack[cap.start:cap.end] - haystack^ = haystack[cap.end:] + res = haystack[cap.byte_start:cap.byte_end] + haystack^ = haystack[cap.byte_end:] } } |