diff options
| author | gingerBill <gingerBill@users.noreply.github.com> | 2022-12-22 11:08:10 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-12-22 11:08:10 +0000 |
| commit | c8f05b7c0c02fb2fb03d592a751bdb91ec4369f1 (patch) | |
| tree | de5547c0885b386be4d9f5cbb326d902f86eda21 /core | |
| parent | b00c4a6a8f59c33afc44eebfc8463ce0a0aec8b9 (diff) | |
| parent | 63a0395a79505598ae89c756c7394765bd89c1aa (diff) | |
Merge pull request #2269 from Skytrias/luapattern
Add lua pattern matching to core:text with tests
Diffstat (limited to 'core')
| -rw-r--r-- | core/text/match/strlib.odin | 959 |
1 files changed, 959 insertions, 0 deletions
diff --git a/core/text/match/strlib.odin b/core/text/match/strlib.odin new file mode 100644 index 000000000..b8c2861fa --- /dev/null +++ b/core/text/match/strlib.odin @@ -0,0 +1,959 @@ +package text_match + +import "core:runtime" +import "core:unicode" +import "core:unicode/utf8" +import "core:strings" + +MAX_CAPTURES :: 32 + +Capture :: struct { + init: int, + len: int, +} + +Match :: struct { + byte_start, byte_end: int, +} + +Error :: enum { + OK, + OOB, + Invalid_Capture_Index, + Invalid_Pattern_Capture, + Unfinished_Capture, + Malformed_Pattern, + Rune_Error, + Match_Invalid, +} + +L_ESC :: '%' +CAP_POSITION :: -2 +CAP_UNFINISHED :: -1 +INVALID :: -1 + +Match_State :: struct { + src: string, + pattern: string, + level: int, + capture: [MAX_CAPTURES]Capture, +} + +match_class :: proc(c: rune, cl: rune) -> (res: bool) { + switch unicode.to_lower(cl) { + case 'a': res = is_alpha(c) + case 'c': res = is_cntrl(c) + case 'd': res = is_digit(c) + case 'g': res = is_graph(c) + case 'l': res = is_lower(c) + case 'p': res = is_punct(c) + case 's': res = is_space(c) + case 'u': res = is_upper(c) + case 'w': res = is_alnum(c) + case 'x': res = is_xdigit(c) + case: return cl == c + } + + return is_lower(cl) ? res : !res +} + +is_alpha :: unicode.is_alpha +is_digit :: unicode.is_digit +is_lower :: unicode.is_lower +is_upper :: unicode.is_upper +is_punct :: unicode.is_punct +is_space :: unicode.is_space +is_cntrl :: unicode.is_control + +is_alnum :: proc(c: rune) -> bool { + return unicode.is_alpha(c) || unicode.is_digit(c) +} + +is_graph :: proc(c: rune) -> bool { + return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || unicode.is_digit(c) +} + +is_xdigit :: proc(c: rune) -> bool { + return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || unicode.is_digit(c) +} + +// find the first utf8 charater and its size, return an error if the character is an error +utf8_peek :: proc(bytes: string) -> (c: rune, size: int, err: Error) { + c, size = utf8.decode_rune_in_string(bytes) + + if c == utf8.RUNE_ERROR { + err = .Rune_Error + } + + return +} + +// find the first utf8 charater and its size and advance the index +// return an error if the character is an error +utf8_advance :: proc(bytes: string, index: ^int) -> (c: rune, err: Error) { + size: int + c, size = utf8.decode_rune_in_string(bytes[index^:]) + + if c == utf8.RUNE_ERROR { + err = .Rune_Error + } + + index^ += size + return +} + +// continuation byte? +is_cont :: proc(b: byte) -> bool { + return b & 0xc0 == 0x80 +} + +utf8_prev :: proc(bytes: string, a, b: int) -> int { + b := b + + for a < b && is_cont(bytes[b - 1]) { + b -= 1 + } + + return a < b ? b - 1 : a +} + +utf8_next :: proc(bytes: string, a: int) -> int { + a := a + b := len(bytes) + + for a < b - 1 && is_cont(bytes[a + 1]) { + a += 1 + } + + return a < b ? a + 1 : b +} + +check_capture :: proc(ms: ^Match_State, l: rune) -> (int, Error) { + l := int(l - '1') + + if l < 0 || l >= ms.level || ms.capture[l].len == CAP_UNFINISHED { + return 0, .Invalid_Capture_Index + } + + return l, .OK +} + +capture_to_close :: proc(ms: ^Match_State) -> (int, Error) { + level := ms.level - 1 + + for level >= 0 { + if ms.capture[level].len == CAP_UNFINISHED { + return level, .OK + } + + level -= 1 + } + + return 0, .Invalid_Pattern_Capture +} + +class_end :: proc(ms: ^Match_State, p: int) -> (step: int, err: Error) { + step = p + ch := utf8_advance(ms.pattern, &step) or_return + + switch ch { + case L_ESC: + if step == len(ms.pattern) { + err = .Malformed_Pattern + return + } + + utf8_advance(ms.pattern, &step) or_return + + case '[': + // fine with step by 1 + if step + 1 < len(ms.pattern) && ms.pattern[step] == '^' { + step += 1 + } + + // run till end is reached + for { + if step == len(ms.pattern) { + err = .Malformed_Pattern + return + } + + if ms.pattern[step] == ']' { + break + } + + // dont care about utf8 here + step += 1 + + if step < len(ms.pattern) && ms.pattern[step] == L_ESC { + // skip escapes like '%' + step += 1 + } + } + + // advance last time + step += 1 + } + + return +} + +match_bracket_class :: proc(ms: ^Match_State, c: rune, p, ec: int) -> (sig: bool, err: Error) { + sig = true + p := p + + if ms.pattern[p + 1] == '^' { + p += 1 + sig = false + } + + // while inside of class range + for p < ec { + char := utf8_advance(ms.pattern, &p) or_return + + // e.g. %a + if char == L_ESC { + next := utf8_advance(ms.pattern, &p) or_return + + if match_class(c, next) { + return + } + } else { + next, next_size := utf8_peek(ms.pattern[p:]) or_return + + // TODO test case for [a-???] where ??? is missing + if next == '-' && p + next_size < len(ms.pattern) { + // advance 2 codepoints + p += next_size + last := utf8_advance(ms.pattern, &p) or_return + + if char <= c && c <= last { + return + } + } else if char == c { + return + } + } + } + + sig = !sig + return +} + +single_match :: proc(ms: ^Match_State, s, p, ep: int) -> (matched: bool, schar_size: int, err: Error) { + if s >= len(ms.src) { + return + } + + pchar, psize := utf8_peek(ms.pattern[p:]) or_return + schar, ssize := utf8_peek(ms.src[s:]) or_return + schar_size = ssize + + switch pchar { + case '.': matched = true + case L_ESC: + pchar_next, _ := utf8_peek(ms.pattern[p + psize:]) or_return + matched = match_class(schar, pchar_next) + case '[': matched = match_bracket_class(ms, schar, p, ep - 1) or_return + case: matched = schar == pchar + } + + return +} + +match_balance :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) { + if p >= len(ms.pattern) - 1 { + return INVALID, .Invalid_Pattern_Capture + } + + schar, ssize := utf8_peek(ms.src[s:]) or_return + pchar, psize := utf8_peek(ms.pattern[p:]) or_return + + // skip until the src and pattern match + if schar != pchar { + return INVALID, .OK + } + + s_begin := s + cont := 1 + s := s + ssize + begin := pchar + end, _ := utf8_peek(ms.pattern[p + psize:]) or_return + + for s < len(ms.src) { + ch := utf8_advance(ms.src, &s) or_return + + switch ch{ + case end: + cont -= 1 + + if cont == 0 { + return s, .OK + } + + case begin: + cont += 1 + } + } + + return INVALID, .OK +} + +max_expand :: proc(ms: ^Match_State, s, p, ep: int) -> (res: int, err: Error) { + m := s + + // count up matches + for { + matched, size := single_match(ms, m, p, ep) or_return + + if !matched { + break + } + + m += size + } + + for s <= m { + result := match(ms, m, ep + 1) or_return + + if result != INVALID { + return result, .OK + } + + if s == m { + break + } + + m = utf8_prev(ms.src, s, m) + } + + return INVALID, .OK +} + +min_expand :: proc(ms: ^Match_State, s, p, ep: int) -> (res: int, err: Error) { + s := s + + for { + result := match(ms, s, ep + 1) or_return + + if result != INVALID { + return result, .OK + } else { + // TODO receive next step maybe? + matched, rune_size := single_match(ms, s, p, ep) or_return + + if matched { + s += rune_size + } else { + return INVALID, .OK + } + } + } +} + +start_capture :: proc(ms: ^Match_State, s, p, what: int) -> (res: int, err: Error) { + level := ms.level + + ms.capture[level].init = s + ms.capture[level].len = what + ms.level += 1 + + res = match(ms, s, p) or_return + if res == INVALID { + ms.level -= 1 + } + return +} + +end_capture :: proc(ms: ^Match_State, s, p: int) -> (res: int, err: Error) { + l := capture_to_close(ms) or_return + + // TODO double check, could do string as int index + ms.capture[l].len = s - ms.capture[l].init + + res = match(ms, s, p) or_return + if res == INVALID { + ms.capture[l].len = CAP_UNFINISHED + } + return +} + +match_capture :: proc(ms: ^Match_State, s: int, char: rune) -> (res: int, err: Error) { + index := check_capture(ms, char) or_return + length := ms.capture[index].len + + if len(ms.src) - s >= length { + return s + length, .OK + } + + return INVALID, .OK +} + +match :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) { + s := s + p := p + + if p == len(ms.pattern) { + return s, .OK + } + + // NOTE we can walk by ascii steps if we know the characters are ascii + char, _ := utf8_peek(ms.pattern[p:]) or_return + switch char { + case '(': + if p + 1 < len(ms.pattern) && ms.pattern[p + 1] == ')' { + s = start_capture(ms, s, p + 2, CAP_POSITION) or_return + } else { + s = start_capture(ms, s, p + 1, CAP_UNFINISHED) or_return + } + + case ')': + s = end_capture(ms, s, p + 1) or_return + + case '$': + if p + 1 != len(ms.pattern) { + return match_default(ms, s, p) + } + + if len(ms.src) != s { + s = INVALID + } + + case L_ESC: + // stop short patterns like "%" only + if p + 1 >= len(ms.pattern) { + err = .OOB + return + } + + switch ms.pattern[p + 1] { + // balanced string + case 'b': + s = match_balance(ms, s, p + 2) or_return + + if s != INVALID { + // eg after %b() + return match(ms, s, p + 4) + } + + // frontier + case 'f': + p += 2 + + if ms.pattern[p] != '[' { + return INVALID, .Invalid_Pattern_Capture + } + + ep := class_end(ms, p) or_return + previous, current: rune + + // get previous + if s != 0 { + temp := utf8_prev(ms.src, 0, s) + previous, _ = utf8_peek(ms.src[temp:]) or_return + } + + // get current + if s != len(ms.src) { + current, _ = utf8_peek(ms.src[s:]) or_return + } + + m1 := match_bracket_class(ms, previous, p, ep - 1) or_return + m2 := match_bracket_class(ms, current, p, ep - 1) or_return + + if !m1 && m2 { + return match(ms, s, ep) + } + + s = INVALID + + // capture group + case '0'..<'9': + s = match_capture(ms, s, rune(ms.pattern[p + 1])) or_return + + if s != INVALID { + return match(ms, s, p + 2) + } + + case: return match_default(ms, s, p) + } + + case: + return match_default(ms, s, p) + } + + return s, .OK +} + +match_default :: proc(ms: ^Match_State, s, p: int) -> (unused: int, err: Error) { + s := s + ep := class_end(ms, p) or_return + single_matched, ssize := single_match(ms, s, p, ep) or_return + + if !single_matched { + epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0 + + switch epc { + case '*', '?', '-': return match(ms, s, ep + 1) + case: s = INVALID + } + } else { + epc := ep < len(ms.pattern) ? ms.pattern[ep] : 0 + + switch epc { + case '?': + result := match(ms, s + ssize, ep + 1) or_return + + if result != INVALID { + s = result + } else { + return match(ms, s, ep + 1) + } + + case '+': s = max_expand(ms, s + ssize, p, ep) or_return + case '*': s = max_expand(ms, s, p, ep) or_return + case '-': s = min_expand(ms, s, p, ep) or_return + case: return match(ms, s + ssize, ep) + } + } + + return s, .OK +} + +push_onecapture :: proc(ms: ^Match_State, i: int, s: int, e: int, matches: []Match) -> (err: Error) { + if i >= ms.level { + if i == 0 { + matches[0] = { 0, e - s } + } else { + err = .Invalid_Capture_Index + } + } else { + init := ms.capture[i].init + length := ms.capture[i].len + + switch length { + case CAP_UNFINISHED: err = .Unfinished_Capture + case CAP_POSITION: matches[i] = { init, init + 1 } + case: matches[i] = { init, init + length } + } + } + + return +} + +push_captures :: proc( + ms: ^Match_State, + s: int, + e: int, + matches: []Match, +) -> (nlevels: int, err: Error) { + nlevels = 1 if ms.level == 0 && s != -1 else ms.level + + for i in 0..<nlevels { + push_onecapture(ms, i, s, e, matches) or_return + } + + return +} + +// SPECIALS := "^$*+?.([%-" +// all special characters inside a small ascii array +SPECIALS_TABLE := [256]bool { + '^' = true, + '$' = true, + '*' = true, + '+' = true, + '?' = true, + '.' = true, + '(' = true, + '[' = true, + '%' = true, + '-' = true, +} + +// helper call to quick search for special characters +index_special :: proc(text: string) -> int { + for i in 0..<len(text) { + if SPECIALS_TABLE[text[i]] { + return i + } + } + + return -1 +} + +lmem_find :: proc(s1, s2: string) -> int { + l1 := len(s1) + l2 := len(s2) + + if l2 == 0 { + return 0 + } else if l2 > l1 { + return -1 + } else { + init := strings.index_byte(s1, s2[0]) + end := init + l2 + + for end <= l1 && init != -1 { + init += 1 + + if s1[init - 1:end] == s2 { + return init - 1 + } else { + next := strings.index_byte(s1[init:], s2[0]) + + if next == -1 { + return -1 + } else { + init = init + next + end = init + l2 + } + } + } + } + + return -1 +} + +// find a pattern with in a haystack with an offset +// allow_memfind will speed up simple searches +find_aux :: proc( + haystack: string, + pattern: string, + offset: int, + allow_memfind: bool, + matches: ^[MAX_CAPTURES]Match, +) -> (captures: int, err: Error) { + s := offset + p := 0 + + specials_idx := index_special(pattern) + if allow_memfind && specials_idx == -1 { + if index := lmem_find(haystack[s:], pattern); index != -1 { + matches[0] = { index + s, index + s + len(pattern) } + captures = 1 + return + } else { + return + } + } + + pattern := pattern + anchor: bool + if len(pattern) > 0 && pattern[0] == '^' { + anchor = true + pattern = pattern[1:] + } + + ms := Match_State { + src = haystack, + pattern = pattern, + } + + for { + res := match(&ms, s, p) or_return + + if res != INVALID { + // disallow non advancing match + if s == res { + err = .Match_Invalid + } + + // NOTE(Skytrias): first result is reserved for a full match + matches[0] = { s, res } + + // rest are the actual captures + captures = push_captures(&ms, -1, -1, matches[1:]) or_return + captures += 1 + + return + } + + s += 1 + + if !(s < len(ms.src) && !anchor) { + break + } + } + + return +} + +// iterative matching which returns the 0th/1st match +// rest has to be used from captures +gmatch :: proc( + haystack: ^string, + pattern: string, + captures: ^[MAX_CAPTURES]Match, +) -> (res: string, ok: bool) { + if len(haystack) > 0 { + length, err := find_aux(haystack^, pattern, 0, false, captures) + + if length != 0 && err == .OK { + ok = true + first := length > 1 ? 1 : 0 + cap := captures[first] + res = haystack[cap.byte_start:cap.byte_end] + haystack^ = haystack[cap.byte_end:] + } + } + + return +} + +// gsub with builder, replace patterns found with the replace content +gsub_builder :: proc( + builder: ^strings.Builder, + haystack: string, + pattern: string, + replace: string, +) -> string { + // find matches + captures: [MAX_CAPTURES]Match + haystack := haystack + + for { + length, err := find_aux(haystack, pattern, 0, false, &captures) + + // done + if length == 0 { + break + } + + if err != .OK { + return {} + } + + cap := captures[0] + + // write front till capture + strings.write_string(builder, haystack[:cap.byte_start]) + + // write replacements + strings.write_string(builder, replace) + + // advance string till end + haystack = haystack[cap.byte_end:] + } + + strings.write_string(builder, haystack[:]) + return strings.to_string(builder^) +} + +// uses temp builder to build initial string - then allocates the result +gsub_allocator :: proc( + haystack: string, + pattern: string, + replace: string, + allocator := context.allocator, +) -> string { + builder := strings.builder_make(0, 256, context.temp_allocator) + return gsub_builder(&builder, haystack, pattern, replace) +} + +Gsub_Proc :: proc( + // optional passed data + data: rawptr, + // word match found + word: string, + // current haystack for found captures + haystack: string, + // found captures - empty for no captures + captures: []Match, +) + +// call a procedure on every match in the haystack +gsub_with :: proc( + haystack: string, + pattern: string, + data: rawptr, + call: Gsub_Proc, +) { + // find matches + captures: [MAX_CAPTURES]Match + haystack := haystack + + for { + length, err := find_aux(haystack, pattern, 0, false, &captures) + + // done + if length == 0 || err != .OK { + break + } + + cap := captures[0] + + word := haystack[cap.byte_start:cap.byte_end] + call(data, word, haystack, captures[1:length]) + + // advance string till end + haystack = haystack[cap.byte_end:] + } +} + +gsub :: proc { gsub_builder, gsub_allocator } + +// iterative find with zeroth capture only +gfind :: proc( + haystack: ^string, + pattern: string, + captures: ^[MAX_CAPTURES]Match, +) -> (res: string, ok: bool) { + if len(haystack) > 0 { + length, err := find_aux(haystack^, pattern, 0, true, captures) + + if length != 0 && err == .OK { + ok = true + cap := captures[0] + res = haystack[cap.byte_start:cap.byte_end] + haystack^ = haystack[cap.byte_end:] + } + } + + return +} + +// rebuilds a pattern into a case insensitive pattern +pattern_case_insensitive_builder :: proc( + builder: ^strings.Builder, + pattern: string, +) -> (res: string) { + p := pattern + last_percent: bool + + for len(p) > 0 { + char, size := utf8.decode_rune_in_string(p) + + if unicode.is_alpha(char) && !last_percent { + // write character class in manually + strings.write_byte(builder, '[') + strings.write_rune(builder, unicode.to_lower(char)) + strings.write_rune(builder, unicode.to_upper(char)) + strings.write_byte(builder, ']') + } else { + strings.write_rune(builder, char) + } + + last_percent = char == L_ESC + p = p[size:] + } + + return strings.to_string(builder^) +} + +pattern_case_insensitive_allocator :: proc( + pattern: string, + cap: int = 256, + allocator := context.allocator, +) -> (res: string) { + builder := strings.builder_make(0, cap, context.temp_allocator) + return pattern_case_insensitive_builder(&builder, pattern) +} + +pattern_case_insensitive :: proc { pattern_case_insensitive_builder, pattern_case_insensitive_allocator } + +// Matcher helper struct that stores optional data you might want to use or not +// as lua is far more dynamic this helps dealing with too much data +// this also allows use of find/match/gmatch at through one struct +Matcher :: struct { + haystack: string, + pattern: string, + captures: [MAX_CAPTURES]Match, + captures_length: int, + offset: int, + err: Error, + + // changing content for iterators + iter: string, + iter_index: int, +} + +// init using haystack & pattern and an optional byte offset +matcher_init :: proc(haystack, pattern: string, offset: int = 0) -> (res: Matcher) { + res.haystack = haystack + res.pattern = pattern + res.offset = offset + res.iter = haystack + return +} + +// find the first match and return the byte start / end position in the string, true on success +matcher_find :: proc(matcher: ^Matcher) -> (start, end: int, ok: bool) #no_bounds_check { + matcher.captures_length, matcher.err = find_aux( + matcher.haystack, + matcher.pattern, + matcher.offset, + true, + &matcher.captures, + ) + ok = matcher.captures_length > 0 && matcher.err == .OK + match := matcher.captures[0] + start = match.byte_start + end = match.byte_end + return +} + +// find the first match and return the matched word, true on success +matcher_match :: proc(matcher: ^Matcher) -> (word: string, ok: bool) #no_bounds_check { + matcher.captures_length, matcher.err = find_aux( + matcher.haystack, + matcher.pattern, + matcher.offset, + false, + &matcher.captures, + ) + ok = matcher.captures_length > 0 && matcher.err == .OK + match := matcher.captures[0] + word = matcher.haystack[match.byte_start:match.byte_end] + return +} + +// get the capture at the "correct" spot, as spot 0 is reserved for the first match +matcher_capture :: proc(matcher: ^Matcher, index: int, loc := #caller_location) -> string #no_bounds_check { + runtime.bounds_check_error_loc(loc, index + 1, MAX_CAPTURES - 1) + cap := matcher.captures[index + 1] + return matcher.haystack[cap.byte_start:cap.byte_end] +} + +// get the raw match out of the captures, skipping spot 0 +matcher_capture_raw :: proc(matcher: ^Matcher, index: int, loc := #caller_location) -> Match #no_bounds_check { + runtime.bounds_check_error_loc(loc, index + 1, MAX_CAPTURES - 1) + return matcher.captures[index + 1] +} + +// alias +matcher_gmatch :: matcher_match_iter + +// iteratively match the haystack till it cant find any matches +matcher_match_iter :: proc(matcher: ^Matcher) -> (res: string, index: int, ok: bool) { + if len(matcher.iter) > 0 { + matcher.captures_length, matcher.err = find_aux( + matcher.iter, + matcher.pattern, + matcher.offset, + false, + &matcher.captures, + ) + + if matcher.captures_length != 0 && matcher.err == .OK { + ok = true + first := matcher.captures_length > 1 ? 1 : 0 + match := matcher.captures[first] + + // output + res = matcher.iter[match.byte_start:match.byte_end] + index = matcher.iter_index + + // advance + matcher.iter_index += 1 + matcher.iter = matcher.iter[match.byte_end:] + } + } + + return +} + +// get a slice of all valid captures above the first match +matcher_captures_slice :: proc(matcher: ^Matcher) -> []Match { + return matcher.captures[1:matcher.captures_length] +} |