diff options
| author | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-07-21 21:58:25 -0400 |
|---|---|---|
| committer | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-07-22 14:25:12 -0400 |
| commit | cb0704d51c6a170bca8206a9bb3e9796c71c6341 (patch) | |
| tree | 2e275eb1a22d74ddd5d5052a4f89aa45ef9245b0 /core/text/regex/parser/parser.odin | |
| parent | ccf8b2764d4a3add4a575b2c88b710b2d15ebae8 (diff) | |
Add `core:text/regex`
Diffstat (limited to 'core/text/regex/parser/parser.odin')
| -rw-r--r-- | core/text/regex/parser/parser.odin | 580 |
1 files changed, 580 insertions, 0 deletions
diff --git a/core/text/regex/parser/parser.odin b/core/text/regex/parser/parser.odin new file mode 100644 index 000000000..1958ee399 --- /dev/null +++ b/core/text/regex/parser/parser.odin @@ -0,0 +1,580 @@ +package regex_parser + +import "base:intrinsics" +import "core:strconv" +import "core:strings" +import "core:text/regex/common" +import "core:text/regex/tokenizer" +import "core:unicode" +import "core:unicode/utf8" + +Token :: tokenizer.Token +Token_Kind :: tokenizer.Token_Kind +Tokenizer :: tokenizer.Tokenizer + +Rune_Class_Range :: struct { + lower, upper: rune, +} +Rune_Class_Data :: struct { + runes: [dynamic]rune, + ranges: [dynamic]Rune_Class_Range, +} + + +Node_Rune :: struct { + data: rune, +} + +Node_Rune_Class :: struct { + negating: bool, + using data: Rune_Class_Data, +} + +Node_Wildcard :: struct {} + +Node_Alternation :: struct { + left, right: Node, +} + +Node_Concatenation :: struct { + nodes: [dynamic]Node, +} + +Node_Repeat_Zero :: struct { + inner: Node, +} +Node_Repeat_Zero_Non_Greedy :: struct { + inner: Node, +} +Node_Repeat_One :: struct { + inner: Node, +} +Node_Repeat_One_Non_Greedy :: struct { + inner: Node, +} + +Node_Repeat_N :: struct { + inner: Node, + lower, upper: int, +} + +Node_Optional :: struct { + inner: Node, +} +Node_Optional_Non_Greedy :: struct { + inner: Node, +} + +Node_Group :: struct { + inner: Node, + capture_id: int, + capture: bool, +} + +Node_Anchor :: struct { + start: bool, +} +Node_Word_Boundary :: struct { + non_word: bool, +} + +Node_Match_All_And_Escape :: struct {} + +Node :: union { + ^Node_Rune, + ^Node_Rune_Class, + ^Node_Wildcard, + ^Node_Concatenation, + ^Node_Alternation, + ^Node_Repeat_Zero, + ^Node_Repeat_Zero_Non_Greedy, + ^Node_Repeat_One, + ^Node_Repeat_One_Non_Greedy, + ^Node_Repeat_N, + ^Node_Optional, + ^Node_Optional_Non_Greedy, + ^Node_Group, + ^Node_Anchor, + ^Node_Word_Boundary, + + // Optimized nodes (not created by the Parser): + ^Node_Match_All_And_Escape, +} + + +left_binding_power :: proc(kind: Token_Kind) -> int { + #partial switch kind { + case .Alternate: return 1 + case .Concatenate: return 2 + case .Repeat_Zero, .Repeat_One, + .Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy, + .Repeat_N: return 3 + case .Optional, + .Optional_Non_Greedy: return 4 + case .Open_Paren, + .Open_Paren_Non_Capture: return 9 + } + return 0 +} + + +Expected_Token :: struct { + pos: int, + kind: Token_Kind, +} + +Invalid_Repetition :: struct { + pos: int, +} + +Invalid_Token :: struct { + pos: int, + kind: Token_Kind, +} + +Invalid_Unicode :: struct { + pos: int, +} + +Too_Many_Capture_Groups :: struct { + pos: int, +} + +Unexpected_EOF :: struct { + pos: int, +} + +Error :: union { + Expected_Token, + Invalid_Repetition, + Invalid_Token, + Invalid_Unicode, + Too_Many_Capture_Groups, + Unexpected_EOF, +} + + +Parser :: struct { + flags: common.Flags, + t: Tokenizer, + + cur_token: Token, + + groups: int, +} + + +@require_results +advance :: proc(p: ^Parser) -> Error { + p.cur_token = tokenizer.scan(&p.t) + if p.cur_token.kind == .Invalid { + return Invalid_Unicode { pos = 0 } + } + return nil +} + +expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) { + if p.cur_token.kind == kind { + advance(p) or_return + return + } + + return Expected_Token{ + pos = p.t.offset, + kind = kind, + } +} + +null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) { + #partial switch token.kind { + case .Rune: + r: rune + for ru in token.text { + r = ru + break + } + assert(r != 0, "Parsed an empty Rune token.") + + if .Case_Insensitive in p.flags { + lower := unicode.to_lower(r) + upper := unicode.to_upper(r) + if lower != upper { + node := new(Node_Rune_Class) + append(&node.runes, lower) + append(&node.runes, upper) + return node, nil + } + } + + node := new(Node_Rune) + node ^= { r } + return node, nil + + case .Rune_Class: + if len(token.text) == 0 { + return nil, nil + } + + node := new(Node_Rune_Class) + + for i := 0; i < len(token.text); /**/ { + r, size := utf8.decode_rune(token.text[i:]) + if i == 0 && r == '^' { + node.negating = true + i += size + continue + } + i += size + + assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.") + + if r == '\\' { + next_r, next_size := utf8.decode_rune(token.text[i:]) + i += next_size + assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.") + + // @MetaCharacter + // NOTE: These must be kept in sync with the tokenizer. + switch next_r { + case 'f': append(&node.runes, '\f') + case 'n': append(&node.runes, '\n') + case 'r': append(&node.runes, '\r') + case 't': append(&node.runes, '\t') + + case 'd': + append(&node.ranges, Rune_Class_Range{ '0', '9' }) + case 's': + append(&node.runes, '\t') + append(&node.runes, '\n') + append(&node.runes, '\f') + append(&node.runes, '\r') + append(&node.runes, ' ') + case 'w': + append(&node.ranges, Rune_Class_Range{ '0', '9' }) + append(&node.ranges, Rune_Class_Range{ 'A', 'Z' }) + append(&node.runes, '_') + append(&node.ranges, Rune_Class_Range{ 'a', 'z' }) + case 'D': + append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 }) + append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) }) + case 'S': + append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 }) + // \t and \n are adjacent. + append(&node.runes, '\x0b') // Vertical Tab + append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 }) + append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) }) + case 'W': + append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 }) + append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 }) + append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 }) + append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 }) + append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) }) + case: + append(&node.runes, next_r) + } + continue + } + + if r == '-' && len(node.runes) > 0 { + next_r, next_size := utf8.decode_rune(token.text[i:]) + if next_size > 0 { + last := pop(&node.runes) + i += next_size + + append(&node.ranges, Rune_Class_Range{ last, next_r }) + continue + } + } + + append(&node.runes, r) + } + + if .Case_Insensitive in p.flags { + length := len(node.runes) + #no_bounds_check for i := 0; i < length; i += 1 { + r := node.runes[i] + lower := unicode.to_lower(r) + upper := unicode.to_upper(r) + + if lower != upper { + if lower != r { + append(&node.runes, lower) + } else { + append(&node.runes, upper) + } + } + } + + length = len(node.ranges) + #no_bounds_check for i := 0; i < length; i += 1 { + range := &node.ranges[i] + + min_lower := unicode.to_lower(range.lower) + max_lower := unicode.to_lower(range.upper) + + min_upper := unicode.to_upper(range.lower) + max_upper := unicode.to_upper(range.upper) + + if min_lower != min_upper && max_lower != max_upper { + range.lower = min_lower + range.upper = max_lower + append(&node.ranges, Rune_Class_Range{ min_upper, max_upper }) + } + } + } + + result = node + + case .Wildcard: + node := new(Node_Wildcard) + result = node + + case .Open_Paren: + // Because of the recursive nature of the token parser, we take the + // group number first instead of afterwards, in order to construct + // group matches from the outside in. + p.groups += 1 + if p.groups == common.MAX_CAPTURE_GROUPS { + return nil, Too_Many_Capture_Groups{ pos = token.pos } + } + this_group := p.groups + + node := new(Node_Group) + node.capture = true + node.capture_id = this_group + + node.inner = parse_expression(p, 0) or_return + expect(p, .Close_Paren) or_return + result = node + case .Open_Paren_Non_Capture: + node := new(Node_Group) + node.inner = parse_expression(p, 0) or_return + expect(p, .Close_Paren) or_return + result = node + case .Close_Paren: + node := new(Node_Rune) + node ^= { ')' } + return node, nil + + case .Anchor_Start: + node := new(Node_Anchor) + node.start = true + result = node + case .Anchor_End: + node := new(Node_Anchor) + result = node + case .Word_Boundary: + node := new(Node_Word_Boundary) + result = node + case .Non_Word_Boundary: + node := new(Node_Word_Boundary) + node.non_word = true + result = node + + case .Alternate: + // A unary alternation with a left-side empty path, i.e. `|a`. + right, right_err := parse_expression(p, left_binding_power(.Alternate)) + #partial switch specific in right_err { + case Unexpected_EOF: + // This token is a NOP, i.e. `|`. + break + case nil: + break + case: + return nil, right_err + } + + node := new(Node_Alternation) + node.right = right + result = node + + case .EOF: + return nil, Unexpected_EOF{ pos = token.pos } + + case: + return nil, Invalid_Token{ pos = token.pos, kind = token.kind } + } + + return +} + +left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) { + #partial switch token.kind { + case .Alternate: + if p.cur_token.kind == .Close_Paren { + // `(a|)` + // parse_expression will fail, so intervene here. + node := new(Node_Alternation) + node.left = left + return node, nil + } + + right, right_err := parse_expression(p, left_binding_power(.Alternate)) + + #partial switch specific in right_err { + case nil: + break + case Unexpected_EOF: + // EOF is okay in an alternation; it's an edge case in the way of + // expressing an optional such as `a|`. + break + case: + return nil, right_err + } + + node := new(Node_Alternation) + node.left = left + node.right = right + result = node + + case .Concatenate: + right := parse_expression(p, left_binding_power(.Concatenate)) or_return + + // There should be no need to check if right is Node_Concatenation, due + // to how the parsing direction works. + #partial switch specific in left { + case ^Node_Concatenation: + append(&specific.nodes, right) + result = specific + case: + node := new(Node_Concatenation) + append(&node.nodes, left) + append(&node.nodes, right) + result = node + } + + case .Repeat_Zero: + node := new(Node_Repeat_Zero) + node.inner = left + result = node + case .Repeat_Zero_Non_Greedy: + node := new(Node_Repeat_Zero_Non_Greedy) + node.inner = left + result = node + case .Repeat_One: + node := new(Node_Repeat_One) + node.inner = left + result = node + case .Repeat_One_Non_Greedy: + node := new(Node_Repeat_One_Non_Greedy) + node.inner = left + result = node + + case .Repeat_N: + node := new(Node_Repeat_N) + node.inner = left + + comma := strings.index_byte(token.text, ',') + + switch comma { + case -1: // {N} + exact, ok := strconv.parse_u64_of_base(token.text, base = 10) + if !ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + if exact == 0 { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = cast(int)exact + node.upper = cast(int)exact + + case 0: // {,M} + upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10) + if !ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + if upper == 0 { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = -1 + node.upper = cast(int)upper + + case len(token.text) - 1: // {N,} + lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10) + if !ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = cast(int)lower + node.upper = -1 + + case: // {N,M} + lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10) + if !lower_ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10) + if !upper_ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + if lower > upper { + return nil, Invalid_Repetition{ pos = token.pos } + } + if upper == 0 { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = cast(int)lower + node.upper = cast(int)upper + } + + result = node + + case .Optional: + node := new(Node_Optional) + node.inner = left + result = node + case .Optional_Non_Greedy: + node := new(Node_Optional_Non_Greedy) + node.inner = left + result = node + + case .EOF: + return nil, Unexpected_EOF{ pos = token.pos } + + case: + return nil, Invalid_Token{ pos = token.pos, kind = token.kind } + } + + return +} + +parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) { + token := p.cur_token + + advance(p) or_return + left := null_denotation(p, token) or_return + + token = p.cur_token + for rbp < left_binding_power(token.kind) { + advance(p) or_return + left = left_denotation(p, token, left) or_return + token = p.cur_token + } + + return left, nil +} + +parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) { + if len(str) == 0 { + node := new(Node_Group) + return node, nil + } + + p: Parser + p.flags = flags + + tokenizer.init(&p.t, str, flags) + + p.cur_token = tokenizer.scan(&p.t) + if p.cur_token.kind == .Invalid { + return nil, Invalid_Unicode { pos = 0 } + } + + node := parse_expression(&p, 0) or_return + result = node + + return +} |