aboutsummaryrefslogtreecommitdiff
path: root/core/text/regex/parser/parser.odin
diff options
context:
space:
mode:
authorFeoramund <161657516+Feoramund@users.noreply.github.com>2024-07-21 21:58:25 -0400
committerFeoramund <161657516+Feoramund@users.noreply.github.com>2024-07-22 14:25:12 -0400
commitcb0704d51c6a170bca8206a9bb3e9796c71c6341 (patch)
tree2e275eb1a22d74ddd5d5052a4f89aa45ef9245b0 /core/text/regex/parser/parser.odin
parentccf8b2764d4a3add4a575b2c88b710b2d15ebae8 (diff)
Add `core:text/regex`
Diffstat (limited to 'core/text/regex/parser/parser.odin')
-rw-r--r--core/text/regex/parser/parser.odin580
1 files changed, 580 insertions, 0 deletions
diff --git a/core/text/regex/parser/parser.odin b/core/text/regex/parser/parser.odin
new file mode 100644
index 000000000..1958ee399
--- /dev/null
+++ b/core/text/regex/parser/parser.odin
@@ -0,0 +1,580 @@
+package regex_parser
+
+import "base:intrinsics"
+import "core:strconv"
+import "core:strings"
+import "core:text/regex/common"
+import "core:text/regex/tokenizer"
+import "core:unicode"
+import "core:unicode/utf8"
+
+Token :: tokenizer.Token
+Token_Kind :: tokenizer.Token_Kind
+Tokenizer :: tokenizer.Tokenizer
+
+Rune_Class_Range :: struct {
+ lower, upper: rune,
+}
+Rune_Class_Data :: struct {
+ runes: [dynamic]rune,
+ ranges: [dynamic]Rune_Class_Range,
+}
+
+
+Node_Rune :: struct {
+ data: rune,
+}
+
+Node_Rune_Class :: struct {
+ negating: bool,
+ using data: Rune_Class_Data,
+}
+
+Node_Wildcard :: struct {}
+
+Node_Alternation :: struct {
+ left, right: Node,
+}
+
+Node_Concatenation :: struct {
+ nodes: [dynamic]Node,
+}
+
+Node_Repeat_Zero :: struct {
+ inner: Node,
+}
+Node_Repeat_Zero_Non_Greedy :: struct {
+ inner: Node,
+}
+Node_Repeat_One :: struct {
+ inner: Node,
+}
+Node_Repeat_One_Non_Greedy :: struct {
+ inner: Node,
+}
+
+Node_Repeat_N :: struct {
+ inner: Node,
+ lower, upper: int,
+}
+
+Node_Optional :: struct {
+ inner: Node,
+}
+Node_Optional_Non_Greedy :: struct {
+ inner: Node,
+}
+
+Node_Group :: struct {
+ inner: Node,
+ capture_id: int,
+ capture: bool,
+}
+
+Node_Anchor :: struct {
+ start: bool,
+}
+Node_Word_Boundary :: struct {
+ non_word: bool,
+}
+
+Node_Match_All_And_Escape :: struct {}
+
+Node :: union {
+ ^Node_Rune,
+ ^Node_Rune_Class,
+ ^Node_Wildcard,
+ ^Node_Concatenation,
+ ^Node_Alternation,
+ ^Node_Repeat_Zero,
+ ^Node_Repeat_Zero_Non_Greedy,
+ ^Node_Repeat_One,
+ ^Node_Repeat_One_Non_Greedy,
+ ^Node_Repeat_N,
+ ^Node_Optional,
+ ^Node_Optional_Non_Greedy,
+ ^Node_Group,
+ ^Node_Anchor,
+ ^Node_Word_Boundary,
+
+ // Optimized nodes (not created by the Parser):
+ ^Node_Match_All_And_Escape,
+}
+
+
+left_binding_power :: proc(kind: Token_Kind) -> int {
+ #partial switch kind {
+ case .Alternate: return 1
+ case .Concatenate: return 2
+ case .Repeat_Zero, .Repeat_One,
+ .Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
+ .Repeat_N: return 3
+ case .Optional,
+ .Optional_Non_Greedy: return 4
+ case .Open_Paren,
+ .Open_Paren_Non_Capture: return 9
+ }
+ return 0
+}
+
+
+Expected_Token :: struct {
+ pos: int,
+ kind: Token_Kind,
+}
+
+Invalid_Repetition :: struct {
+ pos: int,
+}
+
+Invalid_Token :: struct {
+ pos: int,
+ kind: Token_Kind,
+}
+
+Invalid_Unicode :: struct {
+ pos: int,
+}
+
+Too_Many_Capture_Groups :: struct {
+ pos: int,
+}
+
+Unexpected_EOF :: struct {
+ pos: int,
+}
+
+Error :: union {
+ Expected_Token,
+ Invalid_Repetition,
+ Invalid_Token,
+ Invalid_Unicode,
+ Too_Many_Capture_Groups,
+ Unexpected_EOF,
+}
+
+
+Parser :: struct {
+ flags: common.Flags,
+ t: Tokenizer,
+
+ cur_token: Token,
+
+ groups: int,
+}
+
+
+@require_results
+advance :: proc(p: ^Parser) -> Error {
+ p.cur_token = tokenizer.scan(&p.t)
+ if p.cur_token.kind == .Invalid {
+ return Invalid_Unicode { pos = 0 }
+ }
+ return nil
+}
+
+expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
+ if p.cur_token.kind == kind {
+ advance(p) or_return
+ return
+ }
+
+ return Expected_Token{
+ pos = p.t.offset,
+ kind = kind,
+ }
+}
+
+null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
+ #partial switch token.kind {
+ case .Rune:
+ r: rune
+ for ru in token.text {
+ r = ru
+ break
+ }
+ assert(r != 0, "Parsed an empty Rune token.")
+
+ if .Case_Insensitive in p.flags {
+ lower := unicode.to_lower(r)
+ upper := unicode.to_upper(r)
+ if lower != upper {
+ node := new(Node_Rune_Class)
+ append(&node.runes, lower)
+ append(&node.runes, upper)
+ return node, nil
+ }
+ }
+
+ node := new(Node_Rune)
+ node ^= { r }
+ return node, nil
+
+ case .Rune_Class:
+ if len(token.text) == 0 {
+ return nil, nil
+ }
+
+ node := new(Node_Rune_Class)
+
+ for i := 0; i < len(token.text); /**/ {
+ r, size := utf8.decode_rune(token.text[i:])
+ if i == 0 && r == '^' {
+ node.negating = true
+ i += size
+ continue
+ }
+ i += size
+
+ assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
+
+ if r == '\\' {
+ next_r, next_size := utf8.decode_rune(token.text[i:])
+ i += next_size
+ assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
+
+ // @MetaCharacter
+ // NOTE: These must be kept in sync with the tokenizer.
+ switch next_r {
+ case 'f': append(&node.runes, '\f')
+ case 'n': append(&node.runes, '\n')
+ case 'r': append(&node.runes, '\r')
+ case 't': append(&node.runes, '\t')
+
+ case 'd':
+ append(&node.ranges, Rune_Class_Range{ '0', '9' })
+ case 's':
+ append(&node.runes, '\t')
+ append(&node.runes, '\n')
+ append(&node.runes, '\f')
+ append(&node.runes, '\r')
+ append(&node.runes, ' ')
+ case 'w':
+ append(&node.ranges, Rune_Class_Range{ '0', '9' })
+ append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
+ append(&node.runes, '_')
+ append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
+ case 'D':
+ append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
+ append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
+ case 'S':
+ append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
+ // \t and \n are adjacent.
+ append(&node.runes, '\x0b') // Vertical Tab
+ append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
+ append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
+ case 'W':
+ append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
+ append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
+ append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
+ append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
+ append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
+ case:
+ append(&node.runes, next_r)
+ }
+ continue
+ }
+
+ if r == '-' && len(node.runes) > 0 {
+ next_r, next_size := utf8.decode_rune(token.text[i:])
+ if next_size > 0 {
+ last := pop(&node.runes)
+ i += next_size
+
+ append(&node.ranges, Rune_Class_Range{ last, next_r })
+ continue
+ }
+ }
+
+ append(&node.runes, r)
+ }
+
+ if .Case_Insensitive in p.flags {
+ length := len(node.runes)
+ #no_bounds_check for i := 0; i < length; i += 1 {
+ r := node.runes[i]
+ lower := unicode.to_lower(r)
+ upper := unicode.to_upper(r)
+
+ if lower != upper {
+ if lower != r {
+ append(&node.runes, lower)
+ } else {
+ append(&node.runes, upper)
+ }
+ }
+ }
+
+ length = len(node.ranges)
+ #no_bounds_check for i := 0; i < length; i += 1 {
+ range := &node.ranges[i]
+
+ min_lower := unicode.to_lower(range.lower)
+ max_lower := unicode.to_lower(range.upper)
+
+ min_upper := unicode.to_upper(range.lower)
+ max_upper := unicode.to_upper(range.upper)
+
+ if min_lower != min_upper && max_lower != max_upper {
+ range.lower = min_lower
+ range.upper = max_lower
+ append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
+ }
+ }
+ }
+
+ result = node
+
+ case .Wildcard:
+ node := new(Node_Wildcard)
+ result = node
+
+ case .Open_Paren:
+ // Because of the recursive nature of the token parser, we take the
+ // group number first instead of afterwards, in order to construct
+ // group matches from the outside in.
+ p.groups += 1
+ if p.groups == common.MAX_CAPTURE_GROUPS {
+ return nil, Too_Many_Capture_Groups{ pos = token.pos }
+ }
+ this_group := p.groups
+
+ node := new(Node_Group)
+ node.capture = true
+ node.capture_id = this_group
+
+ node.inner = parse_expression(p, 0) or_return
+ expect(p, .Close_Paren) or_return
+ result = node
+ case .Open_Paren_Non_Capture:
+ node := new(Node_Group)
+ node.inner = parse_expression(p, 0) or_return
+ expect(p, .Close_Paren) or_return
+ result = node
+ case .Close_Paren:
+ node := new(Node_Rune)
+ node ^= { ')' }
+ return node, nil
+
+ case .Anchor_Start:
+ node := new(Node_Anchor)
+ node.start = true
+ result = node
+ case .Anchor_End:
+ node := new(Node_Anchor)
+ result = node
+ case .Word_Boundary:
+ node := new(Node_Word_Boundary)
+ result = node
+ case .Non_Word_Boundary:
+ node := new(Node_Word_Boundary)
+ node.non_word = true
+ result = node
+
+ case .Alternate:
+ // A unary alternation with a left-side empty path, i.e. `|a`.
+ right, right_err := parse_expression(p, left_binding_power(.Alternate))
+ #partial switch specific in right_err {
+ case Unexpected_EOF:
+ // This token is a NOP, i.e. `|`.
+ break
+ case nil:
+ break
+ case:
+ return nil, right_err
+ }
+
+ node := new(Node_Alternation)
+ node.right = right
+ result = node
+
+ case .EOF:
+ return nil, Unexpected_EOF{ pos = token.pos }
+
+ case:
+ return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
+ }
+
+ return
+}
+
+left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
+ #partial switch token.kind {
+ case .Alternate:
+ if p.cur_token.kind == .Close_Paren {
+ // `(a|)`
+ // parse_expression will fail, so intervene here.
+ node := new(Node_Alternation)
+ node.left = left
+ return node, nil
+ }
+
+ right, right_err := parse_expression(p, left_binding_power(.Alternate))
+
+ #partial switch specific in right_err {
+ case nil:
+ break
+ case Unexpected_EOF:
+ // EOF is okay in an alternation; it's an edge case in the way of
+ // expressing an optional such as `a|`.
+ break
+ case:
+ return nil, right_err
+ }
+
+ node := new(Node_Alternation)
+ node.left = left
+ node.right = right
+ result = node
+
+ case .Concatenate:
+ right := parse_expression(p, left_binding_power(.Concatenate)) or_return
+
+ // There should be no need to check if right is Node_Concatenation, due
+ // to how the parsing direction works.
+ #partial switch specific in left {
+ case ^Node_Concatenation:
+ append(&specific.nodes, right)
+ result = specific
+ case:
+ node := new(Node_Concatenation)
+ append(&node.nodes, left)
+ append(&node.nodes, right)
+ result = node
+ }
+
+ case .Repeat_Zero:
+ node := new(Node_Repeat_Zero)
+ node.inner = left
+ result = node
+ case .Repeat_Zero_Non_Greedy:
+ node := new(Node_Repeat_Zero_Non_Greedy)
+ node.inner = left
+ result = node
+ case .Repeat_One:
+ node := new(Node_Repeat_One)
+ node.inner = left
+ result = node
+ case .Repeat_One_Non_Greedy:
+ node := new(Node_Repeat_One_Non_Greedy)
+ node.inner = left
+ result = node
+
+ case .Repeat_N:
+ node := new(Node_Repeat_N)
+ node.inner = left
+
+ comma := strings.index_byte(token.text, ',')
+
+ switch comma {
+ case -1: // {N}
+ exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
+ if !ok {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+ if exact == 0 {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+
+ node.lower = cast(int)exact
+ node.upper = cast(int)exact
+
+ case 0: // {,M}
+ upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
+ if !ok {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+ if upper == 0 {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+
+ node.lower = -1
+ node.upper = cast(int)upper
+
+ case len(token.text) - 1: // {N,}
+ lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
+ if !ok {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+
+ node.lower = cast(int)lower
+ node.upper = -1
+
+ case: // {N,M}
+ lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
+ if !lower_ok {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+ upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
+ if !upper_ok {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+ if lower > upper {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+ if upper == 0 {
+ return nil, Invalid_Repetition{ pos = token.pos }
+ }
+
+ node.lower = cast(int)lower
+ node.upper = cast(int)upper
+ }
+
+ result = node
+
+ case .Optional:
+ node := new(Node_Optional)
+ node.inner = left
+ result = node
+ case .Optional_Non_Greedy:
+ node := new(Node_Optional_Non_Greedy)
+ node.inner = left
+ result = node
+
+ case .EOF:
+ return nil, Unexpected_EOF{ pos = token.pos }
+
+ case:
+ return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
+ }
+
+ return
+}
+
+parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
+ token := p.cur_token
+
+ advance(p) or_return
+ left := null_denotation(p, token) or_return
+
+ token = p.cur_token
+ for rbp < left_binding_power(token.kind) {
+ advance(p) or_return
+ left = left_denotation(p, token, left) or_return
+ token = p.cur_token
+ }
+
+ return left, nil
+}
+
+parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
+ if len(str) == 0 {
+ node := new(Node_Group)
+ return node, nil
+ }
+
+ p: Parser
+ p.flags = flags
+
+ tokenizer.init(&p.t, str, flags)
+
+ p.cur_token = tokenizer.scan(&p.t)
+ if p.cur_token.kind == .Invalid {
+ return nil, Invalid_Unicode { pos = 0 }
+ }
+
+ node := parse_expression(&p, 0) or_return
+ result = node
+
+ return
+}