aboutsummaryrefslogtreecommitdiff
path: root/core/text
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2025-05-24 15:38:26 +0200
committerGitHub <noreply@github.com>2025-05-24 15:38:26 +0200
commit142dd58b279b6d7291d75a1f8158fe315b7104c0 (patch)
tree9e0528bcf3f192b4a7699825f3498d11bd1fce25 /core/text
parent7b0b5d9adffe532a47b2ad9fb94afb64ca49bb29 (diff)
parent5d01acc04f5f90925c94a64dca0508d104b6241d (diff)
Merge pull request #5209 from Feoramund/regex-fixes
Fix RegEx iterator, remove `.Global`, make patterns unanchored by default (breaking change)
Diffstat (limited to 'core/text')
-rw-r--r--core/text/regex/common/common.odin3
-rw-r--r--core/text/regex/compiler/compiler.odin14
-rw-r--r--core/text/regex/regex.odin71
-rw-r--r--core/text/regex/virtual_machine/virtual_machine.odin6
4 files changed, 74 insertions, 20 deletions
diff --git a/core/text/regex/common/common.odin b/core/text/regex/common/common.odin
index 4a303e0a3..e60bef58f 100644
--- a/core/text/regex/common/common.odin
+++ b/core/text/regex/common/common.odin
@@ -15,8 +15,6 @@ MAX_PROGRAM_SIZE :: int(max(i16))
MAX_CLASSES :: int(max(u8))
Flag :: enum u8 {
- // Global: try to match the pattern anywhere in the string.
- Global,
// Multiline: treat `^` and `$` as if they also match newlines.
Multiline,
// Case Insensitive: treat `a-z` as if it was also `A-Z`.
@@ -36,7 +34,6 @@ Flags :: bit_set[Flag; u8]
@(rodata)
Flag_To_Letter := #sparse[Flag]u8 {
- .Global = 'g',
.Multiline = 'm',
.Case_Insensitive = 'i',
.Ignore_Whitespace = 'x',
diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin
index b3ded0104..07ace7b5d 100644
--- a/core/text/regex/compiler/compiler.odin
+++ b/core/text/regex/compiler/compiler.odin
@@ -401,7 +401,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
pc_open := 0
- add_global: if .Global in flags {
+ optimize_opening: {
// Check if the opening to the pattern is predictable.
// If so, use one of the optimized Wait opcodes.
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
@@ -412,7 +412,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
- break add_global
+ break optimize_opening
case .Rune:
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
@@ -420,24 +420,28 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
pc_open += size_of(Opcode)
inject_raw(&code, pc_open, operand)
pc_open += size_of(rune)
- break add_global
+ break optimize_opening
case .Rune_Class:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
- break add_global
+ break optimize_opening
case .Rune_Class_Negated:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
- break add_global
+ break optimize_opening
case .Save:
continue
+
+ case .Assert_Start:
+ break optimize_opening
+
case:
break seek_loop
}
diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin
index c805740f7..94a4b163a 100644
--- a/core/text/regex/regex.odin
+++ b/core/text/regex/regex.odin
@@ -77,6 +77,8 @@ Match_Iterator :: struct {
vm: virtual_machine.Machine,
idx: int,
temp: runtime.Allocator,
+ threads: int,
+ done: bool,
}
/*
@@ -101,7 +103,6 @@ create :: proc(
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Regular_Expression, err: Error) {
-
// For the sake of speed and simplicity, we first run all the intermediate
// processes such as parsing and compilation through the temporary
// allocator.
@@ -166,7 +167,6 @@ to escape the delimiter if found in the middle of the string.
All runes after the closing delimiter will be parsed as flags:
-- 'g': Global
- 'm': Multiline
- 'i': Case_Insensitive
- 'x': Ignore_Whitespace
@@ -243,7 +243,6 @@ create_by_user :: proc(
// to `end` here.
for r in pattern[start + end:] {
switch r {
- case 'g': flags += { .Global }
case 'm': flags += { .Multiline }
case 'i': flags += { .Case_Insensitive }
case 'x': flags += { .Ignore_Whitespace }
@@ -282,8 +281,6 @@ create_iterator :: proc(
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Match_Iterator, err: Error) {
- flags := flags
- flags += {.Global} // We're iterating over a string, so the next match could start anywhere
if .Multiline in flags {
return {}, .Unsupported_Flag
@@ -294,6 +291,7 @@ create_iterator :: proc(
result.temp = temporary_allocator
result.vm = virtual_machine.create(result.regex.program, str)
result.vm.class_data = result.regex.class_data
+ result.threads = max(1, virtual_machine.opcode_count(result.vm.code) - 1)
return
}
@@ -457,8 +455,27 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS,
"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
+ // Guard against situations in which the iterator should finish.
+ if it.done {
+ return
+ }
+
runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+ if it.idx > 0 {
+ // Reset the state needed to `virtual_machine.run` again.
+ it.vm.top_thread = 0
+ it.vm.current_rune = rune(0)
+ it.vm.current_rune_size = 0
+ for i in 0..<it.threads {
+ it.vm.threads[i] = {}
+ it.vm.next_threads[i] = {}
+ }
+ }
+
+ // Take note of where the string pointer is before we start.
+ sp_before := it.vm.string_pointer
+
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
{
context.allocator = it.temp
@@ -469,6 +486,28 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
}
}
+ if !ok {
+ // Match failed, bail out.
+ return
+ }
+
+ if it.vm.string_pointer == sp_before {
+ // The string pointer did not move, but there was a match.
+ //
+ // At this point, the pattern supplied to the iterator will infinitely
+ // loop if we do not intervene.
+ it.done = true
+ }
+ if it.vm.string_pointer == len(it.vm.memory) {
+ // The VM hit the end of the string.
+ //
+ // We do not check at the start, because a match of pattern `$`
+ // against string "" is valid and must return a match.
+ //
+ // This check prevents a double-match of `$` against a non-empty string.
+ it.done = true
+ }
+
str := string(it.vm.memory)
num_groups: int
@@ -488,9 +527,7 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
num_groups = n
}
- defer if ok {
- it.idx += 1
- }
+ defer it.idx += 1
if num_groups > 0 {
result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]}
@@ -504,8 +541,24 @@ match :: proc {
match_iterator,
}
+/*
+Reset an iterator, allowing it to be run again as if new.
+
+Inputs:
+- it: The iterator to reset.
+*/
reset :: proc(it: ^Match_Iterator) {
- it.idx = 0
+ it.done = false
+ it.idx = 0
+ it.vm.string_pointer = 0
+
+ it.vm.top_thread = 0
+ it.vm.current_rune = rune(0)
+ it.vm.current_rune_size = 0
+ for i in 0..<it.threads {
+ it.vm.threads[i] = {}
+ it.vm.next_threads[i] = {}
+ }
}
/*
diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin
index ab1dfbec1..32b772802 100644
--- a/core/text/regex/virtual_machine/virtual_machine.odin
+++ b/core/text/regex/virtual_machine/virtual_machine.odin
@@ -329,10 +329,10 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
when UNICODE_MODE {
- vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
+ vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer:])
} else {
if len(vm.memory) > 0 {
- vm.next_rune = cast(rune)vm.memory[0]
+ vm.next_rune = cast(rune)vm.memory[vm.string_pointer]
vm.next_rune_size = 1
}
}
@@ -652,4 +652,4 @@ destroy :: proc(vm: Machine, allocator := context.allocator) {
delete(vm.busy_map)
free(vm.threads)
free(vm.next_threads)
-} \ No newline at end of file
+}