aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFeoramund <161657516+Feoramund@users.noreply.github.com>2025-05-26 14:28:40 -0400
committerFeoramund <161657516+Feoramund@users.noreply.github.com>2025-05-26 14:48:45 -0400
commit35b157ac8392e6b60d554ea7943cb004504f2327 (patch)
tree926feaec6c351e33b656bb98f38c1d5d6f148cb5
parent4f7ed35435a908d5118a917f329db97434d922ef (diff)
Fix multiline RegEx iteration
In `.Multiline` mode: - `^` is now defined to assert the start of the string or that a "\n" or "\r" rune was parsed on last VM dispatch. - `$` is now defined to consume a newline sequence of "\n", "\r", or "\r\n" or to assert the end of the string.
-rw-r--r--core/text/regex/compiler/compiler.odin10
-rw-r--r--core/text/regex/regex.odin5
-rw-r--r--core/text/regex/virtual_machine/doc.odin40
-rw-r--r--core/text/regex/virtual_machine/util.odin2
-rw-r--r--core/text/regex/virtual_machine/virtual_machine.odin52
-rw-r--r--tests/core/text/regex/test_core_text_regex.odin57
6 files changed, 113 insertions, 53 deletions
diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin
index 07ace7b5d..2f0f183e9 100644
--- a/core/text/regex/compiler/compiler.odin
+++ b/core/text/regex/compiler/compiler.odin
@@ -195,8 +195,12 @@ generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
case ^Node_Anchor:
if .Multiline in c.flags {
- append(&code, Opcode.Multiline_Open)
- append(&code, Opcode.Multiline_Close)
+ if specific.start {
+ append(&code, Opcode.Assert_Start_Multiline)
+ } else {
+ append(&code, Opcode.Multiline_Open)
+ append(&code, Opcode.Multiline_Close)
+ }
} else {
if specific.start {
append(&code, Opcode.Assert_Start)
@@ -439,7 +443,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data:
case .Save:
continue
- case .Assert_Start:
+ case .Assert_Start, .Assert_Start_Multiline:
break optimize_opening
case:
diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin
index 94a4b163a..7456634ac 100644
--- a/core/text/regex/regex.odin
+++ b/core/text/regex/regex.odin
@@ -282,10 +282,6 @@ create_iterator :: proc(
temporary_allocator := context.temp_allocator,
) -> (result: Match_Iterator, err: Error) {
- if .Multiline in flags {
- return {}, .Unsupported_Flag
- }
-
result.regex = create(pattern, flags, permanent_allocator, temporary_allocator) or_return
result.capture = preallocate_capture()
result.temp = temporary_allocator
@@ -555,6 +551,7 @@ reset :: proc(it: ^Match_Iterator) {
it.vm.top_thread = 0
it.vm.current_rune = rune(0)
it.vm.current_rune_size = 0
+ it.vm.last_rune = rune(0)
for i in 0..<it.threads {
it.vm.threads[i] = {}
it.vm.next_threads[i] = {}
diff --git a/core/text/regex/virtual_machine/doc.odin b/core/text/regex/virtual_machine/doc.odin
index 1b0694565..d599dbb1c 100644
--- a/core/text/regex/virtual_machine/doc.odin
+++ b/core/text/regex/virtual_machine/doc.odin
@@ -109,34 +109,42 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
(0x0A) Assert_Start
- Asserts that the thread is at the beginning of a string.
+ Asserts that the thread is at the beginning of the string.
- (0x0B) Assert_End
+ (0x0B) Assert_Start_Multiline
- Asserts that the thread is at the end of a string.
+ This opcode is compiled in only when the `Multiline` flag is present as a
+ replacement for the `^` text anchor.
- (0x0C) Assert_Word_Boundary
+ Asserts that the thread is at the beginning of the string or previously
+ parsed either a "\n" or "\r".
+
+ (0x0C) Assert_End
+
+ Asserts that the thread is at the end of the string.
+
+ (0x0D) Assert_Word_Boundary
Asserts that the thread is on a word boundary, which can be the start or
end of the text. This examines both the current rune and the next rune.
- (0x0D) Assert_Non_Word_Boundary
+ (0x0E) Assert_Non_Word_Boundary
A modified version of Assert_Word_Boundary that returns the opposite value.
- (0x0E) Multiline_Open
+ (0x0F) Multiline_Open
- This opcode is compiled in only when the `Multiline` flag is present, and
- it replaces both `^` and `$` text anchors.
+ This opcode is compiled in only when the `Multiline` flag is present as a
+ replacement for the `$` text anchor.
- It asserts that either the current thread is on one of the string
- boundaries, or it consumes a `\n` or `\r` character.
+ It asserts that either the current thread is at the end of the string,
+ or it consumes a `\n` or `\r` character.
If a `\r` character is consumed, the PC will be advanced to the sibling
`Multiline_Close` opcode to optionally consume a `\n` character on the next
frame.
- (0x0F) Multiline_Close
+ (0x10) Multiline_Close
This opcode is always present after `Multiline_Open`.
@@ -144,10 +152,10 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
For example, Windows newlines are represented by the characters `\r\n`,
whereas UNIX newlines are `\n` and Macintosh newlines are `\r`.
- (0x10) Wait_For_Byte
- (0x11) Wait_For_Rune
- (0x12) Wait_For_Rune_Class
- (0x13) Wait_For_Rune_Class_Negated
+ (0x11) Wait_For_Byte
+ (0x12) Wait_For_Rune
+ (0x13) Wait_For_Rune_Class
+ (0x14) Wait_For_Rune_Class_Negated
These opcodes are an optimization around restarting threads on failed
matches when the beginning to a pattern is predictable and the Global flag
@@ -156,7 +164,7 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
They will cause the VM to wait for the next rune to match before splitting,
as would happen in the un-optimized version.
- (0x14) Match_All_And_Escape
+ (0x15) Match_All_And_Escape
This opcode is an optimized version of `.*$` or `.+$` that causes the
active thread to immediately work on escaping the program by following all
diff --git a/core/text/regex/virtual_machine/util.odin b/core/text/regex/virtual_machine/util.odin
index fa94a139f..79c781e4a 100644
--- a/core/text/regex/virtual_machine/util.odin
+++ b/core/text/regex/virtual_machine/util.odin
@@ -34,6 +34,7 @@ iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok:
case .Split: iter.pc += size_of(Opcode) + 2 * size_of(u16)
case .Save: iter.pc += size_of(Opcode) + size_of(u8)
case .Assert_Start: iter.pc += size_of(Opcode)
+ case .Assert_Start_Multiline: iter.pc += size_of(Opcode)
case .Assert_End: iter.pc += size_of(Opcode)
case .Assert_Word_Boundary: iter.pc += size_of(Opcode)
case .Assert_Non_Word_Boundary: iter.pc += size_of(Opcode)
@@ -64,6 +65,7 @@ opcode_to_name :: proc(opcode: Opcode) -> (str: string) {
case .Split: str = "Split"
case .Save: str = "Save"
case .Assert_Start: str = "Assert_Start"
+ case .Assert_Start_Multiline: str = "Assert_Start_Multiline"
case .Assert_End: str = "Assert_End"
case .Assert_Word_Boundary: str = "Assert_Word_Boundary"
case .Assert_Non_Word_Boundary: str = "Assert_Non_Word_Boundary"
diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin
index 32b772802..c292b0e99 100644
--- a/core/text/regex/virtual_machine/virtual_machine.odin
+++ b/core/text/regex/virtual_machine/virtual_machine.odin
@@ -37,16 +37,17 @@ Opcode :: enum u8 {
Split = 0x08, // | u16, u16
Save = 0x09, // | u8
Assert_Start = 0x0A, // |
- Assert_End = 0x0B, // |
- Assert_Word_Boundary = 0x0C, // |
- Assert_Non_Word_Boundary = 0x0D, // |
- Multiline_Open = 0x0E, // |
- Multiline_Close = 0x0F, // |
- Wait_For_Byte = 0x10, // | u8
- Wait_For_Rune = 0x11, // | i32
- Wait_For_Rune_Class = 0x12, // | u8
- Wait_For_Rune_Class_Negated = 0x13, // | u8
- Match_All_And_Escape = 0x14, // |
+ Assert_Start_Multiline = 0x0B, // |
+ Assert_End = 0x0C, // |
+ Assert_Word_Boundary = 0x0D, // |
+ Assert_Non_Word_Boundary = 0x0E, // |
+ Multiline_Open = 0x0F, // |
+ Multiline_Close = 0x10, // |
+ Wait_For_Byte = 0x11, // | u8
+ Wait_For_Rune = 0x12, // | i32
+ Wait_For_Rune_Class = 0x13, // | u8
+ Wait_For_Rune_Class_Negated = 0x14, // | u8
+ Match_All_And_Escape = 0x15, // |
}
Thread :: struct {
@@ -77,6 +78,8 @@ Machine :: struct {
current_rune_size: int,
next_rune: rune,
next_rune_size: int,
+
+ last_rune: rune,
}
@@ -169,6 +172,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
pc += size_of(Opcode)
continue
}
+ case .Assert_Start_Multiline:
+ sp := vm.string_pointer+vm.current_rune_size
+ if sp == 0 || vm.last_rune == '\n' || vm.last_rune == '\r' {
+ pc += size_of(Opcode)
+ continue
+ }
case .Assert_End:
sp := vm.string_pointer+vm.current_rune_size
if sp == len(vm.memory) {
@@ -177,24 +186,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
}
case .Multiline_Open:
sp := vm.string_pointer+vm.current_rune_size
- if sp == 0 || sp == len(vm.memory) {
- if vm.next_rune == '\r' || vm.next_rune == '\n' {
- // The VM is currently on a newline at the string boundary,
- // so consume the newline next frame.
- when common.ODIN_DEBUG_REGEX {
- io.write_string(common.debug_stream, "*** New thread added [PC:")
- common.write_padded_hex(common.debug_stream, pc, 4)
- io.write_string(common.debug_stream, "]\n")
- }
- vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
- vm.top_thread += 1
- } else {
- // Skip the `Multiline_Close` opcode.
- pc += 2 * size_of(Opcode)
- continue
- }
+ if sp == len(vm.memory) {
+ // Skip the `Multiline_Close` opcode.
+ pc += 2 * size_of(Opcode)
+ continue
} else {
- // Not on a string boundary.
+ // Not at the end of the string.
// Try to consume a newline next frame in the other opcode loop.
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
@@ -613,6 +610,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU
break
}
+ vm.last_rune = vm.current_rune
vm.string_pointer += vm.current_rune_size
}
diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin
index aed3091e1..8369444b9 100644
--- a/tests/core/text/regex/test_core_text_regex.odin
+++ b/tests/core/text/regex/test_core_text_regex.odin
@@ -699,15 +699,15 @@ test_case_insensitive :: proc(t: ^testing.T) {
test_multiline :: proc(t: ^testing.T) {
{
EXPR :: `^hellope$world$`
- check_expression(t, EXPR, "\nhellope\nworld\n", "\nhellope\nworld\n", extra_flags = { .Multiline })
+ check_expression(t, EXPR, "hellope\nworld\n", "hellope\nworld\n", extra_flags = { .Multiline })
check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline })
check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline })
check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline })
}
{
- EXPR :: `^?.$`
- check_expression(t, EXPR, "\nh", "\nh", extra_flags = { .Multiline })
+ EXPR :: `^.$`
check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline })
+ check_expression(t, EXPR, "h\n", "h\n", extra_flags = { .Multiline })
}
{
EXPR :: `^$`
@@ -1219,6 +1219,57 @@ iterator_vectors := []Iterator_Test{
{pos = {{3, 3}}, groups = {""}},
},
},
+ // Multiline iteration is supported, but it must follow the `^...$` scheme.
+ //
+ // Any usage outside of this strict syntax will produce predictable but
+ // unusual outputs, as `^` is defined to assert the start of a string or
+ // that a newline sequence was previously consumed, and `$` consumes a
+ // newline sequence or asserts the end of the string.
+ {
+ "foo1\nfoo2\r\nfoo3\rfoo4", `^foo.$`, {.Multiline},
+ {
+ {pos = {{0, 5}}, groups = {"foo1\n"}},
+ {pos = {{5, 11}}, groups = {"foo2\r\n"}},
+ {pos = {{11, 16}}, groups = {"foo3\r"}},
+ {pos = {{16, 20}}, groups = {"foo4"}},
+ },
+ },
+ {
+ "a\nb\n\r", `^$`, {.Multiline},
+ {},
+ },
+ {
+ "a\nb\n", `^$`, {.Multiline},
+ {},
+ },
+ {
+ "a\nb", `^$`, {.Multiline},
+ {},
+ },
+ // Multiline anchors must work within groups, as people are going to end up
+ // using them in there and we do not forbid it.
+ {
+ "a\nb\na\nb", `(?:^a$|^b$)`, {.Multiline},
+ {
+ {pos = {{0, 2}}, groups = {"a\n"}},
+ {pos = {{2, 4}}, groups = {"b\n"}},
+ {pos = {{4, 6}}, groups = {"a\n"}},
+ {pos = {{6, 7}}, groups = {"b"}},
+ },
+ },
+ // The following patterns are valid uses of optional anchors and must match.
+ {
+ "a\nb\na\nb", `^a(?:b|$)`, {.Multiline},
+ {
+ {pos = {{0, 2}}, groups = {"a\n"}},
+ },
+ },
+ {
+ "a\nb\na\nb", `^ab?$?`, {.Multiline},
+ {
+ {pos = {{0, 2}}, groups = {"a\n"}},
+ },
+ },
}
@test