diff options
| author | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2025-05-26 14:28:40 -0400 |
|---|---|---|
| committer | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2025-05-26 14:48:45 -0400 |
| commit | 35b157ac8392e6b60d554ea7943cb004504f2327 (patch) | |
| tree | 926feaec6c351e33b656bb98f38c1d5d6f148cb5 /core/text/regex | |
| parent | 4f7ed35435a908d5118a917f329db97434d922ef (diff) | |
Fix multiline RegEx iteration
In `.Multiline` mode:
- `^` is now defined to assert the start of the string or that a "\n" or
"\r" rune was parsed on last VM dispatch.
- `$` is now defined to consume a newline sequence of "\n", "\r", or
"\r\n" or to assert the end of the string.
Diffstat (limited to 'core/text/regex')
| -rw-r--r-- | core/text/regex/compiler/compiler.odin | 10 | ||||
| -rw-r--r-- | core/text/regex/regex.odin | 5 | ||||
| -rw-r--r-- | core/text/regex/virtual_machine/doc.odin | 40 | ||||
| -rw-r--r-- | core/text/regex/virtual_machine/util.odin | 2 | ||||
| -rw-r--r-- | core/text/regex/virtual_machine/virtual_machine.odin | 52 |
5 files changed, 59 insertions, 50 deletions
diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index 07ace7b5d..2f0f183e9 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -195,8 +195,12 @@ generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) { case ^Node_Anchor: if .Multiline in c.flags { - append(&code, Opcode.Multiline_Open) - append(&code, Opcode.Multiline_Close) + if specific.start { + append(&code, Opcode.Assert_Start_Multiline) + } else { + append(&code, Opcode.Multiline_Open) + append(&code, Opcode.Multiline_Close) + } } else { if specific.start { append(&code, Opcode.Assert_Start) @@ -439,7 +443,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: case .Save: continue - case .Assert_Start: + case .Assert_Start, .Assert_Start_Multiline: break optimize_opening case: diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 94a4b163a..7456634ac 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -282,10 +282,6 @@ create_iterator :: proc( temporary_allocator := context.temp_allocator, ) -> (result: Match_Iterator, err: Error) { - if .Multiline in flags { - return {}, .Unsupported_Flag - } - result.regex = create(pattern, flags, permanent_allocator, temporary_allocator) or_return result.capture = preallocate_capture() result.temp = temporary_allocator @@ -555,6 +551,7 @@ reset :: proc(it: ^Match_Iterator) { it.vm.top_thread = 0 it.vm.current_rune = rune(0) it.vm.current_rune_size = 0 + it.vm.last_rune = rune(0) for i in 0..<it.threads { it.vm.threads[i] = {} it.vm.next_threads[i] = {} diff --git a/core/text/regex/virtual_machine/doc.odin b/core/text/regex/virtual_machine/doc.odin index 1b0694565..d599dbb1c 100644 --- a/core/text/regex/virtual_machine/doc.odin +++ b/core/text/regex/virtual_machine/doc.odin @@ -109,34 +109,42 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html (0x0A) Assert_Start - Asserts that the thread is at the beginning of a string. + Asserts that the thread is at the beginning of the string. - (0x0B) Assert_End + (0x0B) Assert_Start_Multiline - Asserts that the thread is at the end of a string. + This opcode is compiled in only when the `Multiline` flag is present as a + replacement for the `^` text anchor. - (0x0C) Assert_Word_Boundary + Asserts that the thread is at the beginning of the string or previously + parsed either a "\n" or "\r". + + (0x0C) Assert_End + + Asserts that the thread is at the end of the string. + + (0x0D) Assert_Word_Boundary Asserts that the thread is on a word boundary, which can be the start or end of the text. This examines both the current rune and the next rune. - (0x0D) Assert_Non_Word_Boundary + (0x0E) Assert_Non_Word_Boundary A modified version of Assert_Word_Boundary that returns the opposite value. - (0x0E) Multiline_Open + (0x0F) Multiline_Open - This opcode is compiled in only when the `Multiline` flag is present, and - it replaces both `^` and `$` text anchors. + This opcode is compiled in only when the `Multiline` flag is present as a + replacement for the `$` text anchor. - It asserts that either the current thread is on one of the string - boundaries, or it consumes a `\n` or `\r` character. + It asserts that either the current thread is at the end of the string, + or it consumes a `\n` or `\r` character. If a `\r` character is consumed, the PC will be advanced to the sibling `Multiline_Close` opcode to optionally consume a `\n` character on the next frame. - (0x0F) Multiline_Close + (0x10) Multiline_Close This opcode is always present after `Multiline_Open`. @@ -144,10 +152,10 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html For example, Windows newlines are represented by the characters `\r\n`, whereas UNIX newlines are `\n` and Macintosh newlines are `\r`. - (0x10) Wait_For_Byte - (0x11) Wait_For_Rune - (0x12) Wait_For_Rune_Class - (0x13) Wait_For_Rune_Class_Negated + (0x11) Wait_For_Byte + (0x12) Wait_For_Rune + (0x13) Wait_For_Rune_Class + (0x14) Wait_For_Rune_Class_Negated These opcodes are an optimization around restarting threads on failed matches when the beginning to a pattern is predictable and the Global flag @@ -156,7 +164,7 @@ For more information, see: https://swtch.com/~rsc/regexp/regexp2.html They will cause the VM to wait for the next rune to match before splitting, as would happen in the un-optimized version. - (0x14) Match_All_And_Escape + (0x15) Match_All_And_Escape This opcode is an optimized version of `.*$` or `.+$` that causes the active thread to immediately work on escaping the program by following all diff --git a/core/text/regex/virtual_machine/util.odin b/core/text/regex/virtual_machine/util.odin index fa94a139f..79c781e4a 100644 --- a/core/text/regex/virtual_machine/util.odin +++ b/core/text/regex/virtual_machine/util.odin @@ -34,6 +34,7 @@ iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok: case .Split: iter.pc += size_of(Opcode) + 2 * size_of(u16) case .Save: iter.pc += size_of(Opcode) + size_of(u8) case .Assert_Start: iter.pc += size_of(Opcode) + case .Assert_Start_Multiline: iter.pc += size_of(Opcode) case .Assert_End: iter.pc += size_of(Opcode) case .Assert_Word_Boundary: iter.pc += size_of(Opcode) case .Assert_Non_Word_Boundary: iter.pc += size_of(Opcode) @@ -64,6 +65,7 @@ opcode_to_name :: proc(opcode: Opcode) -> (str: string) { case .Split: str = "Split" case .Save: str = "Save" case .Assert_Start: str = "Assert_Start" + case .Assert_Start_Multiline: str = "Assert_Start_Multiline" case .Assert_End: str = "Assert_End" case .Assert_Word_Boundary: str = "Assert_Word_Boundary" case .Assert_Non_Word_Boundary: str = "Assert_Non_Word_Boundary" diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin index 32b772802..c292b0e99 100644 --- a/core/text/regex/virtual_machine/virtual_machine.odin +++ b/core/text/regex/virtual_machine/virtual_machine.odin @@ -37,16 +37,17 @@ Opcode :: enum u8 { Split = 0x08, // | u16, u16 Save = 0x09, // | u8 Assert_Start = 0x0A, // | - Assert_End = 0x0B, // | - Assert_Word_Boundary = 0x0C, // | - Assert_Non_Word_Boundary = 0x0D, // | - Multiline_Open = 0x0E, // | - Multiline_Close = 0x0F, // | - Wait_For_Byte = 0x10, // | u8 - Wait_For_Rune = 0x11, // | i32 - Wait_For_Rune_Class = 0x12, // | u8 - Wait_For_Rune_Class_Negated = 0x13, // | u8 - Match_All_And_Escape = 0x14, // | + Assert_Start_Multiline = 0x0B, // | + Assert_End = 0x0C, // | + Assert_Word_Boundary = 0x0D, // | + Assert_Non_Word_Boundary = 0x0E, // | + Multiline_Open = 0x0F, // | + Multiline_Close = 0x10, // | + Wait_For_Byte = 0x11, // | u8 + Wait_For_Rune = 0x12, // | i32 + Wait_For_Rune_Class = 0x13, // | u8 + Wait_For_Rune_Class_Negated = 0x14, // | u8 + Match_All_And_Escape = 0x15, // | } Thread :: struct { @@ -77,6 +78,8 @@ Machine :: struct { current_rune_size: int, next_rune: rune, next_rune_size: int, + + last_rune: rune, } @@ -169,6 +172,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: pc += size_of(Opcode) continue } + case .Assert_Start_Multiline: + sp := vm.string_pointer+vm.current_rune_size + if sp == 0 || vm.last_rune == '\n' || vm.last_rune == '\r' { + pc += size_of(Opcode) + continue + } case .Assert_End: sp := vm.string_pointer+vm.current_rune_size if sp == len(vm.memory) { @@ -177,24 +186,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: } case .Multiline_Open: sp := vm.string_pointer+vm.current_rune_size - if sp == 0 || sp == len(vm.memory) { - if vm.next_rune == '\r' || vm.next_rune == '\n' { - // The VM is currently on a newline at the string boundary, - // so consume the newline next frame. - when common.ODIN_DEBUG_REGEX { - io.write_string(common.debug_stream, "*** New thread added [PC:") - common.write_padded_hex(common.debug_stream, pc, 4) - io.write_string(common.debug_stream, "]\n") - } - vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } - vm.top_thread += 1 - } else { - // Skip the `Multiline_Close` opcode. - pc += 2 * size_of(Opcode) - continue - } + if sp == len(vm.memory) { + // Skip the `Multiline_Close` opcode. + pc += 2 * size_of(Opcode) + continue } else { - // Not on a string boundary. + // Not at the end of the string. // Try to consume a newline next frame in the other opcode loop. when common.ODIN_DEBUG_REGEX { io.write_string(common.debug_stream, "*** New thread added [PC:") @@ -613,6 +610,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU break } + vm.last_rune = vm.current_rune vm.string_pointer += vm.current_rune_size } |