diff options
| author | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-08-10 07:17:03 -0400 |
|---|---|---|
| committer | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-08-10 07:17:03 -0400 |
| commit | c69fa87d53297325d235d1a5ff57d84655ae2217 (patch) | |
| tree | 73182746f3ad8bc8493848d069593c56f0e026a2 /core/bytes/bytes.odin | |
| parent | e7e7fe766a2e9171b7edff58cfdf889a41e1094e (diff) | |
Merge `core:simd/util` into `core:bytes`
Diffstat (limited to 'core/bytes/bytes.odin')
| -rw-r--r-- | core/bytes/bytes.odin | 151 |
1 files changed, 130 insertions, 21 deletions
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index e130502a1..e09859a19 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -2,10 +2,21 @@ package bytes import "base:intrinsics" import "core:mem" -@require import simd_util "core:simd/util" import "core:unicode" import "core:unicode/utf8" + +@private SIMD_SCAN_WIDTH :: 32 + +@(private, rodata) +simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +} + + clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte { c := make([]byte, len(s), allocator, loc) copy(c, s) @@ -295,43 +306,141 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { return _split_iterator(s, sep, len(sep)) } +/* +Scan a slice of bytes for a specific byte. + +This procedure safely handles slices of any length, including empty slices. -index_byte :: proc(s: []byte, c: byte) -> int { - _index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - for ch, i in s { - if ch == c { +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check { + length := len(s) + i := 0 + + // Guard against small strings. + if length < SIMD_SCAN_WIDTH { + for /**/; i < length; i += 1 { + if s[i] == c { return i } } return -1 } - // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a - // significant speedup when compiling in either Size or Speed mode. - // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) + ptr := cast(int)cast(uintptr)raw_data(s) + + alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i < alignment_start; i += 1 { + if s[i] == c { + return i + } + } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SIMD_SCAN_WIDTH]u8 = c + tail := length - (length - alignment_start) % SIMD_SCAN_WIDTH + + for /**/; i < tail; i += SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_min(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < length; i += 1 { + if s[i] == c { + return i + } } + + return -1 } -// Returns -1 if c is not present -last_index_byte :: proc(s: []byte, c: byte) -> int { - _last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - #reverse for ch, i in s { - if ch == c { +/* +Scan a slice of bytes for a specific byte, starting from the end and working +backwards to the start. + +This procedure safely handles slices of any length, including empty slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check { + length := len(s) + i := length - 1 + + // Guard against small strings. + if length < SIMD_SCAN_WIDTH { + for /**/; i >= 0; i -= 1 { + if s[i] == c { return i } } return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) + ptr := cast(int)cast(uintptr)raw_data(s) + + tail := length - (ptr + length) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i >= tail; i -= 1 { + if s[i] == c { + return i + } } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SIMD_SCAN_WIDTH]u8 = c + alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + i -= SIMD_SCAN_WIDTH - 1 + + for /**/; i >= alignment_start; i -= SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_max(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + i += SIMD_SCAN_WIDTH - 1 + + for /**/; i >= 0; i -= 1 { + if s[i] == c { + return i + } + } + + return -1 } |