aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFeoramund <161657516+Feoramund@users.noreply.github.com>2024-08-04 15:47:00 -0400
committerFeoramund <161657516+Feoramund@users.noreply.github.com>2024-08-06 15:19:01 -0400
commit8deeb40e5de1d33b571f0b1faf7b8dea678cd91b (patch)
treeef17c047bf2fe4dc22511eca7dbec0b8e930e212
parent39d557bcb450026d31bc6b098b7988c07e58ef09 (diff)
Add vectorized `index_byte` and `last_index_byte`
-rw-r--r--core/simd/util/util.odin188
1 files changed, 188 insertions, 0 deletions
diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin
new file mode 100644
index 000000000..ac523b42a
--- /dev/null
+++ b/core/simd/util/util.odin
@@ -0,0 +1,188 @@
+/*
+ (c) Copyright 2024 Feoramund <rune@swevencraft.org>.
+ Made available under Odin's BSD-3 license.
+
+ List of contributors:
+ Feoramund: `index_byte` procedures.
+*/
+
+// package simd_util implements compositions of SIMD operations for optimizing
+// the core library where available.
+
+//+build i386, amd64
+package simd_util
+
+import "base:intrinsics"
+import "core:simd/x86"
+
+@private SCAN_REGISTER_SIZE :: 16
+@private SCAN_REGISTERS :: 4
+@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
+
+// How long should a string be before using any of the `index_*` procedures in
+// this package.
+RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
+
+/*
+Scan a slice of bytes for a specific byte.
+
+This procedure safely handles padding out slices of any length, including empty
+slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+@(enable_target_feature="sse2")
+index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
+ scanner_data: [SCAN_REGISTER_SIZE]u8 = c
+ scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
+
+ i: int
+ length := len(data)
+ full_chunks_length := length - length % SCAN_WIDTH
+
+ for /**/; i < full_chunks_length; i += SCAN_WIDTH {
+ simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
+
+ #unroll for j in 0..<SCAN_REGISTERS {
+ cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
+ mask := x86._mm_movemask_epi8(cmp)
+
+ // NOTE(Feoramund): I experimented with ORing all the masks onto a
+ // 128-bit integer before performing the `mask != 0` check to see
+ // if that might be faster. However, the cost to avoid 3
+ // compares resulted in a marginally slower runtime on my machine.
+ //
+ // Simpler won out here.
+ if mask != 0 {
+ ctz := intrinsics.count_trailing_zeros(mask)
+ return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
+ }
+ }
+ }
+
+ if i < length {
+ // The data is not exactly divisible by SCAN_WIDTH, and we haven't found
+ // what we're looking for yet, so we must pad out the end, then run our
+ // algorithm on it.
+ padded_data_end: [SCAN_WIDTH]u8 = ---
+ remnant_length := length % SCAN_WIDTH
+ intrinsics.mem_copy_non_overlapping(
+ &padded_data_end[0],
+ &raw_data(data)[full_chunks_length],
+ remnant_length,
+ )
+
+ simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
+
+ #unroll for j in 0..<SCAN_REGISTERS {
+ cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
+ mask := x86._mm_movemask_epi8(cmp)
+
+ // Because this data is padded out, it's possible that we could
+ // match on uninitialized memory, so we must guard against that.
+
+ // Create a relevancy mask: (Example)
+ //
+ // max(u64) = 0xFFFF_FFFF_FFFF_FFFF
+ //
+ // Convert an integer into a stream of on-bits by using the
+ // shifted negation of the maximum. The subtraction selects which
+ // section of the overall mask we should apply.
+ //
+ // << 17 - (1 * SCAN_REGISTER_SIZE)
+ // = 0xFFFF_FFFF_FFFF_FFFE
+ //
+ submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
+ //
+ // ~submask = 0x0000_0000_0000_0001
+ // (submask >> 63) = 0x0000_0000_0000_0001
+ //
+ // The multiplication is a guard against zero.
+ //
+ submask = ~submask * (submask >> 63)
+ //
+ // Finally, mask out any irrelevant bits with the submask.
+ mask &= i32(submask)
+
+ if mask != 0 {
+ ctz := int(intrinsics.count_trailing_zeros(mask))
+ return i + j * SCAN_REGISTER_SIZE + ctz
+ }
+ }
+ }
+
+ return -1
+}
+
+/*
+Scan a slice of bytes for a specific byte, starting from the end and working
+backwards to the start.
+
+This procedure safely handles padding out slices of any length, including empty
+slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+@(enable_target_feature="sse2")
+last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
+ scanner_data: [SCAN_REGISTER_SIZE]u8 = c
+ scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
+
+ i := len(data) - SCAN_WIDTH
+
+ for /**/; i >= 0; i -= SCAN_WIDTH {
+ simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
+
+ // There is no #reverse #unroll at the time of this writing, so we use
+ // `j` to count down by subtraction.
+ #unroll for j in 1..=SCAN_REGISTERS {
+ cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
+ mask := x86._mm_movemask_epi8(cmp)
+
+ if mask != 0 {
+ // CLZ is used instead to get the on-bit from the other end.
+ clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
+ return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
+ }
+ }
+ }
+
+ if i < 0 {
+ padded_data_end: [SCAN_WIDTH]u8 = ---
+ remnant_length := len(data) % SCAN_WIDTH
+ intrinsics.mem_copy_non_overlapping(
+ &padded_data_end[0],
+ &raw_data(data)[0],
+ remnant_length,
+ )
+
+ simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
+
+ #unroll for j in 1..=SCAN_REGISTERS {
+ cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
+ mask := x86._mm_movemask_epi8(cmp)
+
+ submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
+ submask = ~submask * (submask >> 63)
+
+ mask &= i32(submask)
+
+ if mask != 0 {
+ clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
+ return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
+ }
+ }
+ }
+
+ return -1
+}