aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorFeoramund <161657516+Feoramund@users.noreply.github.com>2024-08-09 17:39:19 -0400
committerFeoramund <161657516+Feoramund@users.noreply.github.com>2024-08-09 18:54:04 -0400
commit12dd0cb72a586a99129280c78697089caab0500a (patch)
tree80bbcb47b01bcefff2595c61147dd15318054b00 /core
parent793811b219e77b21a1c765323957e4b74ce13e64 (diff)
Simplify and make `simd_util` cross-platform
This new algorithm uses a Scalar->Vector->Scalar iteration loop which requires no masking off of any incomplete data chunks. Also, the width was reduced to 32 bytes instead of 64, as I found this to be about as fast as the previous 64-byte x86 version.
Diffstat (limited to 'core')
-rw-r--r--core/bytes/bytes.odin18
-rw-r--r--core/simd/util/util.odin216
-rw-r--r--core/strings/strings.odin18
3 files changed, 102 insertions, 150 deletions
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index dcd4931e2..136c98f6b 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
// significant speedup when compiling in either Size or Speed mode.
// The SIMD version is usually 2-3x slower without optimizations on.
- when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
- // SIMD's benefits are noticeable only past a certain threshold of data.
- // For small data, use the plain old algorithm.
- if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
- return simd_util.index_byte(s, c)
- } else {
- return _index_byte(s, c)
- }
+ when ODIN_OPTIMIZATION_MODE > .Minimal {
+ return #force_inline simd_util.index_byte(s, c)
} else {
return _index_byte(s, c)
}
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
return -1
}
- when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
- if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
- return simd_util.last_index_byte(s, c)
- } else {
- return _last_index_byte(s, c)
- }
+ when ODIN_OPTIMIZATION_MODE > .Minimal {
+ return #force_inline simd_util.last_index_byte(s, c)
} else {
return _last_index_byte(s, c)
}
diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin
index ac523b42a..b209a44ea 100644
--- a/core/simd/util/util.odin
+++ b/core/simd/util/util.odin
@@ -8,26 +8,24 @@
// package simd_util implements compositions of SIMD operations for optimizing
// the core library where available.
-
-//+build i386, amd64
package simd_util
import "base:intrinsics"
-import "core:simd/x86"
-@private SCAN_REGISTER_SIZE :: 16
-@private SCAN_REGISTERS :: 4
-@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
+@private SCAN_WIDTH :: 32
-// How long should a string be before using any of the `index_*` procedures in
-// this package.
-RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
+@(private, rodata)
+simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+}
/*
Scan a slice of bytes for a specific byte.
-This procedure safely handles padding out slices of any length, including empty
-slices.
+This procedure safely handles slices of any length, including empty slices.
Inputs:
- data: A slice of bytes.
@@ -36,83 +34,54 @@ Inputs:
Returns:
- index: The index of the byte `c`, or -1 if it was not found.
*/
-@(enable_target_feature="sse2")
index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
- scanner_data: [SCAN_REGISTER_SIZE]u8 = c
- scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
-
- i: int
length := len(data)
- full_chunks_length := length - length % SCAN_WIDTH
-
- for /**/; i < full_chunks_length; i += SCAN_WIDTH {
- simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
-
- #unroll for j in 0..<SCAN_REGISTERS {
- cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
- mask := x86._mm_movemask_epi8(cmp)
-
- // NOTE(Feoramund): I experimented with ORing all the masks onto a
- // 128-bit integer before performing the `mask != 0` check to see
- // if that might be faster. However, the cost to avoid 3
- // compares resulted in a marginally slower runtime on my machine.
- //
- // Simpler won out here.
- if mask != 0 {
- ctz := intrinsics.count_trailing_zeros(mask)
- return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
+ i := 0
+
+ // Guard against small strings.
+ if length < SCAN_WIDTH {
+ for /**/; i < length; i += 1 {
+ if data[i] == c {
+ return i
}
}
+ return -1
}
- if i < length {
- // The data is not exactly divisible by SCAN_WIDTH, and we haven't found
- // what we're looking for yet, so we must pad out the end, then run our
- // algorithm on it.
- padded_data_end: [SCAN_WIDTH]u8 = ---
- remnant_length := length % SCAN_WIDTH
- intrinsics.mem_copy_non_overlapping(
- &padded_data_end[0],
- &raw_data(data)[full_chunks_length],
- remnant_length,
- )
-
- simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
-
- #unroll for j in 0..<SCAN_REGISTERS {
- cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
- mask := x86._mm_movemask_epi8(cmp)
-
- // Because this data is padded out, it's possible that we could
- // match on uninitialized memory, so we must guard against that.
-
- // Create a relevancy mask: (Example)
- //
- // max(u64) = 0xFFFF_FFFF_FFFF_FFFF
- //
- // Convert an integer into a stream of on-bits by using the
- // shifted negation of the maximum. The subtraction selects which
- // section of the overall mask we should apply.
- //
- // << 17 - (1 * SCAN_REGISTER_SIZE)
- // = 0xFFFF_FFFF_FFFF_FFFE
- //
- submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
- //
- // ~submask = 0x0000_0000_0000_0001
- // (submask >> 63) = 0x0000_0000_0000_0001
- //
- // The multiplication is a guard against zero.
- //
- submask = ~submask * (submask >> 63)
- //
- // Finally, mask out any irrelevant bits with the submask.
- mask &= i32(submask)
-
- if mask != 0 {
- ctz := int(intrinsics.count_trailing_zeros(mask))
- return i + j * SCAN_REGISTER_SIZE + ctz
- }
+ ptr := cast(int)cast(uintptr)raw_data(data)
+
+ alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
+
+ // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
+ //
+ // This way, every load in the vector loop will be aligned, which should be
+ // the fastest possible scenario.
+ for /**/; i < alignment_start; i += 1 {
+ if data[i] == c {
+ return i
+ }
+ }
+
+ // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
+ scanner: #simd[SCAN_WIDTH]u8 = c
+ tail := length - (length - alignment_start) % SCAN_WIDTH
+
+ for /**/; i < tail; i += SCAN_WIDTH {
+ load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
+ comparison := intrinsics.simd_lanes_eq(load, scanner)
+ match := intrinsics.simd_reduce_or(comparison)
+ if match > 0 {
+ sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
+ index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+ index_reduce := intrinsics.simd_reduce_min(index_select)
+ return i + cast(int)index_reduce
+ }
+ }
+
+ // Iterate as a scalar over the remaining unaligned portion.
+ for /**/; i < length; i += 1 {
+ if data[i] == c {
+ return i
}
}
@@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
Scan a slice of bytes for a specific byte, starting from the end and working
backwards to the start.
-This procedure safely handles padding out slices of any length, including empty
-slices.
+This procedure safely handles slices of any length, including empty slices.
Inputs:
- data: A slice of bytes.
@@ -133,54 +101,58 @@ Inputs:
Returns:
- index: The index of the byte `c`, or -1 if it was not found.
*/
-@(enable_target_feature="sse2")
last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
- scanner_data: [SCAN_REGISTER_SIZE]u8 = c
- scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
-
- i := len(data) - SCAN_WIDTH
-
- for /**/; i >= 0; i -= SCAN_WIDTH {
- simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
-
- // There is no #reverse #unroll at the time of this writing, so we use
- // `j` to count down by subtraction.
- #unroll for j in 1..=SCAN_REGISTERS {
- cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
- mask := x86._mm_movemask_epi8(cmp)
+ length := len(data)
+ i := length - 1
- if mask != 0 {
- // CLZ is used instead to get the on-bit from the other end.
- clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
- return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
+ // Guard against small strings.
+ if length < SCAN_WIDTH {
+ for /**/; i >= 0; i -= 1 {
+ if data[i] == c {
+ return i
}
}
+ return -1
}
- if i < 0 {
- padded_data_end: [SCAN_WIDTH]u8 = ---
- remnant_length := len(data) % SCAN_WIDTH
- intrinsics.mem_copy_non_overlapping(
- &padded_data_end[0],
- &raw_data(data)[0],
- remnant_length,
- )
-
- simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
+ ptr := cast(int)cast(uintptr)raw_data(data)
- #unroll for j in 1..=SCAN_REGISTERS {
- cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
- mask := x86._mm_movemask_epi8(cmp)
+ tail := length - (ptr + length) % SCAN_WIDTH
- submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
- submask = ~submask * (submask >> 63)
+ // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
+ //
+ // This way, every load in the vector loop will be aligned, which should be
+ // the fastest possible scenario.
+ for /**/; i >= tail; i -= 1 {
+ if data[i] == c {
+ return i
+ }
+ }
- mask &= i32(submask)
+ // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
+ scanner: #simd[SCAN_WIDTH]u8 = c
+ alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
+
+ i -= SCAN_WIDTH - 1
+
+ for /**/; i >= alignment_start; i -= SCAN_WIDTH {
+ load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
+ comparison := intrinsics.simd_lanes_eq(load, scanner)
+ match := intrinsics.simd_reduce_or(comparison)
+ if match > 0 {
+ sentinel: #simd[SCAN_WIDTH]u8
+ index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+ index_reduce := intrinsics.simd_reduce_max(index_select)
+ return i + cast(int)index_reduce
+ }
+ }
- if mask != 0 {
- clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
- return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
- }
+ // Iterate as a scalar over the remaining unaligned portion.
+ i += SCAN_WIDTH - 1
+
+ for /**/; i >= 0; i -= 1 {
+ if data[i] == c {
+ return i
}
}
diff --git a/core/strings/strings.odin b/core/strings/strings.odin
index 9d3e88165..b8e43f90d 100644
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) {
// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
// significant speedup when compiling in either Size or Speed mode.
// The SIMD version is usually 2-3x slower without optimizations on.
- when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
- // SIMD's benefits are noticeable only past a certain threshold of data.
- // For small data, use the plain old algorithm.
- if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
- return simd_util.index_byte(transmute([]u8)s, c)
- } else {
- return _index_byte(s, c)
- }
+ when ODIN_OPTIMIZATION_MODE > .Minimal {
+ return #force_inline simd_util.index_byte(transmute([]u8)s, c)
} else {
return _index_byte(s, c)
}
@@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) {
return -1
}
- when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
- if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
- return simd_util.last_index_byte(transmute([]u8)s, c)
- } else {
- return _last_index_byte(s, c)
- }
+ when ODIN_OPTIMIZATION_MODE > .Minimal {
+ return #force_inline simd_util.last_index_byte(transmute([]u8)s, c)
} else {
return _last_index_byte(s, c)
}