diff options
| author | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-08-09 17:39:19 -0400 |
|---|---|---|
| committer | Feoramund <161657516+Feoramund@users.noreply.github.com> | 2024-08-09 18:54:04 -0400 |
| commit | 12dd0cb72a586a99129280c78697089caab0500a (patch) | |
| tree | 80bbcb47b01bcefff2595c61147dd15318054b00 | |
| parent | 793811b219e77b21a1c765323957e4b74ce13e64 (diff) | |
Simplify and make `simd_util` cross-platform
This new algorithm uses a Scalar->Vector->Scalar iteration loop which
requires no masking off of any incomplete data chunks.
Also, the width was reduced to 32 bytes instead of 64, as I found this
to be about as fast as the previous 64-byte x86 version.
| -rw-r--r-- | core/bytes/bytes.odin | 18 | ||||
| -rw-r--r-- | core/simd/util/util.odin | 216 | ||||
| -rw-r--r-- | core/strings/strings.odin | 18 | ||||
| -rw-r--r-- | tests/benchmark/simd/util/benchmark_simd_util.odin | 1 | ||||
| -rw-r--r-- | tests/core/simd/util/test_core_simd_util.odin | 1 |
5 files changed, 102 insertions, 152 deletions
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index dcd4931e2..136c98f6b 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(s, c) } else { return _index_byte(s, c) } @@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(s, c) } else { return _last_index_byte(s, c) } diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin index ac523b42a..b209a44ea 100644 --- a/core/simd/util/util.odin +++ b/core/simd/util/util.odin @@ -8,26 +8,24 @@ // package simd_util implements compositions of SIMD operations for optimizing // the core library where available. - -//+build i386, amd64 package simd_util import "base:intrinsics" -import "core:simd/x86" -@private SCAN_REGISTER_SIZE :: 16 -@private SCAN_REGISTERS :: 4 -@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE +@private SCAN_WIDTH :: 32 -// How long should a string be before using any of the `index_*` procedures in -// this package. -RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE +@(private, rodata) +simd_scanner_indices := #simd[SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +} /* Scan a slice of bytes for a specific byte. -This procedure safely handles padding out slices of any length, including empty -slices. +This procedure safely handles slices of any length, including empty slices. Inputs: - data: A slice of bytes. @@ -36,83 +34,54 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -@(enable_target_feature="sse2") index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { - scanner_data: [SCAN_REGISTER_SIZE]u8 = c - scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) - - i: int length := len(data) - full_chunks_length := length - length % SCAN_WIDTH - - for /**/; i < full_chunks_length; i += SCAN_WIDTH { - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) - - #unroll for j in 0..<SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner) - mask := x86._mm_movemask_epi8(cmp) - - // NOTE(Feoramund): I experimented with ORing all the masks onto a - // 128-bit integer before performing the `mask != 0` check to see - // if that might be faster. However, the cost to avoid 3 - // compares resulted in a marginally slower runtime on my machine. - // - // Simpler won out here. - if mask != 0 { - ctz := intrinsics.count_trailing_zeros(mask) - return i + j * SCAN_REGISTER_SIZE + cast(int)ctz + i := 0 + + // Guard against small strings. + if length < SCAN_WIDTH { + for /**/; i < length; i += 1 { + if data[i] == c { + return i } } + return -1 } - if i < length { - // The data is not exactly divisible by SCAN_WIDTH, and we haven't found - // what we're looking for yet, so we must pad out the end, then run our - // algorithm on it. - padded_data_end: [SCAN_WIDTH]u8 = --- - remnant_length := length % SCAN_WIDTH - intrinsics.mem_copy_non_overlapping( - &padded_data_end[0], - &raw_data(data)[full_chunks_length], - remnant_length, - ) - - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0]) - - #unroll for j in 0..<SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner) - mask := x86._mm_movemask_epi8(cmp) - - // Because this data is padded out, it's possible that we could - // match on uninitialized memory, so we must guard against that. - - // Create a relevancy mask: (Example) - // - // max(u64) = 0xFFFF_FFFF_FFFF_FFFF - // - // Convert an integer into a stream of on-bits by using the - // shifted negation of the maximum. The subtraction selects which - // section of the overall mask we should apply. - // - // << 17 - (1 * SCAN_REGISTER_SIZE) - // = 0xFFFF_FFFF_FFFF_FFFE - // - submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE)) - // - // ~submask = 0x0000_0000_0000_0001 - // (submask >> 63) = 0x0000_0000_0000_0001 - // - // The multiplication is a guard against zero. - // - submask = ~submask * (submask >> 63) - // - // Finally, mask out any irrelevant bits with the submask. - mask &= i32(submask) - - if mask != 0 { - ctz := int(intrinsics.count_trailing_zeros(mask)) - return i + j * SCAN_REGISTER_SIZE + ctz - } + ptr := cast(int)cast(uintptr)raw_data(data) + + alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i < alignment_start; i += 1 { + if data[i] == c { + return i + } + } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SCAN_WIDTH]u8 = c + tail := length - (length - alignment_start) % SCAN_WIDTH + + for /**/; i < tail; i += SCAN_WIDTH { + load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_min(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < length; i += 1 { + if data[i] == c { + return i } } @@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { Scan a slice of bytes for a specific byte, starting from the end and working backwards to the start. -This procedure safely handles padding out slices of any length, including empty -slices. +This procedure safely handles slices of any length, including empty slices. Inputs: - data: A slice of bytes. @@ -133,54 +101,58 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -@(enable_target_feature="sse2") last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check { - scanner_data: [SCAN_REGISTER_SIZE]u8 = c - scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) - - i := len(data) - SCAN_WIDTH - - for /**/; i >= 0; i -= SCAN_WIDTH { - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) - - // There is no #reverse #unroll at the time of this writing, so we use - // `j` to count down by subtraction. - #unroll for j in 1..=SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) - mask := x86._mm_movemask_epi8(cmp) + length := len(data) + i := length - 1 - if mask != 0 { - // CLZ is used instead to get the on-bit from the other end. - clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) - return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + // Guard against small strings. + if length < SCAN_WIDTH { + for /**/; i >= 0; i -= 1 { + if data[i] == c { + return i } } + return -1 } - if i < 0 { - padded_data_end: [SCAN_WIDTH]u8 = --- - remnant_length := len(data) % SCAN_WIDTH - intrinsics.mem_copy_non_overlapping( - &padded_data_end[0], - &raw_data(data)[0], - remnant_length, - ) - - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0]) + ptr := cast(int)cast(uintptr)raw_data(data) - #unroll for j in 1..=SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) - mask := x86._mm_movemask_epi8(cmp) + tail := length - (ptr + length) % SCAN_WIDTH - submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE) - submask = ~submask * (submask >> 63) + // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i >= tail; i -= 1 { + if data[i] == c { + return i + } + } - mask &= i32(submask) + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SCAN_WIDTH]u8 = c + alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH + + i -= SCAN_WIDTH - 1 + + for /**/; i >= alignment_start; i -= SCAN_WIDTH { + load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SCAN_WIDTH]u8 + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_max(index_select) + return i + cast(int)index_reduce + } + } - if mask != 0 { - clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) - return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz - } + // Iterate as a scalar over the remaining unaligned portion. + i += SCAN_WIDTH - 1 + + for /**/; i >= 0; i -= 1 { + if data[i] == c { + return i } } diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 9d3e88165..b8e43f90d 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(transmute([]u8)s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(transmute([]u8)s, c) } else { return _index_byte(s, c) } @@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(transmute([]u8)s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(transmute([]u8)s, c) } else { return _last_index_byte(s, c) } diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin index 4538c6612..18fa0a9e3 100644 --- a/tests/benchmark/simd/util/benchmark_simd_util.odin +++ b/tests/benchmark/simd/util/benchmark_simd_util.odin @@ -1,4 +1,3 @@ -//+build i386, amd64 package benchmark_simd_util import "core:fmt" diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin index 65bf566c0..ff7e1f9aa 100644 --- a/tests/core/simd/util/test_core_simd_util.odin +++ b/tests/core/simd/util/test_core_simd_util.odin @@ -1,4 +1,3 @@ -//+build i386, amd64 package test_core_simd_util import simd_util "core:simd/util" |