aboutsummaryrefslogtreecommitdiff
path: root/base/runtime
diff options
context:
space:
mode:
authorFeoramund <161657516+Feoramund@users.noreply.github.com>2025-05-29 15:32:58 -0400
committerFeoramund <161657516+Feoramund@users.noreply.github.com>2025-05-29 16:29:13 -0400
commit34698288b812147202cd30cc357b47f306cc8f41 (patch)
treea8315aebbe4924ea8a953e8e0fba4d9a50acaa16 /base/runtime
parent827a6f90454cc7540bb3a809657b8d4162545f3c (diff)
Vectorize `runtime.memory_*` comparison procedures
Diffstat (limited to 'base/runtime')
-rw-r--r--base/runtime/internal.odin198
1 files changed, 140 insertions, 58 deletions
diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
index bddbcaa22..f51d01a9d 100644
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -234,91 +234,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
case n == 0: return true
case x == y: return true
}
- a, b := ([^]byte)(x), ([^]byte)(y)
- length := uint(n)
+ a, b := cast([^]byte)x, cast([^]byte)y
+
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ if n >= 8 {
+ when !SIMD_IS_EMULATED {
+ // Avoid using 256-bit SIMD on platforms where its emulation is
+ // likely to be less than ideal.
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+ ne := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(ne) != 0 {
+ return false
+ }
+ }
+ }
+ }
+
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+ ne := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(ne) != 0 {
+ return false
+ }
+ }
- for i := uint(0); i < length; i += 1 {
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
+ return false
+ }
+ }
+ }
+
+ for /**/; i < n; i += 1 {
if a[i] != b[i] {
return false
}
}
return true
}
-memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
+
+memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
switch {
- case a == b: return 0
- case a == nil: return -1
- case b == nil: return +1
- }
-
- x := uintptr(a)
- y := uintptr(b)
- n := uintptr(n)
-
- SU :: size_of(uintptr)
- fast := n/SU + 1
- offset := (fast-1)*SU
- curr_block := uintptr(0)
- if n < SU {
- fast = 0
- }
-
- for /**/; curr_block < fast; curr_block += 1 {
- va := (^uintptr)(x + curr_block * size_of(uintptr))^
- vb := (^uintptr)(y + curr_block * size_of(uintptr))^
- if va ~ vb != 0 {
- for pos := curr_block*SU; pos < n; pos += 1 {
- a := (^byte)(x+pos)^
- b := (^byte)(y+pos)^
- if a ~ b != 0 {
- return -1 if (int(a) - int(b)) < 0 else +1
+ case x == y: return 0
+ case x == nil: return -1
+ case y == nil: return +1
+ }
+ a, b := cast([^]byte)x, cast([^]byte)y
+
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ when !SIMD_IS_EMULATED {
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[32]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[32]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
}
}
}
}
- for /**/; offset < n; offset += 1 {
- a := (^byte)(x+offset)^
- b := (^byte)(y+offset)^
- if a ~ b != 0 {
- return -1 if (int(a) - int(b)) < 0 else +1
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[16]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[16]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
}
}
+ // 64-bit SIMD is faster than using a `uintptr` to detect a difference then
+ // re-iterating with the byte-by-byte loop, at least on AMD64.
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[8]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[8]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
+ }
+ }
+
+ for /**/; i < n; i += 1 {
+ if a[i] ~ b[i] != 0 {
+ return -1 if int(a[i]) - int(b[i]) < 0 else +1
+ }
+ }
return 0
}
memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
- x := uintptr(a)
- n := uintptr(n)
-
- SU :: size_of(uintptr)
- fast := n/SU + 1
- offset := (fast-1)*SU
- curr_block := uintptr(0)
- if n < SU {
- fast = 0
- }
-
- for /**/; curr_block < fast; curr_block += 1 {
- va := (^uintptr)(x + curr_block * size_of(uintptr))^
- if va ~ 0 != 0 {
- for pos := curr_block*SU; pos < n; pos += 1 {
- a := (^byte)(x+pos)^
- if a ~ 0 != 0 {
- return -1 if int(a) < 0 else +1
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ // Because we're comparing against zero, we never return -1, as that would
+ // indicate the compared value is less than zero.
+ //
+ // Note that a zero return value here means equality.
+
+ bytes := ([^]u8)(a)
+
+ if n >= 8 {
+ when !SIMD_IS_EMULATED {
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ scanner32: #simd[32]u8
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
+ ne := intrinsics.simd_lanes_ne(scanner32, load)
+ if intrinsics.simd_reduce_or(ne) > 0 {
+ return 1
+ }
}
}
}
- }
- for /**/; offset < n; offset += 1 {
- a := (^byte)(x+offset)^
- if a ~ 0 != 0 {
- return -1 if int(a) < 0 else +1
+ scanner16: #simd[16]u8
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
+ ne := intrinsics.simd_lanes_ne(scanner16, load)
+ if intrinsics.simd_reduce_or(ne) != 0 {
+ return 1
+ }
+ }
+
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
+ return 1
+ }
}
}
+ for /**/; i < n; i += 1 {
+ if bytes[i] != 0 {
+ return 1
+ }
+ }
return 0
}