aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2025-05-29 23:32:19 +0200
committerGitHub <noreply@github.com>2025-05-29 23:32:19 +0200
commit3142aaf497d7a784eb85b9596e05fd329554c5a8 (patch)
tree0d1cf96e11db30a134a640a72356fcf8ab1ecdc5
parent0d0f311df1e31102f4138c1a28110f86dcf60fca (diff)
parent45219f240efdb5ced113ea10398abe864ee21632 (diff)
Merge pull request #4063 from Feoramund/simd-memory
Vectorize `base:runtime.memory_*`
-rw-r--r--base/runtime/internal.odin253
-rw-r--r--core/bytes/bytes.odin4
-rw-r--r--core/crypto/_chacha20/simd128/chacha20_simd128.odin2
-rw-r--r--core/simd/simd.odin13
-rw-r--r--tests/benchmark/bytes/benchmark_bytes.odin9
-rw-r--r--tests/benchmark/runtime/benchmark_runtime.odin227
-rw-r--r--tests/core/runtime/test_core_runtime.odin76
7 files changed, 457 insertions, 127 deletions
diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
index 38b7f662c..a35dbff8a 100644
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -16,6 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
@(private)
__float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
+HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
+ false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
+ false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
+ false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
+ true
+
@(private)
byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
@@ -229,150 +235,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
case n == 0: return true
case x == y: return true
}
- a, b := ([^]byte)(x), ([^]byte)(y)
- length := uint(n)
-
- for i := uint(0); i < length; i += 1 {
- if a[i] != b[i] {
- return false
- }
- }
- return true
-
-/*
-
- when size_of(uint) == 8 {
- if word_length := length >> 3; word_length != 0 {
- for _ in 0..<word_length {
- if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
- return false
+ a, b := cast([^]byte)x, cast([^]byte)y
+
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ if n >= 8 {
+ when HAS_HARDWARE_SIMD {
+ // Avoid using 256-bit SIMD on platforms where its emulation is
+ // likely to be less than ideal.
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+ ne := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(ne) != 0 {
+ return false
+ }
}
- a = a[size_of(u64):]
- b = b[size_of(u64):]
}
}
-
- if length & 4 != 0 {
- if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
+
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+ ne := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(ne) != 0 {
return false
}
- a = a[size_of(u32):]
- b = b[size_of(u32):]
}
-
- if length & 2 != 0 {
- if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
+
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
return false
}
- a = a[size_of(u16):]
- b = b[size_of(u16):]
}
-
- if length & 1 != 0 && a[0] != b[0] {
- return false
- }
- return true
- } else {
- if word_length := length >> 2; word_length != 0 {
- for _ in 0..<word_length {
- if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
- return false
- }
- a = a[size_of(u32):]
- b = b[size_of(u32):]
- }
- }
-
- length &= 3
-
- if length != 0 {
- for i in 0..<length {
- if a[i] != b[i] {
- return false
- }
- }
- }
-
- return true
}
-*/
+ for /**/; i < n; i += 1 {
+ if a[i] != b[i] {
+ return false
+ }
+ }
+ return true
}
-memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
+
+memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
switch {
- case a == b: return 0
- case a == nil: return -1
- case b == nil: return +1
- }
-
- x := uintptr(a)
- y := uintptr(b)
- n := uintptr(n)
-
- SU :: size_of(uintptr)
- fast := n/SU + 1
- offset := (fast-1)*SU
- curr_block := uintptr(0)
- if n < SU {
- fast = 0
- }
-
- for /**/; curr_block < fast; curr_block += 1 {
- va := (^uintptr)(x + curr_block * size_of(uintptr))^
- vb := (^uintptr)(y + curr_block * size_of(uintptr))^
- if va ~ vb != 0 {
- for pos := curr_block*SU; pos < n; pos += 1 {
- a := (^byte)(x+pos)^
- b := (^byte)(y+pos)^
- if a ~ b != 0 {
- return -1 if (int(a) - int(b)) < 0 else +1
+ case x == y: return 0
+ case x == nil: return -1
+ case y == nil: return +1
+ }
+ a, b := cast([^]byte)x, cast([^]byte)y
+
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ when HAS_HARDWARE_SIMD {
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[32]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[32]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
}
}
}
}
- for /**/; offset < n; offset += 1 {
- a := (^byte)(x+offset)^
- b := (^byte)(y+offset)^
- if a ~ b != 0 {
- return -1 if (int(a) - int(b)) < 0 else +1
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[16]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[16]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
}
}
+ // 64-bit SIMD is faster than using a `uintptr` to detect a difference then
+ // re-iterating with the byte-by-byte loop, at least on AMD64.
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[8]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[8]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
+ }
+ }
+
+ for /**/; i < n; i += 1 {
+ if a[i] ~ b[i] != 0 {
+ return -1 if int(a[i]) - int(b[i]) < 0 else +1
+ }
+ }
return 0
}
memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
- x := uintptr(a)
- n := uintptr(n)
-
- SU :: size_of(uintptr)
- fast := n/SU + 1
- offset := (fast-1)*SU
- curr_block := uintptr(0)
- if n < SU {
- fast = 0
- }
-
- for /**/; curr_block < fast; curr_block += 1 {
- va := (^uintptr)(x + curr_block * size_of(uintptr))^
- if va ~ 0 != 0 {
- for pos := curr_block*SU; pos < n; pos += 1 {
- a := (^byte)(x+pos)^
- if a ~ 0 != 0 {
- return -1 if int(a) < 0 else +1
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ // Because we're comparing against zero, we never return -1, as that would
+ // indicate the compared value is less than zero.
+ //
+ // Note that a zero return value here means equality.
+
+ bytes := ([^]u8)(a)
+
+ if n >= 8 {
+ when HAS_HARDWARE_SIMD {
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ scanner32: #simd[32]u8
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
+ ne := intrinsics.simd_lanes_ne(scanner32, load)
+ if intrinsics.simd_reduce_or(ne) > 0 {
+ return 1
+ }
}
}
}
- }
- for /**/; offset < n; offset += 1 {
- a := (^byte)(x+offset)^
- if a ~ 0 != 0 {
- return -1 if int(a) < 0 else +1
+ scanner16: #simd[16]u8
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
+ ne := intrinsics.simd_lanes_ne(scanner16, load)
+ if intrinsics.simd_reduce_or(ne) != 0 {
+ return 1
+ }
+ }
+
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
+ return 1
+ }
}
}
+ for /**/; i < n; i += 1 {
+ if bytes[i] != 0 {
+ return 1
+ }
+ }
return 0
}
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index c0d25bcce..71b6ef70c 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -350,7 +350,7 @@ index_byte :: proc "contextless" (s: []byte, c: byte) -> (index: int) #no_bounds
}
c_vec: simd.u8x16 = c
- when !simd.IS_EMULATED {
+ when simd.HAS_HARDWARE_SIMD {
// Note: While this is something that could also logically take
// advantage of AVX512, the various downclocking and power
// consumption related woes make premature to have a dedicated
@@ -485,7 +485,7 @@ last_index_byte :: proc "contextless" (s: []byte, c: byte) -> int #no_bounds_che
}
c_vec: simd.u8x16 = c
- when !simd.IS_EMULATED {
+ when simd.HAS_HARDWARE_SIMD {
// Note: While this is something that could also logically take
// advantage of AVX512, the various downclocking and power
// consumption related woes make premature to have a dedicated
diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
index 6b37b8d61..4bf40e240 100644
--- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin
+++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -39,7 +39,7 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
// Some targets lack runtime feature detection, and will flat out refuse
// to load binaries that have unknown instructions. This is distinct from
-// `simd.IS_EMULATED` as actually good designs support runtime feature
+// `simd.HAS_HARDWARE_SIMD` as actually good designs support runtime feature
// detection and that constant establishes a baseline.
//
// See:
diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index a97155f58..b4779b5ff 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -21,20 +21,17 @@ package simd
import "base:builtin"
import "base:intrinsics"
+import "base:runtime"
/*
Check if SIMD is software-emulated on a target platform.
-This value is `false`, when the compile-time target has the hardware support for
-at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
-for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be
+This value is `true`, when the compile-time target has the hardware support for
+at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
+for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be
emulated.
*/
-IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
- true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
- true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
- true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
- false
+HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD
/*
Vector of 16 `u8` lanes (128 bits).
diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin
index 13ef8f9a5..ee3a91d64 100644
--- a/tests/benchmark/bytes/benchmark_bytes.odin
+++ b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -54,14 +54,15 @@ run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx
accumulator: int
+ watch: time.Stopwatch
+
+ time.stopwatch_start(&watch)
for _ in 0..<runs {
- start := time.now()
accumulator += p(data, 'z')
- done := time.since(start)
- timing += done
}
+ time.stopwatch_stop(&watch)
- timing /= time.Duration(runs)
+ timing = time.stopwatch_duration(watch)
log.debug(accumulator)
return
diff --git a/tests/benchmark/runtime/benchmark_runtime.odin b/tests/benchmark/runtime/benchmark_runtime.odin
new file mode 100644
index 000000000..871fb05e6
--- /dev/null
+++ b/tests/benchmark/runtime/benchmark_runtime.odin
@@ -0,0 +1,227 @@
+package benchmark_runtime
+
+import "base:runtime"
+import "core:fmt"
+import "core:log"
+import "core:testing"
+import "core:strings"
+import "core:text/table"
+import "core:time"
+
+RUNS_PER_SIZE :: 2500
+
+sizes := [?]int {
+ 7, 8, 9,
+ 15, 16, 17,
+ 31, 32, 33,
+ 63, 64, 65,
+ 95, 96, 97,
+ 128,
+ 256,
+ 512,
+ 1024,
+ 4096,
+ 1024 * 1024,
+}
+
+// These are the normal, unoptimized algorithms.
+
+plain_memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
+ switch {
+ case n == 0: return true
+ case x == y: return true
+ }
+ a, b := ([^]byte)(x), ([^]byte)(y)
+ length := uint(n)
+
+ for i := uint(0); i < length; i += 1 {
+ if a[i] != b[i] {
+ return false
+ }
+ }
+ return true
+}
+
+plain_memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
+ switch {
+ case a == b: return 0
+ case a == nil: return -1
+ case b == nil: return +1
+ }
+
+ x := uintptr(a)
+ y := uintptr(b)
+ n := uintptr(n)
+
+ SU :: size_of(uintptr)
+ fast := n/SU + 1
+ offset := (fast-1)*SU
+ curr_block := uintptr(0)
+ if n < SU {
+ fast = 0
+ }
+
+ for /**/; curr_block < fast; curr_block += 1 {
+ va := (^uintptr)(x + curr_block * size_of(uintptr))^
+ vb := (^uintptr)(y + curr_block * size_of(uintptr))^
+ if va ~ vb != 0 {
+ for pos := curr_block*SU; pos < n; pos += 1 {
+ a := (^byte)(x+pos)^
+ b := (^byte)(y+pos)^
+ if a ~ b != 0 {
+ return -1 if (int(a) - int(b)) < 0 else +1
+ }
+ }
+ }
+ }
+
+ for /**/; offset < n; offset += 1 {
+ a := (^byte)(x+offset)^
+ b := (^byte)(y+offset)^
+ if a ~ b != 0 {
+ return -1 if (int(a) - int(b)) < 0 else +1
+ }
+ }
+
+ return 0
+}
+
+plain_memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
+ x := uintptr(a)
+ n := uintptr(n)
+
+ SU :: size_of(uintptr)
+ fast := n/SU + 1
+ offset := (fast-1)*SU
+ curr_block := uintptr(0)
+ if n < SU {
+ fast = 0
+ }
+
+ for /**/; curr_block < fast; curr_block += 1 {
+ va := (^uintptr)(x + curr_block * size_of(uintptr))^
+ if va ~ 0 != 0 {
+ for pos := curr_block*SU; pos < n; pos += 1 {
+ a := (^byte)(x+pos)^
+ if a ~ 0 != 0 {
+ return -1 if int(a) < 0 else +1
+ }
+ }
+ }
+ }
+
+ for /**/; offset < n; offset += 1 {
+ a := (^byte)(x+offset)^
+ if a ~ 0 != 0 {
+ return -1 if int(a) < 0 else +1
+ }
+ }
+
+ return 0
+}
+
+run_trial_size_cmp :: proc(p: proc "contextless" (rawptr, rawptr, int) -> $R, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
+ left := make([]u8, size)
+ right := make([]u8, size)
+ defer {
+ delete(left)
+ delete(right)
+ }
+
+ right[idx] = 0x01
+
+ accumulator: int
+
+ watch: time.Stopwatch
+
+ time.stopwatch_start(&watch)
+ for _ in 0..<runs {
+ result := p(&left[0], &right[0], size)
+ when R == bool {
+ assert(result == false, loc = loc)
+ accumulator += 1
+ } else when R == int {
+ assert(result == -1, loc = loc)
+ accumulator += result
+ }
+ }
+ time.stopwatch_stop(&watch)
+ timing = time.stopwatch_duration(watch)
+
+ log.debug(accumulator)
+ return
+}
+
+run_trial_size_zero :: proc(p: proc "contextless" (rawptr, int) -> int, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
+ data := make([]u8, size)
+ defer delete(data)
+
+ data[idx] = 0x01
+
+ accumulator: int
+
+ watch: time.Stopwatch
+
+ time.stopwatch_start(&watch)
+ for _ in 0..<runs {
+ result := p(&data[0], size)
+ assert(result == 1, loc = loc)
+ accumulator += result
+ }
+ time.stopwatch_stop(&watch)
+ timing = time.stopwatch_duration(watch)
+
+ log.debug(accumulator)
+ return
+}
+
+run_trial_size :: proc {
+ run_trial_size_cmp,
+ run_trial_size_zero,
+}
+
+
+bench_table :: proc(algo_name: string, plain, simd: $P) {
+ string_buffer := strings.builder_make()
+ defer strings.builder_destroy(&string_buffer)
+
+ tbl: table.Table
+ table.init(&tbl)
+ defer table.destroy(&tbl)
+
+ table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Iterations", "Scalar", "SIMD", "SIMD Relative (%)", "SIMD Relative (x)")
+
+ for size in sizes {
+ // Place the non-zero byte somewhere in the middle.
+ needle_index := size / 2
+
+ plain_timing := run_trial_size(plain, size, needle_index, RUNS_PER_SIZE)
+ simd_timing := run_trial_size(simd, size, needle_index, RUNS_PER_SIZE)
+
+ _plain := fmt.tprintf("%8M", plain_timing)
+ _simd := fmt.tprintf("%8M", simd_timing)
+ _relp := fmt.tprintf("%.3f %%", f64(simd_timing) / f64(plain_timing) * 100.0)
+ _relx := fmt.tprintf("%.3f x", 1 / (f64(simd_timing) / f64(plain_timing)))
+
+ table.aligned_row_of_values(
+ &tbl,
+ .Right,
+ algo_name,
+ size, RUNS_PER_SIZE, _plain, _simd, _relp, _relx)
+ }
+
+ builder_writer := strings.to_writer(&string_buffer)
+
+ fmt.sbprintln(&string_buffer)
+ table.write_plain_table(builder_writer, &tbl)
+
+ my_table_string := strings.to_string(string_buffer)
+ log.info(my_table_string)
+}
+
+@test
+benchmark_memory_procs :: proc(t: ^testing.T) {
+ bench_table("memory_equal", plain_memory_equal, runtime.memory_equal)
+ bench_table("memory_compare", plain_memory_compare, runtime.memory_compare)
+ bench_table("memory_compare_zero", plain_memory_compare_zero, runtime.memory_compare_zero)
+}
diff --git a/tests/core/runtime/test_core_runtime.odin b/tests/core/runtime/test_core_runtime.odin
index 472a5527d..6bbb9fb8a 100644
--- a/tests/core/runtime/test_core_runtime.odin
+++ b/tests/core/runtime/test_core_runtime.odin
@@ -4,6 +4,7 @@ package test_core_runtime
import "base:intrinsics"
import "core:mem"
import "base:runtime"
+import "core:slice"
import "core:testing"
// Tests that having space for the allocation, but not for the allocation and alignment
@@ -177,3 +178,78 @@ test_map_get :: proc(t: ^testing.T) {
check(t, m)
}
}
+
+@(test)
+test_memory_equal :: proc(t: ^testing.T) {
+ data: [256]u8
+ cmp: [256]u8
+
+ slice.fill(data[:], 0xAA)
+ slice.fill(cmp[:], 0xAA)
+
+ for offset in 0..<len(data) {
+ subdata := data[offset:]
+ subcmp := cmp[offset:]
+ for idx in 0..<len(subdata) {
+ if !testing.expect_value(t, runtime.memory_equal(&subdata[0], &subcmp[0], len(subdata)), true) {
+ return
+ }
+
+ subcmp[idx] = 0x55
+ if !testing.expect_value(t, runtime.memory_equal(&subdata[0], &subcmp[0], len(subdata)), false) {
+ return
+ }
+ subcmp[idx] = 0xAA
+ }
+ }
+}
+
+@(test)
+test_memory_compare :: proc(t: ^testing.T) {
+ data: [256]u8
+ cmp: [256]u8
+
+ for offset in 0..<len(data) {
+ subdata := data[offset:]
+ subcmp := cmp[offset:]
+ for idx in 0..<len(subdata) {
+ if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), 0) {
+ return
+ }
+
+ subdata[idx] = 0x7F
+ subcmp[idx] = 0xFF
+ if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), -1) {
+ return
+ }
+
+ subdata[idx] = 0xFF
+ subcmp[idx] = 0x7F
+ if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), 1) {
+ return
+ }
+
+ subdata[idx] = 0
+ subcmp[idx] = 0
+ }
+ }
+}
+
+@(test)
+test_memory_compare_zero :: proc(t: ^testing.T) {
+ data: [256]u8
+
+ for offset in 0..<len(data) {
+ subdata := data[offset:]
+ for idx in 0..<len(subdata) {
+ if !testing.expect_value(t, runtime.memory_compare_zero(&subdata[0], len(subdata)), 0) {
+ return
+ }
+ subdata[idx] = 0xFF
+ if !testing.expect_value(t, runtime.memory_compare_zero(&subdata[0], len(subdata)), 1) {
+ return
+ }
+ subdata[idx] = 0
+ }
+ }
+}