Add static SIMD support to XXH3 in core:hash/xxhash.

This uses compile-time features to decide how large of a SIMD vector to use. It currently has checks for amd64/i386 to size its vectors for SSE2/AVX2/AVX512 as necessary. The generalized SIMD functions could also be useful for multiversioning of the hash procs, to allow for run-time dispatch based on available CPU features.
author: Barinzaya <barinzaya@gmail.com> 2025-07-30 12:37:12 -0400
committer: Barinzaya <barinzaya@gmail.com> 2025-07-31 13:05:08 -0400
commit: 9d40f371bebaca6c74b615fa03dd5574eb51e327 (patch)
tree: ae72123bc6c73555866ffaa71ee39047f12e1149 /core/hash/xxhash/xxhash_3.odin
parent: 393e00bec3e855475659de0c6c38d3898a36cb36 (diff)
1 files changed, 84 insertions, 4 deletions
diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
index 293e98528..8e88d4a90 100644
--- a/core/hash/xxhash/xxhash_3.odin
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -52,6 +52,7 @@ XXH3_SECRET_SIZE_MIN    :: 136
 #assert(len(XXH3_kSecret) == 192 && len(XXH3_kSecret) > XXH3_SECRET_SIZE_MIN)
 
 XXH_ACC_ALIGN           :: 8   /* scalar */
+XXH_MAX_WIDTH           :: #config(XXH_MAX_WIDTH, 512) / 64
 
 /*
 	This is the optimal update size for incremental hashing.
@@ -733,10 +734,6 @@ XXH3_accumulate_512_f       :: #type proc(acc: []xxh_u64, input:  []u8, secret:
 XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
 XXH3_init_custom_secret_f   :: #type proc(custom_secret: []u8, seed64: xxh_u64)
 
-XXH3_accumulate_512       : XXH3_accumulate_512_f       = XXH3_accumulate_512_scalar
-XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
-XXH3_init_custom_secret   : XXH3_init_custom_secret_f   = XXH3_init_custom_secret_scalar
-
 /* scalar variants - universal */
 @(optimization_mode="favor_size")
 XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
@@ -785,6 +782,89 @@ XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64
 	}
 }
 
+/* generalized SIMD variants */
+@(optimization_mode="favor_size")
+XXH3_accumulate_512_simd_generic :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, $W: uint) {
+	u32xW :: #simd[W]u32
+	u64xW :: #simd[W]u64
+
+	#no_bounds_check for i in uint(0)..<XXH_ACC_NB/W {
+		data_val := XXH64_read64_simd(input[8 * W * i:], W)
+		sec      := XXH64_read64_simd(secret[8 * W * i:], W)
+		data_key := data_val ~ sec
+
+		// Swap adjacent lanes
+		when W == 2 {
+			data_val = swizzle(data_val, 1, 0)
+		} else when W == 4 {
+			data_val = swizzle(data_val, 1, 0, 3, 2)
+		} else when W == 8 {
+			data_val = swizzle(data_val, 1, 0, 3, 2, 5, 4, 7, 6)
+		} else {
+			#panic("Unsupported vector size!")
+		}
+
+		a := XXH64_read64_simd(acc[W * i:], W)
+		a += data_val
+		a += u64xW(u32xW(data_key)) * intrinsics.simd_shr(data_key, 32)
+		XXH64_write64_simd(acc[W * i:], a)
+	}
+}
+
+XXH3_scramble_accumulator_simd_generic :: #force_inline proc(acc: []xxh_u64, secret: []u8, $W: uint) {
+	u64xW :: #simd[W]u64
+	#no_bounds_check for i in uint(0)..<XXH_ACC_NB/W {
+		key64 := XXH64_read64_simd(secret[8 * W * i:], W)
+		acc64 := XXH64_read64_simd(acc[W * i:], W)
+		acc64 ~= intrinsics.simd_shr(acc64, 47)
+		acc64 ~= key64
+		acc64 *= XXH_PRIME32_1
+		XXH64_write64_simd(acc[W * i:], acc64)
+	}
+}
+
+@(optimization_mode="favor_size")
+XXH3_init_custom_secret_simd_generic :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64, $W: uint) {
+	u64xW :: #simd[W]u64
+
+	seedVec := u64xW(seed64)
+	for i in 0..<W/2 {
+		j := 2*i + 1
+		seedVec = intrinsics.simd_replace(seedVec, j, -intrinsics.simd_extract(seedVec, j))
+	}
+
+	nbRounds := XXH_SECRET_DEFAULT_SIZE / 8 / W
+	#no_bounds_check for i in uint(0)..<nbRounds {
+		block := XXH64_read64_simd(XXH3_kSecret[8 * W * i:], W)
+		block += seedVec
+		XXH64_write64_simd(custom_secret[8 * W * i:], block)
+	}
+}
+
+XXH3_accumulate_512 :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_accumulate_512_simd_generic(acc, input, secret, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_accumulate_512_scalar(acc, input, secret)
+	}
+}
+
+XXH3_scramble_accumulator :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_scramble_accumulator_simd_generic(acc, secret, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_scramble_accumulator_scalar(acc, secret)
+	}
+}
+
+XXH3_init_custom_secret :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_init_custom_secret_simd_generic(custom_secret, seed64, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_init_custom_secret_scalar(custom_secret, seed64)
+	}
+}
+
 XXH_PREFETCH_DIST :: 320
 
 /*
author	Barinzaya <barinzaya@gmail.com>	2025-07-30 12:37:12 -0400
committer	Barinzaya <barinzaya@gmail.com>	2025-07-31 13:05:08 -0400
commit	9d40f371bebaca6c74b615fa03dd5574eb51e327 (patch)
tree	ae72123bc6c73555866ffaa71ee39047f12e1149 /core/hash/xxhash/xxhash_3.odin
parent	393e00bec3e855475659de0c6c38d3898a36cb36 (diff)