xxhash: Add XXH3_128 + test vectors.

author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-09-11 15:27:51 +0200
committer: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-09-11 15:28:49 +0200
commit: eaefbc43cb10282039cdb5818ffc9d1bde8d2f43 (patch)
tree: e78a0b4050b59aea8a2b60bf365eb5b4ebe50afa /core
parent: 1f1d8ef884fa21d5ccb37f0281e1668544e1b87d (diff)
3 files changed, 937 insertions, 4 deletions
diff --git a/core/hash/xxhash/common.odin b/core/hash/xxhash/common.odin
index 93a4230c4..4fa0b9d55 100644
--- a/core/hash/xxhash/common.odin
+++ b/core/hash/xxhash/common.odin
@@ -11,7 +11,9 @@ package xxhash
 
 import "core:intrinsics"
 import "core:runtime"
-mem_copy :: runtime.mem_copy
+import "core:sys/llvm"
+mem_copy  :: runtime.mem_copy
+byte_swap :: intrinsics.byte_swap
 
 /*
 	Version definition
@@ -43,6 +45,22 @@ Error :: enum {
 	Error,
 }
 
+XXH_DISABLE_PREFETCH :: #config(XXH_DISABLE_PREFETCH, false)
+
+when !XXH_DISABLE_PREFETCH {
+	prefetch_address :: #force_inline proc(address: rawptr) {
+		llvm.prefetch(address, .Read, .High, .Data)
+	}
+	prefetch_offset  :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {
+		ptr := rawptr(uintptr(address) + offset)
+		prefetch_address(ptr)
+	}
+} else {
+	prefetch_address :: #force_inline proc(address: rawptr) {}
+	prefetch_offset  :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {}
+}
+prefetch :: proc { prefetch_address, prefetch_offset, }
+
 @(optimization_mode="speed")
 XXH_rotl32 :: #force_inline proc(x, r: u32) -> (res: u32) {
 	return ((x << r) | (x >> (32 - r)))
@@ -54,7 +72,7 @@ XXH_rotl64 :: #force_inline proc(x, r: u64) -> (res: u64) {
 }
 
 @(optimization_mode="speed")
-XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32) {
+XXH32_read32 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u32) {
 	if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
 		#no_bounds_check b := (^u32le)(&buf[0])^
 		return u32(b)
@@ -66,7 +84,7 @@ XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32
 }
 
 @(optimization_mode="speed")
-XXH64_read64 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u64) {
+XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u64) {
 	if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
 		#no_bounds_check b := (^u64le)(&buf[0])^
 		return u64(b)
diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
new file mode 100644
index 000000000..327a4c847
--- /dev/null
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -0,0 +1,914 @@
+/*
+	An implementation of Yann Collet's [xxhash Fast Hash Algorithm](https://cyan4973.github.io/xxHash/).
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+
+	Made available under Odin's BSD-3 license, based on the original C code.
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+package xxhash
+import "core:intrinsics"
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************
+* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+* remaining a true 64-bit/128-bit hash function.
+*
+* This is done by prioritizing a subset of 64-bit operations that can be
+* emulated without too many steps on the average 32-bit machine.
+*
+* For example, these two lines seem similar, and run equally fast on 64-bit:
+*
+*   xxh_u64 x;
+*   x ^= (x >> 47); // good
+*   x ^= (x >> 13); // bad
+*
+* However, to a 32-bit machine, there is a major difference.
+*
+* x ^= (x >> 47) looks like this:
+*
+*   x.lo ^= (x.hi >> (47 - 32));
+*
+* while x ^= (x >> 13) looks like this:
+*
+*   // note: funnel shifts are not usually cheap.
+*   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+*   x.hi ^= (x.hi >> 13);
+*
+* The first one is significantly faster than the second, simply because the
+* shift is larger than 32. This means:
+*  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+*    32 bits in the shift.
+*  - The shift result will always fit in the lower 32 bits, and therefore,
+*    we can ignore the upper 32 bits in the xor.
+*
+* Thanks to this optimization, XXH3 only requires these features to be efficient:
+*
+*  - Usable unaligned access
+*  - A 32-bit or 64-bit ALU
+*      - If 32-bit, a decent ADC instruction
+*  - A 32 or 64-bit multiply with a 64-bit result
+*  - For the 128-bit variant, a decent byteswap helps short inputs.
+*
+* The first two are already required by XXH32, and almost all 32-bit and 64-bit
+* platforms which can run XXH32 can run XXH3 efficiently.
+*
+* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+* notable exception.
+*
+* First of all, Thumb-1 lacks support for the UMULL instruction which
+* performs the important long multiply. This means numerous __aeabi_lmul
+* calls.
+*
+* Second of all, the 8 functional registers are just not enough.
+* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+* Lo registers, and this shuffling results in thousands more MOVs than A32.
+*
+* A32 and T32 don't have this limitation. They can access all 14 registers,
+* do a 32->64 multiply with UMULL, and the flexible operand allowing free
+* shifts is helpful, too.
+*
+* Therefore, we do a quick sanity check.
+*
+* If compiling Thumb-1 for a target which supports ARM instructions, we will
+* emit a warning, as it is not a "sane" platform to compile for.
+*
+* Usually, if this happens, it is because of an accident and you probably need
+* to specify -march, as you likely meant to compile for a newer architecture.
+*
+* Credit: large sections of the vectorial and asm source code paths
+*         have been contributed by @easyaspi314
+*/
+
+XXH_ACC_ALIGN :: 8                 /* scalar */
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+XXH3_SECRET_SIZE_MIN    :: 136
+XXH_SECRET_DEFAULT_SIZE :: max(XXH3_SECRET_SIZE_MIN, #config(XXH_SECRET_DEFAULT_SIZE, 192))
+
+XXH3_kSecret :: [?]u8{
+	0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+	0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+	0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+	0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+	0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+	0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+	0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+	0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+	0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+	0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+	0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+	0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+}
+#assert(size_of(XXH3_kSecret) == 192)
+
+
+/************************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*
+	Stored in little endian order, although the fields themselves are in native endianness.
+*/
+xxh_u128              :: u128
+XXH3_128_hash         :: u128
+XXH3_128_DEFAULT_SEED :: xxh_u64(0)
+
+XXH128_hash_t :: struct #raw_union {
+	using raw: struct {
+		low:  XXH64_hash, /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+		high: XXH64_hash, /*!< `value >> 64` */
+	},
+	h: xxh_u128,
+}
+#assert(size_of(xxh_u128) == size_of(XXH128_hash_t))
+
+@(optimization_mode="speed")
+XXH_mul_32_to_64 :: #force_inline proc(x, y: xxh_u32) -> (res: xxh_u64) {
+	return u64(x) * u64(y)
+}
+
+@(optimization_mode="speed")
+XXH_mul_64_to_128 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u128) {
+	return xxh_u128(lhs) * xxh_u128(rhs)
+}
+
+/*
+	The reason for the separate function is to prevent passing too many structs
+	around by value. This will hopefully inline the multiply, but we don't force it.
+
+	@param lhs, rhs The 64-bit integers to multiply
+	@return The low 64 bits of the product XOR'd by the high 64 bits.
+*/
+@(optimization_mode="speed")
+XXH_mul_64_to_128_fold_64 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u64) {
+	t  := XXH128_hash_t{}
+	t.h = #force_inline XXH_mul_64_to_128(lhs, rhs)
+	return t.low ~ t.high
+}
+
+@(optimization_mode="speed")
+XXH_xorshift_64 :: #force_inline proc(v: xxh_u64, auto_cast shift: uint) -> (res: xxh_u64) {
+	return v ~ (v >> shift)
+}
+
+/*
+	This is a fast avalanche stage, suitable when input bits are already partially mixed
+*/
+@(optimization_mode="speed")
+XXH3_avalanche :: #force_inline proc(h64: xxh_u64) -> (res: xxh_u64) {
+	res = XXH_xorshift_64(h64, 37)
+	res *= 0x165667919E3779F9
+	res = XXH_xorshift_64(res, 32)
+	return
+}
+
+/*
+	This is a stronger avalanche, inspired by Pelle Evensen's rrmxmx
+	preferable when input has not been previously mixed
+*/
+@(optimization_mode="speed")
+XXH3_rrmxmx :: #force_inline proc(h64, length: xxh_u64) -> (res: xxh_u64) {
+	/* this mix is inspired by Pelle Evensen's rrmxmx */
+	res = h64
+	res ~= XXH_rotl64(res, 49) ~ XXH_rotl64(res, 24)
+	res *= 0x9FB21C651E98DF25
+	res ~= (res >> 35) + length 
+	res *= 0x9FB21C651E98DF25
+	return XXH_xorshift_64(res, 28)
+}
+
+/*
+	==========================================
+	       XXH3 128 bits (a.k.a XXH128)
+	==========================================
+	XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+	even without counting the significantly larger output size.
+
+ 	For example, extra steps are taken to avoid the seed-dependent collisions
+	in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+
+	This strength naturally comes at the cost of some speed, especially on short
+	lengths. Note that longer hashes are about as fast as the 64-bit version
+	due to it using only a slight modification of the 64-bit loop.
+
+	XXH128 is also more oriented towards 64-bit machines. It is still extremely
+	fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+*/
+
+@(optimization_mode="speed")
+XXH3_len_1to3_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	/* A doubled version of 1to3_64b with different constants. */
+	length := len(input)
+	/*
+	 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+	 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+	 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+	 */
+	#no_bounds_check {
+		c1 := input[          0]
+		c2 := input[length >> 1]
+		c3 := input[length  - 1]
+		combinedl := (u32(c1) << 16) | (u32(c2) << 24) | (u32(c3) << 0) | (u32(length) << 8)
+		combinedh := XXH_rotl32(byte_swap(combinedl), 13)
+		bitflipl  := u64(XXH32_read32(secret[0:]) ~ XXH32_read32(secret[4: ])) + seed
+		bitfliph  := u64(XXH32_read32(secret[8:]) ~ XXH32_read32(secret[12:])) - seed
+		keyed_lo  := u64(combinedl) ~ bitflipl
+		keyed_hi  := u64(combinedh) ~ bitfliph
+		
+		return xxh_u128(XXH64_avalanche(keyed_lo)) | xxh_u128(XXH64_avalanche(keyed_hi)) << 64
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_len_4to8_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	length := len(input)
+	seed   := seed
+
+	seed ~= u64(byte_swap(u32(seed))) << 32
+	#no_bounds_check {
+		input_lo := u64(XXH32_read32(input[0:]))
+		input_hi := u64(XXH32_read32(input[length - 4:]))
+		input_64 := u64(input_lo) + u64(input_hi) << 32
+		bitflip  := (XXH64_read64(secret[16:]) ~ XXH64_read64(secret[24:])) + seed
+		keyed    := input_64 ~ bitflip
+
+		/* Shift len to the left to ensure it is even, this avoids even multiplies. */
+		m128 := XXH128_hash_t{
+			h = XXH_mul_64_to_128(keyed, u64(XXH_PRIME64_1) + (u64(length) << 2)),
+		}
+		m128.high += (m128.low  << 1)
+		m128.low  ~= (m128.high >> 3)
+
+		m128.low   = XXH_xorshift_64(m128.low, 35)
+		m128.low  *= 0x9FB21C651E98DF25
+		m128.low   = XXH_xorshift_64(m128.low, 28)
+		m128.high  = XXH3_avalanche(m128.high)
+
+		return m128.h
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_len_9to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	length := len(input)
+
+	#no_bounds_check {
+		bitflipl := (XXH64_read64(secret[32:]) ~ XXH64_read64(secret[40:])) - seed
+		bitfliph := (XXH64_read64(secret[48:]) ~ XXH64_read64(secret[56:])) + seed
+		input_lo := XXH64_read64(input[0:])
+		input_hi := XXH64_read64(input[length - 8:])
+		m128     := XXH128_hash_t{
+			h = XXH_mul_64_to_128(input_lo ~ input_hi ~ bitflipl, XXH_PRIME64_1),
+		}
+		/*
+		 * Put len in the middle of m128 to ensure that the length gets mixed to
+		 * both the low and high bits in the 128x64 multiply below.
+		 */
+		m128.low += u64(length - 1) << 54
+		input_hi ~= bitfliph
+		/*
+		 * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+		 * add the long product of the low 32 bits of input_hi and XXH_XXH_PRIME32_2 to
+		 * the high 64 bits of m128.
+		 *
+		 * The best approach to this operation is different on 32-bit and 64-bit.
+		 */
+		when size_of(rawptr) == 4 { /* 32-bit */
+			/*
+			 * 32-bit optimized version, which is more readable.
+			 *
+			 * On 32-bit, it removes an ADC and delays a dependency between the two
+			 * halves of m128.high64, but it generates an extra mask on 64-bit.
+			 */
+			m128.high += (input_hi & 0xFFFFFFFF00000000) + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2)
+		} else {
+			/*
+			 * 64-bit optimized (albeit more confusing) version.
+			 *
+			 * Uses some properties of addition and multiplication to remove the mask:
+			 *
+			 * Let:
+			 *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+			 *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+			 *    c = XXH_XXH_PRIME32_2
+			 *
+			 *    a + (b * c)
+			 * Inverse Property: x + y - x == y
+			 *    a + (b * (1 + c - 1))
+			 * Distributive Property: x * (y + z) == (x * y) + (x * z)
+			 *    a + (b * 1) + (b * (c - 1))
+			 * Identity Property: x * 1 == x
+			 *    a + b + (b * (c - 1))
+			 *
+			 * Substitute a, b, and c:
+			 *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
+			 *
+			 * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+			 *    input_hi + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
+			 */
+			m128.high += input_hi + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2 - 1)
+		}
+		/* m128 ^= XXH_swap64(m128 >> 64); */
+		m128.low ~= byte_swap(m128.high)
+		{   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+			h128 := XXH128_hash_t{
+				h = XXH_mul_64_to_128(m128.low, XXH_PRIME64_2),
+			}
+			h128.high += m128.high * XXH_PRIME64_2
+			h128.low   = XXH3_avalanche(h128.low)
+			h128.high  = XXH3_avalanche(h128.high)
+			return h128.h
+		}
+	}
+}
+
+/*
+	Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+*/
+@(optimization_mode="speed")
+XXH3_len_0to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	length := len(input)
+
+	switch {
+	case length  > 8: return XXH3_len_9to16_128b(input, secret, seed)
+	case length >= 4: return XXH3_len_4to8_128b (input, secret, seed)
+	case length  > 0: return XXH3_len_1to3_128b (input, secret, seed)
+	case:
+		#no_bounds_check bitflipl := XXH64_read64(secret[64:]) ~ XXH64_read64(secret[72:])
+		#no_bounds_check bitfliph := XXH64_read64(secret[80:]) ~ XXH64_read64(secret[88:])
+		return xxh_u128(XXH64_avalanche(seed ~ bitflipl)) | xxh_u128(XXH64_avalanche(seed ~ bitfliph)) << 64
+	}
+}
+
+/*
+	A bit slower than XXH3_mix16B, but handles multiply by zero better.
+*/
+@(optimization_mode="speed")
+XXH128_mix32B :: #force_inline proc(acc: xxh_u128, input_1: []u8, input_2: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	acc128 := XXH128_hash_t{
+		h = acc,
+	}
+	#no_bounds_check {
+		acc128.low  += XXH3_mix16B (input_1, secret[0:], seed)
+		acc128.low  ~= XXH64_read64(input_2[0:]) + XXH64_read64(input_2[8:])
+		acc128.high += XXH3_mix16B (input_2, secret[16:], seed)
+		acc128.high ~= XXH64_read64(input_1) + XXH64_read64(input_1[8:])
+		return acc128.h
+	}
+}
+
+
+
+
+@(optimization_mode="speed")
+XXH3_len_17to128_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	length := len(input)
+
+	acc  := XXH128_hash_t{}
+	acc.low = xxh_u64(length) * XXH_PRIME64_1
+
+	switch{
+	case length > 96:
+		#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[48:], input[length - 64:], secret[96:], seed)
+		fallthrough
+	case length > 64:
+		#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[32:], input[length - 48:], secret[64:], seed)
+		fallthrough
+	case length > 32:
+		#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[16:], input[length - 32:], secret[32:], seed)
+		fallthrough
+	case:
+		#no_bounds_check acc.h = XXH128_mix32B(acc.h, input,      input[length - 16:], secret,      seed)
+
+		h128     := XXH128_hash_t{}
+		h128.low  = acc.low + acc.high
+		h128.high = (acc.low * XXH_PRIME64_1) + (acc.high * XXH_PRIME64_4) + ((u64(length) - seed) * XXH_PRIME64_2)
+		h128.low  = XXH3_avalanche(h128.low)
+		h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
+		return h128.h
+	}
+	unreachable()
+}
+
+@(optimization_mode="speed")
+XXH3_len_129to240_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+	length := len(input)
+
+	#no_bounds_check {
+		acc := XXH128_hash_t{}
+		acc.low = u64(length) * XXH_PRIME64_1
+
+		nbRounds := length / 32
+
+		i: int
+		#no_bounds_check for i = 0; i < 4; i += 1 {
+			acc.h = XXH128_mix32B(acc.h,
+				                  input[32 * i:],
+				                  input [32 * i + 16:],
+				                  secret[32 * i:],
+				                  seed)
+		}
+		acc.low  = XXH3_avalanche(acc.low)
+		acc.high = XXH3_avalanche(acc.high)
+
+		#no_bounds_check for i = 4; i < nbRounds; i += 1 {
+			acc.h = XXH128_mix32B(acc.h,
+				                  input[32 * i:], input[32 * i + 16:],
+				                  secret[XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)):],
+								  seed)
+		}
+		/* last bytes */
+		#no_bounds_check acc.h = XXH128_mix32B(acc.h,
+							input[length - 16:],
+							input[length - 32:],
+							secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16:],
+							u64(i64(0) - i64(seed)))
+
+		#no_bounds_check {
+			h128 := XXH128_hash_t{}
+			h128.low  = acc.low + acc.high
+			h128.high = u64(
+				        u128(acc.low  * XXH_PRIME64_1) \
+			          + u128(acc.high * XXH_PRIME64_4) \
+			          + u128((u64(length) - seed) * XXH_PRIME64_2))
+			h128.low  = XXH3_avalanche(h128.low)
+			h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
+			return h128.h
+		}
+	}
+	unreachable()
+}
+
+XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{
+	XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,
+	XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1,
+}
+
+XXH_SECRET_MERGEACCS_START :: 11
+
+@(optimization_mode="speed")
+XXH3_hashLong_128b_internal :: #force_inline proc(
+			input: []u8,
+			secret: []u8,
+			f_acc512: XXH3_accumulate_512_f,
+			f_scramble: XXH3_scramble_accumulator_f) -> (res: XXH3_128_hash) {
+
+	acc := XXH3_INIT_ACC
+	#assert(size_of(acc) == 64)
+
+	XXH3_hashLong_internal_loop(acc[:], input, secret, f_acc512, f_scramble)
+
+	/* converge into final hash */
+	{
+		length      := len(input)
+		secret_size := len(secret)
+
+		h128 := XXH128_hash_t{}
+		h128.low  = XXH3_mergeAccs(acc[:], secret[XXH_SECRET_MERGEACCS_START:], u64(length) * XXH_PRIME64_1)
+		h128.high = XXH3_mergeAccs(acc[:], secret[secret_size - size_of(acc) - XXH_SECRET_MERGEACCS_START:],
+				~(u64(length) * XXH_PRIME64_2))
+		return h128.h
+	}
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
+	k_secret := XXH3_kSecret
+	return XXH3_hashLong_128b_internal(input, k_secret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
+	return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
+}
+
+XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
+								input: []u8, seed: xxh_u64, secret: []u8,
+								f_acc512: XXH3_accumulate_512_f,
+								f_scramble: XXH3_scramble_accumulator_f,
+								f_initSec: XXH3_init_custom_secret_f) -> (res: XXH3_128_hash) {
+
+	if seed == 0 {
+		k := XXH3_kSecret
+		return XXH3_hashLong_128b_internal(input, k[:], f_acc512, f_scramble)
+	}
+
+	{
+		secret := [XXH_SECRET_DEFAULT_SIZE]u8{}
+		f_initSec(secret[:], seed)
+		return XXH3_hashLong_128b_internal(input, secret[:], f_acc512, f_scramble)
+	}
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
+	return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
+}
+
+XXH3_hashLong128_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8)  -> (res: XXH3_128_hash)
+
+XXH3_128bits_internal :: #force_inline proc(
+	input: []u8, seed: xxh_u64, secret: []u8, f_hl128: XXH3_hashLong128_f) -> (res: XXH3_128_hash) {
+
+	assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
+	/*
+	 * If an action is to be taken if `secret` conditions are not respected,
+	 * it should be done here.
+	 * For now, it's a contract pre-condition.
+	 * Adding a check and a branch here would cost performance at every hash.
+	 */
+	length := len(input)
+
+	switch {
+	case length <= 16:
+		return XXH3_len_0to16_128b(input, secret, seed)
+	case length <= 128:
+		return XXH3_len_17to128_128b(input, secret, seed)
+	case length <= XXH3_MIDSIZE_MAX:
+		return XXH3_len_129to240_128b(input, secret, seed)
+	case:
+		return f_hl128(input, seed, secret)
+	}
+}
+
+/* ===   Public XXH128 API   === */
+
+XXH3_128bits :: proc(input: []u8) -> (hash: XXH3_128_hash) {
+	k := XXH3_kSecret
+	return XXH3_128bits_internal(input, XXH3_128_DEFAULT_SEED, k[:], XXH3_hashLong_128b_default)
+}
+
+
+
+/*
+	==========================================
+	Short keys
+	==========================================
+	One of the shortcomings of XXH32 and XXH64 was that their performance was
+	sub-optimal on short lengths. It used an iterative algorithm which strongly
+	favored lengths that were a multiple of 4 or 8.
+
+	Instead of iterating over individual inputs, we use a set of single shot
+	functions which piece together a range of lengths and operate in constant time.
+	Additionally, the number of multiplies has been significantly reduced. This
+	reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+
+	Depending on the platform, this may or may not be faster than XXH32, but it
+	is almost guaranteed to be faster than XXH64.
+*/
+
+/*
+	At very short lengths, there isn't enough input to fully hide secrets, or use the entire secret.
+
+	There is also only a limited amount of mixing we can do before significantly impacting performance.
+
+	Therefore, we use different sections of the secret and always mix two secret samples with an XOR.
+	This should have no effect on performance on the seedless or withSeed variants because everything
+	_should_ be constant folded by modern compilers.
+
+	The XOR mixing hides individual parts of the secret and increases entropy.
+	This adds an extra layer of strength for custom secrets.
+*/
+@(optimization_mode="speed")
+XXH3_len_1to3_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	length := u32(len(input))
+	assert(input != nil)
+	assert(1 <= length && length <= 3)
+	assert(secret != nil)
+	/*
+		len = 1: combined = { input[0], 0x01, input[0], input[0] }
+		len = 2: combined = { input[1], 0x02, input[0], input[1] }
+		len = 3: combined = { input[2], 0x03, input[0], input[1] }
+	*/
+	#no_bounds_check {
+		c1 := u32(input[0          ])
+		c2 := u32(input[length >> 1])
+		c3 := u32(input[length  - 1])
+
+		combined := c1 << 16 | c2  << 24 | c3 << 0 | length << 8
+		bitflip  := (u64(XXH32_read32(secret)) ~ u64(XXH32_read32(secret[4:]))) + seed
+		keyed    := u64(combined) ~ bitflip
+		return XXH64_avalanche(keyed)
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_len_4to8_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	length := u32(len(input))
+	assert(input != nil)
+	assert(4 <= length && length <= 8)
+	assert(secret != nil)
+	seed := seed
+
+	seed ~= u64(byte_swap(u32(seed) << 32))
+	#no_bounds_check {
+		input1  := XXH32_read32(input)
+		input2  := XXH32_read32(input[length - 4:])
+		bitflip := (XXH64_read64(secret[8:]) ~ XXH64_read64(secret[16:])) - seed
+		input64 := u64(input2) + (u64(input1) << 32)
+		keyed   := input64 ~ bitflip
+		return XXH3_rrmxmx(keyed, u64(length))
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_len_9to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	length := u64(len(input))
+	assert(input != nil)
+	assert(9 <= length && length <= 16)
+	assert(secret != nil)
+	#no_bounds_check {
+		bitflip1 := (XXH64_read64(secret[24:]) ~ XXH64_read64(secret[32:])) + seed
+		bitflip2 := (XXH64_read64(secret[40:]) ~ XXH64_read64(secret[48:])) - seed
+		input_lo := XXH64_read64(input)              ~ bitflip1
+		input_hi := XXH64_read64(input[length - 8:]) ~ bitflip2
+		acc      := length + byte_swap(input_lo) + input_hi \
+					+ XXH_mul_64_to_128_fold_64(input_lo, input_hi)
+		return XXH3_avalanche(acc)
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_len_0to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	length := u64(len(input))
+	assert(input != nil)
+	assert(length <= 16)
+	#no_bounds_check {
+		switch {
+		case length  > 8: return #force_inline XXH3_len_9to16_64b(input, secret, seed)
+		case length >= 4: return #force_inline XXH3_len_4to8_64b (input, secret, seed)
+		case length  > 0: return #force_inline XXH3_len_1to3_64b (input, secret, seed)
+		case:
+			return #force_inline XXH64_avalanche(seed ~ (XXH64_read64(secret[56:]) ~ XXH64_read64(secret[64:])))
+		}
+	}
+}
+
+/*
+	DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+	multiplication by zero, affecting hashes of lengths 17 to 240.
+
+	However, they are very unlikely.
+
+	Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+	unseeded non-cryptographic hashes, it does not attempt to defend itself
+	against specially crafted inputs, only random inputs.
+
+	Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+	cancelling out the secret is taken an arbitrary number of times (addressed
+	in XXH3_accumulate_512), this collision is very unlikely with random inputs
+	and/or proper seeding:
+
+	This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+	function that is only called up to 16 times per hash with up to 240 bytes of
+	input.
+
+	This is not too bad for a non-cryptographic hash function, especially with
+	only 64 bit outputs.
+
+	The 128-bit variant (which trades some speed for strength) is NOT affected
+	by this, although it is always a good idea to use a proper seed if you care
+	about strength.
+*/
+@(optimization_mode="speed")
+XXH3_mix16B :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	input_lo := XXH64_read64(input[0:])
+	input_hi := XXH64_read64(input[8:])
+
+	input_lo ~= (XXH64_read64(secret[0:]) + seed)
+	input_hi ~= (XXH64_read64(secret[8:]) - seed)
+	return XXH_mul_64_to_128_fold_64(input_lo, input_hi)
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+@(optimization_mode="speed")
+XXH3_len_17to128_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
+	length := len(input)
+	assert(16 < length && length <= 128)
+
+	#no_bounds_check {
+		acc := u64(length) * XXH_PRIME64_1
+		switch {
+		case length > 96:
+			acc += XXH3_mix16B(input[48:         ], secret[96: ], seed)
+			acc += XXH3_mix16B(input[length - 64:], secret[112:], seed)
+			fallthrough
+		case length > 64:
+			acc += XXH3_mix16B(input[32:         ], secret[64: ], seed)
+			acc += XXH3_mix16B(input[length - 48:], secret[80: ], seed)
+			fallthrough
+		case length > 32:
+			acc += XXH3_mix16B(input[16:         ], secret[32: ], seed)
+			acc += XXH3_mix16B(input[length - 32:], secret[48: ], seed)
+			fallthrough
+		case:
+			acc += XXH3_mix16B(input[0:          ], secret[0:  ], seed)
+			acc += XXH3_mix16B(input[length - 16:], secret[16: ], seed)
+		}
+		return XXH3_avalanche(acc)
+	}
+}
+
+XXH3_MIDSIZE_MAX         :: 240
+XXH3_MIDSIZE_STARTOFFSET :: 3
+XXH3_MIDSIZE_LASTOFFSET  :: 17
+
+@(optimization_mode="speed")
+XXH3_len_129to240_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+	assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
+	length := len(input)
+	assert(128 < length && length <= XXH3_MIDSIZE_MAX)
+
+	#no_bounds_check {
+		acc := u64(length) * XXH_PRIME64_1
+		nbRounds := length / 16
+
+		i: int
+		for i = 0; i < 8; i += 1 {
+			acc += XXH3_mix16B(input[16 * i:], secret[16 * i:], seed)
+		}
+
+		acc = XXH3_avalanche(acc)
+		assert(nbRounds >= 8)
+
+		for i = 8; i < nbRounds; i += 1 {
+			acc += XXH3_mix16B(input[16 * i:], secret[(16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET:], seed)
+		}
+		/* last bytes */
+		acc += XXH3_mix16B(input[length - 16:], secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET:], seed)
+		return XXH3_avalanche(acc)
+	}
+}
+
+/* =======     Long Keys     ======= */
+
+XXH_STRIPE_LEN          :: 64
+XXH_SECRET_CONSUME_RATE :: 8 /* nb of secret bytes consumed at each accumulation */
+XXH_ACC_NB              :: (XXH_STRIPE_LEN / size_of(xxh_u64))
+
+@(optimization_mode="speed")
+XXH_writeLE64 :: #force_inline proc(dst: []u8, v64: u64le) {
+	v := v64
+	mem_copy(raw_data(dst), &v, size_of(v64))
+}
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH3_accumulate_512_f       :: #type proc(acc: []xxh_u64, input:  []u8, secret: []u8)
+XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
+XXH3_init_custom_secret_f   :: #type proc(custom_secret: []u8, seed64: xxh_u64)
+
+XXH3_accumulate_512       : XXH3_accumulate_512_f       = XXH3_accumulate_512_scalar
+XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
+XXH3_init_custom_secret   : XXH3_init_custom_secret_f   = XXH3_init_custom_secret_scalar
+
+/* scalar variants - universal */
+@(optimization_mode="speed")
+XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
+	xacc    := acc     /* presumed aligned */
+	xinput  := input   /* no alignment restriction */
+	xsecret := secret  /* no alignment restriction */
+
+	assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
+
+	#no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
+		data_val    := XXH64_read64(xinput[8 * i:])
+		data_key    := data_val ~ XXH64_read64(xsecret[8 * i:])
+		xacc[i ~ 1] += data_val /* swap adjacent lanes */
+		xacc[i    ] += XXH_mul_32_to_64(u32(data_key & 0xFFFFFFFF), u32(data_key >> 32))
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_scramble_accumulator_scalar :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
+	xacc    := acc     /* presumed aligned */
+	xsecret := secret  /* no alignment restriction */
+
+	assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
+
+	#no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
+		key64   := XXH64_read64(xsecret[8 * i:])
+		acc64   := xacc[i]
+		acc64    = XXH_xorshift_64(acc64, 47)
+		acc64   ~= key64
+		acc64   *= u64(XXH_PRIME32_1)
+		xacc[i]  = acc64
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
+	#assert((XXH_SECRET_DEFAULT_SIZE & 15) == 0)
+
+	kSecretPtr := XXH3_kSecret
+
+	nbRounds := XXH_SECRET_DEFAULT_SIZE / 16
+	#no_bounds_check for i := 0; i < nbRounds; i += 1 {
+		lo := XXH64_read64(kSecretPtr[16 * i:    ]) + seed64
+		hi := XXH64_read64(kSecretPtr[16 * i + 8:]) - seed64
+		XXH_writeLE64(custom_secret[16 * i:    ], u64le(lo))
+		XXH_writeLE64(custom_secret[16 * i + 8:], u64le(hi))
+	}
+}
+
+XXH_PREFETCH_DIST :: 320
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+@(optimization_mode="speed")
+XXH3_accumulate :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint,
+	f_acc512: XXH3_accumulate_512_f) {
+
+	for n := uint(0); n < nbStripes; n += 1 {
+		when !XXH_DISABLE_PREFETCH {
+			in_ptr := &input[n * XXH_STRIPE_LEN]
+			prefetch(in_ptr, XXH_PREFETCH_DIST)
+		}
+		f_acc512(acc, input[n * XXH_STRIPE_LEN:], secret[n * XXH_SECRET_CONSUME_RATE:])
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_hashLong_internal_loop :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8,
+	f_acc512: XXH3_accumulate_512_f, f_scramble: XXH3_scramble_accumulator_f) {
+
+	length      := uint(len(input))
+	secret_size := uint(len(secret))
+	stripes_per_block := (secret_size - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE
+
+	block_len   := XXH_STRIPE_LEN * stripes_per_block
+	blocks      := (length - 1) / block_len
+
+	#no_bounds_check for n := uint(0); n < blocks; n += 1 {
+		XXH3_accumulate(acc, input[n * block_len:], secret, stripes_per_block, f_acc512)
+		f_scramble(acc, secret[secret_size - XXH_STRIPE_LEN:])
+	}
+
+	/* last partial block */
+	#no_bounds_check {
+		stripes := ((length - 1) - (block_len * blocks)) / XXH_STRIPE_LEN
+
+		XXH3_accumulate(acc, input[blocks * block_len:], secret, stripes, f_acc512)
+
+		/* last stripe */
+		#no_bounds_check {
+			p := input[length - XXH_STRIPE_LEN:]
+			XXH_SECRET_LASTACC_START :: 7  /* not aligned on 8, last secret is different from acc & scrambler */
+			f_acc512(acc, p, secret[secret_size - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START:])
+		}
+	}
+}
+
+@(optimization_mode="speed")
+XXH3_mix2Accs :: #force_inline proc(acc: []xxh_u64, secret: []u8) -> (res: xxh_u64) {
+	return XXH_mul_64_to_128_fold_64(
+		acc[0] ~ XXH64_read64(secret),
+		acc[1] ~ XXH64_read64(secret[8:]))
+}
+
+@(optimization_mode="speed")
+XXH3_mergeAccs :: #force_inline proc(acc: []xxh_u64, secret: []u8, start: xxh_u64) -> (res: xxh_u64) {
+	result64 := start
+	#no_bounds_check for i := 0; i < 4; i += 1 {
+		result64 += XXH3_mix2Accs(acc[2 * i:], secret[16 * i:])
+	}
+	return XXH3_avalanche(result64)
+}
+\ No newline at end of file
diff --git a/core/hash/xxhash/xxhash_32.odin b/core/hash/xxhash/xxhash_32.odin
index fe1a686ce..f41161133 100644
--- a/core/hash/xxhash/xxhash_32.odin
+++ b/core/hash/xxhash/xxhash_32.odin
@@ -15,6 +15,7 @@ import "core:intrinsics"
 	32-bit hash functions
 */
 XXH32_hash :: u32
+xxh_u32    :: u32
 XXH32_DEFAULT_SEED :: XXH32_hash(0)
 
 XXH32_state :: struct {
@@ -153,7 +154,7 @@ XXH32_endian_align :: #force_inline proc(input: []u8, seed := XXH32_DEFAULT_SEED
 		v3 := seed + 0
 		v4 := seed - XXH_PRIME32_1
 
-		for len(buf) >= 15 {
+		for len(buf) >= 16 {
 			#no_bounds_check v1 = XXH32_round(v1, XXH32_read32(buf, alignment)); buf = buf[4:]
 			#no_bounds_check v2 = XXH32_round(v2, XXH32_read32(buf, alignment)); buf = buf[4:]
 			#no_bounds_check v3 = XXH32_round(v3, XXH32_read32(buf, alignment)); buf = buf[4:]
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-09-11 15:27:51 +0200
committer	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-09-11 15:28:49 +0200
commit	eaefbc43cb10282039cdb5818ffc9d1bde8d2f43 (patch)
tree	e78a0b4050b59aea8a2b60bf365eb5b4ebe50afa /core
parent	1f1d8ef884fa21d5ccb37f0281e1668544e1b87d (diff)