aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2021-09-11 15:27:51 +0200
committerJeroen van Rijn <Kelimion@users.noreply.github.com>2021-09-11 15:28:49 +0200
commiteaefbc43cb10282039cdb5818ffc9d1bde8d2f43 (patch)
treee78a0b4050b59aea8a2b60bf365eb5b4ebe50afa /core
parent1f1d8ef884fa21d5ccb37f0281e1668544e1b87d (diff)
xxhash: Add XXH3_128 + test vectors.
Diffstat (limited to 'core')
-rw-r--r--core/hash/xxhash/common.odin24
-rw-r--r--core/hash/xxhash/xxhash_3.odin914
-rw-r--r--core/hash/xxhash/xxhash_32.odin3
3 files changed, 937 insertions, 4 deletions
diff --git a/core/hash/xxhash/common.odin b/core/hash/xxhash/common.odin
index 93a4230c4..4fa0b9d55 100644
--- a/core/hash/xxhash/common.odin
+++ b/core/hash/xxhash/common.odin
@@ -11,7 +11,9 @@ package xxhash
import "core:intrinsics"
import "core:runtime"
-mem_copy :: runtime.mem_copy
+import "core:sys/llvm"
+mem_copy :: runtime.mem_copy
+byte_swap :: intrinsics.byte_swap
/*
Version definition
@@ -43,6 +45,22 @@ Error :: enum {
Error,
}
+XXH_DISABLE_PREFETCH :: #config(XXH_DISABLE_PREFETCH, false)
+
+when !XXH_DISABLE_PREFETCH {
+ prefetch_address :: #force_inline proc(address: rawptr) {
+ llvm.prefetch(address, .Read, .High, .Data)
+ }
+ prefetch_offset :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {
+ ptr := rawptr(uintptr(address) + offset)
+ prefetch_address(ptr)
+ }
+} else {
+ prefetch_address :: #force_inline proc(address: rawptr) {}
+ prefetch_offset :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {}
+}
+prefetch :: proc { prefetch_address, prefetch_offset, }
+
@(optimization_mode="speed")
XXH_rotl32 :: #force_inline proc(x, r: u32) -> (res: u32) {
return ((x << r) | (x >> (32 - r)))
@@ -54,7 +72,7 @@ XXH_rotl64 :: #force_inline proc(x, r: u64) -> (res: u64) {
}
@(optimization_mode="speed")
-XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32) {
+XXH32_read32 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u32) {
if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
#no_bounds_check b := (^u32le)(&buf[0])^
return u32(b)
@@ -66,7 +84,7 @@ XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32
}
@(optimization_mode="speed")
-XXH64_read64 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u64) {
+XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u64) {
if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
#no_bounds_check b := (^u64le)(&buf[0])^
return u64(b)
diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
new file mode 100644
index 000000000..327a4c847
--- /dev/null
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -0,0 +1,914 @@
+/*
+ An implementation of Yann Collet's [xxhash Fast Hash Algorithm](https://cyan4973.github.io/xxHash/).
+ Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+
+ Made available under Odin's BSD-3 license, based on the original C code.
+
+ List of contributors:
+ Jeroen van Rijn: Initial implementation.
+*/
+package xxhash
+import "core:intrinsics"
+
+/* *********************************************************************
+* XXH3
+* New generation hash designed for speed on small keys and vectorization
+************************************************************************
+* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+* remaining a true 64-bit/128-bit hash function.
+*
+* This is done by prioritizing a subset of 64-bit operations that can be
+* emulated without too many steps on the average 32-bit machine.
+*
+* For example, these two lines seem similar, and run equally fast on 64-bit:
+*
+* xxh_u64 x;
+* x ^= (x >> 47); // good
+* x ^= (x >> 13); // bad
+*
+* However, to a 32-bit machine, there is a major difference.
+*
+* x ^= (x >> 47) looks like this:
+*
+* x.lo ^= (x.hi >> (47 - 32));
+*
+* while x ^= (x >> 13) looks like this:
+*
+* // note: funnel shifts are not usually cheap.
+* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+* x.hi ^= (x.hi >> 13);
+*
+* The first one is significantly faster than the second, simply because the
+* shift is larger than 32. This means:
+* - All the bits we need are in the upper 32 bits, so we can ignore the lower
+* 32 bits in the shift.
+* - The shift result will always fit in the lower 32 bits, and therefore,
+* we can ignore the upper 32 bits in the xor.
+*
+* Thanks to this optimization, XXH3 only requires these features to be efficient:
+*
+* - Usable unaligned access
+* - A 32-bit or 64-bit ALU
+* - If 32-bit, a decent ADC instruction
+* - A 32 or 64-bit multiply with a 64-bit result
+* - For the 128-bit variant, a decent byteswap helps short inputs.
+*
+* The first two are already required by XXH32, and almost all 32-bit and 64-bit
+* platforms which can run XXH32 can run XXH3 efficiently.
+*
+* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+* notable exception.
+*
+* First of all, Thumb-1 lacks support for the UMULL instruction which
+* performs the important long multiply. This means numerous __aeabi_lmul
+* calls.
+*
+* Second of all, the 8 functional registers are just not enough.
+* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+* Lo registers, and this shuffling results in thousands more MOVs than A32.
+*
+* A32 and T32 don't have this limitation. They can access all 14 registers,
+* do a 32->64 multiply with UMULL, and the flexible operand allowing free
+* shifts is helpful, too.
+*
+* Therefore, we do a quick sanity check.
+*
+* If compiling Thumb-1 for a target which supports ARM instructions, we will
+* emit a warning, as it is not a "sane" platform to compile for.
+*
+* Usually, if this happens, it is because of an accident and you probably need
+* to specify -march, as you likely meant to compile for a newer architecture.
+*
+* Credit: large sections of the vectorial and asm source code paths
+* have been contributed by @easyaspi314
+*/
+
+XXH_ACC_ALIGN :: 8 /* scalar */
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+XXH3_SECRET_SIZE_MIN :: 136
+XXH_SECRET_DEFAULT_SIZE :: max(XXH3_SECRET_SIZE_MIN, #config(XXH_SECRET_DEFAULT_SIZE, 192))
+
+XXH3_kSecret :: [?]u8{
+ 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+ 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+ 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+ 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+ 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+ 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+ 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+ 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+ 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+ 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+ 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+ 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+}
+#assert(size_of(XXH3_kSecret) == 192)
+
+
+/************************************************************************
+* XXH3 128-bit variant
+************************************************************************/
+
+/*
+ Stored in little endian order, although the fields themselves are in native endianness.
+*/
+xxh_u128 :: u128
+XXH3_128_hash :: u128
+XXH3_128_DEFAULT_SEED :: xxh_u64(0)
+
+XXH128_hash_t :: struct #raw_union {
+ using raw: struct {
+ low: XXH64_hash, /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+ high: XXH64_hash, /*!< `value >> 64` */
+ },
+ h: xxh_u128,
+}
+#assert(size_of(xxh_u128) == size_of(XXH128_hash_t))
+
+@(optimization_mode="speed")
+XXH_mul_32_to_64 :: #force_inline proc(x, y: xxh_u32) -> (res: xxh_u64) {
+ return u64(x) * u64(y)
+}
+
+@(optimization_mode="speed")
+XXH_mul_64_to_128 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u128) {
+ return xxh_u128(lhs) * xxh_u128(rhs)
+}
+
+/*
+ The reason for the separate function is to prevent passing too many structs
+ around by value. This will hopefully inline the multiply, but we don't force it.
+
+ @param lhs, rhs The 64-bit integers to multiply
+ @return The low 64 bits of the product XOR'd by the high 64 bits.
+*/
+@(optimization_mode="speed")
+XXH_mul_64_to_128_fold_64 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u64) {
+ t := XXH128_hash_t{}
+ t.h = #force_inline XXH_mul_64_to_128(lhs, rhs)
+ return t.low ~ t.high
+}
+
+@(optimization_mode="speed")
+XXH_xorshift_64 :: #force_inline proc(v: xxh_u64, auto_cast shift: uint) -> (res: xxh_u64) {
+ return v ~ (v >> shift)
+}
+
+/*
+ This is a fast avalanche stage, suitable when input bits are already partially mixed
+*/
+@(optimization_mode="speed")
+XXH3_avalanche :: #force_inline proc(h64: xxh_u64) -> (res: xxh_u64) {
+ res = XXH_xorshift_64(h64, 37)
+ res *= 0x165667919E3779F9
+ res = XXH_xorshift_64(res, 32)
+ return
+}
+
+/*
+ This is a stronger avalanche, inspired by Pelle Evensen's rrmxmx
+ preferable when input has not been previously mixed
+*/
+@(optimization_mode="speed")
+XXH3_rrmxmx :: #force_inline proc(h64, length: xxh_u64) -> (res: xxh_u64) {
+ /* this mix is inspired by Pelle Evensen's rrmxmx */
+ res = h64
+ res ~= XXH_rotl64(res, 49) ~ XXH_rotl64(res, 24)
+ res *= 0x9FB21C651E98DF25
+ res ~= (res >> 35) + length
+ res *= 0x9FB21C651E98DF25
+ return XXH_xorshift_64(res, 28)
+}
+
+/*
+ ==========================================
+ XXH3 128 bits (a.k.a XXH128)
+ ==========================================
+ XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ even without counting the significantly larger output size.
+
+ For example, extra steps are taken to avoid the seed-dependent collisions
+ in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+
+ This strength naturally comes at the cost of some speed, especially on short
+ lengths. Note that longer hashes are about as fast as the 64-bit version
+ due to it using only a slight modification of the 64-bit loop.
+
+ XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+*/
+
+@(optimization_mode="speed")
+XXH3_len_1to3_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ /* A doubled version of 1to3_64b with different constants. */
+ length := len(input)
+ /*
+ * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+ * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+ * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+ */
+ #no_bounds_check {
+ c1 := input[ 0]
+ c2 := input[length >> 1]
+ c3 := input[length - 1]
+ combinedl := (u32(c1) << 16) | (u32(c2) << 24) | (u32(c3) << 0) | (u32(length) << 8)
+ combinedh := XXH_rotl32(byte_swap(combinedl), 13)
+ bitflipl := u64(XXH32_read32(secret[0:]) ~ XXH32_read32(secret[4: ])) + seed
+ bitfliph := u64(XXH32_read32(secret[8:]) ~ XXH32_read32(secret[12:])) - seed
+ keyed_lo := u64(combinedl) ~ bitflipl
+ keyed_hi := u64(combinedh) ~ bitfliph
+
+ return xxh_u128(XXH64_avalanche(keyed_lo)) | xxh_u128(XXH64_avalanche(keyed_hi)) << 64
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_len_4to8_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ length := len(input)
+ seed := seed
+
+ seed ~= u64(byte_swap(u32(seed))) << 32
+ #no_bounds_check {
+ input_lo := u64(XXH32_read32(input[0:]))
+ input_hi := u64(XXH32_read32(input[length - 4:]))
+ input_64 := u64(input_lo) + u64(input_hi) << 32
+ bitflip := (XXH64_read64(secret[16:]) ~ XXH64_read64(secret[24:])) + seed
+ keyed := input_64 ~ bitflip
+
+ /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+ m128 := XXH128_hash_t{
+ h = XXH_mul_64_to_128(keyed, u64(XXH_PRIME64_1) + (u64(length) << 2)),
+ }
+ m128.high += (m128.low << 1)
+ m128.low ~= (m128.high >> 3)
+
+ m128.low = XXH_xorshift_64(m128.low, 35)
+ m128.low *= 0x9FB21C651E98DF25
+ m128.low = XXH_xorshift_64(m128.low, 28)
+ m128.high = XXH3_avalanche(m128.high)
+
+ return m128.h
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_len_9to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ length := len(input)
+
+ #no_bounds_check {
+ bitflipl := (XXH64_read64(secret[32:]) ~ XXH64_read64(secret[40:])) - seed
+ bitfliph := (XXH64_read64(secret[48:]) ~ XXH64_read64(secret[56:])) + seed
+ input_lo := XXH64_read64(input[0:])
+ input_hi := XXH64_read64(input[length - 8:])
+ m128 := XXH128_hash_t{
+ h = XXH_mul_64_to_128(input_lo ~ input_hi ~ bitflipl, XXH_PRIME64_1),
+ }
+ /*
+ * Put len in the middle of m128 to ensure that the length gets mixed to
+ * both the low and high bits in the 128x64 multiply below.
+ */
+ m128.low += u64(length - 1) << 54
+ input_hi ~= bitfliph
+ /*
+ * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+ * add the long product of the low 32 bits of input_hi and XXH_XXH_PRIME32_2 to
+ * the high 64 bits of m128.
+ *
+ * The best approach to this operation is different on 32-bit and 64-bit.
+ */
+ when size_of(rawptr) == 4 { /* 32-bit */
+ /*
+ * 32-bit optimized version, which is more readable.
+ *
+ * On 32-bit, it removes an ADC and delays a dependency between the two
+ * halves of m128.high64, but it generates an extra mask on 64-bit.
+ */
+ m128.high += (input_hi & 0xFFFFFFFF00000000) + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2)
+ } else {
+ /*
+ * 64-bit optimized (albeit more confusing) version.
+ *
+ * Uses some properties of addition and multiplication to remove the mask:
+ *
+ * Let:
+ * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+ * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+ * c = XXH_XXH_PRIME32_2
+ *
+ * a + (b * c)
+ * Inverse Property: x + y - x == y
+ * a + (b * (1 + c - 1))
+ * Distributive Property: x * (y + z) == (x * y) + (x * z)
+ * a + (b * 1) + (b * (c - 1))
+ * Identity Property: x * 1 == x
+ * a + b + (b * (c - 1))
+ *
+ * Substitute a, b, and c:
+ * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
+ *
+ * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+ * input_hi + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
+ */
+ m128.high += input_hi + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2 - 1)
+ }
+ /* m128 ^= XXH_swap64(m128 >> 64); */
+ m128.low ~= byte_swap(m128.high)
+ { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+ h128 := XXH128_hash_t{
+ h = XXH_mul_64_to_128(m128.low, XXH_PRIME64_2),
+ }
+ h128.high += m128.high * XXH_PRIME64_2
+ h128.low = XXH3_avalanche(h128.low)
+ h128.high = XXH3_avalanche(h128.high)
+ return h128.h
+ }
+ }
+}
+
+/*
+ Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+*/
+@(optimization_mode="speed")
+XXH3_len_0to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ length := len(input)
+
+ switch {
+ case length > 8: return XXH3_len_9to16_128b(input, secret, seed)
+ case length >= 4: return XXH3_len_4to8_128b (input, secret, seed)
+ case length > 0: return XXH3_len_1to3_128b (input, secret, seed)
+ case:
+ #no_bounds_check bitflipl := XXH64_read64(secret[64:]) ~ XXH64_read64(secret[72:])
+ #no_bounds_check bitfliph := XXH64_read64(secret[80:]) ~ XXH64_read64(secret[88:])
+ return xxh_u128(XXH64_avalanche(seed ~ bitflipl)) | xxh_u128(XXH64_avalanche(seed ~ bitfliph)) << 64
+ }
+}
+
+/*
+ A bit slower than XXH3_mix16B, but handles multiply by zero better.
+*/
+@(optimization_mode="speed")
+XXH128_mix32B :: #force_inline proc(acc: xxh_u128, input_1: []u8, input_2: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ acc128 := XXH128_hash_t{
+ h = acc,
+ }
+ #no_bounds_check {
+ acc128.low += XXH3_mix16B (input_1, secret[0:], seed)
+ acc128.low ~= XXH64_read64(input_2[0:]) + XXH64_read64(input_2[8:])
+ acc128.high += XXH3_mix16B (input_2, secret[16:], seed)
+ acc128.high ~= XXH64_read64(input_1) + XXH64_read64(input_1[8:])
+ return acc128.h
+ }
+}
+
+
+
+
+@(optimization_mode="speed")
+XXH3_len_17to128_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ length := len(input)
+
+ acc := XXH128_hash_t{}
+ acc.low = xxh_u64(length) * XXH_PRIME64_1
+
+ switch{
+ case length > 96:
+ #no_bounds_check acc.h = XXH128_mix32B(acc.h, input[48:], input[length - 64:], secret[96:], seed)
+ fallthrough
+ case length > 64:
+ #no_bounds_check acc.h = XXH128_mix32B(acc.h, input[32:], input[length - 48:], secret[64:], seed)
+ fallthrough
+ case length > 32:
+ #no_bounds_check acc.h = XXH128_mix32B(acc.h, input[16:], input[length - 32:], secret[32:], seed)
+ fallthrough
+ case:
+ #no_bounds_check acc.h = XXH128_mix32B(acc.h, input, input[length - 16:], secret, seed)
+
+ h128 := XXH128_hash_t{}
+ h128.low = acc.low + acc.high
+ h128.high = (acc.low * XXH_PRIME64_1) + (acc.high * XXH_PRIME64_4) + ((u64(length) - seed) * XXH_PRIME64_2)
+ h128.low = XXH3_avalanche(h128.low)
+ h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
+ return h128.h
+ }
+ unreachable()
+}
+
+@(optimization_mode="speed")
+XXH3_len_129to240_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
+ length := len(input)
+
+ #no_bounds_check {
+ acc := XXH128_hash_t{}
+ acc.low = u64(length) * XXH_PRIME64_1
+
+ nbRounds := length / 32
+
+ i: int
+ #no_bounds_check for i = 0; i < 4; i += 1 {
+ acc.h = XXH128_mix32B(acc.h,
+ input[32 * i:],
+ input [32 * i + 16:],
+ secret[32 * i:],
+ seed)
+ }
+ acc.low = XXH3_avalanche(acc.low)
+ acc.high = XXH3_avalanche(acc.high)
+
+ #no_bounds_check for i = 4; i < nbRounds; i += 1 {
+ acc.h = XXH128_mix32B(acc.h,
+ input[32 * i:], input[32 * i + 16:],
+ secret[XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)):],
+ seed)
+ }
+ /* last bytes */
+ #no_bounds_check acc.h = XXH128_mix32B(acc.h,
+ input[length - 16:],
+ input[length - 32:],
+ secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16:],
+ u64(i64(0) - i64(seed)))
+
+ #no_bounds_check {
+ h128 := XXH128_hash_t{}
+ h128.low = acc.low + acc.high
+ h128.high = u64(
+ u128(acc.low * XXH_PRIME64_1) \
+ + u128(acc.high * XXH_PRIME64_4) \
+ + u128((u64(length) - seed) * XXH_PRIME64_2))
+ h128.low = XXH3_avalanche(h128.low)
+ h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
+ return h128.h
+ }
+ }
+ unreachable()
+}
+
+XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{
+ XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,
+ XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1,
+}
+
+XXH_SECRET_MERGEACCS_START :: 11
+
+@(optimization_mode="speed")
+XXH3_hashLong_128b_internal :: #force_inline proc(
+ input: []u8,
+ secret: []u8,
+ f_acc512: XXH3_accumulate_512_f,
+ f_scramble: XXH3_scramble_accumulator_f) -> (res: XXH3_128_hash) {
+
+ acc := XXH3_INIT_ACC
+ #assert(size_of(acc) == 64)
+
+ XXH3_hashLong_internal_loop(acc[:], input, secret, f_acc512, f_scramble)
+
+ /* converge into final hash */
+ {
+ length := len(input)
+ secret_size := len(secret)
+
+ h128 := XXH128_hash_t{}
+ h128.low = XXH3_mergeAccs(acc[:], secret[XXH_SECRET_MERGEACCS_START:], u64(length) * XXH_PRIME64_1)
+ h128.high = XXH3_mergeAccs(acc[:], secret[secret_size - size_of(acc) - XXH_SECRET_MERGEACCS_START:],
+ ~(u64(length) * XXH_PRIME64_2))
+ return h128.h
+ }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
+ k_secret := XXH3_kSecret
+ return XXH3_hashLong_128b_internal(input, k_secret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
+ return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
+}
+
+XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
+ input: []u8, seed: xxh_u64, secret: []u8,
+ f_acc512: XXH3_accumulate_512_f,
+ f_scramble: XXH3_scramble_accumulator_f,
+ f_initSec: XXH3_init_custom_secret_f) -> (res: XXH3_128_hash) {
+
+ if seed == 0 {
+ k := XXH3_kSecret
+ return XXH3_hashLong_128b_internal(input, k[:], f_acc512, f_scramble)
+ }
+
+ {
+ secret := [XXH_SECRET_DEFAULT_SIZE]u8{}
+ f_initSec(secret[:], seed)
+ return XXH3_hashLong_128b_internal(input, secret[:], f_acc512, f_scramble)
+ }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
+ return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
+}
+
+XXH3_hashLong128_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash)
+
+XXH3_128bits_internal :: #force_inline proc(
+ input: []u8, seed: xxh_u64, secret: []u8, f_hl128: XXH3_hashLong128_f) -> (res: XXH3_128_hash) {
+
+ assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
+ /*
+ * If an action is to be taken if `secret` conditions are not respected,
+ * it should be done here.
+ * For now, it's a contract pre-condition.
+ * Adding a check and a branch here would cost performance at every hash.
+ */
+ length := len(input)
+
+ switch {
+ case length <= 16:
+ return XXH3_len_0to16_128b(input, secret, seed)
+ case length <= 128:
+ return XXH3_len_17to128_128b(input, secret, seed)
+ case length <= XXH3_MIDSIZE_MAX:
+ return XXH3_len_129to240_128b(input, secret, seed)
+ case:
+ return f_hl128(input, seed, secret)
+ }
+}
+
+/* === Public XXH128 API === */
+
+XXH3_128bits :: proc(input: []u8) -> (hash: XXH3_128_hash) {
+ k := XXH3_kSecret
+ return XXH3_128bits_internal(input, XXH3_128_DEFAULT_SEED, k[:], XXH3_hashLong_128b_default)
+}
+
+
+
+/*
+ ==========================================
+ Short keys
+ ==========================================
+ One of the shortcomings of XXH32 and XXH64 was that their performance was
+ sub-optimal on short lengths. It used an iterative algorithm which strongly
+ favored lengths that were a multiple of 4 or 8.
+
+ Instead of iterating over individual inputs, we use a set of single shot
+ functions which piece together a range of lengths and operate in constant time.
+ Additionally, the number of multiplies has been significantly reduced. This
+ reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+
+ Depending on the platform, this may or may not be faster than XXH32, but it
+ is almost guaranteed to be faster than XXH64.
+*/
+
+/*
+ At very short lengths, there isn't enough input to fully hide secrets, or use the entire secret.
+
+ There is also only a limited amount of mixing we can do before significantly impacting performance.
+
+ Therefore, we use different sections of the secret and always mix two secret samples with an XOR.
+ This should have no effect on performance on the seedless or withSeed variants because everything
+ _should_ be constant folded by modern compilers.
+
+ The XOR mixing hides individual parts of the secret and increases entropy.
+ This adds an extra layer of strength for custom secrets.
+*/
+@(optimization_mode="speed")
+XXH3_len_1to3_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ length := u32(len(input))
+ assert(input != nil)
+ assert(1 <= length && length <= 3)
+ assert(secret != nil)
+ /*
+ len = 1: combined = { input[0], 0x01, input[0], input[0] }
+ len = 2: combined = { input[1], 0x02, input[0], input[1] }
+ len = 3: combined = { input[2], 0x03, input[0], input[1] }
+ */
+ #no_bounds_check {
+ c1 := u32(input[0 ])
+ c2 := u32(input[length >> 1])
+ c3 := u32(input[length - 1])
+
+ combined := c1 << 16 | c2 << 24 | c3 << 0 | length << 8
+ bitflip := (u64(XXH32_read32(secret)) ~ u64(XXH32_read32(secret[4:]))) + seed
+ keyed := u64(combined) ~ bitflip
+ return XXH64_avalanche(keyed)
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_len_4to8_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ length := u32(len(input))
+ assert(input != nil)
+ assert(4 <= length && length <= 8)
+ assert(secret != nil)
+ seed := seed
+
+ seed ~= u64(byte_swap(u32(seed) << 32))
+ #no_bounds_check {
+ input1 := XXH32_read32(input)
+ input2 := XXH32_read32(input[length - 4:])
+ bitflip := (XXH64_read64(secret[8:]) ~ XXH64_read64(secret[16:])) - seed
+ input64 := u64(input2) + (u64(input1) << 32)
+ keyed := input64 ~ bitflip
+ return XXH3_rrmxmx(keyed, u64(length))
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_len_9to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ length := u64(len(input))
+ assert(input != nil)
+ assert(9 <= length && length <= 16)
+ assert(secret != nil)
+ #no_bounds_check {
+ bitflip1 := (XXH64_read64(secret[24:]) ~ XXH64_read64(secret[32:])) + seed
+ bitflip2 := (XXH64_read64(secret[40:]) ~ XXH64_read64(secret[48:])) - seed
+ input_lo := XXH64_read64(input) ~ bitflip1
+ input_hi := XXH64_read64(input[length - 8:]) ~ bitflip2
+ acc := length + byte_swap(input_lo) + input_hi \
+ + XXH_mul_64_to_128_fold_64(input_lo, input_hi)
+ return XXH3_avalanche(acc)
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_len_0to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ length := u64(len(input))
+ assert(input != nil)
+ assert(length <= 16)
+ #no_bounds_check {
+ switch {
+ case length > 8: return #force_inline XXH3_len_9to16_64b(input, secret, seed)
+ case length >= 4: return #force_inline XXH3_len_4to8_64b (input, secret, seed)
+ case length > 0: return #force_inline XXH3_len_1to3_64b (input, secret, seed)
+ case:
+ return #force_inline XXH64_avalanche(seed ~ (XXH64_read64(secret[56:]) ~ XXH64_read64(secret[64:])))
+ }
+ }
+}
+
+/*
+ DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ multiplication by zero, affecting hashes of lengths 17 to 240.
+
+ However, they are very unlikely.
+
+ Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ unseeded non-cryptographic hashes, it does not attempt to defend itself
+ against specially crafted inputs, only random inputs.
+
+ Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ cancelling out the secret is taken an arbitrary number of times (addressed
+ in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ and/or proper seeding:
+
+ This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ function that is only called up to 16 times per hash with up to 240 bytes of
+ input.
+
+ This is not too bad for a non-cryptographic hash function, especially with
+ only 64 bit outputs.
+
+ The 128-bit variant (which trades some speed for strength) is NOT affected
+ by this, although it is always a good idea to use a proper seed if you care
+ about strength.
+*/
+@(optimization_mode="speed")
+XXH3_mix16B :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ input_lo := XXH64_read64(input[0:])
+ input_hi := XXH64_read64(input[8:])
+
+ input_lo ~= (XXH64_read64(secret[0:]) + seed)
+ input_hi ~= (XXH64_read64(secret[8:]) - seed)
+ return XXH_mul_64_to_128_fold_64(input_lo, input_hi)
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+@(optimization_mode="speed")
+XXH3_len_17to128_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
+ length := len(input)
+ assert(16 < length && length <= 128)
+
+ #no_bounds_check {
+ acc := u64(length) * XXH_PRIME64_1
+ switch {
+ case length > 96:
+ acc += XXH3_mix16B(input[48: ], secret[96: ], seed)
+ acc += XXH3_mix16B(input[length - 64:], secret[112:], seed)
+ fallthrough
+ case length > 64:
+ acc += XXH3_mix16B(input[32: ], secret[64: ], seed)
+ acc += XXH3_mix16B(input[length - 48:], secret[80: ], seed)
+ fallthrough
+ case length > 32:
+ acc += XXH3_mix16B(input[16: ], secret[32: ], seed)
+ acc += XXH3_mix16B(input[length - 32:], secret[48: ], seed)
+ fallthrough
+ case:
+ acc += XXH3_mix16B(input[0: ], secret[0: ], seed)
+ acc += XXH3_mix16B(input[length - 16:], secret[16: ], seed)
+ }
+ return XXH3_avalanche(acc)
+ }
+}
+
+XXH3_MIDSIZE_MAX :: 240
+XXH3_MIDSIZE_STARTOFFSET :: 3
+XXH3_MIDSIZE_LASTOFFSET :: 17
+
+@(optimization_mode="speed")
+XXH3_len_129to240_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
+ assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
+ length := len(input)
+ assert(128 < length && length <= XXH3_MIDSIZE_MAX)
+
+ #no_bounds_check {
+ acc := u64(length) * XXH_PRIME64_1
+ nbRounds := length / 16
+
+ i: int
+ for i = 0; i < 8; i += 1 {
+ acc += XXH3_mix16B(input[16 * i:], secret[16 * i:], seed)
+ }
+
+ acc = XXH3_avalanche(acc)
+ assert(nbRounds >= 8)
+
+ for i = 8; i < nbRounds; i += 1 {
+ acc += XXH3_mix16B(input[16 * i:], secret[(16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET:], seed)
+ }
+ /* last bytes */
+ acc += XXH3_mix16B(input[length - 16:], secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET:], seed)
+ return XXH3_avalanche(acc)
+ }
+}
+
+/* ======= Long Keys ======= */
+
+XXH_STRIPE_LEN :: 64
+XXH_SECRET_CONSUME_RATE :: 8 /* nb of secret bytes consumed at each accumulation */
+XXH_ACC_NB :: (XXH_STRIPE_LEN / size_of(xxh_u64))
+
+@(optimization_mode="speed")
+XXH_writeLE64 :: #force_inline proc(dst: []u8, v64: u64le) {
+ v := v64
+ mem_copy(raw_data(dst), &v, size_of(v64))
+}
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH3_accumulate_512_f :: #type proc(acc: []xxh_u64, input: []u8, secret: []u8)
+XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
+XXH3_init_custom_secret_f :: #type proc(custom_secret: []u8, seed64: xxh_u64)
+
+XXH3_accumulate_512 : XXH3_accumulate_512_f = XXH3_accumulate_512_scalar
+XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
+XXH3_init_custom_secret : XXH3_init_custom_secret_f = XXH3_init_custom_secret_scalar
+
+/* scalar variants - universal */
+@(optimization_mode="speed")
+XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
+ xacc := acc /* presumed aligned */
+ xinput := input /* no alignment restriction */
+ xsecret := secret /* no alignment restriction */
+
+ assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
+
+ #no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
+ data_val := XXH64_read64(xinput[8 * i:])
+ data_key := data_val ~ XXH64_read64(xsecret[8 * i:])
+ xacc[i ~ 1] += data_val /* swap adjacent lanes */
+ xacc[i ] += XXH_mul_32_to_64(u32(data_key & 0xFFFFFFFF), u32(data_key >> 32))
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_scramble_accumulator_scalar :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
+ xacc := acc /* presumed aligned */
+ xsecret := secret /* no alignment restriction */
+
+ assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
+
+ #no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
+ key64 := XXH64_read64(xsecret[8 * i:])
+ acc64 := xacc[i]
+ acc64 = XXH_xorshift_64(acc64, 47)
+ acc64 ~= key64
+ acc64 *= u64(XXH_PRIME32_1)
+ xacc[i] = acc64
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
+ #assert((XXH_SECRET_DEFAULT_SIZE & 15) == 0)
+
+ kSecretPtr := XXH3_kSecret
+
+ nbRounds := XXH_SECRET_DEFAULT_SIZE / 16
+ #no_bounds_check for i := 0; i < nbRounds; i += 1 {
+ lo := XXH64_read64(kSecretPtr[16 * i: ]) + seed64
+ hi := XXH64_read64(kSecretPtr[16 * i + 8:]) - seed64
+ XXH_writeLE64(custom_secret[16 * i: ], u64le(lo))
+ XXH_writeLE64(custom_secret[16 * i + 8:], u64le(hi))
+ }
+}
+
+XXH_PREFETCH_DIST :: 320
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+@(optimization_mode="speed")
+XXH3_accumulate :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint,
+ f_acc512: XXH3_accumulate_512_f) {
+
+ for n := uint(0); n < nbStripes; n += 1 {
+ when !XXH_DISABLE_PREFETCH {
+ in_ptr := &input[n * XXH_STRIPE_LEN]
+ prefetch(in_ptr, XXH_PREFETCH_DIST)
+ }
+ f_acc512(acc, input[n * XXH_STRIPE_LEN:], secret[n * XXH_SECRET_CONSUME_RATE:])
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_hashLong_internal_loop :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8,
+ f_acc512: XXH3_accumulate_512_f, f_scramble: XXH3_scramble_accumulator_f) {
+
+ length := uint(len(input))
+ secret_size := uint(len(secret))
+ stripes_per_block := (secret_size - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE
+
+ block_len := XXH_STRIPE_LEN * stripes_per_block
+ blocks := (length - 1) / block_len
+
+ #no_bounds_check for n := uint(0); n < blocks; n += 1 {
+ XXH3_accumulate(acc, input[n * block_len:], secret, stripes_per_block, f_acc512)
+ f_scramble(acc, secret[secret_size - XXH_STRIPE_LEN:])
+ }
+
+ /* last partial block */
+ #no_bounds_check {
+ stripes := ((length - 1) - (block_len * blocks)) / XXH_STRIPE_LEN
+
+ XXH3_accumulate(acc, input[blocks * block_len:], secret, stripes, f_acc512)
+
+ /* last stripe */
+ #no_bounds_check {
+ p := input[length - XXH_STRIPE_LEN:]
+ XXH_SECRET_LASTACC_START :: 7 /* not aligned on 8, last secret is different from acc & scrambler */
+ f_acc512(acc, p, secret[secret_size - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START:])
+ }
+ }
+}
+
+@(optimization_mode="speed")
+XXH3_mix2Accs :: #force_inline proc(acc: []xxh_u64, secret: []u8) -> (res: xxh_u64) {
+ return XXH_mul_64_to_128_fold_64(
+ acc[0] ~ XXH64_read64(secret),
+ acc[1] ~ XXH64_read64(secret[8:]))
+}
+
+@(optimization_mode="speed")
+XXH3_mergeAccs :: #force_inline proc(acc: []xxh_u64, secret: []u8, start: xxh_u64) -> (res: xxh_u64) {
+ result64 := start
+ #no_bounds_check for i := 0; i < 4; i += 1 {
+ result64 += XXH3_mix2Accs(acc[2 * i:], secret[16 * i:])
+ }
+ return XXH3_avalanche(result64)
+} \ No newline at end of file
diff --git a/core/hash/xxhash/xxhash_32.odin b/core/hash/xxhash/xxhash_32.odin
index fe1a686ce..f41161133 100644
--- a/core/hash/xxhash/xxhash_32.odin
+++ b/core/hash/xxhash/xxhash_32.odin
@@ -15,6 +15,7 @@ import "core:intrinsics"
32-bit hash functions
*/
XXH32_hash :: u32
+xxh_u32 :: u32
XXH32_DEFAULT_SEED :: XXH32_hash(0)
XXH32_state :: struct {
@@ -153,7 +154,7 @@ XXH32_endian_align :: #force_inline proc(input: []u8, seed := XXH32_DEFAULT_SEED
v3 := seed + 0
v4 := seed - XXH_PRIME32_1
- for len(buf) >= 15 {
+ for len(buf) >= 16 {
#no_bounds_check v1 = XXH32_round(v1, XXH32_read32(buf, alignment)); buf = buf[4:]
#no_bounds_check v2 = XXH32_round(v2, XXH32_read32(buf, alignment)); buf = buf[4:]
#no_bounds_check v3 = XXH32_round(v3, XXH32_read32(buf, alignment)); buf = buf[4:]