diff options
| author | Yawning Angel <yawning@schwanenlied.me> | 2025-11-25 04:06:07 +0900 |
|---|---|---|
| committer | Yawning Angel <yawning@schwanenlied.me> | 2025-11-29 16:16:52 +0900 |
| commit | ea1f1e275d7ba62afddd185532ec60024aa16b6b (patch) | |
| tree | c571b29dc03533518b045af9de392c85ef22e424 /base/runtime | |
| parent | e1ba69ea5192a245263b6a5ea5b4359cae7c0220 (diff) | |
base/runtime: Add chacha8rand as the default RNG
Diffstat (limited to 'base/runtime')
| -rw-r--r-- | base/runtime/random_generator.odin | 85 | ||||
| -rw-r--r-- | base/runtime/random_generator_chacha8.odin | 164 | ||||
| -rw-r--r-- | base/runtime/random_generator_chacha8_ref.odin | 145 | ||||
| -rw-r--r-- | base/runtime/random_generator_chacha8_simd128.odin | 290 | ||||
| -rw-r--r-- | base/runtime/random_generator_chacha8_simd256.odin | 197 |
5 files changed, 796 insertions, 85 deletions
diff --git a/base/runtime/random_generator.odin b/base/runtime/random_generator.odin index ca5c008d0..7d873fe33 100644 --- a/base/runtime/random_generator.odin +++ b/base/runtime/random_generator.odin @@ -41,88 +41,3 @@ random_generator_reset_u64 :: proc(rg: Random_Generator, p: u64) { rg.procedure(rg.data, .Reset, ([^]byte)(&p)[:size_of(p)]) } } - - -Default_Random_State :: struct { - state: u64, - inc: u64, -} - -default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) { - @(require_results) - read_u64 :: proc "contextless" (r: ^Default_Random_State) -> u64 { - old_state := r.state - r.state = old_state * 6364136223846793005 + (r.inc|1) - xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081 - rot := (old_state >> 59) - return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63)) - } - - @(thread_local) - global_rand_seed: Default_Random_State - - init :: proc "contextless" (r: ^Default_Random_State, seed: u64) { - seed := seed - if seed == 0 { - seed = u64(intrinsics.read_cycle_counter()) - } - r.state = 0 - r.inc = (seed << 1) | 1 - _ = read_u64(r) - r.state += seed - _ = read_u64(r) - } - - r: ^Default_Random_State = --- - if data == nil { - r = &global_rand_seed - } else { - r = cast(^Default_Random_State)data - } - - switch mode { - case .Read: - if r.state == 0 && r.inc == 0 { - init(r, 0) - } - - switch len(p) { - case size_of(u64): - // Fast path for a 64-bit destination. - intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r)) - case: - // All other cases. - pos := i8(0) - val := u64(0) - for &v in p { - if pos == 0 { - val = read_u64(r) - pos = 8 - } - v = byte(val) - val >>= 8 - pos -= 1 - } - } - - case .Reset: - seed: u64 - mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p))) - init(r, seed) - - case .Query_Info: - if len(p) != size_of(Random_Generator_Query_Info) { - return - } - info := (^Random_Generator_Query_Info)(raw_data(p)) - info^ += {.Uniform, .Resettable} - } -} - -@(require_results) -default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator { - return { - procedure = default_random_generator_proc, - data = state, - } -}
\ No newline at end of file diff --git a/base/runtime/random_generator_chacha8.odin b/base/runtime/random_generator_chacha8.odin new file mode 100644 index 000000000..02acb4e9f --- /dev/null +++ b/base/runtime/random_generator_chacha8.odin @@ -0,0 +1,164 @@ +package runtime + +import "base:intrinsics" + +// This is an implementation of the Chacha8Rand DRBG, as specified +// in https://github.com/C2SP/C2SP/blob/main/chacha8rand.md +// +// There is a tradeoff to be made between state-size and performance, +// in terms of the amount of rng output buffered. +// +// The sensible buffer sizes are: +// - 256-bytes: 128-bit SIMD with 16x vector registers (SSE2) +// - 512-bytes: 128-bit SIMD with 32x vector registers (ARMv8), +// 256-bit SIMD with 16x vector registers (AVX2), +// - 1024-bytes: AVX-512 +// +// Notes: +// - Smaller than 256-bytes is possible but would require redundant +// calls to the ChaCha8 function, which is prohibitively expensive. +// - Larger than 1024-bytes is possible but pointless as the construct +// is defined around 992-bytes of RNG output and 32-bytes of input +// per iteration. +// +// This implementation opts for a 1024-byte buffer for simplicity, +// under the rationale that modern extremely memory constrained targets +// provide suitable functionality in hardware, and the language makes +// supporting the various SIMD flavors easy. + +@(private = "file") +RNG_SEED_SIZE :: 32 +@(private) +RNG_OUTPUT_PER_ITER :: 1024 - RNG_SEED_SIZE + +@(private) +CHACHA_SIGMA_0: u32 : 0x61707865 +@(private) +CHACHA_SIGMA_1: u32 : 0x3320646e +@(private) +CHACHA_SIGMA_2: u32 : 0x79622d32 +@(private) +CHACHA_SIGMA_3: u32 : 0x6b206574 +@(private) +CHACHA_ROUNDS :: 8 + +Default_Random_State :: struct { + _buf: [1024]byte, + _off: int, + _seeded: bool, +} + +@(require_results) +default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator { + return { + procedure = default_random_generator_proc, + data = state, + } +} + +default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) { + @(thread_local) + state: Default_Random_State + + r: ^Default_Random_State = &state + if data != nil { + r = cast(^Default_Random_State)data + } + next_seed := r._buf[RNG_OUTPUT_PER_ITER:] + + switch mode { + case .Read: + if !r._seeded { // Unlikely. + rand_bytes(next_seed) + r._off = RNG_OUTPUT_PER_ITER // Force refill. + r._seeded = true + } + + assert(r._off <= RNG_OUTPUT_PER_ITER, "chacha8rand/BUG: outputed key material") + if r._off >= RNG_OUTPUT_PER_ITER { // Unlikely. + chacha8rand_refill(r) + } + + // We are guaranteed to have at least some RNG output buffered. + // + // As an invariant each read will consume a multiple of 8-bytes + // of output at a time. + assert(r._off <= RNG_OUTPUT_PER_ITER - 8, "chacha8rand/BUG: less than 8-bytes of output available") + assert(r._off % 8 == 0, "chacha8rand/BUG: buffered output is not a multiple of 8-bytes") + + p_len := len(p) + if p_len == size_of(u64) { + #no_bounds_check { + // Fast path for a 64-bit destination. + src := (^u64)(raw_data(r._buf[r._off:])) + intrinsics.unaligned_store((^u64)(raw_data(p)), src^) + src^ = 0 // Erasure (backtrack resistance) + r._off += 8 + } + return + } + + p_ := p + for remaining := p_len; remaining > 0; { + sz := min(remaining, RNG_OUTPUT_PER_ITER - r._off) + #no_bounds_check { + copy(p_[:sz], r._buf[r._off:]) + p_ = p_[sz:] + remaining -= sz + } + rounded_sz := ((sz + 7) / 8) * 8 + new_off := r._off + rounded_sz + #no_bounds_check if new_off < RNG_OUTPUT_PER_ITER { + // Erasure (backtrack resistance) + intrinsics.mem_zero(raw_data(r._buf[r._off:]), rounded_sz) + r._off = new_off + } else { + // Can omit erasure since we are overwriting the entire + // buffer. + chacha8rand_refill(r) + } + } + + case .Reset: + // If no seed is passed, the next call to .Read will attempt to + // reseed from the system entropy source. + if len(p) == 0 { + r._seeded = false + return + } + + // The cryptographic security of the output depends entirely + // on the quality of the entropy in the seed, we will allow + // re-seeding (as it makes testing easier), but callers that + // decide to provide arbitrary seeds are on their own as far + // as ensuring high-quality entropy. + intrinsics.mem_zero(raw_data(next_seed), RNG_SEED_SIZE) + copy(next_seed, p) + r._seeded = true + r._off = RNG_OUTPUT_PER_ITER // Force a refill. + + case .Query_Info: + if len(p) != size_of(Random_Generator_Query_Info) { + return + } + info := (^Random_Generator_Query_Info)(raw_data(p)) + info^ += {.Uniform, .Cryptographic, .Resettable} + } +} + +@(private = "file") +chacha8rand_refill :: proc(r: ^Default_Random_State) { + assert(r._seeded == true, "chacha8rand/BUG: unseeded refill") + + // i386 has insufficient vector registers to use the + // accelerated path at the moment. + when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { + chacha8rand_refill_simd256(r) + } else when HAS_HARDWARE_SIMD && ODIN_ARCH != .i386 { + chacha8rand_refill_simd128(r) + } else { + chacha8rand_refill_ref(r) + } + + r._off = 0 +} diff --git a/base/runtime/random_generator_chacha8_ref.odin b/base/runtime/random_generator_chacha8_ref.odin new file mode 100644 index 000000000..b1e812c3f --- /dev/null +++ b/base/runtime/random_generator_chacha8_ref.odin @@ -0,0 +1,145 @@ +package runtime + +import "base:intrinsics" + +@(private) +chacha8rand_refill_ref :: proc(r: ^Default_Random_State) { + // Initialize the base state. + k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:])) + when ODIN_ENDIAN == .Little { + s4 := k[0] + s5 := k[1] + s6 := k[2] + s7 := k[3] + s8 := k[4] + s9 := k[5] + s10 := k[6] + s11 := k[7] + } else { + s4 := intrinsics.byte_swap(k[0]) + s5 := intrinsics.byte_swap(k[1]) + s6 := intrinsics.byte_swap(k[2]) + s7 := intrinsics.byte_swap(k[3]) + s8 := intrinsics.byte_swap(k[4]) + s9 := intrinsics.byte_swap(k[5]) + s10 := intrinsics.byte_swap(k[6]) + s11 := intrinicss.byte_swap(k[7]) + } + s12: u32 // Counter starts at 0. + s13, s14, s15: u32 // IV of all 0s. + + dst: [^]u32 = (^u32)(raw_data(r._buf[:])) + + // At least with LLVM21 force_inline produces identical perf to + // manual inlining, yay. + quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) { + a, b, c, d := a, b, c, d + + a += b + d ~= a + d = rotl(d, 16) + + c += d + b ~= c + b = rotl(b, 12) + + a += b + d ~= a + d = rotl(d, 8) + + c += d + b ~= c + b = rotl(b, 7) + + return a, b, c, d + } + + // Filippo Valsorda made an observation that only one of the column + // round depends on the counter (s12), so it is worth precomputing + // and reusing across multiple blocks. As far as I know, only Go's + // chacha implementation does this. + + p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13) + p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14) + p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15) + + // 4 groups + for g := 0; g < 4; g = g + 1 { + // 4 blocks per group + for n := 0; n < 4; n = n + 1 { + // First column round that depends on the counter + p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12) + + // First diagonal round + x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15) + x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12) + x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13) + x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14) + + for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 { + x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12) + x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13) + x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14) + x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15) + x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12) + x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13) + x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14) + } + + // Interleave 4 blocks + // NB: The additions of sigma and the counter are omitted + STRIDE :: 4 + d_ := dst[n:] + when ODIN_ENDIAN == .Little { + d_[STRIDE*0] = x0 + d_[STRIDE*1] = x1 + d_[STRIDE*2] = x2 + d_[STRIDE*3] = x3 + d_[STRIDE*4] = x4 + s4 + d_[STRIDE*5] = x5 + s5 + d_[STRIDE*6] = x6 + s6 + d_[STRIDE*7] = x7 + s7 + d_[STRIDE*8] = x8 + s8 + d_[STRIDE*9] = x9 + s9 + d_[STRIDE*10] = x10 + s10 + d_[STRIDE*11] = x11 + s11 + d_[STRIDE*12] = x12 + d_[STRIDE*13] = x13 + s13 + d_[STRIDE*14] = x14 + s14 + d_[STRIDE*15] = x15 + s15 + } else { + d_[STRIDE*0] = intrinsics.byte_swap(x0) + d_[STRIDE*1] = intrinsics.byte_swap(x1) + d_[STRIDE*2] = intrinsics.byte_swap(x2) + d_[STRIDE*3] = intrinsics.byte_swap(x3) + d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4) + d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5) + d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6) + d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7) + d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8) + d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9) + d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10) + d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11) + d_[STRIDE*12] = intrinsics.byte_swap(x12) + d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13) + d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14) + d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15) + } + + s12 = s12 + 1 // Increment the counter + } + + dst = dst[16*4:] + } +} + +// This replicates `rotate_left32` from `core:math/bits`, under the +// assumption that this will live in `base:runtime`. +@(require_results, private = "file") +rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 { + n :: 32 + s := uint(k) & (n-1) + return x << s | x >> (n-s) +} diff --git a/base/runtime/random_generator_chacha8_simd128.odin b/base/runtime/random_generator_chacha8_simd128.odin new file mode 100644 index 000000000..d63d92620 --- /dev/null +++ b/base/runtime/random_generator_chacha8_simd128.odin @@ -0,0 +1,290 @@ +#+build !i386 +package runtime + +import "base:intrinsics" + +@(private = "file") +u32x4 :: #simd[4]u32 + +@(private = "file") +S0: u32x4 : {CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0} +@(private = "file") +S1: u32x4 : {CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1} +@(private = "file") +S2: u32x4 : {CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2} +@(private = "file") +S3: u32x4 : {CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3} + +@(private = "file") +_ROT_7L: u32x4 : {7, 7, 7, 7} +@(private = "file") +_ROT_7R: u32x4 : {25, 25, 25, 25} +@(private = "file") +_ROT_12L: u32x4 : {12, 12, 12, 12} +@(private = "file") +_ROT_12R: u32x4 : {20, 20, 20, 20} +@(private = "file") +_ROT_8L: u32x4 : {8, 8, 8, 8} +@(private = "file") +_ROT_8R: u32x4 : {24, 24, 24, 24} +@(private = "file") +_ROT_16: u32x4 : {16, 16, 16, 16} +@(private = "file") +_CTR_INC_4: u32x4 : {4, 4, 4, 4} +@(private = "file") +_CTR_INC_8: u32x4 : {8, 8, 8, 8} + +when ODIN_ENDIAN == .Big { + @(private = "file") + _byteswap_u32x4 :: #force_inline proc "contextless" (v: u32x4) -> u32x4 { + u8x16 :: #simd[16]u8 + return( + transmute(u32x4)simd.shuffle( + transmute(u8x16)v, + transmute(u8x16)v, + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} + +@(private) +chacha8rand_refill_simd128 :: proc(r: ^Default_Random_State) { + // Initialize the base state. + k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:])) + when ODIN_ENDIAN == .Little { + s4_ := k[0] + s5_ := k[1] + s6_ := k[2] + s7_ := k[3] + s8_ := k[4] + s9_ := k[5] + s10_ := k[6] + s11_ := k[7] + } else { + s4_ := intrinsics.byte_swap(k[0]) + s5_ := intrinsics.byte_swap(k[1]) + s6_ := intrinsics.byte_swap(k[2]) + s7_ := intrinsics.byte_swap(k[3]) + s8_ := intrinsics.byte_swap(k[4]) + s9_ := intrinsics.byte_swap(k[5]) + s10_ := intrinsics.byte_swap(k[6]) + s11_ := intrinicss.byte_swap(k[7]) + } + + // 4-lane ChaCha8. + s4 := u32x4{s4_, s4_, s4_, s4_} + s5 := u32x4{s5_, s5_, s5_, s5_} + s6 := u32x4{s6_, s6_, s6_, s6_} + s7 := u32x4{s7_, s7_, s7_, s7_} + s8 := u32x4{s8_, s8_, s8_, s8_} + s9 := u32x4{s9_, s9_, s9_, s9_} + s10 := u32x4{s10_, s10_, s10_, s10_} + s11 := u32x4{s11_, s11_, s11_, s11_} + s12 := u32x4{0, 1, 2, 3} + s13, s14, s15: u32x4 + + dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:])) + + quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x4) -> (u32x4, u32x4, u32x4, u32x4) { + a, b, c, d := a, b, c, d + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R)) + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R)) + + return a, b, c, d + } + + // 8 blocks at a time. + // + // Note: + // This uses a ton of registers so it is only worth it on targets + // that have something like 32 128-bit registers. This is currently + // all ARMv8 targets, and RISC-V Zvl128b (`V` application profile) + // targets. + // + // While our current definition of `.arm32` is 32-bit ARMv8, this + // may change in the future (ARMv7 is still relevant), and things + // like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time + // thus needs bemchmarking. + when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 { + for _ in 0..<2 { + x0_0, x1_0, x2_0, x3_0 := S0, S1, S2, S3 + x4_0, x5_0, x6_0, x7_0 := s4, s5, s6, s7 + x8_0, x9_0, x10_0, x11_0 := s8, s9, s10, s11 + x12_0, x13_0, x14_0, x15_0 := s12, s13, s14, s15 + + x0_1, x1_1, x2_1, x3_1 := S0, S1, S2, S3 + x4_1, x5_1, x6_1, x7_1 := s4, s5, s6, s7 + x8_1, x9_1, x10_1, x11_1 := s8, s9, s10, s11 + x12_1 := intrinsics.simd_add(s12, _CTR_INC_4) + x13_1, x14_1, x15_1 := s13, s14, s15 + + for i := CHACHA_ROUNDS; i > 0; i = i - 2 { + x0_0, x4_0, x8_0, x12_0 = quarter_round(x0_0, x4_0, x8_0, x12_0) + x0_1, x4_1, x8_1, x12_1 = quarter_round(x0_1, x4_1, x8_1, x12_1) + x1_0, x5_0, x9_0, x13_0 = quarter_round(x1_0, x5_0, x9_0, x13_0) + x1_1, x5_1, x9_1, x13_1 = quarter_round(x1_1, x5_1, x9_1, x13_1) + x2_0, x6_0, x10_0, x14_0 = quarter_round(x2_0, x6_0, x10_0, x14_0) + x2_1, x6_1, x10_1, x14_1 = quarter_round(x2_1, x6_1, x10_1, x14_1) + x3_0, x7_0, x11_0, x15_0 = quarter_round(x3_0, x7_0, x11_0, x15_0) + x3_1, x7_1, x11_1, x15_1 = quarter_round(x3_1, x7_1, x11_1, x15_1) + + x0_0, x5_0, x10_0, x15_0 = quarter_round(x0_0, x5_0, x10_0, x15_0) + x0_1, x5_1, x10_1, x15_1 = quarter_round(x0_1, x5_1, x10_1, x15_1) + x1_0, x6_0, x11_0, x12_0 = quarter_round(x1_0, x6_0, x11_0, x12_0) + x1_1, x6_1, x11_1, x12_1 = quarter_round(x1_1, x6_1, x11_1, x12_1) + x2_0, x7_0, x8_0, x13_0 = quarter_round(x2_0, x7_0, x8_0, x13_0) + x2_1, x7_1, x8_1, x13_1 = quarter_round(x2_1, x7_1, x8_1, x13_1) + x3_0, x4_0, x9_0, x14_0 = quarter_round(x3_0, x4_0, x9_0, x14_0) + x3_1, x4_1, x9_1, x14_1 = quarter_round(x3_1, x4_1, x9_1, x14_1) + } + + when ODIN_ENDIAN == .Little { + intrinsics.unaligned_store((^u32x4)(dst[0:]), x0_0) + intrinsics.unaligned_store((^u32x4)(dst[1:]), x1_0) + intrinsics.unaligned_store((^u32x4)(dst[2:]), x2_0) + intrinsics.unaligned_store((^u32x4)(dst[3:]), x3_0) + intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4_0, s4)) + intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5_0, s5)) + intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6_0, s6)) + intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7_0, s7)) + intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8_0, s8)) + intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9_0, s9)) + intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10_0, s10)) + intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11_0, s11)) + intrinsics.unaligned_store((^u32x4)(dst[12:]), x12_0) + intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13_0, s13)) + intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14_0, s14)) + intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15_0, s15)) + + intrinsics.unaligned_store((^u32x4)(dst[16:]), x0_1) + intrinsics.unaligned_store((^u32x4)(dst[17:]), x1_1) + intrinsics.unaligned_store((^u32x4)(dst[18:]), x2_1) + intrinsics.unaligned_store((^u32x4)(dst[19:]), x3_1) + intrinsics.unaligned_store((^u32x4)(dst[20:]), intrinsics.simd_add(x4_1, s4)) + intrinsics.unaligned_store((^u32x4)(dst[21:]), intrinsics.simd_add(x5_1, s5)) + intrinsics.unaligned_store((^u32x4)(dst[22:]), intrinsics.simd_add(x6_1, s6)) + intrinsics.unaligned_store((^u32x4)(dst[23:]), intrinsics.simd_add(x7_1, s7)) + intrinsics.unaligned_store((^u32x4)(dst[24:]), intrinsics.simd_add(x8_1, s8)) + intrinsics.unaligned_store((^u32x4)(dst[25:]), intrinsics.simd_add(x9_1, s9)) + intrinsics.unaligned_store((^u32x4)(dst[26:]), intrinsics.simd_add(x10_1, s10)) + intrinsics.unaligned_store((^u32x4)(dst[27:]), intrinsics.simd_add(x11_1, s11)) + intrinsics.unaligned_store((^u32x4)(dst[28:]), x12_1) + intrinsics.unaligned_store((^u32x4)(dst[29:]), intrinsics.simd_add(x13_1, s13)) + intrinsics.unaligned_store((^u32x4)(dst[30:]), intrinsics.simd_add(x14_1, s14)) + intrinsics.unaligned_store((^u32x4)(dst[31:]), intrinsics.simd_add(x15_1, s15)) + } else { + intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0_0)) + intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1_0)) + intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2_0)) + intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3_0)) + intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4_0, s4))) + intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5_0, s5))) + intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6_0, s6))) + intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7_0, s7))) + intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8_0, s8))) + intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9_0, s9))) + intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10_0, s10))) + intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11_0, s11))) + intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12_0)) + intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13_0, s13))) + intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14_0, s14))) + intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15_0, s15))) + + intrinsics.unaligned_store((^u32x4)(dst[16:]), _byteswap_u32x4(x0_1)) + intrinsics.unaligned_store((^u32x4)(dst[17:]), _byteswap_u32x4(x1_1)) + intrinsics.unaligned_store((^u32x4)(dst[18:]), _byteswap_u32x4(x2_1)) + intrinsics.unaligned_store((^u32x4)(dst[19:]), _byteswap_u32x4(x3_1)) + intrinsics.unaligned_store((^u32x4)(dst[20:]), _byteswap_u32x4(intrinsics.simd_add(x4_1, s4))) + intrinsics.unaligned_store((^u32x4)(dst[21:]), _byteswap_u32x4(intrinsics.simd_add(x5_1, s5))) + intrinsics.unaligned_store((^u32x4)(dst[22:]), _byteswap_u32x4(intrinsics.simd_add(x6_1, s6))) + intrinsics.unaligned_store((^u32x4)(dst[23:]), _byteswap_u32x4(intrinsics.simd_add(x7_1, s7))) + intrinsics.unaligned_store((^u32x4)(dst[24:]), _byteswap_u32x4(intrinsics.simd_add(x8_1, s8))) + intrinsics.unaligned_store((^u32x4)(dst[25:]), _byteswap_u32x4(intrinsics.simd_add(x9_1, s9))) + intrinsics.unaligned_store((^u32x4)(dst[26:]), _byteswap_u32x4(intrinsics.simd_add(x10_1, s10))) + intrinsics.unaligned_store((^u32x4)(dst[27:]), _byteswap_u32x4(intrinsics.simd_add(x11_1, s11))) + intrinsics.unaligned_store((^u32x4)(dst[28:]), _byteswap_u32x4(x12_1)) + intrinsics.unaligned_store((^u32x4)(dst[29:]), _byteswap_u32x4(intrinsics.simd_add(x13_1, s13))) + intrinsics.unaligned_store((^u32x4)(dst[30:]), _byteswap_u32x4(intrinsics.simd_add(x14_1, s14))) + intrinsics.unaligned_store((^u32x4)(dst[31:]), _byteswap_u32x4(intrinsics.simd_add(x15_1, s15))) + } + + s12 = intrinsics.simd_add(s12, _CTR_INC_8) + + dst = dst[32:] + } + } else { + for _ in 0..<4 { + x0, x1, x2, x3 := S0, S1, S2, S3 + x4, x5, x6, x7 := s4, s5, s6, s7 + x8, x9, x10, x11 := s8, s9, s10, s11 + x12, x13, x14, x15 := s12, s13, s14, s15 + + for i := CHACHA_ROUNDS; i > 0; i = i - 2 { + x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12) + x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13) + x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14) + x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15) + x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12) + x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13) + x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14) + } + + when ODIN_ENDIAN == .Little { + intrinsics.unaligned_store((^u32x4)(dst[0:]), x0) + intrinsics.unaligned_store((^u32x4)(dst[1:]), x1) + intrinsics.unaligned_store((^u32x4)(dst[2:]), x2) + intrinsics.unaligned_store((^u32x4)(dst[3:]), x3) + intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4, s4)) + intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5, s5)) + intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6, s6)) + intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7, s7)) + intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8, s8)) + intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9, s9)) + intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10, s10)) + intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11, s11)) + intrinsics.unaligned_store((^u32x4)(dst[12:]), x12) + intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13, s13)) + intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14, s14)) + intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15, s15)) + } else { + intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0)) + intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1)) + intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2)) + intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3)) + intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4, s4))) + intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5, s5))) + intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6, s6))) + intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7, s7))) + intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8, s8))) + intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9, s9))) + intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10, s10))) + intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11, s11))) + intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12)) + intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13, s13))) + intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14, s14))) + intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15, s15))) + } + + s12 = intrinsics.simd_add(s12, _CTR_INC_4) + + dst = dst[16:] + } + } +} diff --git a/base/runtime/random_generator_chacha8_simd256.odin b/base/runtime/random_generator_chacha8_simd256.odin new file mode 100644 index 000000000..c0985f456 --- /dev/null +++ b/base/runtime/random_generator_chacha8_simd256.odin @@ -0,0 +1,197 @@ +#+build amd64 +package runtime + +import "base:intrinsics" + +#assert(ODIN_ENDIAN == .Little) + +@(private = "file") +u32x8 :: #simd[8]u32 +@(private = "file") +u32x4 :: #simd[4]u32 + +@(private = "file") +S0: u32x8 : { + CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, + CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, +} +@(private = "file") +S1: u32x8 : { + CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, + CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, +} +@(private = "file") +S2: u32x8 : { + CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, + CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, +} +@(private = "file") +S3: u32x8 : { + CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, + CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, +} + +@(private = "file") +_ROT_7L: u32x8 : {7, 7, 7, 7, 7, 7, 7, 7} +@(private = "file") +_ROT_7R: u32x8 : {25, 25, 25, 25, 25, 25, 25, 25} +@(private = "file") +_ROT_12L: u32x8 : {12, 12, 12, 12, 12, 12, 12, 12} +@(private = "file") +_ROT_12R: u32x8 : {20, 20, 20, 20, 20, 20, 20, 20} +@(private = "file") +_ROT_8L: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8} +@(private = "file") +_ROT_8R: u32x8 : {24, 24, 24, 24, 24, 24, 24, 24} +@(private = "file") +_ROT_16: u32x8 : {16, 16, 16, 16, 16, 16, 16, 16} +@(private = "file") +_CTR_INC_8: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8} + +// To the best of my knowledge this is only really useful on +// modern x86-64 as most ARM silicon is missing support for SVE2. + +@(private, enable_target_feature = "avx,avx2") +chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) { + // Initialize the base state. + k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:])) + s4_ := k[0] + s5_ := k[1] + s6_ := k[2] + s7_ := k[3] + s8_ := k[4] + s9_ := k[5] + s10_ := k[6] + s11_ := k[7] + + // 8-lane ChaCha8. + s4 := u32x8{s4_, s4_, s4_, s4_, s4_, s4_, s4_, s4_} + s5 := u32x8{s5_, s5_, s5_, s5_, s5_, s5_, s5_, s5_} + s6 := u32x8{s6_, s6_, s6_, s6_, s6_, s6_, s6_, s6_} + s7 := u32x8{s7_, s7_, s7_, s7_, s7_, s7_, s7_, s7_} + s8 := u32x8{s8_, s8_, s8_, s8_, s8_, s8_, s8_, s8_} + s9 := u32x8{s9_, s9_, s9_, s9_, s9_, s9_, s9_, s9_} + s10 := u32x8{s10_, s10_, s10_, s10_, s10_, s10_, s10_, s10_} + s11 := u32x8{s11_, s11_, s11_, s11_, s11_, s11_, s11_, s11_} + s12 := u32x8{0, 1, 2, 3, 4, 5, 6, 7} + s13, s14, s15: u32x8 + + u32x4 :: #simd[4]u32 + dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:])) + + quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x8) -> (u32x8, u32x8, u32x8, u32x8) { + a, b, c, d := a, b, c, d + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R)) + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R)) + + return a, b, c, d + } + + for _ in 0..<2 { + x0, x1, x2, x3 := S0, S1, S2, S3 + x4, x5, x6, x7 := s4, s5, s6, s7 + x8, x9, x10, x11 := s8, s9, s10, s11 + x12, x13, x14, x15 := s12, s13, s14, s15 + + for i := CHACHA_ROUNDS; i > 0; i = i - 2 { + x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12) + x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13) + x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14) + x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15) + x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12) + x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13) + x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14) + } + + x4 = intrinsics.simd_add(x4, s4) + x5 = intrinsics.simd_add(x5, s5) + x6 = intrinsics.simd_add(x6, s6) + x7 = intrinsics.simd_add(x7, s7) + x8 = intrinsics.simd_add(x8, s8) + x9 = intrinsics.simd_add(x9, s9) + x10 = intrinsics.simd_add(x10, s10) + x11 = intrinsics.simd_add(x11, s11) + x13 = intrinsics.simd_add(x13, s13) + x14 = intrinsics.simd_add(x14, s14) + x15 = intrinsics.simd_add(x15, s15) + + // Ok, now we have x0->x15 with 8 lanes, but we need to + // output the first 4 blocks, then the second 4 blocks. + // + // LLVM appears not to consider "this instruction is totally + // awful on the given microarchitcture", which leads to + // `VPCOMPRESSED` being generated iff AVX512 support is + // enabled for `intrinsics.simd_masked_compress_store`. + // On Zen 4, this leads to a 50% performance regression vs + // the 128-bit SIMD code. + // + // The fake intrinsic (because LLVM doesn't appear to have + // an amd64 specific one), doesn't generate `VEXTRACTI128`, + // but instead does cleverness without horrible regressions. + + intrinsics.unaligned_store((^u32x4)(dst[0:]), _mm_mm256_extracti128_si256(x0, 0)) + intrinsics.unaligned_store((^u32x4)(dst[1:]), _mm_mm256_extracti128_si256(x1, 0)) + intrinsics.unaligned_store((^u32x4)(dst[2:]), _mm_mm256_extracti128_si256(x2, 0)) + intrinsics.unaligned_store((^u32x4)(dst[3:]), _mm_mm256_extracti128_si256(x3, 0)) + intrinsics.unaligned_store((^u32x4)(dst[4:]), _mm_mm256_extracti128_si256(x4, 0)) + intrinsics.unaligned_store((^u32x4)(dst[5:]), _mm_mm256_extracti128_si256(x5, 0)) + intrinsics.unaligned_store((^u32x4)(dst[6:]), _mm_mm256_extracti128_si256(x6, 0)) + intrinsics.unaligned_store((^u32x4)(dst[7:]), _mm_mm256_extracti128_si256(x7, 0)) + intrinsics.unaligned_store((^u32x4)(dst[8:]), _mm_mm256_extracti128_si256(x8, 0)) + intrinsics.unaligned_store((^u32x4)(dst[9:]), _mm_mm256_extracti128_si256(x9, 0)) + intrinsics.unaligned_store((^u32x4)(dst[10:]), _mm_mm256_extracti128_si256(x10, 0)) + intrinsics.unaligned_store((^u32x4)(dst[11:]), _mm_mm256_extracti128_si256(x11, 0)) + intrinsics.unaligned_store((^u32x4)(dst[12:]), _mm_mm256_extracti128_si256(x12, 0)) + intrinsics.unaligned_store((^u32x4)(dst[13:]), _mm_mm256_extracti128_si256(x13, 0)) + intrinsics.unaligned_store((^u32x4)(dst[14:]), _mm_mm256_extracti128_si256(x14, 0)) + intrinsics.unaligned_store((^u32x4)(dst[15:]), _mm_mm256_extracti128_si256(x15, 0)) + + intrinsics.unaligned_store((^u32x4)(dst[16:]), _mm_mm256_extracti128_si256(x0, 1)) + intrinsics.unaligned_store((^u32x4)(dst[17:]), _mm_mm256_extracti128_si256(x1, 1)) + intrinsics.unaligned_store((^u32x4)(dst[18:]), _mm_mm256_extracti128_si256(x2, 1)) + intrinsics.unaligned_store((^u32x4)(dst[19:]), _mm_mm256_extracti128_si256(x3, 1)) + intrinsics.unaligned_store((^u32x4)(dst[20:]), _mm_mm256_extracti128_si256(x4, 1)) + intrinsics.unaligned_store((^u32x4)(dst[21:]), _mm_mm256_extracti128_si256(x5, 1)) + intrinsics.unaligned_store((^u32x4)(dst[22:]), _mm_mm256_extracti128_si256(x6, 1)) + intrinsics.unaligned_store((^u32x4)(dst[23:]), _mm_mm256_extracti128_si256(x7, 1)) + intrinsics.unaligned_store((^u32x4)(dst[24:]), _mm_mm256_extracti128_si256(x8, 1)) + intrinsics.unaligned_store((^u32x4)(dst[25:]), _mm_mm256_extracti128_si256(x9, 1)) + intrinsics.unaligned_store((^u32x4)(dst[26:]), _mm_mm256_extracti128_si256(x10, 1)) + intrinsics.unaligned_store((^u32x4)(dst[27:]), _mm_mm256_extracti128_si256(x11, 1)) + intrinsics.unaligned_store((^u32x4)(dst[28:]), _mm_mm256_extracti128_si256(x12, 1)) + intrinsics.unaligned_store((^u32x4)(dst[29:]), _mm_mm256_extracti128_si256(x13, 1)) + intrinsics.unaligned_store((^u32x4)(dst[30:]), _mm_mm256_extracti128_si256(x14, 1)) + intrinsics.unaligned_store((^u32x4)(dst[31:]), _mm_mm256_extracti128_si256(x15, 1)) + + s12 = intrinsics.simd_add(s12, _CTR_INC_8) + + dst = dst[32:] + } +} + +@(private = "file", require_results, enable_target_feature="avx2") +_mm_mm256_extracti128_si256 :: #force_inline proc "c" (a: u32x8, $OFFSET: int) -> u32x4 { + when OFFSET == 0 { + return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3) + } else when OFFSET == 1 { + return intrinsics.simd_shuffle(a, a, 4, 5, 6, 7) + } else { + #panic("chacha8rand: invalid offset") + } +} |