aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2025-11-29 14:38:22 +0000
committerGitHub <noreply@github.com>2025-11-29 14:38:22 +0000
commit5db9afd73b8f54ca38f154ce299dd0256e46f33e (patch)
treeeb740665e295cf1d52eeedc21c7a6f06497aa4f0
parent4db4841413095645e2319afcafc1db7276259f9d (diff)
parent3e8e0bb110d9a12e7c2a812b7909297cf3f6ade8 (diff)
Merge pull request #5963 from Yawning/feature/chacha8rand
runtime: Use chacha8rand as the default RNG (BREAKING)
-rw-r--r--base/runtime/os_specific.odin10
-rw-r--r--base/runtime/os_specific_bsd.odin8
-rw-r--r--base/runtime/os_specific_darwin.odin14
-rw-r--r--base/runtime/os_specific_freestanding.odin2
-rw-r--r--base/runtime/os_specific_haiku.odin8
-rw-r--r--base/runtime/os_specific_js.odin18
-rw-r--r--base/runtime/os_specific_linux.odin49
-rw-r--r--base/runtime/os_specific_orca.odin2
-rw-r--r--base/runtime/os_specific_wasi.odin11
-rw-r--r--base/runtime/os_specific_windows.odin33
-rw-r--r--base/runtime/random_generator.odin85
-rw-r--r--base/runtime/random_generator_chacha8.odin164
-rw-r--r--base/runtime/random_generator_chacha8_ref.odin145
-rw-r--r--base/runtime/random_generator_chacha8_simd128.odin290
-rw-r--r--base/runtime/random_generator_chacha8_simd256.odin197
-rw-r--r--core/crypto/crypto.odin6
-rw-r--r--core/crypto/rand_bsd.odin15
-rw-r--r--core/crypto/rand_darwin.odin17
-rw-r--r--core/crypto/rand_generic.odin16
-rw-r--r--core/crypto/rand_js.odin24
-rw-r--r--core/crypto/rand_linux.odin40
-rw-r--r--core/crypto/rand_wasi.odin13
-rw-r--r--core/crypto/rand_windows.odin26
-rw-r--r--core/math/rand/rand.odin45
-rw-r--r--core/math/rand/rand_pcg.odin107
-rw-r--r--core/math/rand/rand_xoshiro256.odin123
-rw-r--r--core/testing/runner.odin4
-rw-r--r--examples/demo/demo.odin5
-rw-r--r--tests/benchmark/all.odin1
-rw-r--r--tests/benchmark/math/benchmark_rand.odin130
-rw-r--r--tests/core/math/rand/test_core_math_rand.odin120
-rw-r--r--tests/internal/test_chacha8rand.odin151
32 files changed, 1632 insertions, 247 deletions
diff --git a/base/runtime/os_specific.odin b/base/runtime/os_specific.odin
index b6c1288d0..16e7e4751 100644
--- a/base/runtime/os_specific.odin
+++ b/base/runtime/os_specific.odin
@@ -2,10 +2,20 @@ package runtime
_OS_Errno :: distinct int
+HAS_RAND_BYTES :: _HAS_RAND_BYTES
+
stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return _stderr_write(data)
}
+rand_bytes :: proc "contextless" (dst: []byte) {
+ when HAS_RAND_BYTES {
+ _rand_bytes(dst)
+ } else {
+ panic_contextless("base/runtime: no runtime entropy source")
+ }
+}
+
exit :: proc "contextless" (code: int) -> ! {
_exit(code)
} \ No newline at end of file
diff --git a/base/runtime/os_specific_bsd.odin b/base/runtime/os_specific_bsd.odin
index de300f1e0..ab8eabb6c 100644
--- a/base/runtime/os_specific_bsd.odin
+++ b/base/runtime/os_specific_bsd.odin
@@ -4,6 +4,8 @@ package runtime
foreign import libc "system:c"
+_HAS_RAND_BYTES :: true
+
@(default_calling_convention="c")
foreign libc {
@(link_name="write")
@@ -14,6 +16,8 @@ foreign libc {
} else {
__error :: proc() -> ^i32 ---
}
+
+ arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
}
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
@@ -25,6 +29,10 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return int(ret), 0
}
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ arc4random_buf(raw_data(dst), len(dst))
+}
+
_exit :: proc "contextless" (code: int) -> ! {
@(default_calling_convention="c")
foreign libc {
diff --git a/base/runtime/os_specific_darwin.odin b/base/runtime/os_specific_darwin.odin
index 37315240f..576725a1c 100644
--- a/base/runtime/os_specific_darwin.odin
+++ b/base/runtime/os_specific_darwin.odin
@@ -4,6 +4,8 @@ package runtime
import "base:intrinsics"
+_HAS_RAND_BYTES :: true
+
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
STDERR :: 2
when ODIN_NO_CRT {
@@ -29,6 +31,18 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
foreign import libc "system:System"
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ // This process used to use Security/RandomCopyBytes, however
+ // on every version of MacOS (>= 10.12) that we care about,
+ // arc4random is implemented securely.
+
+ @(default_calling_convention="c")
+ foreign libc {
+ arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
+ }
+ arc4random_buf(raw_data(dst), len(dst))
+}
+
_exit :: proc "contextless" (code: int) -> ! {
@(default_calling_convention="c")
foreign libc {
diff --git a/base/runtime/os_specific_freestanding.odin b/base/runtime/os_specific_freestanding.odin
index b5a5fb146..3b2b5a714 100644
--- a/base/runtime/os_specific_freestanding.odin
+++ b/base/runtime/os_specific_freestanding.odin
@@ -2,6 +2,8 @@
#+private
package runtime
+_HAS_RAND_BYTES :: false
+
// TODO(bill): reimplement `os.write`
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return 0, -1
diff --git a/base/runtime/os_specific_haiku.odin b/base/runtime/os_specific_haiku.odin
index 74ff58cde..7e53539a1 100644
--- a/base/runtime/os_specific_haiku.odin
+++ b/base/runtime/os_specific_haiku.odin
@@ -4,11 +4,15 @@ package runtime
foreign import libc "system:c"
+_HAS_RAND_BYTES :: true
+
foreign libc {
@(link_name="write")
_unix_write :: proc(fd: i32, buf: rawptr, size: int) -> int ---
_errnop :: proc() -> ^i32 ---
+
+ arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
}
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
@@ -20,7 +24,9 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return int(ret), 0
}
-
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ arc4random_buf(raw_data(dst), len(dst))
+}
_exit :: proc "contextless" (code: int) -> ! {
trap()
diff --git a/base/runtime/os_specific_js.odin b/base/runtime/os_specific_js.odin
index bd88b1871..8676f3a6e 100644
--- a/base/runtime/os_specific_js.odin
+++ b/base/runtime/os_specific_js.odin
@@ -4,6 +4,8 @@ package runtime
foreign import "odin_env"
+_HAS_RAND_BYTES :: true
+
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
foreign odin_env {
write :: proc "contextless" (fd: u32, p: []byte) ---
@@ -12,6 +14,22 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return len(data), 0
}
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ foreign odin_env {
+ @(link_name = "rand_bytes")
+ env_rand_bytes :: proc "contextless" (buf: []byte) ---
+ }
+
+ MAX_PER_CALL_BYTES :: 65536 // 64kiB
+
+ dst := dst
+ for len(dst) > 0 {
+ to_read := min(len(dst), MAX_PER_CALL_BYTES)
+ env_rand_bytes(dst[:to_read])
+
+ dst = dst[to_read:]
+ }
+}
_exit :: proc "contextless" (code: int) -> ! {
trap()
diff --git a/base/runtime/os_specific_linux.odin b/base/runtime/os_specific_linux.odin
index dfe3c8841..1abcc03e5 100644
--- a/base/runtime/os_specific_linux.odin
+++ b/base/runtime/os_specific_linux.odin
@@ -3,6 +3,8 @@ package runtime
import "base:intrinsics"
+_HAS_RAND_BYTES :: true
+
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
when ODIN_ARCH == .amd64 {
SYS_write :: uintptr(1)
@@ -25,6 +27,53 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return ret, 0
}
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ when ODIN_ARCH == .amd64 {
+ SYS_getrandom :: uintptr(318)
+ } else when ODIN_ARCH == .arm64 {
+ SYS_getrandom :: uintptr(278)
+ } else when ODIN_ARCH == .i386 {
+ SYS_getrandom :: uintptr(355)
+ } else when ODIN_ARCH == .arm32 {
+ SYS_getrandom :: uintptr(384)
+ } else when ODIN_ARCH == .riscv64 {
+ SYS_getrandom :: uintptr(278)
+ } else {
+ #panic("base/runtime: no SYS_getrandom definition for target")
+ }
+
+ ERR_EINTR :: 4
+ ERR_ENOSYS :: 38
+
+ MAX_PER_CALL_BYTES :: 33554431 // 2^25 - 1
+
+ dst := dst
+ l := len(dst)
+
+ for l > 0 {
+ to_read := min(l, MAX_PER_CALL_BYTES)
+ ret := int(intrinsics.syscall(SYS_getrandom, uintptr(raw_data(dst[:to_read])), uintptr(to_read), uintptr(0)))
+ switch ret {
+ case -ERR_EINTR:
+ // Call interupted by a signal handler, just retry the
+ // request.
+ continue
+ case -ERR_ENOSYS:
+ // The kernel is apparently prehistoric (< 3.17 circa 2014)
+ // and does not support getrandom.
+ panic_contextless("base/runtime: getrandom not available in kernel")
+ case:
+ if ret < 0 {
+ // All other failures are things that should NEVER happen
+ // unless the kernel interface changes (ie: the Linux
+ // developers break userland).
+ panic_contextless("base/runtime: getrandom failed")
+ }
+ }
+ l -= ret
+ dst = dst[ret:]
+ }
+}
_exit :: proc "contextless" (code: int) -> ! {
SYS_exit_group ::
diff --git a/base/runtime/os_specific_orca.odin b/base/runtime/os_specific_orca.odin
index 491edcfa4..f5ce50411 100644
--- a/base/runtime/os_specific_orca.odin
+++ b/base/runtime/os_specific_orca.odin
@@ -4,6 +4,8 @@ package runtime
import "base:intrinsics"
+_HAS_RAND_BYTES :: false
+
// Constants allowing to specify the level of logging verbosity.
log_level :: enum u32 {
// Only errors are logged.
diff --git a/base/runtime/os_specific_wasi.odin b/base/runtime/os_specific_wasi.odin
index 194034865..c5e94653a 100644
--- a/base/runtime/os_specific_wasi.odin
+++ b/base/runtime/os_specific_wasi.odin
@@ -4,6 +4,8 @@ package runtime
foreign import wasi "wasi_snapshot_preview1"
+_HAS_RAND_BYTES :: true
+
@(default_calling_convention="contextless")
foreign wasi {
fd_write :: proc(
@@ -26,6 +28,9 @@ foreign wasi {
@(private="file")
proc_exit :: proc(rval: u32) -> ! ---
+
+ @(private ="file")
+ random_get :: proc(buf: []u8) -> u16 ---
}
_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
@@ -34,6 +39,12 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
return int(n), _OS_Errno(err)
}
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ if errno := random_get(dst); errno != 0 {
+ panic_contextless("base/runtime: wasi.random_get failed")
+ }
+}
+
_wasi_setup_args :: proc() {
num_of_args, size_of_args: uint
if errno := args_sizes_get(&num_of_args, &size_of_args); errno != 0 {
diff --git a/base/runtime/os_specific_windows.odin b/base/runtime/os_specific_windows.odin
index c5ca1e4c5..d938e87ea 100644
--- a/base/runtime/os_specific_windows.odin
+++ b/base/runtime/os_specific_windows.odin
@@ -2,8 +2,11 @@
#+private
package runtime
+foreign import bcrypt "system:Bcrypt.lib"
foreign import kernel32 "system:Kernel32.lib"
+_HAS_RAND_BYTES :: true
+
@(private="file")
@(default_calling_convention="system")
foreign kernel32 {
@@ -18,6 +21,12 @@ foreign kernel32 {
ExitProcess :: proc(code: u32) -> ! ---
}
+@(private="file")
+@(default_calling_convention="system")
+foreign bcrypt {
+ BCryptGenRandom :: proc(hAlgorithm: rawptr, pBuffer: [^]u8, cbBuffer: u32, dwFlags: u32) -> i32 ---
+}
+
_stderr_write :: proc "contextless" (data: []byte) -> (n: int, err: _OS_Errno) #no_bounds_check {
if len(data) == 0 {
return 0, 0
@@ -52,6 +61,30 @@ _stderr_write :: proc "contextless" (data: []byte) -> (n: int, err: _OS_Errno) #
return
}
+_rand_bytes :: proc "contextless" (dst: []byte) {
+ ensure_contextless(u64(len(dst)) <= u64(max(u32)), "base/runtime: oversized rand_bytes request")
+
+ BCRYPT_USE_SYSTEM_PREFERRED_RNG :: 0x00000002
+
+ ERROR_INVALID_HANDLE :: 6
+ ERROR_INVALID_PARAMETER :: 87
+
+ ret := BCryptGenRandom(nil, raw_data(dst), u32(len(dst)), BCRYPT_USE_SYSTEM_PREFERRED_RNG)
+ switch ret {
+ case 0:
+ case ERROR_INVALID_HANDLE:
+ // The handle to the first parameter is invalid.
+ // This should not happen here, since we explicitly pass nil to it
+ panic_contextless("base/runtime: BCryptGenRandom Invalid handle for hAlgorithm")
+ case ERROR_INVALID_PARAMETER:
+ // One of the parameters was invalid
+ panic_contextless("base/runtime: BCryptGenRandom Invalid parameter")
+ case:
+ // Unknown error
+ panic_contextless("base/runtime: BCryptGenRandom failed")
+ }
+}
+
_exit :: proc "contextless" (code: int) -> ! {
ExitProcess(u32(code))
} \ No newline at end of file
diff --git a/base/runtime/random_generator.odin b/base/runtime/random_generator.odin
index ca5c008d0..7d873fe33 100644
--- a/base/runtime/random_generator.odin
+++ b/base/runtime/random_generator.odin
@@ -41,88 +41,3 @@ random_generator_reset_u64 :: proc(rg: Random_Generator, p: u64) {
rg.procedure(rg.data, .Reset, ([^]byte)(&p)[:size_of(p)])
}
}
-
-
-Default_Random_State :: struct {
- state: u64,
- inc: u64,
-}
-
-default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) {
- @(require_results)
- read_u64 :: proc "contextless" (r: ^Default_Random_State) -> u64 {
- old_state := r.state
- r.state = old_state * 6364136223846793005 + (r.inc|1)
- xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081
- rot := (old_state >> 59)
- return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63))
- }
-
- @(thread_local)
- global_rand_seed: Default_Random_State
-
- init :: proc "contextless" (r: ^Default_Random_State, seed: u64) {
- seed := seed
- if seed == 0 {
- seed = u64(intrinsics.read_cycle_counter())
- }
- r.state = 0
- r.inc = (seed << 1) | 1
- _ = read_u64(r)
- r.state += seed
- _ = read_u64(r)
- }
-
- r: ^Default_Random_State = ---
- if data == nil {
- r = &global_rand_seed
- } else {
- r = cast(^Default_Random_State)data
- }
-
- switch mode {
- case .Read:
- if r.state == 0 && r.inc == 0 {
- init(r, 0)
- }
-
- switch len(p) {
- case size_of(u64):
- // Fast path for a 64-bit destination.
- intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
- case:
- // All other cases.
- pos := i8(0)
- val := u64(0)
- for &v in p {
- if pos == 0 {
- val = read_u64(r)
- pos = 8
- }
- v = byte(val)
- val >>= 8
- pos -= 1
- }
- }
-
- case .Reset:
- seed: u64
- mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p)))
- init(r, seed)
-
- case .Query_Info:
- if len(p) != size_of(Random_Generator_Query_Info) {
- return
- }
- info := (^Random_Generator_Query_Info)(raw_data(p))
- info^ += {.Uniform, .Resettable}
- }
-}
-
-@(require_results)
-default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator {
- return {
- procedure = default_random_generator_proc,
- data = state,
- }
-} \ No newline at end of file
diff --git a/base/runtime/random_generator_chacha8.odin b/base/runtime/random_generator_chacha8.odin
new file mode 100644
index 000000000..02acb4e9f
--- /dev/null
+++ b/base/runtime/random_generator_chacha8.odin
@@ -0,0 +1,164 @@
+package runtime
+
+import "base:intrinsics"
+
+// This is an implementation of the Chacha8Rand DRBG, as specified
+// in https://github.com/C2SP/C2SP/blob/main/chacha8rand.md
+//
+// There is a tradeoff to be made between state-size and performance,
+// in terms of the amount of rng output buffered.
+//
+// The sensible buffer sizes are:
+// - 256-bytes: 128-bit SIMD with 16x vector registers (SSE2)
+// - 512-bytes: 128-bit SIMD with 32x vector registers (ARMv8),
+// 256-bit SIMD with 16x vector registers (AVX2),
+// - 1024-bytes: AVX-512
+//
+// Notes:
+// - Smaller than 256-bytes is possible but would require redundant
+// calls to the ChaCha8 function, which is prohibitively expensive.
+// - Larger than 1024-bytes is possible but pointless as the construct
+// is defined around 992-bytes of RNG output and 32-bytes of input
+// per iteration.
+//
+// This implementation opts for a 1024-byte buffer for simplicity,
+// under the rationale that modern extremely memory constrained targets
+// provide suitable functionality in hardware, and the language makes
+// supporting the various SIMD flavors easy.
+
+@(private = "file")
+RNG_SEED_SIZE :: 32
+@(private)
+RNG_OUTPUT_PER_ITER :: 1024 - RNG_SEED_SIZE
+
+@(private)
+CHACHA_SIGMA_0: u32 : 0x61707865
+@(private)
+CHACHA_SIGMA_1: u32 : 0x3320646e
+@(private)
+CHACHA_SIGMA_2: u32 : 0x79622d32
+@(private)
+CHACHA_SIGMA_3: u32 : 0x6b206574
+@(private)
+CHACHA_ROUNDS :: 8
+
+Default_Random_State :: struct {
+ _buf: [1024]byte,
+ _off: int,
+ _seeded: bool,
+}
+
+@(require_results)
+default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator {
+ return {
+ procedure = default_random_generator_proc,
+ data = state,
+ }
+}
+
+default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) {
+ @(thread_local)
+ state: Default_Random_State
+
+ r: ^Default_Random_State = &state
+ if data != nil {
+ r = cast(^Default_Random_State)data
+ }
+ next_seed := r._buf[RNG_OUTPUT_PER_ITER:]
+
+ switch mode {
+ case .Read:
+ if !r._seeded { // Unlikely.
+ rand_bytes(next_seed)
+ r._off = RNG_OUTPUT_PER_ITER // Force refill.
+ r._seeded = true
+ }
+
+ assert(r._off <= RNG_OUTPUT_PER_ITER, "chacha8rand/BUG: outputed key material")
+ if r._off >= RNG_OUTPUT_PER_ITER { // Unlikely.
+ chacha8rand_refill(r)
+ }
+
+ // We are guaranteed to have at least some RNG output buffered.
+ //
+ // As an invariant each read will consume a multiple of 8-bytes
+ // of output at a time.
+ assert(r._off <= RNG_OUTPUT_PER_ITER - 8, "chacha8rand/BUG: less than 8-bytes of output available")
+ assert(r._off % 8 == 0, "chacha8rand/BUG: buffered output is not a multiple of 8-bytes")
+
+ p_len := len(p)
+ if p_len == size_of(u64) {
+ #no_bounds_check {
+ // Fast path for a 64-bit destination.
+ src := (^u64)(raw_data(r._buf[r._off:]))
+ intrinsics.unaligned_store((^u64)(raw_data(p)), src^)
+ src^ = 0 // Erasure (backtrack resistance)
+ r._off += 8
+ }
+ return
+ }
+
+ p_ := p
+ for remaining := p_len; remaining > 0; {
+ sz := min(remaining, RNG_OUTPUT_PER_ITER - r._off)
+ #no_bounds_check {
+ copy(p_[:sz], r._buf[r._off:])
+ p_ = p_[sz:]
+ remaining -= sz
+ }
+ rounded_sz := ((sz + 7) / 8) * 8
+ new_off := r._off + rounded_sz
+ #no_bounds_check if new_off < RNG_OUTPUT_PER_ITER {
+ // Erasure (backtrack resistance)
+ intrinsics.mem_zero(raw_data(r._buf[r._off:]), rounded_sz)
+ r._off = new_off
+ } else {
+ // Can omit erasure since we are overwriting the entire
+ // buffer.
+ chacha8rand_refill(r)
+ }
+ }
+
+ case .Reset:
+ // If no seed is passed, the next call to .Read will attempt to
+ // reseed from the system entropy source.
+ if len(p) == 0 {
+ r._seeded = false
+ return
+ }
+
+ // The cryptographic security of the output depends entirely
+ // on the quality of the entropy in the seed, we will allow
+ // re-seeding (as it makes testing easier), but callers that
+ // decide to provide arbitrary seeds are on their own as far
+ // as ensuring high-quality entropy.
+ intrinsics.mem_zero(raw_data(next_seed), RNG_SEED_SIZE)
+ copy(next_seed, p)
+ r._seeded = true
+ r._off = RNG_OUTPUT_PER_ITER // Force a refill.
+
+ case .Query_Info:
+ if len(p) != size_of(Random_Generator_Query_Info) {
+ return
+ }
+ info := (^Random_Generator_Query_Info)(raw_data(p))
+ info^ += {.Uniform, .Cryptographic, .Resettable}
+ }
+}
+
+@(private = "file")
+chacha8rand_refill :: proc(r: ^Default_Random_State) {
+ assert(r._seeded == true, "chacha8rand/BUG: unseeded refill")
+
+ // i386 has insufficient vector registers to use the
+ // accelerated path at the moment.
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ chacha8rand_refill_simd256(r)
+ } else when HAS_HARDWARE_SIMD && ODIN_ARCH != .i386 {
+ chacha8rand_refill_simd128(r)
+ } else {
+ chacha8rand_refill_ref(r)
+ }
+
+ r._off = 0
+}
diff --git a/base/runtime/random_generator_chacha8_ref.odin b/base/runtime/random_generator_chacha8_ref.odin
new file mode 100644
index 000000000..b1e812c3f
--- /dev/null
+++ b/base/runtime/random_generator_chacha8_ref.odin
@@ -0,0 +1,145 @@
+package runtime
+
+import "base:intrinsics"
+
+@(private)
+chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
+ // Initialize the base state.
+ k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
+ when ODIN_ENDIAN == .Little {
+ s4 := k[0]
+ s5 := k[1]
+ s6 := k[2]
+ s7 := k[3]
+ s8 := k[4]
+ s9 := k[5]
+ s10 := k[6]
+ s11 := k[7]
+ } else {
+ s4 := intrinsics.byte_swap(k[0])
+ s5 := intrinsics.byte_swap(k[1])
+ s6 := intrinsics.byte_swap(k[2])
+ s7 := intrinsics.byte_swap(k[3])
+ s8 := intrinsics.byte_swap(k[4])
+ s9 := intrinsics.byte_swap(k[5])
+ s10 := intrinsics.byte_swap(k[6])
+ s11 := intrinicss.byte_swap(k[7])
+ }
+ s12: u32 // Counter starts at 0.
+ s13, s14, s15: u32 // IV of all 0s.
+
+ dst: [^]u32 = (^u32)(raw_data(r._buf[:]))
+
+ // At least with LLVM21 force_inline produces identical perf to
+ // manual inlining, yay.
+ quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
+ a, b, c, d := a, b, c, d
+
+ a += b
+ d ~= a
+ d = rotl(d, 16)
+
+ c += d
+ b ~= c
+ b = rotl(b, 12)
+
+ a += b
+ d ~= a
+ d = rotl(d, 8)
+
+ c += d
+ b ~= c
+ b = rotl(b, 7)
+
+ return a, b, c, d
+ }
+
+ // Filippo Valsorda made an observation that only one of the column
+ // round depends on the counter (s12), so it is worth precomputing
+ // and reusing across multiple blocks. As far as I know, only Go's
+ // chacha implementation does this.
+
+ p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
+ p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
+ p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)
+
+ // 4 groups
+ for g := 0; g < 4; g = g + 1 {
+ // 4 blocks per group
+ for n := 0; n < 4; n = n + 1 {
+ // First column round that depends on the counter
+ p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)
+
+ // First diagonal round
+ x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
+ x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
+ x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
+ x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)
+
+ for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
+ x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+ x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+ x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+ x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
+
+ x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+ x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+ x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+ x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
+ }
+
+ // Interleave 4 blocks
+ // NB: The additions of sigma and the counter are omitted
+ STRIDE :: 4
+ d_ := dst[n:]
+ when ODIN_ENDIAN == .Little {
+ d_[STRIDE*0] = x0
+ d_[STRIDE*1] = x1
+ d_[STRIDE*2] = x2
+ d_[STRIDE*3] = x3
+ d_[STRIDE*4] = x4 + s4
+ d_[STRIDE*5] = x5 + s5
+ d_[STRIDE*6] = x6 + s6
+ d_[STRIDE*7] = x7 + s7
+ d_[STRIDE*8] = x8 + s8
+ d_[STRIDE*9] = x9 + s9
+ d_[STRIDE*10] = x10 + s10
+ d_[STRIDE*11] = x11 + s11
+ d_[STRIDE*12] = x12
+ d_[STRIDE*13] = x13 + s13
+ d_[STRIDE*14] = x14 + s14
+ d_[STRIDE*15] = x15 + s15
+ } else {
+ d_[STRIDE*0] = intrinsics.byte_swap(x0)
+ d_[STRIDE*1] = intrinsics.byte_swap(x1)
+ d_[STRIDE*2] = intrinsics.byte_swap(x2)
+ d_[STRIDE*3] = intrinsics.byte_swap(x3)
+ d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
+ d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
+ d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
+ d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
+ d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
+ d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
+ d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
+ d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
+ d_[STRIDE*12] = intrinsics.byte_swap(x12)
+ d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
+ d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
+ d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
+ }
+
+ s12 = s12 + 1 // Increment the counter
+ }
+
+ dst = dst[16*4:]
+ }
+}
+
+// This replicates `rotate_left32` from `core:math/bits`, under the
+// assumption that this will live in `base:runtime`.
+@(require_results, private = "file")
+rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
+ n :: 32
+ s := uint(k) & (n-1)
+ return x << s | x >> (n-s)
+}
diff --git a/base/runtime/random_generator_chacha8_simd128.odin b/base/runtime/random_generator_chacha8_simd128.odin
new file mode 100644
index 000000000..d63d92620
--- /dev/null
+++ b/base/runtime/random_generator_chacha8_simd128.odin
@@ -0,0 +1,290 @@
+#+build !i386
+package runtime
+
+import "base:intrinsics"
+
+@(private = "file")
+u32x4 :: #simd[4]u32
+
+@(private = "file")
+S0: u32x4 : {CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0}
+@(private = "file")
+S1: u32x4 : {CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1}
+@(private = "file")
+S2: u32x4 : {CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2}
+@(private = "file")
+S3: u32x4 : {CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3}
+
+@(private = "file")
+_ROT_7L: u32x4 : {7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: u32x4 : {25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: u32x4 : {12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: u32x4 : {20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: u32x4 : {8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: u32x4 : {24, 24, 24, 24}
+@(private = "file")
+_ROT_16: u32x4 : {16, 16, 16, 16}
+@(private = "file")
+_CTR_INC_4: u32x4 : {4, 4, 4, 4}
+@(private = "file")
+_CTR_INC_8: u32x4 : {8, 8, 8, 8}
+
+when ODIN_ENDIAN == .Big {
+ @(private = "file")
+ _byteswap_u32x4 :: #force_inline proc "contextless" (v: u32x4) -> u32x4 {
+ u8x16 :: #simd[16]u8
+ return(
+ transmute(u32x4)simd.shuffle(
+ transmute(u8x16)v,
+ transmute(u8x16)v,
+ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+ )
+ )
+ }
+}
+
+@(private)
+chacha8rand_refill_simd128 :: proc(r: ^Default_Random_State) {
+ // Initialize the base state.
+ k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
+ when ODIN_ENDIAN == .Little {
+ s4_ := k[0]
+ s5_ := k[1]
+ s6_ := k[2]
+ s7_ := k[3]
+ s8_ := k[4]
+ s9_ := k[5]
+ s10_ := k[6]
+ s11_ := k[7]
+ } else {
+ s4_ := intrinsics.byte_swap(k[0])
+ s5_ := intrinsics.byte_swap(k[1])
+ s6_ := intrinsics.byte_swap(k[2])
+ s7_ := intrinsics.byte_swap(k[3])
+ s8_ := intrinsics.byte_swap(k[4])
+ s9_ := intrinsics.byte_swap(k[5])
+ s10_ := intrinsics.byte_swap(k[6])
+ s11_ := intrinicss.byte_swap(k[7])
+ }
+
+ // 4-lane ChaCha8.
+ s4 := u32x4{s4_, s4_, s4_, s4_}
+ s5 := u32x4{s5_, s5_, s5_, s5_}
+ s6 := u32x4{s6_, s6_, s6_, s6_}
+ s7 := u32x4{s7_, s7_, s7_, s7_}
+ s8 := u32x4{s8_, s8_, s8_, s8_}
+ s9 := u32x4{s9_, s9_, s9_, s9_}
+ s10 := u32x4{s10_, s10_, s10_, s10_}
+ s11 := u32x4{s11_, s11_, s11_, s11_}
+ s12 := u32x4{0, 1, 2, 3}
+ s13, s14, s15: u32x4
+
+ dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
+
+ quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x4) -> (u32x4, u32x4, u32x4, u32x4) {
+ a, b, c, d := a, b, c, d
+
+ a = intrinsics.simd_add(a, b)
+ d = intrinsics.simd_bit_xor(d, a)
+ d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
+
+ c = intrinsics.simd_add(c, d)
+ b = intrinsics.simd_bit_xor(b, c)
+ b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
+
+ a = intrinsics.simd_add(a, b)
+ d = intrinsics.simd_bit_xor(d, a)
+ d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
+
+ c = intrinsics.simd_add(c, d)
+ b = intrinsics.simd_bit_xor(b, c)
+ b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
+
+ return a, b, c, d
+ }
+
+ // 8 blocks at a time.
+ //
+ // Note:
+ // This uses a ton of registers so it is only worth it on targets
+ // that have something like 32 128-bit registers. This is currently
+ // all ARMv8 targets, and RISC-V Zvl128b (`V` application profile)
+ // targets.
+ //
+ // While our current definition of `.arm32` is 32-bit ARMv8, this
+ // may change in the future (ARMv7 is still relevant), and things
+ // like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time
+ // thus needs bemchmarking.
+ when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 {
+ for _ in 0..<2 {
+ x0_0, x1_0, x2_0, x3_0 := S0, S1, S2, S3
+ x4_0, x5_0, x6_0, x7_0 := s4, s5, s6, s7
+ x8_0, x9_0, x10_0, x11_0 := s8, s9, s10, s11
+ x12_0, x13_0, x14_0, x15_0 := s12, s13, s14, s15
+
+ x0_1, x1_1, x2_1, x3_1 := S0, S1, S2, S3
+ x4_1, x5_1, x6_1, x7_1 := s4, s5, s6, s7
+ x8_1, x9_1, x10_1, x11_1 := s8, s9, s10, s11
+ x12_1 := intrinsics.simd_add(s12, _CTR_INC_4)
+ x13_1, x14_1, x15_1 := s13, s14, s15
+
+ for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
+ x0_0, x4_0, x8_0, x12_0 = quarter_round(x0_0, x4_0, x8_0, x12_0)
+ x0_1, x4_1, x8_1, x12_1 = quarter_round(x0_1, x4_1, x8_1, x12_1)
+ x1_0, x5_0, x9_0, x13_0 = quarter_round(x1_0, x5_0, x9_0, x13_0)
+ x1_1, x5_1, x9_1, x13_1 = quarter_round(x1_1, x5_1, x9_1, x13_1)
+ x2_0, x6_0, x10_0, x14_0 = quarter_round(x2_0, x6_0, x10_0, x14_0)
+ x2_1, x6_1, x10_1, x14_1 = quarter_round(x2_1, x6_1, x10_1, x14_1)
+ x3_0, x7_0, x11_0, x15_0 = quarter_round(x3_0, x7_0, x11_0, x15_0)
+ x3_1, x7_1, x11_1, x15_1 = quarter_round(x3_1, x7_1, x11_1, x15_1)
+
+ x0_0, x5_0, x10_0, x15_0 = quarter_round(x0_0, x5_0, x10_0, x15_0)
+ x0_1, x5_1, x10_1, x15_1 = quarter_round(x0_1, x5_1, x10_1, x15_1)
+ x1_0, x6_0, x11_0, x12_0 = quarter_round(x1_0, x6_0, x11_0, x12_0)
+ x1_1, x6_1, x11_1, x12_1 = quarter_round(x1_1, x6_1, x11_1, x12_1)
+ x2_0, x7_0, x8_0, x13_0 = quarter_round(x2_0, x7_0, x8_0, x13_0)
+ x2_1, x7_1, x8_1, x13_1 = quarter_round(x2_1, x7_1, x8_1, x13_1)
+ x3_0, x4_0, x9_0, x14_0 = quarter_round(x3_0, x4_0, x9_0, x14_0)
+ x3_1, x4_1, x9_1, x14_1 = quarter_round(x3_1, x4_1, x9_1, x14_1)
+ }
+
+ when ODIN_ENDIAN == .Little {
+ intrinsics.unaligned_store((^u32x4)(dst[0:]), x0_0)
+ intrinsics.unaligned_store((^u32x4)(dst[1:]), x1_0)
+ intrinsics.unaligned_store((^u32x4)(dst[2:]), x2_0)
+ intrinsics.unaligned_store((^u32x4)(dst[3:]), x3_0)
+ intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4_0, s4))
+ intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5_0, s5))
+ intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6_0, s6))
+ intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7_0, s7))
+ intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8_0, s8))
+ intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9_0, s9))
+ intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10_0, s10))
+ intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11_0, s11))
+ intrinsics.unaligned_store((^u32x4)(dst[12:]), x12_0)
+ intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13_0, s13))
+ intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14_0, s14))
+ intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15_0, s15))
+
+ intrinsics.unaligned_store((^u32x4)(dst[16:]), x0_1)
+ intrinsics.unaligned_store((^u32x4)(dst[17:]), x1_1)
+ intrinsics.unaligned_store((^u32x4)(dst[18:]), x2_1)
+ intrinsics.unaligned_store((^u32x4)(dst[19:]), x3_1)
+ intrinsics.unaligned_store((^u32x4)(dst[20:]), intrinsics.simd_add(x4_1, s4))
+ intrinsics.unaligned_store((^u32x4)(dst[21:]), intrinsics.simd_add(x5_1, s5))
+ intrinsics.unaligned_store((^u32x4)(dst[22:]), intrinsics.simd_add(x6_1, s6))
+ intrinsics.unaligned_store((^u32x4)(dst[23:]), intrinsics.simd_add(x7_1, s7))
+ intrinsics.unaligned_store((^u32x4)(dst[24:]), intrinsics.simd_add(x8_1, s8))
+ intrinsics.unaligned_store((^u32x4)(dst[25:]), intrinsics.simd_add(x9_1, s9))
+ intrinsics.unaligned_store((^u32x4)(dst[26:]), intrinsics.simd_add(x10_1, s10))
+ intrinsics.unaligned_store((^u32x4)(dst[27:]), intrinsics.simd_add(x11_1, s11))
+ intrinsics.unaligned_store((^u32x4)(dst[28:]), x12_1)
+ intrinsics.unaligned_store((^u32x4)(dst[29:]), intrinsics.simd_add(x13_1, s13))
+ intrinsics.unaligned_store((^u32x4)(dst[30:]), intrinsics.simd_add(x14_1, s14))
+ intrinsics.unaligned_store((^u32x4)(dst[31:]), intrinsics.simd_add(x15_1, s15))
+ } else {
+ intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0_0))
+ intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1_0))
+ intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2_0))
+ intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3_0))
+ intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4_0, s4)))
+ intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5_0, s5)))
+ intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6_0, s6)))
+ intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7_0, s7)))
+ intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8_0, s8)))
+ intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9_0, s9)))
+ intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10_0, s10)))
+ intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11_0, s11)))
+ intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12_0))
+ intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13_0, s13)))
+ intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14_0, s14)))
+ intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15_0, s15)))
+
+ intrinsics.unaligned_store((^u32x4)(dst[16:]), _byteswap_u32x4(x0_1))
+ intrinsics.unaligned_store((^u32x4)(dst[17:]), _byteswap_u32x4(x1_1))
+ intrinsics.unaligned_store((^u32x4)(dst[18:]), _byteswap_u32x4(x2_1))
+ intrinsics.unaligned_store((^u32x4)(dst[19:]), _byteswap_u32x4(x3_1))
+ intrinsics.unaligned_store((^u32x4)(dst[20:]), _byteswap_u32x4(intrinsics.simd_add(x4_1, s4)))
+ intrinsics.unaligned_store((^u32x4)(dst[21:]), _byteswap_u32x4(intrinsics.simd_add(x5_1, s5)))
+ intrinsics.unaligned_store((^u32x4)(dst[22:]), _byteswap_u32x4(intrinsics.simd_add(x6_1, s6)))
+ intrinsics.unaligned_store((^u32x4)(dst[23:]), _byteswap_u32x4(intrinsics.simd_add(x7_1, s7)))
+ intrinsics.unaligned_store((^u32x4)(dst[24:]), _byteswap_u32x4(intrinsics.simd_add(x8_1, s8)))
+ intrinsics.unaligned_store((^u32x4)(dst[25:]), _byteswap_u32x4(intrinsics.simd_add(x9_1, s9)))
+ intrinsics.unaligned_store((^u32x4)(dst[26:]), _byteswap_u32x4(intrinsics.simd_add(x10_1, s10)))
+ intrinsics.unaligned_store((^u32x4)(dst[27:]), _byteswap_u32x4(intrinsics.simd_add(x11_1, s11)))
+ intrinsics.unaligned_store((^u32x4)(dst[28:]), _byteswap_u32x4(x12_1))
+ intrinsics.unaligned_store((^u32x4)(dst[29:]), _byteswap_u32x4(intrinsics.simd_add(x13_1, s13)))
+ intrinsics.unaligned_store((^u32x4)(dst[30:]), _byteswap_u32x4(intrinsics.simd_add(x14_1, s14)))
+ intrinsics.unaligned_store((^u32x4)(dst[31:]), _byteswap_u32x4(intrinsics.simd_add(x15_1, s15)))
+ }
+
+ s12 = intrinsics.simd_add(s12, _CTR_INC_8)
+
+ dst = dst[32:]
+ }
+ } else {
+ for _ in 0..<4 {
+ x0, x1, x2, x3 := S0, S1, S2, S3
+ x4, x5, x6, x7 := s4, s5, s6, s7
+ x8, x9, x10, x11 := s8, s9, s10, s11
+ x12, x13, x14, x15 := s12, s13, s14, s15
+
+ for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
+ x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+ x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+ x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+ x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
+
+ x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+ x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+ x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+ x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
+ }
+
+ when ODIN_ENDIAN == .Little {
+ intrinsics.unaligned_store((^u32x4)(dst[0:]), x0)
+ intrinsics.unaligned_store((^u32x4)(dst[1:]), x1)
+ intrinsics.unaligned_store((^u32x4)(dst[2:]), x2)
+ intrinsics.unaligned_store((^u32x4)(dst[3:]), x3)
+ intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4, s4))
+ intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5, s5))
+ intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6, s6))
+ intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7, s7))
+ intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8, s8))
+ intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9, s9))
+ intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10, s10))
+ intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11, s11))
+ intrinsics.unaligned_store((^u32x4)(dst[12:]), x12)
+ intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13, s13))
+ intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14, s14))
+ intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15, s15))
+ } else {
+ intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0))
+ intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1))
+ intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2))
+ intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3))
+ intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4, s4)))
+ intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5, s5)))
+ intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6, s6)))
+ intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7, s7)))
+ intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8, s8)))
+ intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9, s9)))
+ intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10, s10)))
+ intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11, s11)))
+ intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12))
+ intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13, s13)))
+ intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14, s14)))
+ intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15, s15)))
+ }
+
+ s12 = intrinsics.simd_add(s12, _CTR_INC_4)
+
+ dst = dst[16:]
+ }
+ }
+}
diff --git a/base/runtime/random_generator_chacha8_simd256.odin b/base/runtime/random_generator_chacha8_simd256.odin
new file mode 100644
index 000000000..c0985f456
--- /dev/null
+++ b/base/runtime/random_generator_chacha8_simd256.odin
@@ -0,0 +1,197 @@
+#+build amd64
+package runtime
+
+import "base:intrinsics"
+
+#assert(ODIN_ENDIAN == .Little)
+
+@(private = "file")
+u32x8 :: #simd[8]u32
+@(private = "file")
+u32x4 :: #simd[4]u32
+
+@(private = "file")
+S0: u32x8 : {
+ CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0,
+ CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0,
+}
+@(private = "file")
+S1: u32x8 : {
+ CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1,
+ CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1,
+}
+@(private = "file")
+S2: u32x8 : {
+ CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2,
+ CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2,
+}
+@(private = "file")
+S3: u32x8 : {
+ CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3,
+ CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3,
+}
+
+@(private = "file")
+_ROT_7L: u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
+@(private = "file")
+_ROT_16: u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
+@(private = "file")
+_CTR_INC_8: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+
+// To the best of my knowledge this is only really useful on
+// modern x86-64 as most ARM silicon is missing support for SVE2.
+
+@(private, enable_target_feature = "avx,avx2")
+chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
+ // Initialize the base state.
+ k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
+ s4_ := k[0]
+ s5_ := k[1]
+ s6_ := k[2]
+ s7_ := k[3]
+ s8_ := k[4]
+ s9_ := k[5]
+ s10_ := k[6]
+ s11_ := k[7]
+
+ // 8-lane ChaCha8.
+ s4 := u32x8{s4_, s4_, s4_, s4_, s4_, s4_, s4_, s4_}
+ s5 := u32x8{s5_, s5_, s5_, s5_, s5_, s5_, s5_, s5_}
+ s6 := u32x8{s6_, s6_, s6_, s6_, s6_, s6_, s6_, s6_}
+ s7 := u32x8{s7_, s7_, s7_, s7_, s7_, s7_, s7_, s7_}
+ s8 := u32x8{s8_, s8_, s8_, s8_, s8_, s8_, s8_, s8_}
+ s9 := u32x8{s9_, s9_, s9_, s9_, s9_, s9_, s9_, s9_}
+ s10 := u32x8{s10_, s10_, s10_, s10_, s10_, s10_, s10_, s10_}
+ s11 := u32x8{s11_, s11_, s11_, s11_, s11_, s11_, s11_, s11_}
+ s12 := u32x8{0, 1, 2, 3, 4, 5, 6, 7}
+ s13, s14, s15: u32x8
+
+ u32x4 :: #simd[4]u32
+ dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
+
+ quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x8) -> (u32x8, u32x8, u32x8, u32x8) {
+ a, b, c, d := a, b, c, d
+
+ a = intrinsics.simd_add(a, b)
+ d = intrinsics.simd_bit_xor(d, a)
+ d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
+
+ c = intrinsics.simd_add(c, d)
+ b = intrinsics.simd_bit_xor(b, c)
+ b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
+
+ a = intrinsics.simd_add(a, b)
+ d = intrinsics.simd_bit_xor(d, a)
+ d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
+
+ c = intrinsics.simd_add(c, d)
+ b = intrinsics.simd_bit_xor(b, c)
+ b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
+
+ return a, b, c, d
+ }
+
+ for _ in 0..<2 {
+ x0, x1, x2, x3 := S0, S1, S2, S3
+ x4, x5, x6, x7 := s4, s5, s6, s7
+ x8, x9, x10, x11 := s8, s9, s10, s11
+ x12, x13, x14, x15 := s12, s13, s14, s15
+
+ for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
+ x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+ x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+ x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+ x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
+
+ x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+ x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+ x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+ x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
+ }
+
+ x4 = intrinsics.simd_add(x4, s4)
+ x5 = intrinsics.simd_add(x5, s5)
+ x6 = intrinsics.simd_add(x6, s6)
+ x7 = intrinsics.simd_add(x7, s7)
+ x8 = intrinsics.simd_add(x8, s8)
+ x9 = intrinsics.simd_add(x9, s9)
+ x10 = intrinsics.simd_add(x10, s10)
+ x11 = intrinsics.simd_add(x11, s11)
+ x13 = intrinsics.simd_add(x13, s13)
+ x14 = intrinsics.simd_add(x14, s14)
+ x15 = intrinsics.simd_add(x15, s15)
+
+ // Ok, now we have x0->x15 with 8 lanes, but we need to
+ // output the first 4 blocks, then the second 4 blocks.
+ //
+ // LLVM appears not to consider "this instruction is totally
+ // awful on the given microarchitcture", which leads to
+ // `VPCOMPRESSED` being generated iff AVX512 support is
+ // enabled for `intrinsics.simd_masked_compress_store`.
+ // On Zen 4, this leads to a 50% performance regression vs
+ // the 128-bit SIMD code.
+ //
+ // The fake intrinsic (because LLVM doesn't appear to have
+ // an amd64 specific one), doesn't generate `VEXTRACTI128`,
+ // but instead does cleverness without horrible regressions.
+
+ intrinsics.unaligned_store((^u32x4)(dst[0:]), _mm_mm256_extracti128_si256(x0, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[1:]), _mm_mm256_extracti128_si256(x1, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[2:]), _mm_mm256_extracti128_si256(x2, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[3:]), _mm_mm256_extracti128_si256(x3, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[4:]), _mm_mm256_extracti128_si256(x4, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[5:]), _mm_mm256_extracti128_si256(x5, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[6:]), _mm_mm256_extracti128_si256(x6, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[7:]), _mm_mm256_extracti128_si256(x7, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[8:]), _mm_mm256_extracti128_si256(x8, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[9:]), _mm_mm256_extracti128_si256(x9, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[10:]), _mm_mm256_extracti128_si256(x10, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[11:]), _mm_mm256_extracti128_si256(x11, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[12:]), _mm_mm256_extracti128_si256(x12, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[13:]), _mm_mm256_extracti128_si256(x13, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[14:]), _mm_mm256_extracti128_si256(x14, 0))
+ intrinsics.unaligned_store((^u32x4)(dst[15:]), _mm_mm256_extracti128_si256(x15, 0))
+
+ intrinsics.unaligned_store((^u32x4)(dst[16:]), _mm_mm256_extracti128_si256(x0, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[17:]), _mm_mm256_extracti128_si256(x1, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[18:]), _mm_mm256_extracti128_si256(x2, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[19:]), _mm_mm256_extracti128_si256(x3, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[20:]), _mm_mm256_extracti128_si256(x4, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[21:]), _mm_mm256_extracti128_si256(x5, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[22:]), _mm_mm256_extracti128_si256(x6, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[23:]), _mm_mm256_extracti128_si256(x7, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[24:]), _mm_mm256_extracti128_si256(x8, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[25:]), _mm_mm256_extracti128_si256(x9, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[26:]), _mm_mm256_extracti128_si256(x10, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[27:]), _mm_mm256_extracti128_si256(x11, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[28:]), _mm_mm256_extracti128_si256(x12, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[29:]), _mm_mm256_extracti128_si256(x13, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[30:]), _mm_mm256_extracti128_si256(x14, 1))
+ intrinsics.unaligned_store((^u32x4)(dst[31:]), _mm_mm256_extracti128_si256(x15, 1))
+
+ s12 = intrinsics.simd_add(s12, _CTR_INC_8)
+
+ dst = dst[32:]
+ }
+}
+
+@(private = "file", require_results, enable_target_feature="avx2")
+_mm_mm256_extracti128_si256 :: #force_inline proc "c" (a: u32x8, $OFFSET: int) -> u32x4 {
+ when OFFSET == 0 {
+ return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3)
+ } else when OFFSET == 1 {
+ return intrinsics.simd_shuffle(a, a, 4, 5, 6, 7)
+ } else {
+ #panic("chacha8rand: invalid offset")
+ }
+}
diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin
index 89e92e35f..7ccf126e6 100644
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -4,6 +4,10 @@ package crypto
import "base:runtime"
import "core:mem"
+// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
+// entropy source.
+HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES
+
// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
//
// The execution time of this routine is constant regardless of the contents
@@ -54,7 +58,7 @@ rand_bytes :: proc (dst: []byte) {
// zero-fill the buffer first
mem.zero_explicit(raw_data(dst), len(dst))
- _rand_bytes(dst)
+ runtime.rand_bytes(dst)
}
// random_generator returns a `runtime.Random_Generator` backed by the
diff --git a/core/crypto/rand_bsd.odin b/core/crypto/rand_bsd.odin
deleted file mode 100644
index 78a6fcaaf..000000000
--- a/core/crypto/rand_bsd.odin
+++ /dev/null
@@ -1,15 +0,0 @@
-#+build freebsd, openbsd, netbsd
-package crypto
-
-foreign import libc "system:c"
-
-HAS_RAND_BYTES :: true
-
-foreign libc {
- arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
-}
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
- arc4random_buf(raw_data(dst), len(dst))
-}
diff --git a/core/crypto/rand_darwin.odin b/core/crypto/rand_darwin.odin
deleted file mode 100644
index df474bc4c..000000000
--- a/core/crypto/rand_darwin.odin
+++ /dev/null
@@ -1,17 +0,0 @@
-package crypto
-
-import "core:fmt"
-
-import CF "core:sys/darwin/CoreFoundation"
-import Sec "core:sys/darwin/Security"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
- err := Sec.RandomCopyBytes(count=len(dst), bytes=raw_data(dst))
- if err != .Success {
- msg := CF.StringCopyToOdinString(Sec.CopyErrorMessageString(err))
- fmt.panicf("crypto/rand_bytes: SecRandomCopyBytes returned non-zero result: %v %s", err, msg)
- }
-}
diff --git a/core/crypto/rand_generic.odin b/core/crypto/rand_generic.odin
deleted file mode 100644
index 8266f8ffc..000000000
--- a/core/crypto/rand_generic.odin
+++ /dev/null
@@ -1,16 +0,0 @@
-#+build !linux
-#+build !windows
-#+build !openbsd
-#+build !freebsd
-#+build !netbsd
-#+build !darwin
-#+build !js
-#+build !wasi
-package crypto
-
-HAS_RAND_BYTES :: false
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
- unimplemented("crypto: rand_bytes not supported on this OS")
-}
diff --git a/core/crypto/rand_js.odin b/core/crypto/rand_js.odin
deleted file mode 100644
index 72093810e..000000000
--- a/core/crypto/rand_js.odin
+++ /dev/null
@@ -1,24 +0,0 @@
-package crypto
-
-foreign import "odin_env"
-foreign odin_env {
- @(link_name = "rand_bytes")
- env_rand_bytes :: proc "contextless" (buf: []byte) ---
-}
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_MAX_PER_CALL_BYTES :: 65536 // 64kiB
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
- dst := dst
-
- for len(dst) > 0 {
- to_read := min(len(dst), _MAX_PER_CALL_BYTES)
- env_rand_bytes(dst[:to_read])
-
- dst = dst[to_read:]
- }
-}
diff --git a/core/crypto/rand_linux.odin b/core/crypto/rand_linux.odin
deleted file mode 100644
index 7e0edbb7e..000000000
--- a/core/crypto/rand_linux.odin
+++ /dev/null
@@ -1,40 +0,0 @@
-package crypto
-
-import "core:fmt"
-
-import "core:sys/linux"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_MAX_PER_CALL_BYTES :: 33554431 // 2^25 - 1
-
-@(private)
-_rand_bytes :: proc (dst: []byte) {
- dst := dst
- l := len(dst)
-
- for l > 0 {
- to_read := min(l, _MAX_PER_CALL_BYTES)
- n_read, errno := linux.getrandom(dst[:to_read], {})
- #partial switch errno {
- case .NONE:
- // Do nothing
- case .EINTR:
- // Call interupted by a signal handler, just retry the
- // request.
- continue
- case .ENOSYS:
- // The kernel is apparently prehistoric (< 3.17 circa 2014)
- // and does not support getrandom.
- panic("crypto: getrandom not available in kernel")
- case:
- // All other failures are things that should NEVER happen
- // unless the kernel interface changes (ie: the Linux
- // developers break userland).
- fmt.panicf("crypto: getrandom failed: %v", errno)
- }
- l -= n_read
- dst = dst[n_read:]
- }
-}
diff --git a/core/crypto/rand_wasi.odin b/core/crypto/rand_wasi.odin
deleted file mode 100644
index 9653fb985..000000000
--- a/core/crypto/rand_wasi.odin
+++ /dev/null
@@ -1,13 +0,0 @@
-package crypto
-
-import "core:fmt"
-import "core:sys/wasm/wasi"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
- if err := wasi.random_get(dst); err != nil {
- fmt.panicf("crypto: wasi.random_get failed: %v", err)
- }
-}
diff --git a/core/crypto/rand_windows.odin b/core/crypto/rand_windows.odin
deleted file mode 100644
index 83a976e38..000000000
--- a/core/crypto/rand_windows.odin
+++ /dev/null
@@ -1,26 +0,0 @@
-package crypto
-
-import win32 "core:sys/windows"
-import "core:os"
-import "core:fmt"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
- ret := os.Platform_Error(win32.BCryptGenRandom(nil, raw_data(dst), u32(len(dst)), win32.BCRYPT_USE_SYSTEM_PREFERRED_RNG))
- if ret != nil {
- #partial switch ret {
- case os.ERROR_INVALID_HANDLE:
- // The handle to the first parameter is invalid.
- // This should not happen here, since we explicitly pass nil to it
- panic("crypto: BCryptGenRandom Invalid handle for hAlgorithm")
- case os.ERROR_INVALID_PARAMETER:
- // One of the parameters was invalid
- panic("crypto: BCryptGenRandom Invalid parameter")
- case:
- // Unknown error
- fmt.panicf("crypto: BCryptGenRandom failed: %d\n", ret)
- }
- }
-}
diff --git a/core/math/rand/rand.odin b/core/math/rand/rand.odin
index a9870b9f7..4ffcc595e 100644
--- a/core/math/rand/rand.odin
+++ b/core/math/rand/rand.odin
@@ -11,15 +11,50 @@ Generator :: runtime.Random_Generator
Generator_Query_Info :: runtime.Random_Generator_Query_Info
Default_Random_State :: runtime.Default_Random_State
+
+/*
+Returns an instance of the runtime pseudorandom generator. If no
+initial state is provided, the PRNG will be lazily initialized with
+entropy from the system entropy source on first-use.
+
+The cryptographic security of the returned random number generator
+is directly dependent on the quality of the initialization entropy.
+Calling `reset`/`create` SHOULD be done with no seed/state, or
+32-bytes of high-quality entropy.
+
+WARNING:
+- The lazy initialization will panic if there is no system entropy
+ source available.
+- While the generator is cryptographically secure, developers SHOULD
+ prefer `crypto.random_generator()` for cryptographic use cases such
+ as key generation.
+
+Inputs:
+- state: Optional initial PRNG state.
+
+Returns:
+- A `Generator` instance.
+*/
default_random_generator :: runtime.default_random_generator
@(require_results)
-create :: proc(seed: u64) -> (state: Default_Random_State) {
+create_u64 :: proc(seed: u64) -> (state: Default_Random_State) {
seed := seed
runtime.default_random_generator_proc(&state, .Reset, ([^]byte)(&seed)[:size_of(seed)])
return
}
+@(require_results)
+create_bytes :: proc(seed: []byte) -> (state: Default_Random_State) {
+ runtime.default_random_generator_proc(&state, .Reset, seed)
+ return
+}
+
+create :: proc {
+ create_u64,
+ create_bytes,
+}
+
/*
Reset the seed used by the context.random_generator.
@@ -39,10 +74,14 @@ Possible Output:
10
*/
-reset :: proc(seed: u64, gen := context.random_generator) {
- runtime.random_generator_reset_u64(gen, seed)
+reset :: proc {
+ reset_u64,
+ reset_bytes,
}
+reset_u64 :: proc(seed: u64, gen := context.random_generator) {
+ runtime.random_generator_reset_u64(gen, seed)
+}
reset_bytes :: proc(bytes: []byte, gen := context.random_generator) {
runtime.random_generator_reset_bytes(gen, bytes)
diff --git a/core/math/rand/rand_pcg.odin b/core/math/rand/rand_pcg.odin
new file mode 100644
index 000000000..009e139be
--- /dev/null
+++ b/core/math/rand/rand_pcg.odin
@@ -0,0 +1,107 @@
+package rand
+
+import "base:intrinsics"
+import "base:runtime"
+
+/*
+The state for a PCG64 RXS-M-XS pseudorandom generator.
+*/
+PCG_Random_State :: struct {
+ state: u64,
+ inc: u64,
+}
+
+pcg_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
+ @(require_results)
+ read_u64 :: proc "contextless" (r: ^PCG_Random_State) -> u64 {
+ old_state := r.state
+ r.state = old_state * 6364136223846793005 + (r.inc|1)
+ xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081
+ rot := (old_state >> 59)
+ return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63))
+ }
+
+ @(thread_local)
+ global_rand_seed: PCG_Random_State
+
+ init :: proc "contextless" (r: ^PCG_Random_State, seed: u64) {
+ seed := seed
+ if seed == 0 {
+ seed = u64(intrinsics.read_cycle_counter())
+ }
+ r.state = 0
+ r.inc = (seed << 1) | 1
+ _ = read_u64(r)
+ r.state += seed
+ _ = read_u64(r)
+ }
+
+ r: ^PCG_Random_State = ---
+ if data == nil {
+ r = &global_rand_seed
+ } else {
+ r = cast(^PCG_Random_State)data
+ }
+
+ switch mode {
+ case .Read:
+ if r.state == 0 && r.inc == 0 {
+ init(r, 0)
+ }
+
+ switch len(p) {
+ case size_of(u64):
+ // Fast path for a 64-bit destination.
+ intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
+ case:
+ // All other cases.
+ pos := i8(0)
+ val := u64(0)
+ for &v in p {
+ if pos == 0 {
+ val = read_u64(r)
+ pos = 8
+ }
+ v = byte(val)
+ val >>= 8
+ pos -= 1
+ }
+ }
+
+ case .Reset:
+ seed: u64
+ runtime.mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p)))
+ init(r, seed)
+
+ case .Query_Info:
+ if len(p) != size_of(Generator_Query_Info) {
+ return
+ }
+ info := (^Generator_Query_Info)(raw_data(p))
+ info^ += {.Uniform, .Resettable}
+ }
+}
+
+/*
+Returns an instance of the PGC64 RXS-M-XS pseudorandom generator. If no
+initial state is provided, the PRNG will be lazily initialized with the
+system timestamp counter on first-use.
+
+WARNING: This random number generator is NOT cryptographically secure,
+and is additionally known to be flawed. It is only included for
+backward compatibility with historical releases of Odin.
+See: https://github.com/odin-lang/Odin/issues/5881
+
+Inputs:
+- state: Optional initial PRNG state.
+
+Returns:
+- A `Generator` instance.
+*/
+@(require_results)
+pcg_random_generator :: proc "contextless" (state: ^PCG_Random_State = nil) -> Generator {
+ return {
+ procedure = pcg_random_generator_proc,
+ data = state,
+ }
+}
diff --git a/core/math/rand/rand_xoshiro256.odin b/core/math/rand/rand_xoshiro256.odin
new file mode 100644
index 000000000..54dd02130
--- /dev/null
+++ b/core/math/rand/rand_xoshiro256.odin
@@ -0,0 +1,123 @@
+package rand
+
+import "base:intrinsics"
+import "base:runtime"
+
+import "core:math/bits"
+
+/*
+The state for a xoshiro256** pseudorandom generator.
+*/
+Xoshiro256_Random_State :: struct {
+ s: [4]u64,
+}
+
+xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
+ @(require_results)
+ read_u64 :: proc "contextless" (r: ^Xoshiro256_Random_State) -> u64 {
+ // xoshiro256** output function and state transition
+
+ result := bits.rotate_left64(r.s[1] * 5, 7) * 9
+ t := r.s[1] << 17
+
+ r.s[2] = r.s[2] ~ r.s[0]
+ r.s[3] = r.s[3] ~ r.s[1]
+ r.s[1] = r.s[1] ~ r.s[2]
+ r.s[0] = r.s[0] ~ r.s[3]
+ r.s[2] = r.s[2] ~ t
+ r.s[3] = bits.rotate_left64(r.s[3], 45)
+
+ return result
+ }
+
+ @(thread_local)
+ global_rand_seed: Xoshiro256_Random_State
+
+ init :: proc "contextless" (r: ^Xoshiro256_Random_State, seed: u64) {
+ // splitmix64 to expand a 64-bit seed into 256 bits of state
+ sm64_next :: proc "contextless" (s: ^u64) -> u64 {
+ s^ += 0x9E3779B97F4A7C15
+ z := s^
+ z = (z ~ (z >> 30)) * 0xBF58476D1CE4E5B9
+ z = (z ~ (z >> 27)) * 0x94D049BB133111EB
+ return z ~ (z >> 31)
+ }
+
+ local_seed := seed
+ r.s[0] = sm64_next(&local_seed)
+ r.s[1] = sm64_next(&local_seed)
+ r.s[2] = sm64_next(&local_seed)
+ r.s[3] = sm64_next(&local_seed)
+ // Extremely unlikely all zero; ensure non-zero state
+ if (r.s[0] | r.s[1] | r.s[2] | r.s[3]) == 0 {
+ // force a minimal non-zero tweak
+ r.s[0] = 1
+ }
+ }
+
+ r: ^Xoshiro256_Random_State = ---
+ if data == nil {
+ r = &global_rand_seed
+ } else {
+ r = cast(^Xoshiro256_Random_State)data
+ }
+
+ switch mode {
+ case .Read:
+ if (r.s[0] | r.s[1] | r.s[2] | r.s[3]) == 0 {
+ init(r, u64(intrinsics.read_cycle_counter()))
+ }
+
+ switch len(p) {
+ case size_of(u64):
+ // Fast path for a 64-bit destination.
+ intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
+ case:
+ // All other cases.
+ pos := i8(0)
+ val := u64(0)
+ for &v in p {
+ if pos == 0 {
+ val = read_u64(r)
+ pos = 8
+ }
+ v = byte(val)
+ val >>= 8
+ pos -= 1
+ }
+ }
+
+ case .Reset:
+ seed: u64 = 0
+ runtime.mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p)))
+ init(r, seed)
+
+ case .Query_Info:
+ if len(p) != size_of(Generator_Query_Info) {
+ return
+ }
+ info := (^Generator_Query_Info)(raw_data(p))
+ info^ += {.Uniform, .Resettable}
+ }
+}
+
+/*
+Returns an instance of the xoshiro256** pseudorandom generator. If no
+initial state is provided, the PRNG will be lazily initialized with the
+system timestamp counter on first-use.
+
+WARNING: This random number generator is NOT cryptographically secure.
+
+Inputs:
+- state: Optional initial PRNG state.
+
+Returns:
+- A `Generator` instance.
+*/
+@(require_results)
+xoshiro256_random_generator :: proc "contextless" (state: ^Xoshiro256_Random_State = nil) -> Generator {
+ return {
+ procedure = xoshiro256_random_generator_proc,
+ data = state,
+ }
+}
diff --git a/core/testing/runner.odin b/core/testing/runner.odin
index cdb911d15..e0a37c46d 100644
--- a/core/testing/runner.odin
+++ b/core/testing/runner.odin
@@ -151,9 +151,9 @@ run_test_task :: proc(task: thread.Task) {
options = logger_options,
}
- random_generator_state: runtime.Default_Random_State
+ random_generator_state: rand.Xoshiro256_Random_State
context.random_generator = {
- procedure = runtime.default_random_generator_proc,
+ procedure = rand.xoshiro256_random_generator_proc,
data = &random_generator_state,
}
rand.reset(data.t.seed)
diff --git a/examples/demo/demo.odin b/examples/demo/demo.odin
index 161d48acb..1ea06d096 100644
--- a/examples/demo/demo.odin
+++ b/examples/demo/demo.odin
@@ -11,6 +11,7 @@ import "core:reflect"
import "base:runtime"
import "base:intrinsics"
import "core:math/big"
+import "core:math/rand"
/*
Odin is a general-purpose programming language with distinct typing built
@@ -2258,6 +2259,10 @@ arbitrary_precision_mathematics :: proc() {
a, b, c, d, e, f, res := &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}
defer big.destroy(a, b, c, d, e, f, res)
+ // Set the context RNG to something that does not require
+ // cryptographic entropy (not supported on all targets).
+ context.random_generator = rand.xoshiro256_random_generator()
+
// How many bits should the random prime be?
bits := 64
// Number of Rabin-Miller trials, -1 for automatic.
diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin
index 30640ac87..042b2fa63 100644
--- a/tests/benchmark/all.odin
+++ b/tests/benchmark/all.odin
@@ -3,5 +3,6 @@ package benchmarks
@(require) import "bytes"
@(require) import "crypto"
@(require) import "hash"
+@(require) import "math"
@(require) import "text/regex"
@(require) import "strings" \ No newline at end of file
diff --git a/tests/benchmark/math/benchmark_rand.odin b/tests/benchmark/math/benchmark_rand.odin
new file mode 100644
index 000000000..742c90599
--- /dev/null
+++ b/tests/benchmark/math/benchmark_rand.odin
@@ -0,0 +1,130 @@
+package benchmark_core_math
+
+import "base:runtime"
+
+import "core:fmt"
+import "core:math/rand"
+import "core:log"
+import "core:strings"
+import "core:testing"
+import "core:text/table"
+import "core:time"
+
+@(private = "file")
+ITERS :: 10000000
+@(private = "file")
+ITERS_BULK :: 1000
+
+@(private = "file")
+SAMPLE_SEED : string : "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"
+
+@(test)
+benchmark_rng :: proc(t: ^testing.T) {
+ runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+
+ tbl: table.Table
+ table.init(&tbl)
+ defer table.destroy(&tbl)
+
+ table.caption(&tbl, "RNG")
+ table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Time", "Throughput")
+
+ context.random_generator = rand.default_random_generator()
+ rand.reset_bytes(transmute([]byte)(SAMPLE_SEED))
+ _benchmark_u64(t, &tbl, "chacha8rand")
+ _benchmark_large(t, &tbl, "chacha8rand")
+
+ table.row(&tbl)
+
+ context.random_generator = rand.pcg_random_generator()
+ _benchmark_u64(t, &tbl, "pcg64")
+ _benchmark_large(t, &tbl, "pcg64")
+
+ table.row(&tbl)
+
+ context.random_generator = rand.xoshiro256_random_generator()
+ _benchmark_u64(t, &tbl, "xorshiro256**")
+ _benchmark_large(t, &tbl, "xorshiro256**")
+
+ log_table(&tbl)
+}
+
+@(private = "file")
+_benchmark_u64 :: proc(t: ^testing.T, tbl: ^table.Table, algo_name: string) {
+ options := &time.Benchmark_Options{
+ rounds = ITERS,
+ bytes = 8,
+ setup = nil,
+ bench = proc(options: ^time.Benchmark_Options, allocator: runtime.Allocator) -> (err: time.Benchmark_Error){
+ sum: u64
+ for _ in 0 ..= options.rounds {
+ sum += rand.uint64()
+ }
+ options.hash = u128(sum)
+ options.count = options.rounds
+ options.processed = options.rounds * options.bytes
+ return
+ },
+ teardown = nil,
+ }
+
+ err := time.benchmark(options, context.allocator)
+ testing.expect(t, err == nil)
+
+ time_per_iter := options.duration / ITERS
+ table.aligned_row_of_values(
+ tbl,
+ .Right,
+ algo_name,
+ table.format(tbl, "uint64"),
+ table.format(tbl, "%8M", time_per_iter),
+ table.format(tbl, "%5.3f MiB/s", options.megabytes_per_second),
+ )
+}
+
+@(private = "file")
+_benchmark_large :: proc(t: ^testing.T, tbl: ^table.Table, algo_name: string) {
+ options := &time.Benchmark_Options{
+ rounds = ITERS_BULK,
+ bytes = 1024768,
+ setup = nil,
+ bench = proc(options: ^time.Benchmark_Options, allocator: runtime.Allocator) -> (err: time.Benchmark_Error){
+ n: int
+ for _ in 0 ..= options.rounds {
+ n += rand.read(options.output)
+ }
+ options.hash = u128(n)
+ options.count = options.rounds
+ options.processed = options.rounds * options.bytes
+ return
+ },
+ output = make([]byte, 1024768, context.temp_allocator),
+ teardown = nil,
+ }
+
+ err := time.benchmark(options, context.allocator)
+ testing.expect(t, err == nil)
+
+ time_per_iter := options.duration / ITERS_BULK
+ table.aligned_row_of_values(
+ tbl,
+ .Right,
+ algo_name,
+ table.format(tbl, "1 MiB"),
+ table.format(tbl, "%8M", time_per_iter),
+ table.format(tbl, "%5.3f MiB/s", options.megabytes_per_second),
+ )
+}
+
+@(private)
+log_table :: proc(tbl: ^table.Table) {
+ sb := strings.builder_make()
+ defer strings.builder_destroy(&sb)
+
+ wr := strings.to_writer(&sb)
+
+ fmt.sbprintln(&sb)
+ table.write_plain_table(wr, tbl)
+
+ log.info(strings.to_string(sb))
+}
diff --git a/tests/core/math/rand/test_core_math_rand.odin b/tests/core/math/rand/test_core_math_rand.odin
index 392d3d241..814a1b9f8 100644
--- a/tests/core/math/rand/test_core_math_rand.odin
+++ b/tests/core/math/rand/test_core_math_rand.odin
@@ -1,19 +1,54 @@
package test_core_math_rand
+import "core:math"
import "core:math/rand"
import "core:testing"
-@test
-test_default_rand_determinism :: proc(t: ^testing.T) {
+Generator :: struct {
+ name: string,
+ gen: rand.Generator,
+ biased: bool,
+}
+
+@(test)
+test_prngs :: proc(t: ^testing.T) {
+ gens := []Generator {
+ {
+ "default",
+ rand.default_random_generator(),
+ false,
+ },
+ {
+ "pcg64",
+ rand.pcg_random_generator(), // Deprecated
+ true,
+ },
+ {
+ "xoshiro**",
+ rand.xoshiro256_random_generator(),
+ false,
+ },
+ }
+ for gen in gens {
+ rand_determinism(t, gen)
+ if !gen.biased {
+ rand_issue_5881(t, gen)
+ }
+ }
+}
+
+@(private = "file")
+rand_determinism :: proc(t: ^testing.T, rng: Generator) {
+ context.random_generator = rng.gen
rand.reset(13)
first_value := rand.int127()
rand.reset(13)
second_value := rand.int127()
- testing.expect(t, first_value == second_value, "Context default random number generator is non-deterministic.")
+ testing.expectf(t, first_value == second_value, "rng '%s' is non-deterministic.", rng.name)
}
-@test
+@(test)
test_default_rand_determinism_user_set :: proc(t: ^testing.T) {
rng_state_1 := rand.create(13)
rng_state_2 := rand.create(13)
@@ -33,3 +68,80 @@ test_default_rand_determinism_user_set :: proc(t: ^testing.T) {
testing.expect(t, first_value == second_value, "User-set default random number generator is non-deterministic.")
}
+
+@(private = "file")
+rand_issue_5881 :: proc(t:^testing.T, rng: Generator) {
+ // Tests issue #5881 https://github.com/odin-lang/Odin/issues/5881
+
+ // Bit balance and sign uniformity (modest samples to keep CI fast)
+ expect_u64_bit_balance(t, rng, 200_000)
+ expect_quaternion_sign_uniformity(t, rng, 200_000)
+}
+
+// Helper: compute chi-square statistic for counts vs equal-expected across k bins
+@(private = "file")
+chi_square_equal :: proc(counts: []int) -> f64 {
+ n := 0
+ for c in counts {
+ n += c
+ }
+ if n == 0 {
+ return 0
+ }
+ k := len(counts)
+ exp := f64(n) / f64(k)
+ stat := f64(0)
+ for c in counts {
+ d := f64(c) - exp
+ stat += (d * d) / exp
+ }
+ return stat
+}
+
+// Helper: check bit balance on u64 across many samples
+@(private = "file")
+expect_u64_bit_balance :: proc(t: ^testing.T, rng: Generator, samples: int, sigma_k: f64 = 6) {
+ rand.reset(t.seed, rng.gen)
+
+ ones: [64]int
+ for i := 0; i < samples; i += 1 {
+ v := rand.uint64(rng.gen)
+ for b := 0; b < 64; b += 1 {
+ ones[b] += int((v >> u64(b)) & 1)
+ }
+ }
+ mu := f64(samples) * 0.5
+ sigma := math.sqrt(f64(samples) * 0.25)
+ limit := sigma_k * sigma
+ for b := 0; b < 64; b += 1 {
+ diff := math.abs(f64(ones[b]) - mu)
+ if diff > limit {
+ testing.expectf(t, false, "rng '%s': u64 bit %d imbalance: ones=%d samples=%d diff=%.1f limit=%.1f", rng.name, b, ones[b], samples, diff, limit)
+ return
+ }
+ }
+}
+
+// Helper: Uniformity sanity via 4D sign orthant chi-square with modest sample size.
+@(private = "file")
+expect_quaternion_sign_uniformity :: proc(t: ^testing.T, rng: Generator, iterations: int) {
+ counts: [16]int
+ for _ in 0..<iterations {
+ // Map 4D signs to 0..15 index
+ x := rand.float64_range(-10, 10, rng.gen)
+ y := rand.float64_range(-10, 10, rng.gen)
+ z := rand.float64_range(-10, 10, rng.gen)
+ w := rand.float64_range(-10, 10, rng.gen)
+ idx := 0
+ if x >= 0 { idx |= 1 }
+ if y >= 0 { idx |= 2 }
+ if z >= 0 { idx |= 4 }
+ if w >= 0 { idx |= 8 }
+ counts[idx] += 1
+ }
+ // df = 15. For a modest sample size, use a generous cutoff to reduce flakiness.
+ // Chi-square critical values (df=15): p=0.001 -> ~37.7, p=0.0001 -> ~43.8
+ // We accept < 55 as a conservative stability bound across platforms.
+ chi := chi_square_equal(counts[:])
+ testing.expectf(t, chi < 55.0, "rng '%s': 4D sign chi-square too high: %.3f (counts=%v)", rng.name, chi, counts)
+}
diff --git a/tests/internal/test_chacha8rand.odin b/tests/internal/test_chacha8rand.odin
new file mode 100644
index 000000000..378b398f0
--- /dev/null
+++ b/tests/internal/test_chacha8rand.odin
@@ -0,0 +1,151 @@
+package test_internal
+
+import "base:runtime"
+import "core:bytes"
+import "core:encoding/endian"
+import "core:math/rand"
+import "core:testing"
+
+@(private = "file")
+ITERS :: 10000000
+@(private = "file")
+ITERS_BULK :: 1000
+
+@(private = "file")
+SAMPLE_SEED : string : "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"
+@(private = "file")
+SAMPLE_OUTPUT := []u64{
+ 0xb773b6063d4616a5, 0x1160af22a66abc3c, 0x8c2599d9418d287c, 0x7ee07e037edc5cd6,
+ 0xcfaa9ee02d1c16ad, 0x0e090eef8febea79, 0x3c82d271128b5b3e, 0x9c5addc11252a34f,
+ 0xdf79bb617d6ceea6, 0x36d553591f9d736a, 0xeef0d14e181ee01f, 0x089bfc760ae58436,
+ 0xd9e52b59cc2ad268, 0xeb2fb4444b1b8aba, 0x4f95c8a692c46661, 0xc3c6323217cae62c,
+ 0x91ebb4367f4e2e7e, 0x784cf2c6a0ec9bc6, 0x5c34ec5c34eabe20, 0x4f0a8f515570daa8,
+ 0xfc35dcb4113d6bf2, 0x5b0da44c645554bc, 0x6d963da3db21d9e1, 0xeeaefc3150e500f3,
+ 0x2d37923dda3750a5, 0x380d7a626d4bc8b0, 0xeeaf68ede3d7ee49, 0xf4356695883b717c,
+ 0x846a9021392495a4, 0x8e8510549630a61b, 0x18dc02545dbae493, 0x0f8f9ff0a65a3d43,
+ 0xccf065f7190ff080, 0xfd76d1aa39673330, 0x95d232936cba6433, 0x6c7456d1070cbd17,
+ 0x462acfdaff8c6562, 0x5bafab866d34fc6a, 0x0c862f78030a2988, 0xd39a83e407c3163d,
+ 0xc00a2b7b45f22ebf, 0x564307c62466b1a9, 0x257e0424b0c072d4, 0x6fb55e99496c28fe,
+ 0xae9873a88f5cd4e0, 0x4657362ac60d3773, 0x1c83f91ecdf23e8e, 0x6fdc0792c15387c0,
+ 0x36dad2a30dfd2b5c, 0xa4b593290595bdb7, 0x4de18934e4cc02c5, 0xcdc0d604f015e3a7,
+ 0xfba0dbf69ad80321, 0x60e8bea3d139de87, 0xd18a4d851ef48756, 0x6366447c2215f34a,
+ 0x05682e97d3d007ee, 0x4c0e8978c6d54ab2, 0xcf1e9f6a6712edc2, 0x061439414c80cfd3,
+ 0xd1a8b6e2745c0ead, 0x31a7918d45c410e8, 0xabcc61ad90216eec, 0x4040d92d2032a71a,
+ 0x3cd2f66ffb40cd68, 0xdcd051c07295857a, 0xeab55cbcd9ab527e, 0x18471dce781bdaac,
+ 0xf7f08cd144dc7252, 0x5804e0b13d7f40d1, 0x5cb1a446e4b2d35b, 0xe6d4a728d2138a06,
+ 0x05223e40ca60dad8, 0x2d61ec3206ac6a68, 0xab692356874c17b8, 0xc30954417676de1c,
+ 0x4f1ace3732225624, 0xfba9510813988338, 0x997f200f52752e11, 0x1116aaafe86221fa,
+ 0x07ce3b5cb2a13519, 0x2956bc72bc458314, 0x4188b7926140eb78, 0x56ca6dbfd4adea4d,
+ 0x7fe3c22349340ce5, 0x35c08f9c37675f8a, 0x11e1c7fbef5ed521, 0x98adc8464ec1bc75,
+ 0xd163b2c73d1203f8, 0x8c761ee043a2f3f3, 0x24b99d6accecd7b7, 0x793e31aa112f0370,
+ 0x8e87dc2a19285139, 0x4247ae04f7096e25, 0x514f3122926fe20f, 0xdc6fb3f045d2a7e9,
+ 0x15cb30cecdd18eba, 0xcbc7fdecf6900274, 0x3fb5c696dc8ba021, 0xd1664417c8d274e6,
+ 0x05f7e445ea457278, 0xf920bbca1b9db657, 0x0c1950b4da22cb99, 0xf875baf1af09e292,
+ 0xbed3d7b84250f838, 0xf198e8080fd74160, 0xc9eda51d9b7ea703, 0xf709ef55439bf8f6,
+ 0xd20c74feebf116fc, 0x305668eb146d7546, 0x829af3ec10d89787, 0x15b8f9697b551dbc,
+ 0xfc823c6c8e64b8c9, 0x345585e8183b40bc, 0x674b4171d6581368, 0x1234d81cd670e9f7,
+ 0x0e505210d8a55e19, 0xe8258d69eeeca0dc, 0x05d4c452e8baf67e, 0xe8dbe30116a45599,
+ 0x1cf08ce1b1176f00, 0xccf7d0a4b81ecb49, 0x303fea136b2c430e, 0x861d6c139c06c871,
+ 0x5f41df72e05e0487, 0x25bd7e1e1ae26b1d, 0xbe9f4004d662a41d, 0x65bf58d483188546,
+ 0xd1b27cff69db13cc, 0x01a6663372c1bb36, 0x578dd7577b727f4d, 0x19c78f066c083cf6,
+ 0xdbe014d4f9c391bb, 0x97fbb2dd1d13ffb3, 0x31c91e0af9ef8d4f, 0x094dfc98402a43ba,
+ 0x069bd61bea37b752, 0x5b72d762e8d986ca, 0x72ee31865904bc85, 0xd1f5fdc5cd36c33e,
+ 0xba9b4980a8947cad, 0xece8f05eac49ab43, 0x65fe1184abae38e7, 0x2d7cb9dea5d31452,
+ 0xcc71489476e467e3, 0x4c03a258a578c68c, 0x00efdf9ecb0fd8fc, 0x9924cad471e2666d,
+ 0x87f8668318f765e9, 0xcb4dc57c1b55f5d8, 0xd373835a86604859, 0xe526568b5540e482,
+ 0x1f39040f08586fec, 0xb764f3f00293f8e6, 0x049443a2f6bd50a8, 0x76fec88697d3941a,
+ 0x3efb70d039bae7a2, 0xe2f4611368eca8a8, 0x7c007a96e01d2425, 0xbbcce5768e69c5bf,
+ 0x784fb4985c42aac3, 0xf72b5091aa223874, 0x3630333fb1e62e07, 0x8e7319ebdebbb8de,
+ 0x2a3982bca959fa00, 0xb2b98b9f964ba9b3, 0xf7e31014adb71951, 0xebd0fca3703acc82,
+ 0xec654e2a2fe6419a, 0xb326132d55a52e2c, 0x2248c57f44502978, 0x32710c2f342daf16,
+ 0x0517b47b5acb2bec, 0x4c7a718fca270937, 0xd69142bed0bcc541, 0xe40ebcb8ff52ce88,
+ 0x3e44a2dbc9f828d4, 0xc74c2f4f8f873f58, 0x3dbf648eb799e45b, 0x33f22475ee0e86f8,
+ 0x1eb4f9ee16d47f65, 0x40f8d2b8712744e3, 0xb886b4da3cb14572, 0x2086326fbdd6f64d,
+ 0xcc3de5907dd882b9, 0xa2e8b49a5ee909df, 0xdbfb8e7823964c10, 0x70dd6089ef0df8d5,
+ 0x30141663cdd9c99f, 0x04b805325c240365, 0x7483d80314ac12d6, 0x2b271cb91aa7f5f9,
+ 0x97e2245362abddf0, 0x5a84f614232a9fab, 0xf71125fcda4b7fa2, 0x1ca5a61d74b27267,
+ 0x38cc6a9b3adbcb45, 0xdde1bb85dc653e39, 0xe9d0c8fa64f89fd4, 0x02c5fb1ecd2b4188,
+ 0xf2bd137bca5756e5, 0xadefe25d121be155, 0x56cd1c3c5d893a8e, 0x4c50d337beb65bb9,
+ 0x918c5151675cf567, 0xaba649ffcfb56a1e, 0x20c74ab26a2247cd, 0x71166bac853c08da,
+ 0xb07befe2e584fc5d, 0xda45ff2a588dbf32, 0xdb98b03c4d75095e, 0x60285ae1aaa65a4c,
+ 0xf93b686a263140b8, 0xde469752ee1c180e, 0xcec232dc04129aae, 0xeb916baa1835ea04,
+ 0xd49c21c8b64388ff, 0x72a82d9658864888, 0x003348ef7eac66a8, 0x7f6f67e655b209eb,
+ 0x532ffb0b7a941b25, 0xd940ade6128deede, 0xdf24f2a1af89fe23, 0x95aa3b4988195ae0,
+ 0x3da649404f94be4a, 0x692dad132c3f7e27, 0x40aee76ecaaa9eb8, 0x1294a01e09655024,
+ 0x6df797abdba4e4f5, 0xea2fb6024c1d7032, 0x5f4e0492295489fc, 0x57972914ea22e06a,
+ 0x9a8137d133aad473, 0xa2e6dd6ae7cdf2f3, 0x9f42644f18086647, 0x16d03301c170bd3e,
+ 0x908c416fa546656d, 0xe081503be22e123e, 0x077cf09116c4cc72, 0xcbd25cd264b7f229,
+ 0x3db2f468ec594031, 0x46c00e734c9badd5, 0xd0ec0ac72075d861, 0x3037cb3cf80b7630,
+ 0x574c3d7b3a2721c6, 0xae99906a0076824b, 0xb175a5418b532e70, 0xd8b3e251ee231ddd,
+ 0xb433eec25dca1966, 0x530f30dc5cff9a93, 0x9ff03d98b53cd335, 0xafc4225076558cdf,
+ 0xef81d3a28284402a, 0x110bdbf51c110a28, 0x9ae1b255d027e8f6, 0x7de3e0aa24688332,
+ 0xe483c3ecd2067ee2, 0xf829328b276137e6, 0xa413ccad57562cad, 0xe6118e8b496acb1f,
+ 0x8288dca6da5ec01f, 0xa53777dc88c17255, 0x8a00f1e0d5716eda, 0x618e6f47b7a720a8,
+ 0x9e3907b0c692a841, 0x978b42ca963f34f3, 0x75e4b0cd98a7d7ef, 0xde4dbd6e0b5f4752,
+ 0x0252e4153f34493f, 0x50f0e7d803734ef9, 0x237766a38ed167ee, 0x4124414001ee39a0,
+ 0xd08df643e535bb21, 0x34f575b5a9a80b74, 0x2c343af87297f755, 0xcd8b6d99d821f7cb,
+ 0xe376fd7256fc48ae, 0xe1b06e7334352885, 0xfa87b26f86c169eb, 0x36c1604665a971de,
+ 0xdba147c2239c8e80, 0x6b208e69fc7f0e24, 0x8795395b6f2b60c3, 0x05dabee9194907f4,
+ 0xb98175142f5ed902, 0x5e1701e2021ddc81, 0x0875aba2755eed08, 0x778d83289251de95,
+ 0x3bfbe46a039ecb31, 0xb24704fce4cbd7f9, 0x6985ffe9a7c91e3d, 0xc8efb13df249dabb,
+ 0xb1037e64b0f4c9f6, 0x55f69fd197d6b7c3, 0x672589d71d68a90c, 0xbebdb8224f50a77e,
+ 0x3f589f80007374a7, 0xd307f4635954182a, 0xcff5850c10d4fd90, 0xc6da02dfb6408e15,
+ 0x93daeef1e2b1a485, 0x65d833208aeea625, 0xe2b13fa13ed3b5fa, 0x67053538130fb68e,
+ 0xc1042f6598218fa9, 0xee5badca749b8a2e, 0x6d22a3f947dae37d, 0xb62c6d1657f4dbaf,
+ 0x6e007de69704c20b, 0x1af2b913fc3841d8, 0xdc0e47348e2e8e22, 0x9b1ddef1cf958b22,
+ 0x632ed6b0233066b8, 0xddd02d3311bed8f2, 0xf147cfe1834656e9, 0x399aaa49d511597a,
+ 0x6b14886979ec0309, 0x64fc4ac36b5afb97, 0xb82f78e07f7cf081, 0x10925c9a323d0e1b,
+ 0xf451c79ee13c63f6, 0x7c2fc180317876c7, 0x35a12bd9eecb7d22, 0x335654a539621f90,
+ 0xcc32a3f35db581f0, 0xc60748a80b2369cb, 0x7c4dd3b08591156b, 0xac1ced4b6de22291,
+ 0xa32cfa2df134def5, 0x627108918dea2a53, 0x0555b1608fcb4ff4, 0x143ee7ac43aaa33c,
+ 0xdae90ce7cf4fc218, 0x4d68fc2582bcf4b5, 0x37094e1849135d71, 0xf7857e09f3d49fd8,
+ 0x007538c503768be7, 0xedf648ba2f6be601, 0xaa347664dd72513e, 0xbe63893c6ef23b86,
+ 0x130b85710605af97, 0xdd765c6b1ef6ab56, 0xf3249a629a97dc6b, 0x2a114f9020fab8e5,
+ 0x5a69e027cfc6ad08, 0x3c4ccb36f1a5e050, 0x2e9e7d596834f0a5, 0x2430be6858fce789,
+ 0xe90b862f2466e597, 0x895e2884f159a9ec, 0x26ab8fa4902fcb57, 0xa6efff5c54e1fa50,
+ 0x333ac4e5811a8255, 0xa58d515f02498611, 0xfe5a09dcb25c6ef4, 0x03898988ab5f5818,
+ 0x289ff6242af6c617, 0x3d9dd59fd381ea23, 0x52d7d93d8a8aae51, 0xc76a123d511f786f,
+ 0xf68901edaf00c46c, 0x8c630871b590de80, 0x05209c308991e091, 0x1f809f99b4788177,
+ 0x11170c2eb6c19fd8, 0x44433c779062ba58, 0xc0acb51af1874c45, 0x9f2e134284809fa1,
+ 0xedb523bd15c619fa, 0x02d97fd53ecc23c0, 0xacaf05a34462374c, 0xddd9c6d34bffa11f,
+}
+
+@(test)
+chacha8rand_u64s :: proc(t: ^testing.T) {
+ st: runtime.Default_Random_State
+ context.random_generator = runtime.default_random_generator(&st)
+ rand.reset_bytes(transmute([]byte)(SAMPLE_SEED))
+
+ for expected, i in SAMPLE_OUTPUT {
+ actual := rand.uint64()
+ testing.expectf(t, expected == actual, "[%d]: got %x (expected %x)", i, actual, expected)
+ }
+}
+
+@(test)
+chacha8rand_bytes :: proc(t: ^testing.T) {
+ st: runtime.Default_Random_State
+ context.random_generator = runtime.default_random_generator(&st)
+ rand.reset_bytes(transmute([]byte)(SAMPLE_SEED))
+
+ // Test a massive bulk read.
+ buf := make([]byte, len(SAMPLE_OUTPUT) * size_of(u64), context.temp_allocator)
+ n := rand.read(buf)
+ testing.expectf(t, n == len(buf), "insufficient output: got %d (expected %d)", n, len(buf))
+
+ for expected, i in SAMPLE_OUTPUT {
+ actual, _ := endian.get_u64(buf[i*8:], .Little)
+ testing.expectf(t, expected == actual, "[%d]: got %x (expected %x)", i, actual, expected)
+ }
+
+ // Test that the internal state always advances by a multiple of
+ // 8-bytes.
+ rand.reset_bytes(transmute([]byte)(SAMPLE_SEED))
+ tmp: [8]byte
+ off: int
+ for i := 1; i < 8; i += 1 {
+ _ = rand.read(tmp[:i])
+ testing.expect(t, bytes.equal(tmp[:i], buf[off:off+i]))
+ off += 8
+ }
+}