package runtime

import "base:intrinsics"

@(private)
chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
	// Initialize the base state.
	k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
	when ODIN_ENDIAN == .Little {
		s4 := k[0]
		s5 := k[1]
		s6 := k[2]
		s7 := k[3]
		s8 := k[4]
		s9 := k[5]
		s10 := k[6]
		s11 := k[7]
	} else {
		s4 := intrinsics.byte_swap(k[0])
		s5 := intrinsics.byte_swap(k[1])
		s6 := intrinsics.byte_swap(k[2])
		s7 := intrinsics.byte_swap(k[3])
		s8 := intrinsics.byte_swap(k[4])
		s9 := intrinsics.byte_swap(k[5])
		s10 := intrinsics.byte_swap(k[6])
		s11 := intrinicss.byte_swap(k[7])
	}
	s12: u32           // Counter starts at 0.
	s13, s14, s15: u32 // IV of all 0s.

	dst: [^]u32 = (^u32)(raw_data(r._buf[:]))

	// At least with LLVM21 force_inline produces identical perf to
	// manual inlining, yay.
	quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
		a, b, c, d := a, b, c, d

		a += b
		d ~= a
		d = rotl(d, 16)

		c += d
		b ~= c
		b = rotl(b, 12)

		a += b
		d ~= a
		d = rotl(d, 8)

		c += d
		b ~= c
		b = rotl(b, 7)

		return a, b, c, d
	}

	// Filippo Valsorda made an observation that only one of the column
	// round depends on the counter (s12), so it is worth precomputing
	// and reusing across multiple blocks.  As far as I know, only Go's
	// chacha implementation does this.

	p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
	p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
	p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)

	// 4 groups
	for g := 0; g < 4; g = g + 1 {
		// 4 blocks per group
		for n := 0; n < 4; n = n + 1 {
			// First column round that depends on the counter
			p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)

			// First diagonal round
			x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
			x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
			x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
			x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)

			for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
				x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
				x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
				x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
				x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)

				x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
				x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
				x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
				x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
			}

			// Interleave 4 blocks
			// NB: The additions of sigma and the counter are omitted
			STRIDE :: 4
			d_ := dst[n:]
			when ODIN_ENDIAN == .Little {
				d_[STRIDE*0] = x0
				d_[STRIDE*1] = x1
				d_[STRIDE*2] = x2
				d_[STRIDE*3] = x3
				d_[STRIDE*4] = x4 + s4
				d_[STRIDE*5] = x5 + s5
				d_[STRIDE*6] = x6 + s6
				d_[STRIDE*7] = x7 + s7
				d_[STRIDE*8] = x8 + s8
				d_[STRIDE*9] = x9 + s9
				d_[STRIDE*10] = x10 + s10
				d_[STRIDE*11] = x11 + s11
				d_[STRIDE*12] = x12
				d_[STRIDE*13] = x13 + s13
				d_[STRIDE*14] = x14 + s14
				d_[STRIDE*15] = x15 + s15
			} else {
				d_[STRIDE*0] = intrinsics.byte_swap(x0)
				d_[STRIDE*1] = intrinsics.byte_swap(x1)
				d_[STRIDE*2] = intrinsics.byte_swap(x2)
				d_[STRIDE*3] = intrinsics.byte_swap(x3)
				d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
				d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
				d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
				d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
				d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
				d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
				d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
				d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
				d_[STRIDE*12] = intrinsics.byte_swap(x12)
				d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
				d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
				d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
			}

			s12 = s12 + 1 // Increment the counter
		}

		dst = dst[16*4:]
	}
}

// This replicates `rotate_left32` from `core:math/bits`, under the
// assumption that this will live in `base:runtime`.
@(require_results, private = "file")
rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
	n :: 32
	s := uint(k) & (n-1)
	return x << s | x >> (n-s)
}