base/runtime/random_generator_chacha8_ref.odin


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

package runtime

import "base:intrinsics"

@(private)
chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
	// Initialize the base state.
	k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
	when ODIN_ENDIAN == .Little {
		s4 := k[0]
		s5 := k[1]
		s6 := k[2]
		s7 := k[3]
		s8 := k[4]
		s9 := k[5]
		s10 := k[6]
		s11 := k[7]
	} else {
		s4 := intrinsics.byte_swap(k[0])
		s5 := intrinsics.byte_swap(k[1])
		s6 := intrinsics.byte_swap(k[2])
		s7 := intrinsics.byte_swap(k[3])
		s8 := intrinsics.byte_swap(k[4])
		s9 := intrinsics.byte_swap(k[5])
		s10 := intrinsics.byte_swap(k[6])
		s11 := intrinicss.byte_swap(k[7])
	}
	s12: u32           // Counter starts at 0.
	s13, s14, s15: u32 // IV of all 0s.

	dst: [^]u32 = (^u32)(raw_data(r._buf[:]))

	// At least with LLVM21 force_inline produces identical perf to
	// manual inlining, yay.
	quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
		a, b, c, d := a, b, c, d

		a += b
		d ~= a
		d = rotl(d, 16)

		c += d
		b ~= c
		b = rotl(b, 12)

		a += b
		d ~= a
		d = rotl(d, 8)

		c += d
		b ~= c
		b = rotl(b, 7)

		return a, b, c, d
	}

	// Filippo Valsorda made an observation that only one of the column
	// round depends on the counter (s12), so it is worth precomputing
	// and reusing across multiple blocks.  As far as I know, only Go's
	// chacha implementation does this.

	p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
	p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
	p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)

	// 4 groups
	for g := 0; g < 4; g = g + 1 {
		// 4 blocks per group
		for n := 0; n < 4; n = n + 1 {
			// First column round that depends on the counter
			p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)

			// First diagonal round
			x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
			x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
			x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
			x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)

			for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
				x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
				x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
				x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
				x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)

				x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
				x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
				x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
				x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
			}

			// Interleave 4 blocks
			// NB: The additions of sigma and the counter are omitted
			STRIDE :: 4
			d_ := dst[n:]
			when ODIN_ENDIAN == .Little {
				d_[STRIDE*0] = x0
				d_[STRIDE*1] = x1
				d_[STRIDE*2] = x2
				d_[STRIDE*3] = x3
				d_[STRIDE*4] = x4 + s4
				d_[STRIDE*5] = x5 + s5
				d_[STRIDE*6] = x6 + s6
				d_[STRIDE*7] = x7 + s7
				d_[STRIDE*8] = x8 + s8
				d_[STRIDE*9] = x9 + s9
				d_[STRIDE*10] = x10 + s10
				d_[STRIDE*11] = x11 + s11
				d_[STRIDE*12] = x12
				d_[STRIDE*13] = x13 + s13
				d_[STRIDE*14] = x14 + s14
				d_[STRIDE*15] = x15 + s15
			} else {
				d_[STRIDE*0] = intrinsics.byte_swap(x0)
				d_[STRIDE*1] = intrinsics.byte_swap(x1)
				d_[STRIDE*2] = intrinsics.byte_swap(x2)
				d_[STRIDE*3] = intrinsics.byte_swap(x3)
				d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
				d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
				d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
				d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
				d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
				d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
				d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
				d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
				d_[STRIDE*12] = intrinsics.byte_swap(x12)
				d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
				d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
				d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
			}

			s12 = s12 + 1 // Increment the counter
		}

		dst = dst[16*4:]
	}
}

// This replicates `rotate_left32` from `core:math/bits`, under the
// assumption that this will live in `base:runtime`.
@(require_results, private = "file")
rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
	n :: 32
	s := uint(k) & (n-1)
	return x << s | x >> (n-s)
}