1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
package runtime
import "base:intrinsics"
@(private)
chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
// Initialize the base state.
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
when ODIN_ENDIAN == .Little {
s4 := k[0]
s5 := k[1]
s6 := k[2]
s7 := k[3]
s8 := k[4]
s9 := k[5]
s10 := k[6]
s11 := k[7]
} else {
s4 := intrinsics.byte_swap(k[0])
s5 := intrinsics.byte_swap(k[1])
s6 := intrinsics.byte_swap(k[2])
s7 := intrinsics.byte_swap(k[3])
s8 := intrinsics.byte_swap(k[4])
s9 := intrinsics.byte_swap(k[5])
s10 := intrinsics.byte_swap(k[6])
s11 := intrinicss.byte_swap(k[7])
}
s12: u32 // Counter starts at 0.
s13, s14, s15: u32 // IV of all 0s.
dst: [^]u32 = (^u32)(raw_data(r._buf[:]))
// At least with LLVM21 force_inline produces identical perf to
// manual inlining, yay.
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
a, b, c, d := a, b, c, d
a += b
d ~= a
d = rotl(d, 16)
c += d
b ~= c
b = rotl(b, 12)
a += b
d ~= a
d = rotl(d, 8)
c += d
b ~= c
b = rotl(b, 7)
return a, b, c, d
}
// Filippo Valsorda made an observation that only one of the column
// round depends on the counter (s12), so it is worth precomputing
// and reusing across multiple blocks. As far as I know, only Go's
// chacha implementation does this.
p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)
// 4 groups
for g := 0; g < 4; g = g + 1 {
// 4 blocks per group
for n := 0; n < 4; n = n + 1 {
// First column round that depends on the counter
p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)
// First diagonal round
x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)
for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
}
// Interleave 4 blocks
// NB: The additions of sigma and the counter are omitted
STRIDE :: 4
d_ := dst[n:]
when ODIN_ENDIAN == .Little {
d_[STRIDE*0] = x0
d_[STRIDE*1] = x1
d_[STRIDE*2] = x2
d_[STRIDE*3] = x3
d_[STRIDE*4] = x4 + s4
d_[STRIDE*5] = x5 + s5
d_[STRIDE*6] = x6 + s6
d_[STRIDE*7] = x7 + s7
d_[STRIDE*8] = x8 + s8
d_[STRIDE*9] = x9 + s9
d_[STRIDE*10] = x10 + s10
d_[STRIDE*11] = x11 + s11
d_[STRIDE*12] = x12
d_[STRIDE*13] = x13 + s13
d_[STRIDE*14] = x14 + s14
d_[STRIDE*15] = x15 + s15
} else {
d_[STRIDE*0] = intrinsics.byte_swap(x0)
d_[STRIDE*1] = intrinsics.byte_swap(x1)
d_[STRIDE*2] = intrinsics.byte_swap(x2)
d_[STRIDE*3] = intrinsics.byte_swap(x3)
d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
d_[STRIDE*12] = intrinsics.byte_swap(x12)
d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
}
s12 = s12 + 1 // Increment the counter
}
dst = dst[16*4:]
}
}
// This replicates `rotate_left32` from `core:math/bits`, under the
// assumption that this will live in `base:runtime`.
@(require_results, private = "file")
rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
n :: 32
s := uint(k) & (n-1)
return x << s | x >> (n-s)
}
|