aboutsummaryrefslogtreecommitdiff
path: root/core/crypto
diff options
context:
space:
mode:
authorYawning Angel <yawning@schwanenlied.me>2023-12-16 12:24:24 +0900
committerYawning Angel <yawning@schwanenlied.me>2024-08-10 18:32:37 +0900
commit1f3107e693fe4cf72518a549f9a25aec9ec9c485 (patch)
tree062f6234b870d9e1c43a64526700bf06f8aeb465 /core/crypto
parent708f053fe61e729864c82be9d9d7c84c396aa187 (diff)
core/crypto/chacha20: Use 128-bit/256-bit SIMD
Diffstat (limited to 'core/crypto')
-rw-r--r--core/crypto/_chacha20/chacha20.odin123
-rw-r--r--core/crypto/_chacha20/ref/chacha20_ref.odin360
-rw-r--r--core/crypto/_chacha20/simd128/chacha20_simd128.odin481
-rw-r--r--core/crypto/_chacha20/simd256/chacha20_simd256.odin319
-rw-r--r--core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin17
-rw-r--r--core/crypto/chacha20/chacha20.odin503
-rw-r--r--core/crypto/chacha20/chacha20_impl.odin52
7 files changed, 1397 insertions, 458 deletions
diff --git a/core/crypto/_chacha20/chacha20.odin b/core/crypto/_chacha20/chacha20.odin
new file mode 100644
index 000000000..3ede592b7
--- /dev/null
+++ b/core/crypto/_chacha20/chacha20.odin
@@ -0,0 +1,123 @@
+package _chacha20
+
+import "base:intrinsics"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+// KEY_SIZE is the (X)ChaCha20 key size in bytes.
+KEY_SIZE :: 32
+// NONCE_SIZE is the ChaCha20 nonce size in bytes.
+NONCE_SIZE :: 12
+// XNONCE_SIZE is the XChaCha20 nonce size in bytes.
+XNONCE_SIZE :: 24
+
+// MAX_CTR_IETF is the maximum counter value for the IETF flavor ChaCha20.
+MAX_CTR_IETF :: 0xffffffff
+// BLOCK_SIZE is the (X)ChaCha20 block size in bytes.
+BLOCK_SIZE :: 64
+// STATE_SIZE_U32 is the (X)ChaCha20 state size in u32s.
+STATE_SIZE_U32 :: 16
+// Rounds is the (X)ChaCha20 round count.
+ROUNDS :: 20
+
+// SIGMA_0 is sigma[0:4].
+SIGMA_0: u32 : 0x61707865
+// SIGMA_1 is sigma[4:8].
+SIGMA_1: u32 : 0x3320646e
+// SIGMA_2 is sigma[8:12].
+SIGMA_2: u32 : 0x79622d32
+// SIGMA_3 is sigma[12:16].
+SIGMA_3: u32 : 0x6b206574
+
+// Context is a ChaCha20 or XChaCha20 instance.
+Context :: struct {
+ _s: [STATE_SIZE_U32]u32,
+ _buffer: [BLOCK_SIZE]byte,
+ _off: int,
+ _is_ietf_flavor: bool,
+ _is_initialized: bool,
+}
+
+// init inititializes a Context for ChaCha20 with the provided key and
+// nonce.
+//
+// WARNING: This ONLY handles ChaCha20. XChaCha20 sub-key and nonce
+// derivation is expected to be handled by the caller, so that the
+// HChaCha call can be suitably accelerated.
+init :: proc "contextless" (ctx: ^Context, key, nonce: []byte, is_xchacha: bool) {
+ if len(key) != KEY_SIZE || len(nonce) != NONCE_SIZE {
+ intrinsics.trap()
+ }
+
+ k, n := key, nonce
+
+ ctx._s[0] = SIGMA_0
+ ctx._s[1] = SIGMA_1
+ ctx._s[2] = SIGMA_2
+ ctx._s[3] = SIGMA_3
+ ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
+ ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
+ ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
+ ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
+ ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
+ ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
+ ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
+ ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
+ ctx._s[12] = 0
+ ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
+ ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
+ ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
+
+ ctx._off = BLOCK_SIZE
+ ctx._is_ietf_flavor = !is_xchacha
+ ctx._is_initialized = true
+}
+
+// seek seeks the (X)ChaCha20 stream counter to the specified block.
+seek :: proc(ctx: ^Context, block_nr: u64) {
+ assert(ctx._is_initialized)
+
+ if ctx._is_ietf_flavor {
+ if block_nr > MAX_CTR_IETF {
+ panic("crypto/chacha20: attempted to seek past maximum counter")
+ }
+ } else {
+ ctx._s[13] = u32(block_nr >> 32)
+ }
+ ctx._s[12] = u32(block_nr)
+ ctx._off = BLOCK_SIZE
+}
+
+// reset sanitizes the Context. The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+ mem.zero_explicit(&ctx._s, size_of(ctx._s))
+ mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
+
+ ctx._is_initialized = false
+}
+
+check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) {
+ // Enforce the maximum consumed keystream per nonce.
+ //
+ // While all modern "standard" definitions of ChaCha20 use
+ // the IETF 32-bit counter, for XChaCha20 most common
+ // implementations allow for a 64-bit counter.
+ //
+ // Honestly, the answer here is "use a MRAE primitive", but
+ // go with "common" practice in the case of XChaCha20.
+
+ ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per nonce reached"
+
+ if ctx._is_ietf_flavor {
+ if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF {
+ panic(ERR_CTR_EXHAUSTED)
+ }
+ } else {
+ ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
+ if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
+ panic(ERR_CTR_EXHAUSTED)
+ }
+ }
+}
diff --git a/core/crypto/_chacha20/ref/chacha20_ref.odin b/core/crypto/_chacha20/ref/chacha20_ref.odin
new file mode 100644
index 000000000..27104b8e4
--- /dev/null
+++ b/core/crypto/_chacha20/ref/chacha20_ref.odin
@@ -0,0 +1,360 @@
+package chacha20_ref
+
+import "core:crypto/_chacha20"
+import "core:encoding/endian"
+import "core:math/bits"
+
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+ // Enforce the maximum consumed keystream per nonce.
+ _chacha20.check_counter_limit(ctx, nr_blocks)
+
+ dst, src := dst, src
+ x := &ctx._s
+ for n := 0; n < nr_blocks; n = n + 1 {
+ x0, x1, x2, x3 :=
+ _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
+ x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
+ x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ // Even when forcing inlining manually inlining all of
+ // these is decently faster.
+
+ // quarterround(x, 0, 4, 8, 12)
+ x0 += x4
+ x12 ~= x0
+ x12 = bits.rotate_left32(x12, 16)
+ x8 += x12
+ x4 ~= x8
+ x4 = bits.rotate_left32(x4, 12)
+ x0 += x4
+ x12 ~= x0
+ x12 = bits.rotate_left32(x12, 8)
+ x8 += x12
+ x4 ~= x8
+ x4 = bits.rotate_left32(x4, 7)
+
+ // quarterround(x, 1, 5, 9, 13)
+ x1 += x5
+ x13 ~= x1
+ x13 = bits.rotate_left32(x13, 16)
+ x9 += x13
+ x5 ~= x9
+ x5 = bits.rotate_left32(x5, 12)
+ x1 += x5
+ x13 ~= x1
+ x13 = bits.rotate_left32(x13, 8)
+ x9 += x13
+ x5 ~= x9
+ x5 = bits.rotate_left32(x5, 7)
+
+ // quarterround(x, 2, 6, 10, 14)
+ x2 += x6
+ x14 ~= x2
+ x14 = bits.rotate_left32(x14, 16)
+ x10 += x14
+ x6 ~= x10
+ x6 = bits.rotate_left32(x6, 12)
+ x2 += x6
+ x14 ~= x2
+ x14 = bits.rotate_left32(x14, 8)
+ x10 += x14
+ x6 ~= x10
+ x6 = bits.rotate_left32(x6, 7)
+
+ // quarterround(x, 3, 7, 11, 15)
+ x3 += x7
+ x15 ~= x3
+ x15 = bits.rotate_left32(x15, 16)
+ x11 += x15
+ x7 ~= x11
+ x7 = bits.rotate_left32(x7, 12)
+ x3 += x7
+ x15 ~= x3
+ x15 = bits.rotate_left32(x15, 8)
+ x11 += x15
+ x7 ~= x11
+ x7 = bits.rotate_left32(x7, 7)
+
+ // quarterround(x, 0, 5, 10, 15)
+ x0 += x5
+ x15 ~= x0
+ x15 = bits.rotate_left32(x15, 16)
+ x10 += x15
+ x5 ~= x10
+ x5 = bits.rotate_left32(x5, 12)
+ x0 += x5
+ x15 ~= x0
+ x15 = bits.rotate_left32(x15, 8)
+ x10 += x15
+ x5 ~= x10
+ x5 = bits.rotate_left32(x5, 7)
+
+ // quarterround(x, 1, 6, 11, 12)
+ x1 += x6
+ x12 ~= x1
+ x12 = bits.rotate_left32(x12, 16)
+ x11 += x12
+ x6 ~= x11
+ x6 = bits.rotate_left32(x6, 12)
+ x1 += x6
+ x12 ~= x1
+ x12 = bits.rotate_left32(x12, 8)
+ x11 += x12
+ x6 ~= x11
+ x6 = bits.rotate_left32(x6, 7)
+
+ // quarterround(x, 2, 7, 8, 13)
+ x2 += x7
+ x13 ~= x2
+ x13 = bits.rotate_left32(x13, 16)
+ x8 += x13
+ x7 ~= x8
+ x7 = bits.rotate_left32(x7, 12)
+ x2 += x7
+ x13 ~= x2
+ x13 = bits.rotate_left32(x13, 8)
+ x8 += x13
+ x7 ~= x8
+ x7 = bits.rotate_left32(x7, 7)
+
+ // quarterround(x, 3, 4, 9, 14)
+ x3 += x4
+ x14 ~= x3
+ x14 = bits.rotate_left32(x14, 16)
+ x9 += x14
+ x4 ~= x9
+ x4 = bits.rotate_left32(x4, 12)
+ x3 += x4
+ x14 ~= x3
+ x14 = bits.rotate_left32(x14, 8)
+ x9 += x14
+ x4 ~= x9
+ x4 = bits.rotate_left32(x4, 7)
+ }
+
+ x0 += _chacha20.SIGMA_0
+ x1 += _chacha20.SIGMA_1
+ x2 += _chacha20.SIGMA_2
+ x3 += _chacha20.SIGMA_3
+ x4 += x[4]
+ x5 += x[5]
+ x6 += x[6]
+ x7 += x[7]
+ x8 += x[8]
+ x9 += x[9]
+ x10 += x[10]
+ x11 += x[11]
+ x12 += x[12]
+ x13 += x[13]
+ x14 += x[14]
+ x15 += x[15]
+
+ // - The caller(s) ensure that src/dst are valid.
+ // - The compiler knows if the target is picky about alignment.
+
+ #no_bounds_check {
+ if src != nil {
+ endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
+ endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
+ endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
+ endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
+ endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
+ endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
+ endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
+ endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
+ endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
+ endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
+ endian.unchecked_put_u32le(
+ dst[40:44],
+ endian.unchecked_get_u32le(src[40:44]) ~ x10,
+ )
+ endian.unchecked_put_u32le(
+ dst[44:48],
+ endian.unchecked_get_u32le(src[44:48]) ~ x11,
+ )
+ endian.unchecked_put_u32le(
+ dst[48:52],
+ endian.unchecked_get_u32le(src[48:52]) ~ x12,
+ )
+ endian.unchecked_put_u32le(
+ dst[52:56],
+ endian.unchecked_get_u32le(src[52:56]) ~ x13,
+ )
+ endian.unchecked_put_u32le(
+ dst[56:60],
+ endian.unchecked_get_u32le(src[56:60]) ~ x14,
+ )
+ endian.unchecked_put_u32le(
+ dst[60:64],
+ endian.unchecked_get_u32le(src[60:64]) ~ x15,
+ )
+ src = src[_chacha20.BLOCK_SIZE:]
+ } else {
+ endian.unchecked_put_u32le(dst[0:4], x0)
+ endian.unchecked_put_u32le(dst[4:8], x1)
+ endian.unchecked_put_u32le(dst[8:12], x2)
+ endian.unchecked_put_u32le(dst[12:16], x3)
+ endian.unchecked_put_u32le(dst[16:20], x4)
+ endian.unchecked_put_u32le(dst[20:24], x5)
+ endian.unchecked_put_u32le(dst[24:28], x6)
+ endian.unchecked_put_u32le(dst[28:32], x7)
+ endian.unchecked_put_u32le(dst[32:36], x8)
+ endian.unchecked_put_u32le(dst[36:40], x9)
+ endian.unchecked_put_u32le(dst[40:44], x10)
+ endian.unchecked_put_u32le(dst[44:48], x11)
+ endian.unchecked_put_u32le(dst[48:52], x12)
+ endian.unchecked_put_u32le(dst[52:56], x13)
+ endian.unchecked_put_u32le(dst[56:60], x14)
+ endian.unchecked_put_u32le(dst[60:64], x15)
+ }
+ dst = dst[_chacha20.BLOCK_SIZE:]
+ }
+
+ // Increment the counter. Overflow checking is done upon
+ // entry into the routine, so a 64-bit increment safely
+ // covers both cases.
+ new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+ x[12] = u32(new_ctr)
+ x[13] = u32(new_ctr >> 32)
+ }
+}
+
+hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
+ x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
+ x4 := endian.unchecked_get_u32le(key[0:4])
+ x5 := endian.unchecked_get_u32le(key[4:8])
+ x6 := endian.unchecked_get_u32le(key[8:12])
+ x7 := endian.unchecked_get_u32le(key[12:16])
+ x8 := endian.unchecked_get_u32le(key[16:20])
+ x9 := endian.unchecked_get_u32le(key[20:24])
+ x10 := endian.unchecked_get_u32le(key[24:28])
+ x11 := endian.unchecked_get_u32le(key[28:32])
+ x12 := endian.unchecked_get_u32le(nonce[0:4])
+ x13 := endian.unchecked_get_u32le(nonce[4:8])
+ x14 := endian.unchecked_get_u32le(nonce[8:12])
+ x15 := endian.unchecked_get_u32le(nonce[12:16])
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ // quarterround(x, 0, 4, 8, 12)
+ x0 += x4
+ x12 ~= x0
+ x12 = bits.rotate_left32(x12, 16)
+ x8 += x12
+ x4 ~= x8
+ x4 = bits.rotate_left32(x4, 12)
+ x0 += x4
+ x12 ~= x0
+ x12 = bits.rotate_left32(x12, 8)
+ x8 += x12
+ x4 ~= x8
+ x4 = bits.rotate_left32(x4, 7)
+
+ // quarterround(x, 1, 5, 9, 13)
+ x1 += x5
+ x13 ~= x1
+ x13 = bits.rotate_left32(x13, 16)
+ x9 += x13
+ x5 ~= x9
+ x5 = bits.rotate_left32(x5, 12)
+ x1 += x5
+ x13 ~= x1
+ x13 = bits.rotate_left32(x13, 8)
+ x9 += x13
+ x5 ~= x9
+ x5 = bits.rotate_left32(x5, 7)
+
+ // quarterround(x, 2, 6, 10, 14)
+ x2 += x6
+ x14 ~= x2
+ x14 = bits.rotate_left32(x14, 16)
+ x10 += x14
+ x6 ~= x10
+ x6 = bits.rotate_left32(x6, 12)
+ x2 += x6
+ x14 ~= x2
+ x14 = bits.rotate_left32(x14, 8)
+ x10 += x14
+ x6 ~= x10
+ x6 = bits.rotate_left32(x6, 7)
+
+ // quarterround(x, 3, 7, 11, 15)
+ x3 += x7
+ x15 ~= x3
+ x15 = bits.rotate_left32(x15, 16)
+ x11 += x15
+ x7 ~= x11
+ x7 = bits.rotate_left32(x7, 12)
+ x3 += x7
+ x15 ~= x3
+ x15 = bits.rotate_left32(x15, 8)
+ x11 += x15
+ x7 ~= x11
+ x7 = bits.rotate_left32(x7, 7)
+
+ // quarterround(x, 0, 5, 10, 15)
+ x0 += x5
+ x15 ~= x0
+ x15 = bits.rotate_left32(x15, 16)
+ x10 += x15
+ x5 ~= x10
+ x5 = bits.rotate_left32(x5, 12)
+ x0 += x5
+ x15 ~= x0
+ x15 = bits.rotate_left32(x15, 8)
+ x10 += x15
+ x5 ~= x10
+ x5 = bits.rotate_left32(x5, 7)
+
+ // quarterround(x, 1, 6, 11, 12)
+ x1 += x6
+ x12 ~= x1
+ x12 = bits.rotate_left32(x12, 16)
+ x11 += x12
+ x6 ~= x11
+ x6 = bits.rotate_left32(x6, 12)
+ x1 += x6
+ x12 ~= x1
+ x12 = bits.rotate_left32(x12, 8)
+ x11 += x12
+ x6 ~= x11
+ x6 = bits.rotate_left32(x6, 7)
+
+ // quarterround(x, 2, 7, 8, 13)
+ x2 += x7
+ x13 ~= x2
+ x13 = bits.rotate_left32(x13, 16)
+ x8 += x13
+ x7 ~= x8
+ x7 = bits.rotate_left32(x7, 12)
+ x2 += x7
+ x13 ~= x2
+ x13 = bits.rotate_left32(x13, 8)
+ x8 += x13
+ x7 ~= x8
+ x7 = bits.rotate_left32(x7, 7)
+
+ // quarterround(x, 3, 4, 9, 14)
+ x3 += x4
+ x14 ~= x3
+ x14 = bits.rotate_left32(x14, 16)
+ x9 += x14
+ x4 ~= x9
+ x4 = bits.rotate_left32(x4, 12)
+ x3 += x4
+ x14 ~= x3
+ x14 = bits.rotate_left32(x14, 8)
+ x9 += x14
+ x4 ~= x9
+ x4 = bits.rotate_left32(x4, 7)
+ }
+
+ endian.unchecked_put_u32le(dst[0:4], x0)
+ endian.unchecked_put_u32le(dst[4:8], x1)
+ endian.unchecked_put_u32le(dst[8:12], x2)
+ endian.unchecked_put_u32le(dst[12:16], x3)
+ endian.unchecked_put_u32le(dst[16:20], x12)
+ endian.unchecked_put_u32le(dst[20:24], x13)
+ endian.unchecked_put_u32le(dst[24:28], x14)
+ endian.unchecked_put_u32le(dst[28:32], x15)
+}
diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
new file mode 100644
index 000000000..0ec9ef553
--- /dev/null
+++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -0,0 +1,481 @@
+package chacha20_simd128
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+import "core:simd"
+import "core:sys/info"
+
+// Portable 128-bit `core:simd` implementation.
+//
+// This is loosely based on Ted Krovetz's public domain C intrinsic
+// implementation.
+//
+// This is written to perform adequately on any target that has "enough"
+// 128-bit vector registers, the current thought is that 4 blocks at at
+// time is reasonable for amd64, though Ted's code is more conservative.
+//
+// See:
+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
+
+// Ensure the compiler emits SIMD instructions. This is a minimum, and
+// setting the microarchitecture at compile time will allow for better
+// code gen when applicable (eg: AVX). This is somewhat redundant with
+// the default microarchitecture configurations.
+when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+ @(private = "file")
+ TARGET_SIMD_FEATURES :: "neon"
+} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+ // Note: LLVM appears to be smart enough to use PSHUFB despite not
+ // explicitly using simd.u8x16 shuffles.
+ @(private = "file")
+ TARGET_SIMD_FEATURES :: "sse2,ssse3"
+} else {
+ @(private = "file")
+ TARGET_SIMD_FEATURES :: ""
+}
+
+@(private = "file")
+_ROT_7L: simd.u32x4 : {7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: simd.u32x4 : {25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: simd.u32x4 : {12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: simd.u32x4 : {20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: simd.u32x4 : {8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: simd.u32x4 : {24, 24, 24, 24}
+@(private = "file")
+_ROT_16: simd.u32x4 : {16, 16, 16, 16}
+
+when ODIN_ENDIAN == .Big {
+ @(private = "file")
+ _increment_counter :: #force_inline proc "contextless" (ctx: ^Context) -> simd.u32x4 {
+ // In the Big Endian case, the low and high portions in the vector
+ // are flipped, so the 64-bit addition can't be done with a simple
+ // vector add.
+ x := &ctx._s
+
+ new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+ x[12] = u32(new_ctr)
+ x[13] = u32(new_ctr >> 32)
+
+ return intrinsics.unaligned_load(transmute(^simd.u32x4)&x[12])
+ }
+
+ // Convert the endian-ness of the components of a u32x4 vector, for
+ // the purposes of output.
+ @(private = "file")
+ _byteswap_u32x4 :: #force_inline proc "contextless" (v: simd.u32x4) -> simd.u32x4 {
+ return(
+ transmute(simd.u32x4)simd.shuffle(
+ transmute(simd.u8x16)v,
+ transmute(simd.u8x16)v,
+ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+ )
+ )
+ }
+} else {
+ @(private = "file")
+ _VEC_ONE: simd.u64x2 : {1, 0}
+}
+
+@(private = "file")
+_dq_round_simd128 :: #force_inline proc "contextless" (
+ v0, v1, v2, v3: simd.u32x4,
+) -> (
+ simd.u32x4,
+ simd.u32x4,
+ simd.u32x4,
+ simd.u32x4,
+) {
+ v0, v1, v2, v3 := v0, v1, v2, v3
+
+ // a += b; d ^= a; d = ROTW16(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+ // c += d; b ^= c; b = ROTW12(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+ // a += b; d ^= a; d = ROTW8(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+ // c += d; b ^= c; b = ROTW7(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+ // b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ v1 = simd.shuffle(v1, v1, 1, 2, 3, 0)
+ v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
+ v3 = simd.shuffle(v3, v3, 3, 0, 1, 2)
+
+ // a += b; d ^= a; d = ROTW16(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+ // c += d; b ^= c; b = ROTW12(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+ // a += b; d ^= a; d = ROTW8(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+ // c += d; b ^= c; b = ROTW7(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+ // b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+ v1 = simd.shuffle(v1, v1, 3, 0, 1, 2)
+ v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
+ v3 = simd.shuffle(v3, v3, 1, 2, 3, 0)
+
+ return v0, v1, v2, v3
+}
+
+@(private = "file")
+_add_state_simd128 :: #force_inline proc "contextless" (
+ v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x4,
+) -> (
+ simd.u32x4,
+ simd.u32x4,
+ simd.u32x4,
+ simd.u32x4,
+) {
+ v0, v1, v2, v3 := v0, v1, v2, v3
+
+ v0 = simd.add(v0, s0)
+ v1 = simd.add(v1, s1)
+ v2 = simd.add(v2, s2)
+ v3 = simd.add(v3, s3)
+
+ when ODIN_ENDIAN == .Big {
+ v0 = _byteswap_u32x4(v0)
+ v1 = _byteswap_u32x4(v1)
+ v2 = _byteswap_u32x4(v2)
+ v3 = _byteswap_u32x4(v3)
+ }
+
+ return v0, v1, v2, v3
+}
+
+@(private = "file")
+_xor_simd128 :: #force_inline proc "contextless" (
+ src: [^]simd.u32x4,
+ v0, v1, v2, v3: simd.u32x4,
+) -> (
+ simd.u32x4,
+ simd.u32x4,
+ simd.u32x4,
+ simd.u32x4,
+) {
+ v0, v1, v2, v3 := v0, v1, v2, v3
+
+ v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x4)(src[0:])))
+ v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x4)(src[1:])))
+ v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x4)(src[2:])))
+ v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x4)(src[3:])))
+
+ return v0, v1, v2, v3
+}
+
+@(private = "file")
+_store_simd128 :: #force_inline proc "contextless" (
+ dst: [^]simd.u32x4,
+ v0, v1, v2, v3: simd.u32x4,
+) {
+ intrinsics.unaligned_store((^simd.u32x4)(dst[0:]), v0)
+ intrinsics.unaligned_store((^simd.u32x4)(dst[1:]), v1)
+ intrinsics.unaligned_store((^simd.u32x4)(dst[2:]), v2)
+ intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
+}
+
+// is_performant returns true iff the target and current host both support
+// "enough" 128-bit SIMD to make this implementation performant.
+is_performant :: proc "contextless" () -> bool {
+ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+ req_features :: info.CPU_Features{.asimd}
+ } else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+ req_features :: info.CPU_Features{.sse2, .ssse3}
+ }
+
+ features, ok := info.cpu_features.?
+ if !ok {
+ return false
+ }
+
+ return features >= req_features
+ } else when ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32 {
+ return intrinsics.has_target_feature("simd128")
+ } else {
+ return false
+ }
+}
+
+@(enable_target_feature = TARGET_SIMD_FEATURES)
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+ // Enforce the maximum consumed keystream per nonce.
+ _chacha20.check_counter_limit(ctx, nr_blocks)
+
+ dst_v := ([^]simd.u32x4)(raw_data(dst))
+ src_v := ([^]simd.u32x4)(raw_data(src))
+
+ x := &ctx._s
+ n := nr_blocks
+
+ // The state vector is an array of uint32s in native byte-order.
+ x_v := ([^]simd.u32x4)(raw_data(x))
+ s0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
+ s1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
+ s2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
+ s3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
+
+ // 8 blocks at a time.
+ //
+ // Note: This is only worth it on Aarch64.
+ when ODIN_ARCH == .arm64 {
+ for ; n >= 8; n = n - 8 {
+ v0, v1, v2, v3 := s0, s1, s2, s3
+
+ when ODIN_ENDIAN == .Little {
+ s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+ } else {
+ s7 := _increment_counter(ctx)
+ }
+ v4, v5, v6, v7 := s0, s1, s2, s7
+
+ when ODIN_ENDIAN == .Little {
+ s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
+ } else {
+ s11 := _increment_counter(ctx)
+ }
+ v8, v9, v10, v11 := s0, s1, s2, s11
+
+ when ODIN_ENDIAN == .Little {
+ s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
+ } else {
+ s15 := _increment_counter(ctx)
+ }
+ v12, v13, v14, v15 := s0, s1, s2, s15
+
+ when ODIN_ENDIAN == .Little {
+ s19 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
+ } else {
+ s19 := _increment_counter(ctx)
+ }
+
+ v16, v17, v18, v19 := s0, s1, s2, s19
+ when ODIN_ENDIAN == .Little {
+ s23 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s19, _VEC_ONE)
+ } else {
+ s23 := _increment_counter(ctx)
+ }
+
+ v20, v21, v22, v23 := s0, s1, s2, s23
+ when ODIN_ENDIAN == .Little {
+ s27 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s23, _VEC_ONE)
+ } else {
+ s27 := _increment_counter(ctx)
+ }
+
+ v24, v25, v26, v27 := s0, s1, s2, s27
+ when ODIN_ENDIAN == .Little {
+ s31 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s27, _VEC_ONE)
+ } else {
+ s31 := _increment_counter(ctx)
+ }
+ v28, v29, v30, v31 := s0, s1, s2, s31
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+ v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
+ v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
+ v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
+ v16, v17, v18, v19 = _dq_round_simd128(v16, v17, v18, v19)
+ v20, v21, v22, v23 = _dq_round_simd128(v20, v21, v22, v23)
+ v24, v25, v26, v27 = _dq_round_simd128(v24, v25, v26, v27)
+ v28, v29, v30, v31 = _dq_round_simd128(v28, v29, v30, v31)
+ }
+
+ v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+ v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
+ v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
+ v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
+ v16, v17, v18, v19 = _add_state_simd128(v16, v17, v18, v19, s0, s1, s2, s19)
+ v20, v21, v22, v23 = _add_state_simd128(v20, v21, v22, v23, s0, s1, s2, s23)
+ v24, v25, v26, v27 = _add_state_simd128(v24, v25, v26, v27, s0, s1, s2, s27)
+ v28, v29, v30, v31 = _add_state_simd128(v28, v29, v30, v31, s0, s1, s2, s31)
+
+ #no_bounds_check {
+ if src != nil {
+ v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+ v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
+ v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
+ v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
+ v16, v17, v18, v19 = _xor_simd128(src_v[16:], v16, v17, v18, v19)
+ v20, v21, v22, v23 = _xor_simd128(src_v[20:], v20, v21, v22, v23)
+ v24, v25, v26, v27 = _xor_simd128(src_v[24:], v24, v25, v26, v27)
+ v28, v29, v30, v31 = _xor_simd128(src_v[28:], v28, v29, v30, v31)
+ src_v = src_v[32:]
+ }
+
+ _store_simd128(dst_v, v0, v1, v2, v3)
+ _store_simd128(dst_v[4:], v4, v5, v6, v7)
+ _store_simd128(dst_v[8:], v8, v9, v10, v11)
+ _store_simd128(dst_v[12:], v12, v13, v14, v15)
+ _store_simd128(dst_v[16:], v16, v17, v18, v19)
+ _store_simd128(dst_v[20:], v20, v21, v22, v23)
+ _store_simd128(dst_v[24:], v24, v25, v26, v27)
+ _store_simd128(dst_v[28:], v28, v29, v30, v31)
+ dst_v = dst_v[32:]
+ }
+
+ when ODIN_ENDIAN == .Little {
+ // s31 holds the most current counter, so `s3 = s31 + 1`.
+ s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s31, _VEC_ONE)
+ } else {
+ s3 = _increment_counter(ctx)
+ }
+ }
+ }
+
+ // 4 blocks at a time.
+ //
+ // Note: The i386 target lacks the required number of registers
+ // for this to be performant, so it is skipped.
+ when ODIN_ARCH != .i386 {
+ for ; n >= 4; n = n - 4 {
+ v0, v1, v2, v3 := s0, s1, s2, s3
+
+ when ODIN_ENDIAN == .Little {
+ s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+ } else {
+ s7 := _increment_counter(ctx)
+ }
+ v4, v5, v6, v7 := s0, s1, s2, s7
+
+ when ODIN_ENDIAN == .Little {
+ s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
+ } else {
+ s11 := _increment_counter(ctx)
+ }
+ v8, v9, v10, v11 := s0, s1, s2, s11
+
+ when ODIN_ENDIAN == .Little {
+ s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
+ } else {
+ s15 := _increment_counter(ctx)
+ }
+ v12, v13, v14, v15 := s0, s1, s2, s15
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+ v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
+ v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
+ v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
+ }
+
+ v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+ v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
+ v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
+ v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
+
+ #no_bounds_check {
+ if src != nil {
+ v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+ v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
+ v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
+ v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
+ src_v = src_v[16:]
+ }
+
+ _store_simd128(dst_v, v0, v1, v2, v3)
+ _store_simd128(dst_v[4:], v4, v5, v6, v7)
+ _store_simd128(dst_v[8:], v8, v9, v10, v11)
+ _store_simd128(dst_v[12:], v12, v13, v14, v15)
+ dst_v = dst_v[16:]
+ }
+
+ when ODIN_ENDIAN == .Little {
+ // s15 holds the most current counter, so `s3 = s15 + 1`.
+ s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
+ } else {
+ s3 = _increment_counter(ctx)
+ }
+ }
+ }
+
+ // 1 block at a time.
+ for ; n > 0; n = n - 1 {
+ v0, v1, v2, v3 := s0, s1, s2, s3
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+ }
+ v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+
+ #no_bounds_check {
+ if src != nil {
+ v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+ src_v = src_v[4:]
+ }
+
+ _store_simd128(dst_v, v0, v1, v2, v3)
+ dst_v = dst_v[4:]
+ }
+
+ // Increment the counter. Overflow checking is done upon
+ // entry into the routine, so a 64-bit increment safely
+ // covers both cases.
+ when ODIN_ENDIAN == .Little {
+ s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+ } else {
+ s3 = _increment_counter(ctx)
+ }
+ }
+
+ when ODIN_ENDIAN == .Little {
+ // Write back the counter to the state.
+ intrinsics.unaligned_store((^simd.u32x4)(x_v[3:]), s3)
+ }
+}
+
+@(enable_target_feature = TARGET_SIMD_FEATURES)
+hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
+ v0 := simd.u32x4{_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3}
+ v1 := intrinsics.unaligned_load((^simd.u32x4)(&key[0]))
+ v2 := intrinsics.unaligned_load((^simd.u32x4)(&key[16]))
+ v3 := intrinsics.unaligned_load((^simd.u32x4)(&nonce[0]))
+
+ when ODIN_ENDIAN == .Big {
+ v1 = _byteswap_u32x4(v1)
+ v2 = _byteswap_u32x4(v2)
+ v3 = _byteswap_u32x4(v3)
+ }
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+ }
+
+ when ODIN_ENDIAN == .Big {
+ v0 = _byteswap_u32x4(v0)
+ v3 = _byteswap_u32x4(v3)
+ }
+
+ dst_v := ([^]simd.u32x4)(raw_data(dst))
+ intrinsics.unaligned_store((^simd.u32x4)(dst_v[0:]), v0)
+ intrinsics.unaligned_store((^simd.u32x4)(dst_v[1:]), v3)
+}
diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256.odin b/core/crypto/_chacha20/simd256/chacha20_simd256.odin
new file mode 100644
index 000000000..7e84509e1
--- /dev/null
+++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin
@@ -0,0 +1,319 @@
+//+build amd64
+package chacha20_simd256
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+import chacha_simd128 "core:crypto/_chacha20/simd128"
+import "core:simd"
+import "core:sys/info"
+
+// This is loosely based on Ted Krovetz's public domain C intrinsic
+// implementations. While written using `core:simd`, this is currently
+// amd64 specific because we do not have a way to detect ARM SVE.
+//
+// See:
+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
+// supercop-20230530/crypto_stream/chacha20/krovetz/avx2
+
+#assert(ODIN_ENDIAN == .Little)
+
+@(private = "file")
+_ROT_7L: simd.u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: simd.u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: simd.u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: simd.u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: simd.u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: simd.u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
+@(private = "file")
+_ROT_16: simd.u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
+@(private = "file")
+_VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
+@(private = "file")
+_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
+
+// is_performant returns true iff the target and current host both support
+// "enough" SIMD to make this implementation performant.
+is_performant :: proc "contextless" () -> bool {
+ req_features :: info.CPU_Features{.avx, .avx2}
+
+ features, ok := info.cpu_features.?
+ if !ok {
+ return false
+ }
+
+ return features >= req_features
+}
+
+@(private = "file")
+_dq_round_simd256 :: #force_inline proc "contextless" (
+ v0, v1, v2, v3: simd.u32x8,
+) -> (
+ simd.u32x8,
+ simd.u32x8,
+ simd.u32x8,
+ simd.u32x8,
+) {
+ v0, v1, v2, v3 := v0, v1, v2, v3
+
+ // a += b; d ^= a; d = ROTW16(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+ // c += d; b ^= c; b = ROTW12(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+ // a += b; d ^= a; d = ROTW8(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+ // c += d; b ^= c; b = ROTW7(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+ // b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ v1 = simd.shuffle(v1, v1, 1, 2, 3, 0, 5, 6, 7, 4)
+ v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
+ v3 = simd.shuffle(v3, v3, 3, 0, 1, 2, 7, 4, 5, 6)
+
+ // a += b; d ^= a; d = ROTW16(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+ // c += d; b ^= c; b = ROTW12(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+ // a += b; d ^= a; d = ROTW8(d);
+ v0 = simd.add(v0, v1)
+ v3 = simd.bit_xor(v3, v0)
+ v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+ // c += d; b ^= c; b = ROTW7(b);
+ v2 = simd.add(v2, v3)
+ v1 = simd.bit_xor(v1, v2)
+ v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+ // b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+ v1 = simd.shuffle(v1, v1, 3, 0, 1, 2, 7, 4, 5, 6)
+ v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
+ v3 = simd.shuffle(v3, v3, 1, 2, 3, 0, 5, 6, 7, 4)
+
+ return v0, v1, v2, v3
+}
+
+@(private = "file")
+_add_and_permute_state_simd256 :: #force_inline proc "contextless" (
+ v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x8,
+) -> (
+ simd.u32x8,
+ simd.u32x8,
+ simd.u32x8,
+ simd.u32x8,
+) {
+ t0 := simd.add(v0, s0)
+ t1 := simd.add(v1, s1)
+ t2 := simd.add(v2, s2)
+ t3 := simd.add(v3, s3)
+
+ // Big Endian would byteswap here.
+
+ // Each of v0 .. v3 has 128-bits of keystream for 2 separate blocks.
+ // permute the state such that (r0, r1) contains block 0, and (r2, r3)
+ // contains block 1.
+ r0 := simd.shuffle(t0, t1, 0, 1, 2, 3, 8, 9, 10, 11)
+ r2 := simd.shuffle(t0, t1, 4, 5, 6, 7, 12, 13, 14, 15)
+ r1 := simd.shuffle(t2, t3, 0, 1, 2, 3, 8, 9, 10, 11)
+ r3 := simd.shuffle(t2, t3, 4, 5, 6, 7, 12, 13, 14, 15)
+
+ return r0, r1, r2, r3
+}
+
+@(private = "file")
+_xor_simd256 :: #force_inline proc "contextless" (
+ src: [^]simd.u32x8,
+ v0, v1, v2, v3: simd.u32x8,
+) -> (
+ simd.u32x8,
+ simd.u32x8,
+ simd.u32x8,
+ simd.u32x8,
+) {
+ v0, v1, v2, v3 := v0, v1, v2, v3
+
+ v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
+ v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
+ v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x8)(src[2:])))
+ v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x8)(src[3:])))
+
+ return v0, v1, v2, v3
+}
+
+@(private = "file")
+_xor_simd256_x1 :: #force_inline proc "contextless" (
+ src: [^]simd.u32x8,
+ v0, v1: simd.u32x8,
+) -> (
+ simd.u32x8,
+ simd.u32x8,
+) {
+ v0, v1 := v0, v1
+
+ v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
+ v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
+
+ return v0, v1
+}
+
+@(private = "file")
+_store_simd256 :: #force_inline proc "contextless" (
+ dst: [^]simd.u32x8,
+ v0, v1, v2, v3: simd.u32x8,
+) {
+ intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
+ intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
+ intrinsics.unaligned_store((^simd.u32x8)(dst[2:]), v2)
+ intrinsics.unaligned_store((^simd.u32x8)(dst[3:]), v3)
+}
+
+@(private = "file")
+_store_simd256_x1 :: #force_inline proc "contextless" (
+ dst: [^]simd.u32x8,
+ v0, v1: simd.u32x8,
+) {
+ intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
+ intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
+}
+
+@(enable_target_feature = "sse2,ssse3,avx,avx2")
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+ // Enforce the maximum consumed keystream per nonce.
+ _chacha20.check_counter_limit(ctx, nr_blocks)
+
+ dst_v := ([^]simd.u32x8)(raw_data(dst))
+ src_v := ([^]simd.u32x8)(raw_data(src))
+
+ x := &ctx._s
+ n := nr_blocks
+
+ // The state vector is an array of uint32s in native byte-order.
+ // Setup s0 .. s3 such that each register stores 2 copies of the
+ // state.
+ x_v := ([^]simd.u32x4)(raw_data(x))
+ t0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
+ t1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
+ t2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
+ t3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
+ s0 := simd.swizzle(t0, 0, 1, 2, 3, 0, 1, 2, 3)
+ s1 := simd.swizzle(t1, 0, 1, 2, 3, 0, 1, 2, 3)
+ s2 := simd.swizzle(t2, 0, 1, 2, 3, 0, 1, 2, 3)
+ s3 := simd.swizzle(t3, 0, 1, 2, 3, 0, 1, 2, 3)
+
+ // Advance the counter in the 2nd copy of the state by one.
+ s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_ZERO_ONE)
+
+ // 8 blocks at a time.
+ for ; n >= 8; n = n - 8 {
+ v0, v1, v2, v3 := s0, s1, s2, s3
+
+ s7 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
+ v4, v5, v6, v7 := s0, s1, s2, s7
+
+ s11 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s7, _VEC_TWO)
+ v8, v9, v10, v11 := s0, s1, s2, s11
+
+ s15 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s11, _VEC_TWO)
+ v12, v13, v14, v15 := s0, s1, s2, s15
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
+ v4, v5, v6, v7 = _dq_round_simd256(v4, v5, v6, v7)
+ v8, v9, v10, v11 = _dq_round_simd256(v8, v9, v10, v11)
+ v12, v13, v14, v15 = _dq_round_simd256(v12, v13, v14, v15)
+ }
+
+ v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
+ v4, v5, v6, v7 = _add_and_permute_state_simd256(v4, v5, v6, v7, s0, s1, s2, s7)
+ v8, v9, v10, v11 = _add_and_permute_state_simd256(v8, v9, v10, v11, s0, s1, s2, s11)
+ v12, v13, v14, v15 = _add_and_permute_state_simd256(v12, v13, v14, v15, s0, s1, s2, s15)
+
+ #no_bounds_check {
+ if src != nil {
+ v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
+ v4, v5, v6, v7 = _xor_simd256(src_v[4:], v4, v5, v6, v7)
+ v8, v9, v10, v11 = _xor_simd256(src_v[8:], v8, v9, v10, v11)
+ v12, v13, v14, v15 = _xor_simd256(src_v[12:], v12, v13, v14, v15)
+ src_v = src_v[16:]
+ }
+
+ _store_simd256(dst_v, v0, v1, v2, v3)
+ _store_simd256(dst_v[4:], v4, v5, v6, v7)
+ _store_simd256(dst_v[8:], v8, v9, v10, v11)
+ _store_simd256(dst_v[12:], v12, v13, v14, v15)
+ dst_v = dst_v[16:]
+ }
+
+ s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s15, _VEC_TWO)
+ }
+
+
+ // 2 (or 1) block at a time.
+ for ; n > 0; n = n - 2 {
+ v0, v1, v2, v3 := s0, s1, s2, s3
+
+ for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+ v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
+ }
+ v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
+
+ if n == 1 {
+ // Note: No need to advance src_v, dst_v, or increment the counter
+ // since this is guaranteed to be the final block.
+ #no_bounds_check {
+ if src != nil {
+ v0, v1 = _xor_simd256_x1(src_v, v0, v1)
+ }
+
+ _store_simd256_x1(dst_v, v0, v1)
+ }
+ break
+ }
+
+ #no_bounds_check {
+ if src != nil {
+ v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
+ src_v = src_v[4:]
+ }
+
+ _store_simd256(dst_v, v0, v1, v2, v3)
+ dst_v = dst_v[4:]
+ }
+
+ s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
+ }
+
+ // Write back the counter. Doing it this way, saves having to
+ // pull out the correct counter value from s3.
+ new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + u64(nr_blocks)
+ ctx._s[12] = u32(new_ctr)
+ ctx._s[13] = u32(new_ctr >> 32)
+}
+
+@(enable_target_feature = "sse2,ssse3,avx")
+hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
+ // We can just enable AVX and call the simd128 code as going
+ // wider has 0 performance benefit, but VEX encoded instructions
+ // is nice.
+ #force_inline chacha_simd128.hchacha20(dst, key, nonce)
+} \ No newline at end of file
diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin
new file mode 100644
index 000000000..15edf4a68
--- /dev/null
+++ b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin
@@ -0,0 +1,17 @@
+//+build !amd64
+package chacha20_simd256
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+
+is_performant :: proc "contextless" () -> bool {
+ return false
+}
+
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+ panic("crypto/chacha20: simd256 implementation unsupported")
+}
+
+hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
+ intrinsics.trap()
+} \ No newline at end of file
diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin
index 73d3e1ea2..1fa25f573 100644
--- a/core/crypto/chacha20/chacha20.odin
+++ b/core/crypto/chacha20/chacha20.odin
@@ -8,119 +8,66 @@ See:
package chacha20
import "core:bytes"
-import "core:encoding/endian"
-import "core:math/bits"
+import "core:crypto/_chacha20"
import "core:mem"
// KEY_SIZE is the (X)ChaCha20 key size in bytes.
-KEY_SIZE :: 32
+KEY_SIZE :: _chacha20.KEY_SIZE
// NONCE_SIZE is the ChaCha20 nonce size in bytes.
-NONCE_SIZE :: 12
+NONCE_SIZE :: _chacha20.NONCE_SIZE
// XNONCE_SIZE is the XChaCha20 nonce size in bytes.
-XNONCE_SIZE :: 24
-
-@(private)
-_MAX_CTR_IETF :: 0xffffffff
-
-@(private)
-_BLOCK_SIZE :: 64
-@(private)
-_STATE_SIZE_U32 :: 16
-@(private)
-_ROUNDS :: 20
-
-@(private)
-_SIGMA_0: u32 : 0x61707865
-@(private)
-_SIGMA_1: u32 : 0x3320646e
-@(private)
-_SIGMA_2: u32 : 0x79622d32
-@(private)
-_SIGMA_3: u32 : 0x6b206574
+XNONCE_SIZE :: _chacha20.XNONCE_SIZE
// Context is a ChaCha20 or XChaCha20 instance.
Context :: struct {
- _s: [_STATE_SIZE_U32]u32,
- _buffer: [_BLOCK_SIZE]byte,
- _off: int,
- _is_ietf_flavor: bool,
- _is_initialized: bool,
+ _state: _chacha20.Context,
+ _impl: Implementation,
}
// init inititializes a Context for ChaCha20 or XChaCha20 with the provided
// key and nonce.
-init :: proc(ctx: ^Context, key, nonce: []byte) {
+init :: proc(ctx: ^Context, key, nonce: []byte, impl := Implementation.Simd256) {
if len(key) != KEY_SIZE {
- panic("crypto/chacha20: invalid ChaCha20 key size")
+ panic("crypto/chacha20: invalid (X)ChaCha20 key size")
}
- if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE {
+ if l := len(nonce); l != NONCE_SIZE && l != XNONCE_SIZE {
panic("crypto/chacha20: invalid (X)ChaCha20 nonce size")
}
k, n := key, nonce
- // Derive the XChaCha20 subkey and sub-nonce via HChaCha20.
+ init_impl(ctx, impl)
+
is_xchacha := len(nonce) == XNONCE_SIZE
if is_xchacha {
- sub_key := ctx._buffer[:KEY_SIZE]
- _hchacha20(sub_key, k, n)
+ sub_nonce: [NONCE_SIZE]byte
+ sub_key := ctx._state._buffer[:KEY_SIZE]
+ hchacha20(sub_key, k, n, ctx._impl)
k = sub_key
- n = n[16:24]
+ copy(sub_nonce[4:], n[16:])
+ n = sub_nonce[:]
}
- ctx._s[0] = _SIGMA_0
- ctx._s[1] = _SIGMA_1
- ctx._s[2] = _SIGMA_2
- ctx._s[3] = _SIGMA_3
- ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
- ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
- ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
- ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
- ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
- ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
- ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
- ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
- ctx._s[12] = 0
- if !is_xchacha {
- ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
- ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
- ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
- } else {
- ctx._s[13] = 0
- ctx._s[14] = endian.unchecked_get_u32le(n[0:4])
- ctx._s[15] = endian.unchecked_get_u32le(n[4:8])
+ _chacha20.init(&ctx._state, k, n, is_xchacha)
+ if is_xchacha {
// The sub-key is stored in the keystream buffer. While
// this will be overwritten in most circumstances, explicitly
// clear it out early.
- mem.zero_explicit(&ctx._buffer, KEY_SIZE)
+ mem.zero_explicit(&ctx._state._buffer, KEY_SIZE)
}
-
- ctx._off = _BLOCK_SIZE
- ctx._is_ietf_flavor = !is_xchacha
- ctx._is_initialized = true
}
// seek seeks the (X)ChaCha20 stream counter to the specified block.
seek :: proc(ctx: ^Context, block_nr: u64) {
- assert(ctx._is_initialized)
-
- if ctx._is_ietf_flavor {
- if block_nr > _MAX_CTR_IETF {
- panic("crypto/chacha20: attempted to seek past maximum counter")
- }
- } else {
- ctx._s[13] = u32(block_nr >> 32)
- }
- ctx._s[12] = u32(block_nr)
- ctx._off = _BLOCK_SIZE
+ _chacha20.seek(&ctx._state, block_nr)
}
// xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20
// keystream, and writes the resulting output to dst. Dst and src MUST
// alias exactly or not at all.
xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
- assert(ctx._is_initialized)
+ assert(ctx._state._is_initialized)
src, dst := src, dst
if dst_len := len(dst); dst_len < len(src) {
@@ -131,12 +78,13 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
panic("crypto/chacha20: dst and src alias inexactly")
}
- for remaining := len(src); remaining > 0; {
+ st := &ctx._state
+ #no_bounds_check for remaining := len(src); remaining > 0; {
// Process multiple blocks at once
- if ctx._off == _BLOCK_SIZE {
- if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
- direct_bytes := nr_blocks * _BLOCK_SIZE
- _do_blocks(ctx, dst, src, nr_blocks)
+ if st._off == _chacha20.BLOCK_SIZE {
+ if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
+ direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
+ stream_blocks(ctx, dst, src, nr_blocks)
remaining -= direct_bytes
if remaining == 0 {
return
@@ -147,17 +95,17 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
// If there is a partial block, generate and buffer 1 block
// worth of keystream.
- _do_blocks(ctx, ctx._buffer[:], nil, 1)
- ctx._off = 0
+ stream_blocks(ctx, st._buffer[:], nil, 1)
+ st._off = 0
}
// Process partial blocks from the buffered keystream.
- to_xor := min(_BLOCK_SIZE - ctx._off, remaining)
- buffered_keystream := ctx._buffer[ctx._off:]
+ to_xor := min(_chacha20.BLOCK_SIZE - st._off, remaining)
+ buffered_keystream := st._buffer[st._off:]
for i := 0; i < to_xor; i = i + 1 {
dst[i] = buffered_keystream[i] ~ src[i]
}
- ctx._off += to_xor
+ st._off += to_xor
dst = dst[to_xor:]
src = src[to_xor:]
remaining -= to_xor
@@ -166,15 +114,15 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
// keystream_bytes fills dst with the raw (X)ChaCha20 keystream output.
keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
- assert(ctx._is_initialized)
+ assert(ctx._state._is_initialized)
- dst := dst
- for remaining := len(dst); remaining > 0; {
+ dst, st := dst, &ctx._state
+ #no_bounds_check for remaining := len(dst); remaining > 0; {
// Process multiple blocks at once
- if ctx._off == _BLOCK_SIZE {
- if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
- direct_bytes := nr_blocks * _BLOCK_SIZE
- _do_blocks(ctx, dst, nil, nr_blocks)
+ if st._off == _chacha20.BLOCK_SIZE {
+ if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
+ direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
+ stream_blocks(ctx, dst, nil, nr_blocks)
remaining -= direct_bytes
if remaining == 0 {
return
@@ -184,15 +132,15 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
// If there is a partial block, generate and buffer 1 block
// worth of keystream.
- _do_blocks(ctx, ctx._buffer[:], nil, 1)
- ctx._off = 0
+ stream_blocks(ctx, st._buffer[:], nil, 1)
+ st._off = 0
}
// Process partial blocks from the buffered keystream.
- to_copy := min(_BLOCK_SIZE - ctx._off, remaining)
- buffered_keystream := ctx._buffer[ctx._off:]
+ to_copy := min(_chacha20.BLOCK_SIZE - st._off, remaining)
+ buffered_keystream := st._buffer[st._off:]
copy(dst[:to_copy], buffered_keystream[:to_copy])
- ctx._off += to_copy
+ st._off += to_copy
dst = dst[to_copy:]
remaining -= to_copy
}
@@ -201,366 +149,5 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
// reset sanitizes the Context. The Context must be re-initialized to
// be used again.
reset :: proc(ctx: ^Context) {
- mem.zero_explicit(&ctx._s, size_of(ctx._s))
- mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
-
- ctx._is_initialized = false
-}
-
-@(private)
-_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
- // Enforce the maximum consumed keystream per nonce.
- //
- // While all modern "standard" definitions of ChaCha20 use
- // the IETF 32-bit counter, for XChaCha20 most common
- // implementations allow for a 64-bit counter.
- //
- // Honestly, the answer here is "use a MRAE primitive", but
- // go with common practice in the case of XChaCha20.
- if ctx._is_ietf_flavor {
- if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff {
- panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached")
- }
- } else {
- ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
- if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
- panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached")
- }
- }
-
- dst, src := dst, src
- x := &ctx._s
- for n := 0; n < nr_blocks; n = n + 1 {
- x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
- x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
-
- for i := _ROUNDS; i > 0; i = i - 2 {
- // Even when forcing inlining manually inlining all of
- // these is decently faster.
-
- // quarterround(x, 0, 4, 8, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 16)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 8)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 7)
-
- // quarterround(x, 1, 5, 9, 13)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 16)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 12)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 8)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 7)
-
- // quarterround(x, 2, 6, 10, 14)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 16)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 12)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 8)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 7)
-
- // quarterround(x, 3, 7, 11, 15)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 16)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 12)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 8)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 7)
-
- // quarterround(x, 0, 5, 10, 15)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 16)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 12)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 8)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 7)
-
- // quarterround(x, 1, 6, 11, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 16)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 8)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 7)
-
- // quarterround(x, 2, 7, 8, 13)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 16)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 12)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 8)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 7)
-
- // quarterround(x, 3, 4, 9, 14)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 16)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 12)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 8)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 7)
- }
-
- x0 += _SIGMA_0
- x1 += _SIGMA_1
- x2 += _SIGMA_2
- x3 += _SIGMA_3
- x4 += x[4]
- x5 += x[5]
- x6 += x[6]
- x7 += x[7]
- x8 += x[8]
- x9 += x[9]
- x10 += x[10]
- x11 += x[11]
- x12 += x[12]
- x13 += x[13]
- x14 += x[14]
- x15 += x[15]
-
- // While the "correct" answer to getting more performance out of
- // this is "use vector operations", support for that is currently
- // a work in progress/to be designed.
- //
- // In the meantime:
- // - The caller(s) ensure that src/dst are valid.
- // - The compiler knows if the target is picky about alignment.
-
- #no_bounds_check {
- if src != nil {
- endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
- endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
- endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
- endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
- endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
- endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
- endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
- endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
- endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
- endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
- endian.unchecked_put_u32le(dst[40:44], endian.unchecked_get_u32le(src[40:44]) ~ x10)
- endian.unchecked_put_u32le(dst[44:48], endian.unchecked_get_u32le(src[44:48]) ~ x11)
- endian.unchecked_put_u32le(dst[48:52], endian.unchecked_get_u32le(src[48:52]) ~ x12)
- endian.unchecked_put_u32le(dst[52:56], endian.unchecked_get_u32le(src[52:56]) ~ x13)
- endian.unchecked_put_u32le(dst[56:60], endian.unchecked_get_u32le(src[56:60]) ~ x14)
- endian.unchecked_put_u32le(dst[60:64], endian.unchecked_get_u32le(src[60:64]) ~ x15)
- src = src[_BLOCK_SIZE:]
- } else {
- endian.unchecked_put_u32le(dst[0:4], x0)
- endian.unchecked_put_u32le(dst[4:8], x1)
- endian.unchecked_put_u32le(dst[8:12], x2)
- endian.unchecked_put_u32le(dst[12:16], x3)
- endian.unchecked_put_u32le(dst[16:20], x4)
- endian.unchecked_put_u32le(dst[20:24], x5)
- endian.unchecked_put_u32le(dst[24:28], x6)
- endian.unchecked_put_u32le(dst[28:32], x7)
- endian.unchecked_put_u32le(dst[32:36], x8)
- endian.unchecked_put_u32le(dst[36:40], x9)
- endian.unchecked_put_u32le(dst[40:44], x10)
- endian.unchecked_put_u32le(dst[44:48], x11)
- endian.unchecked_put_u32le(dst[48:52], x12)
- endian.unchecked_put_u32le(dst[52:56], x13)
- endian.unchecked_put_u32le(dst[56:60], x14)
- endian.unchecked_put_u32le(dst[60:64], x15)
- }
- dst = dst[_BLOCK_SIZE:]
- }
-
- // Increment the counter. Overflow checking is done upon
- // entry into the routine, so a 64-bit increment safely
- // covers both cases.
- new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
- x[12] = u32(new_ctr)
- x[13] = u32(new_ctr >> 32)
- }
-}
-
-@(private)
-_hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
- x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
- x4 := endian.unchecked_get_u32le(key[0:4])
- x5 := endian.unchecked_get_u32le(key[4:8])
- x6 := endian.unchecked_get_u32le(key[8:12])
- x7 := endian.unchecked_get_u32le(key[12:16])
- x8 := endian.unchecked_get_u32le(key[16:20])
- x9 := endian.unchecked_get_u32le(key[20:24])
- x10 := endian.unchecked_get_u32le(key[24:28])
- x11 := endian.unchecked_get_u32le(key[28:32])
- x12 := endian.unchecked_get_u32le(nonce[0:4])
- x13 := endian.unchecked_get_u32le(nonce[4:8])
- x14 := endian.unchecked_get_u32le(nonce[8:12])
- x15 := endian.unchecked_get_u32le(nonce[12:16])
-
- for i := _ROUNDS; i > 0; i = i - 2 {
- // quarterround(x, 0, 4, 8, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 16)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 12)
- x0 += x4
- x12 ~= x0
- x12 = bits.rotate_left32(x12, 8)
- x8 += x12
- x4 ~= x8
- x4 = bits.rotate_left32(x4, 7)
-
- // quarterround(x, 1, 5, 9, 13)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 16)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 12)
- x1 += x5
- x13 ~= x1
- x13 = bits.rotate_left32(x13, 8)
- x9 += x13
- x5 ~= x9
- x5 = bits.rotate_left32(x5, 7)
-
- // quarterround(x, 2, 6, 10, 14)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 16)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 12)
- x2 += x6
- x14 ~= x2
- x14 = bits.rotate_left32(x14, 8)
- x10 += x14
- x6 ~= x10
- x6 = bits.rotate_left32(x6, 7)
-
- // quarterround(x, 3, 7, 11, 15)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 16)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 12)
- x3 += x7
- x15 ~= x3
- x15 = bits.rotate_left32(x15, 8)
- x11 += x15
- x7 ~= x11
- x7 = bits.rotate_left32(x7, 7)
-
- // quarterround(x, 0, 5, 10, 15)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 16)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 12)
- x0 += x5
- x15 ~= x0
- x15 = bits.rotate_left32(x15, 8)
- x10 += x15
- x5 ~= x10
- x5 = bits.rotate_left32(x5, 7)
-
- // quarterround(x, 1, 6, 11, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 16)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 12)
- x1 += x6
- x12 ~= x1
- x12 = bits.rotate_left32(x12, 8)
- x11 += x12
- x6 ~= x11
- x6 = bits.rotate_left32(x6, 7)
-
- // quarterround(x, 2, 7, 8, 13)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 16)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 12)
- x2 += x7
- x13 ~= x2
- x13 = bits.rotate_left32(x13, 8)
- x8 += x13
- x7 ~= x8
- x7 = bits.rotate_left32(x7, 7)
-
- // quarterround(x, 3, 4, 9, 14)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 16)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 12)
- x3 += x4
- x14 ~= x3
- x14 = bits.rotate_left32(x14, 8)
- x9 += x14
- x4 ~= x9
- x4 = bits.rotate_left32(x4, 7)
- }
-
- endian.unchecked_put_u32le(dst[0:4], x0)
- endian.unchecked_put_u32le(dst[4:8], x1)
- endian.unchecked_put_u32le(dst[8:12], x2)
- endian.unchecked_put_u32le(dst[12:16], x3)
- endian.unchecked_put_u32le(dst[16:20], x12)
- endian.unchecked_put_u32le(dst[20:24], x13)
- endian.unchecked_put_u32le(dst[24:28], x14)
- endian.unchecked_put_u32le(dst[28:32], x15)
+ _chacha20.reset(&ctx._state)
}
diff --git a/core/crypto/chacha20/chacha20_impl.odin b/core/crypto/chacha20/chacha20_impl.odin
new file mode 100644
index 000000000..67b95ca25
--- /dev/null
+++ b/core/crypto/chacha20/chacha20_impl.odin
@@ -0,0 +1,52 @@
+package chacha20
+
+import "base:intrinsics"
+import "core:crypto/_chacha20/ref"
+import "core:crypto/_chacha20/simd128"
+import "core:crypto/_chacha20/simd256"
+
+// Implementation is a ChaCha20 implementation. Most callers will not need
+// to use this as the package will automatically select the most performant
+// implementation available.
+Implementation :: enum {
+ Portable,
+ Simd128,
+ Simd256,
+}
+
+@(private)
+init_impl :: proc(ctx: ^Context, impl: Implementation) {
+ impl := impl
+ if impl == .Simd256 && !simd256.is_performant() {
+ impl = .Simd128
+ }
+ if impl == .Simd128 && !simd128.is_performant() {
+ impl = .Portable
+ }
+
+ ctx._impl = impl
+}
+
+@(private)
+stream_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
+ switch ctx._impl {
+ case .Simd256:
+ simd256.stream_blocks(&ctx._state, dst, src, nr_blocks)
+ case .Simd128:
+ simd128.stream_blocks(&ctx._state, dst, src, nr_blocks)
+ case .Portable:
+ ref.stream_blocks(&ctx._state, dst, src, nr_blocks)
+ }
+}
+
+@(private)
+hchacha20 :: proc "contextless" (dst, key, nonce: []byte, impl: Implementation) {
+ switch impl {
+ case .Simd256:
+ simd256.hchacha20(dst, key, nonce)
+ case .Simd128:
+ simd128.hchacha20(dst, key, nonce)
+ case .Portable:
+ ref.hchacha20(dst, key, nonce)
+ }
+}