diff options
| author | Yawning Angel <yawning@schwanenlied.me> | 2023-12-16 12:24:24 +0900 |
|---|---|---|
| committer | Yawning Angel <yawning@schwanenlied.me> | 2024-08-10 18:32:37 +0900 |
| commit | 1f3107e693fe4cf72518a549f9a25aec9ec9c485 (patch) | |
| tree | 062f6234b870d9e1c43a64526700bf06f8aeb465 /core/crypto | |
| parent | 708f053fe61e729864c82be9d9d7c84c396aa187 (diff) | |
core/crypto/chacha20: Use 128-bit/256-bit SIMD
Diffstat (limited to 'core/crypto')
| -rw-r--r-- | core/crypto/_chacha20/chacha20.odin | 123 | ||||
| -rw-r--r-- | core/crypto/_chacha20/ref/chacha20_ref.odin | 360 | ||||
| -rw-r--r-- | core/crypto/_chacha20/simd128/chacha20_simd128.odin | 481 | ||||
| -rw-r--r-- | core/crypto/_chacha20/simd256/chacha20_simd256.odin | 319 | ||||
| -rw-r--r-- | core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin | 17 | ||||
| -rw-r--r-- | core/crypto/chacha20/chacha20.odin | 503 | ||||
| -rw-r--r-- | core/crypto/chacha20/chacha20_impl.odin | 52 |
7 files changed, 1397 insertions, 458 deletions
diff --git a/core/crypto/_chacha20/chacha20.odin b/core/crypto/_chacha20/chacha20.odin new file mode 100644 index 000000000..3ede592b7 --- /dev/null +++ b/core/crypto/_chacha20/chacha20.odin @@ -0,0 +1,123 @@ +package _chacha20 + +import "base:intrinsics" +import "core:encoding/endian" +import "core:math/bits" +import "core:mem" + +// KEY_SIZE is the (X)ChaCha20 key size in bytes. +KEY_SIZE :: 32 +// NONCE_SIZE is the ChaCha20 nonce size in bytes. +NONCE_SIZE :: 12 +// XNONCE_SIZE is the XChaCha20 nonce size in bytes. +XNONCE_SIZE :: 24 + +// MAX_CTR_IETF is the maximum counter value for the IETF flavor ChaCha20. +MAX_CTR_IETF :: 0xffffffff +// BLOCK_SIZE is the (X)ChaCha20 block size in bytes. +BLOCK_SIZE :: 64 +// STATE_SIZE_U32 is the (X)ChaCha20 state size in u32s. +STATE_SIZE_U32 :: 16 +// Rounds is the (X)ChaCha20 round count. +ROUNDS :: 20 + +// SIGMA_0 is sigma[0:4]. +SIGMA_0: u32 : 0x61707865 +// SIGMA_1 is sigma[4:8]. +SIGMA_1: u32 : 0x3320646e +// SIGMA_2 is sigma[8:12]. +SIGMA_2: u32 : 0x79622d32 +// SIGMA_3 is sigma[12:16]. +SIGMA_3: u32 : 0x6b206574 + +// Context is a ChaCha20 or XChaCha20 instance. +Context :: struct { + _s: [STATE_SIZE_U32]u32, + _buffer: [BLOCK_SIZE]byte, + _off: int, + _is_ietf_flavor: bool, + _is_initialized: bool, +} + +// init inititializes a Context for ChaCha20 with the provided key and +// nonce. +// +// WARNING: This ONLY handles ChaCha20. XChaCha20 sub-key and nonce +// derivation is expected to be handled by the caller, so that the +// HChaCha call can be suitably accelerated. +init :: proc "contextless" (ctx: ^Context, key, nonce: []byte, is_xchacha: bool) { + if len(key) != KEY_SIZE || len(nonce) != NONCE_SIZE { + intrinsics.trap() + } + + k, n := key, nonce + + ctx._s[0] = SIGMA_0 + ctx._s[1] = SIGMA_1 + ctx._s[2] = SIGMA_2 + ctx._s[3] = SIGMA_3 + ctx._s[4] = endian.unchecked_get_u32le(k[0:4]) + ctx._s[5] = endian.unchecked_get_u32le(k[4:8]) + ctx._s[6] = endian.unchecked_get_u32le(k[8:12]) + ctx._s[7] = endian.unchecked_get_u32le(k[12:16]) + ctx._s[8] = endian.unchecked_get_u32le(k[16:20]) + ctx._s[9] = endian.unchecked_get_u32le(k[20:24]) + ctx._s[10] = endian.unchecked_get_u32le(k[24:28]) + ctx._s[11] = endian.unchecked_get_u32le(k[28:32]) + ctx._s[12] = 0 + ctx._s[13] = endian.unchecked_get_u32le(n[0:4]) + ctx._s[14] = endian.unchecked_get_u32le(n[4:8]) + ctx._s[15] = endian.unchecked_get_u32le(n[8:12]) + + ctx._off = BLOCK_SIZE + ctx._is_ietf_flavor = !is_xchacha + ctx._is_initialized = true +} + +// seek seeks the (X)ChaCha20 stream counter to the specified block. +seek :: proc(ctx: ^Context, block_nr: u64) { + assert(ctx._is_initialized) + + if ctx._is_ietf_flavor { + if block_nr > MAX_CTR_IETF { + panic("crypto/chacha20: attempted to seek past maximum counter") + } + } else { + ctx._s[13] = u32(block_nr >> 32) + } + ctx._s[12] = u32(block_nr) + ctx._off = BLOCK_SIZE +} + +// reset sanitizes the Context. The Context must be re-initialized to +// be used again. +reset :: proc(ctx: ^Context) { + mem.zero_explicit(&ctx._s, size_of(ctx._s)) + mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer)) + + ctx._is_initialized = false +} + +check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) { + // Enforce the maximum consumed keystream per nonce. + // + // While all modern "standard" definitions of ChaCha20 use + // the IETF 32-bit counter, for XChaCha20 most common + // implementations allow for a 64-bit counter. + // + // Honestly, the answer here is "use a MRAE primitive", but + // go with "common" practice in the case of XChaCha20. + + ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per nonce reached" + + if ctx._is_ietf_flavor { + if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF { + panic(ERR_CTR_EXHAUSTED) + } + } else { + ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12]) + if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 { + panic(ERR_CTR_EXHAUSTED) + } + } +} diff --git a/core/crypto/_chacha20/ref/chacha20_ref.odin b/core/crypto/_chacha20/ref/chacha20_ref.odin new file mode 100644 index 000000000..27104b8e4 --- /dev/null +++ b/core/crypto/_chacha20/ref/chacha20_ref.odin @@ -0,0 +1,360 @@ +package chacha20_ref + +import "core:crypto/_chacha20" +import "core:encoding/endian" +import "core:math/bits" + +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per nonce. + _chacha20.check_counter_limit(ctx, nr_blocks) + + dst, src := dst, src + x := &ctx._s + for n := 0; n < nr_blocks; n = n + 1 { + x0, x1, x2, x3 := + _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := + x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + // Even when forcing inlining manually inlining all of + // these is decently faster. + + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 16) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 8) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 16) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 12) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 8) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 16) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 12) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 8) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 16) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 12) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 8) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 16) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 12) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 8) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 16) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 8) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 16) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 12) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 8) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 16) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 12) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 8) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 7) + } + + x0 += _chacha20.SIGMA_0 + x1 += _chacha20.SIGMA_1 + x2 += _chacha20.SIGMA_2 + x3 += _chacha20.SIGMA_3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + + // - The caller(s) ensure that src/dst are valid. + // - The compiler knows if the target is picky about alignment. + + #no_bounds_check { + if src != nil { + endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0) + endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1) + endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2) + endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3) + endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4) + endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5) + endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6) + endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7) + endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8) + endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9) + endian.unchecked_put_u32le( + dst[40:44], + endian.unchecked_get_u32le(src[40:44]) ~ x10, + ) + endian.unchecked_put_u32le( + dst[44:48], + endian.unchecked_get_u32le(src[44:48]) ~ x11, + ) + endian.unchecked_put_u32le( + dst[48:52], + endian.unchecked_get_u32le(src[48:52]) ~ x12, + ) + endian.unchecked_put_u32le( + dst[52:56], + endian.unchecked_get_u32le(src[52:56]) ~ x13, + ) + endian.unchecked_put_u32le( + dst[56:60], + endian.unchecked_get_u32le(src[56:60]) ~ x14, + ) + endian.unchecked_put_u32le( + dst[60:64], + endian.unchecked_get_u32le(src[60:64]) ~ x15, + ) + src = src[_chacha20.BLOCK_SIZE:] + } else { + endian.unchecked_put_u32le(dst[0:4], x0) + endian.unchecked_put_u32le(dst[4:8], x1) + endian.unchecked_put_u32le(dst[8:12], x2) + endian.unchecked_put_u32le(dst[12:16], x3) + endian.unchecked_put_u32le(dst[16:20], x4) + endian.unchecked_put_u32le(dst[20:24], x5) + endian.unchecked_put_u32le(dst[24:28], x6) + endian.unchecked_put_u32le(dst[28:32], x7) + endian.unchecked_put_u32le(dst[32:36], x8) + endian.unchecked_put_u32le(dst[36:40], x9) + endian.unchecked_put_u32le(dst[40:44], x10) + endian.unchecked_put_u32le(dst[44:48], x11) + endian.unchecked_put_u32le(dst[48:52], x12) + endian.unchecked_put_u32le(dst[52:56], x13) + endian.unchecked_put_u32le(dst[56:60], x14) + endian.unchecked_put_u32le(dst[60:64], x15) + } + dst = dst[_chacha20.BLOCK_SIZE:] + } + + // Increment the counter. Overflow checking is done upon + // entry into the routine, so a 64-bit increment safely + // covers both cases. + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 + x[12] = u32(new_ctr) + x[13] = u32(new_ctr >> 32) + } +} + +hchacha20 :: proc "contextless" (dst, key, nonce: []byte) { + x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3 + x4 := endian.unchecked_get_u32le(key[0:4]) + x5 := endian.unchecked_get_u32le(key[4:8]) + x6 := endian.unchecked_get_u32le(key[8:12]) + x7 := endian.unchecked_get_u32le(key[12:16]) + x8 := endian.unchecked_get_u32le(key[16:20]) + x9 := endian.unchecked_get_u32le(key[20:24]) + x10 := endian.unchecked_get_u32le(key[24:28]) + x11 := endian.unchecked_get_u32le(key[28:32]) + x12 := endian.unchecked_get_u32le(nonce[0:4]) + x13 := endian.unchecked_get_u32le(nonce[4:8]) + x14 := endian.unchecked_get_u32le(nonce[8:12]) + x15 := endian.unchecked_get_u32le(nonce[12:16]) + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 16) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 8) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 16) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 12) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 8) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 16) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 12) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 8) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 16) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 12) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 8) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 16) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 12) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 8) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 16) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 8) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 16) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 12) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 8) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 16) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 12) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 8) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 7) + } + + endian.unchecked_put_u32le(dst[0:4], x0) + endian.unchecked_put_u32le(dst[4:8], x1) + endian.unchecked_put_u32le(dst[8:12], x2) + endian.unchecked_put_u32le(dst[12:16], x3) + endian.unchecked_put_u32le(dst[16:20], x12) + endian.unchecked_put_u32le(dst[20:24], x13) + endian.unchecked_put_u32le(dst[24:28], x14) + endian.unchecked_put_u32le(dst[28:32], x15) +} diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin new file mode 100644 index 000000000..0ec9ef553 --- /dev/null +++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin @@ -0,0 +1,481 @@ +package chacha20_simd128 + +import "base:intrinsics" +import "core:crypto/_chacha20" +import "core:simd" +import "core:sys/info" + +// Portable 128-bit `core:simd` implementation. +// +// This is loosely based on Ted Krovetz's public domain C intrinsic +// implementation. +// +// This is written to perform adequately on any target that has "enough" +// 128-bit vector registers, the current thought is that 4 blocks at at +// time is reasonable for amd64, though Ted's code is more conservative. +// +// See: +// supercop-20230530/crypto_stream/chacha20/krovetz/vec128 + +// Ensure the compiler emits SIMD instructions. This is a minimum, and +// setting the microarchitecture at compile time will allow for better +// code gen when applicable (eg: AVX). This is somewhat redundant with +// the default microarchitecture configurations. +when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + @(private = "file") + TARGET_SIMD_FEATURES :: "neon" +} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + // Note: LLVM appears to be smart enough to use PSHUFB despite not + // explicitly using simd.u8x16 shuffles. + @(private = "file") + TARGET_SIMD_FEATURES :: "sse2,ssse3" +} else { + @(private = "file") + TARGET_SIMD_FEATURES :: "" +} + +@(private = "file") +_ROT_7L: simd.u32x4 : {7, 7, 7, 7} +@(private = "file") +_ROT_7R: simd.u32x4 : {25, 25, 25, 25} +@(private = "file") +_ROT_12L: simd.u32x4 : {12, 12, 12, 12} +@(private = "file") +_ROT_12R: simd.u32x4 : {20, 20, 20, 20} +@(private = "file") +_ROT_8L: simd.u32x4 : {8, 8, 8, 8} +@(private = "file") +_ROT_8R: simd.u32x4 : {24, 24, 24, 24} +@(private = "file") +_ROT_16: simd.u32x4 : {16, 16, 16, 16} + +when ODIN_ENDIAN == .Big { + @(private = "file") + _increment_counter :: #force_inline proc "contextless" (ctx: ^Context) -> simd.u32x4 { + // In the Big Endian case, the low and high portions in the vector + // are flipped, so the 64-bit addition can't be done with a simple + // vector add. + x := &ctx._s + + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 + x[12] = u32(new_ctr) + x[13] = u32(new_ctr >> 32) + + return intrinsics.unaligned_load(transmute(^simd.u32x4)&x[12]) + } + + // Convert the endian-ness of the components of a u32x4 vector, for + // the purposes of output. + @(private = "file") + _byteswap_u32x4 :: #force_inline proc "contextless" (v: simd.u32x4) -> simd.u32x4 { + return( + transmute(simd.u32x4)simd.shuffle( + transmute(simd.u8x16)v, + transmute(simd.u8x16)v, + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} else { + @(private = "file") + _VEC_ONE: simd.u64x2 : {1, 0} +} + +@(private = "file") +_dq_round_simd128 :: #force_inline proc "contextless" ( + v0, v1, v2, v3: simd.u32x4, +) -> ( + simd.u32x4, + simd.u32x4, + simd.u32x4, + simd.u32x4, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + v1 = simd.shuffle(v1, v1, 1, 2, 3, 0) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1) + v3 = simd.shuffle(v3, v3, 3, 0, 1, 2) + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); + v1 = simd.shuffle(v1, v1, 3, 0, 1, 2) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1) + v3 = simd.shuffle(v3, v3, 1, 2, 3, 0) + + return v0, v1, v2, v3 +} + +@(private = "file") +_add_state_simd128 :: #force_inline proc "contextless" ( + v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x4, +) -> ( + simd.u32x4, + simd.u32x4, + simd.u32x4, + simd.u32x4, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + v0 = simd.add(v0, s0) + v1 = simd.add(v1, s1) + v2 = simd.add(v2, s2) + v3 = simd.add(v3, s3) + + when ODIN_ENDIAN == .Big { + v0 = _byteswap_u32x4(v0) + v1 = _byteswap_u32x4(v1) + v2 = _byteswap_u32x4(v2) + v3 = _byteswap_u32x4(v3) + } + + return v0, v1, v2, v3 +} + +@(private = "file") +_xor_simd128 :: #force_inline proc "contextless" ( + src: [^]simd.u32x4, + v0, v1, v2, v3: simd.u32x4, +) -> ( + simd.u32x4, + simd.u32x4, + simd.u32x4, + simd.u32x4, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x4)(src[0:]))) + v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x4)(src[1:]))) + v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x4)(src[2:]))) + v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x4)(src[3:]))) + + return v0, v1, v2, v3 +} + +@(private = "file") +_store_simd128 :: #force_inline proc "contextless" ( + dst: [^]simd.u32x4, + v0, v1, v2, v3: simd.u32x4, +) { + intrinsics.unaligned_store((^simd.u32x4)(dst[0:]), v0) + intrinsics.unaligned_store((^simd.u32x4)(dst[1:]), v1) + intrinsics.unaligned_store((^simd.u32x4)(dst[2:]), v2) + intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3) +} + +// is_performant returns true iff the target and current host both support +// "enough" 128-bit SIMD to make this implementation performant. +is_performant :: proc "contextless" () -> bool { + when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + req_features :: info.CPU_Features{.asimd} + } else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + req_features :: info.CPU_Features{.sse2, .ssse3} + } + + features, ok := info.cpu_features.? + if !ok { + return false + } + + return features >= req_features + } else when ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32 { + return intrinsics.has_target_feature("simd128") + } else { + return false + } +} + +@(enable_target_feature = TARGET_SIMD_FEATURES) +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per nonce. + _chacha20.check_counter_limit(ctx, nr_blocks) + + dst_v := ([^]simd.u32x4)(raw_data(dst)) + src_v := ([^]simd.u32x4)(raw_data(src)) + + x := &ctx._s + n := nr_blocks + + // The state vector is an array of uint32s in native byte-order. + x_v := ([^]simd.u32x4)(raw_data(x)) + s0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:])) + s1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:])) + s2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:])) + s3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:])) + + // 8 blocks at a time. + // + // Note: This is only worth it on Aarch64. + when ODIN_ARCH == .arm64 { + for ; n >= 8; n = n - 8 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + when ODIN_ENDIAN == .Little { + s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE) + } else { + s7 := _increment_counter(ctx) + } + v4, v5, v6, v7 := s0, s1, s2, s7 + + when ODIN_ENDIAN == .Little { + s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE) + } else { + s11 := _increment_counter(ctx) + } + v8, v9, v10, v11 := s0, s1, s2, s11 + + when ODIN_ENDIAN == .Little { + s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE) + } else { + s15 := _increment_counter(ctx) + } + v12, v13, v14, v15 := s0, s1, s2, s15 + + when ODIN_ENDIAN == .Little { + s19 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE) + } else { + s19 := _increment_counter(ctx) + } + + v16, v17, v18, v19 := s0, s1, s2, s19 + when ODIN_ENDIAN == .Little { + s23 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s19, _VEC_ONE) + } else { + s23 := _increment_counter(ctx) + } + + v20, v21, v22, v23 := s0, s1, s2, s23 + when ODIN_ENDIAN == .Little { + s27 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s23, _VEC_ONE) + } else { + s27 := _increment_counter(ctx) + } + + v24, v25, v26, v27 := s0, s1, s2, s27 + when ODIN_ENDIAN == .Little { + s31 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s27, _VEC_ONE) + } else { + s31 := _increment_counter(ctx) + } + v28, v29, v30, v31 := s0, s1, s2, s31 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7) + v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11) + v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15) + v16, v17, v18, v19 = _dq_round_simd128(v16, v17, v18, v19) + v20, v21, v22, v23 = _dq_round_simd128(v20, v21, v22, v23) + v24, v25, v26, v27 = _dq_round_simd128(v24, v25, v26, v27) + v28, v29, v30, v31 = _dq_round_simd128(v28, v29, v30, v31) + } + + v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3) + v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7) + v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11) + v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15) + v16, v17, v18, v19 = _add_state_simd128(v16, v17, v18, v19, s0, s1, s2, s19) + v20, v21, v22, v23 = _add_state_simd128(v20, v21, v22, v23, s0, s1, s2, s23) + v24, v25, v26, v27 = _add_state_simd128(v24, v25, v26, v27, s0, s1, s2, s27) + v28, v29, v30, v31 = _add_state_simd128(v28, v29, v30, v31, s0, s1, s2, s31) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3) + v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7) + v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11) + v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15) + v16, v17, v18, v19 = _xor_simd128(src_v[16:], v16, v17, v18, v19) + v20, v21, v22, v23 = _xor_simd128(src_v[20:], v20, v21, v22, v23) + v24, v25, v26, v27 = _xor_simd128(src_v[24:], v24, v25, v26, v27) + v28, v29, v30, v31 = _xor_simd128(src_v[28:], v28, v29, v30, v31) + src_v = src_v[32:] + } + + _store_simd128(dst_v, v0, v1, v2, v3) + _store_simd128(dst_v[4:], v4, v5, v6, v7) + _store_simd128(dst_v[8:], v8, v9, v10, v11) + _store_simd128(dst_v[12:], v12, v13, v14, v15) + _store_simd128(dst_v[16:], v16, v17, v18, v19) + _store_simd128(dst_v[20:], v20, v21, v22, v23) + _store_simd128(dst_v[24:], v24, v25, v26, v27) + _store_simd128(dst_v[28:], v28, v29, v30, v31) + dst_v = dst_v[32:] + } + + when ODIN_ENDIAN == .Little { + // s31 holds the most current counter, so `s3 = s31 + 1`. + s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s31, _VEC_ONE) + } else { + s3 = _increment_counter(ctx) + } + } + } + + // 4 blocks at a time. + // + // Note: The i386 target lacks the required number of registers + // for this to be performant, so it is skipped. + when ODIN_ARCH != .i386 { + for ; n >= 4; n = n - 4 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + when ODIN_ENDIAN == .Little { + s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE) + } else { + s7 := _increment_counter(ctx) + } + v4, v5, v6, v7 := s0, s1, s2, s7 + + when ODIN_ENDIAN == .Little { + s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE) + } else { + s11 := _increment_counter(ctx) + } + v8, v9, v10, v11 := s0, s1, s2, s11 + + when ODIN_ENDIAN == .Little { + s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE) + } else { + s15 := _increment_counter(ctx) + } + v12, v13, v14, v15 := s0, s1, s2, s15 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7) + v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11) + v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15) + } + + v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3) + v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7) + v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11) + v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3) + v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7) + v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11) + v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15) + src_v = src_v[16:] + } + + _store_simd128(dst_v, v0, v1, v2, v3) + _store_simd128(dst_v[4:], v4, v5, v6, v7) + _store_simd128(dst_v[8:], v8, v9, v10, v11) + _store_simd128(dst_v[12:], v12, v13, v14, v15) + dst_v = dst_v[16:] + } + + when ODIN_ENDIAN == .Little { + // s15 holds the most current counter, so `s3 = s15 + 1`. + s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE) + } else { + s3 = _increment_counter(ctx) + } + } + } + + // 1 block at a time. + for ; n > 0; n = n - 1 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + } + v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3) + src_v = src_v[4:] + } + + _store_simd128(dst_v, v0, v1, v2, v3) + dst_v = dst_v[4:] + } + + // Increment the counter. Overflow checking is done upon + // entry into the routine, so a 64-bit increment safely + // covers both cases. + when ODIN_ENDIAN == .Little { + s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE) + } else { + s3 = _increment_counter(ctx) + } + } + + when ODIN_ENDIAN == .Little { + // Write back the counter to the state. + intrinsics.unaligned_store((^simd.u32x4)(x_v[3:]), s3) + } +} + +@(enable_target_feature = TARGET_SIMD_FEATURES) +hchacha20 :: proc "contextless" (dst, key, nonce: []byte) { + v0 := simd.u32x4{_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3} + v1 := intrinsics.unaligned_load((^simd.u32x4)(&key[0])) + v2 := intrinsics.unaligned_load((^simd.u32x4)(&key[16])) + v3 := intrinsics.unaligned_load((^simd.u32x4)(&nonce[0])) + + when ODIN_ENDIAN == .Big { + v1 = _byteswap_u32x4(v1) + v2 = _byteswap_u32x4(v2) + v3 = _byteswap_u32x4(v3) + } + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + } + + when ODIN_ENDIAN == .Big { + v0 = _byteswap_u32x4(v0) + v3 = _byteswap_u32x4(v3) + } + + dst_v := ([^]simd.u32x4)(raw_data(dst)) + intrinsics.unaligned_store((^simd.u32x4)(dst_v[0:]), v0) + intrinsics.unaligned_store((^simd.u32x4)(dst_v[1:]), v3) +} diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256.odin b/core/crypto/_chacha20/simd256/chacha20_simd256.odin new file mode 100644 index 000000000..7e84509e1 --- /dev/null +++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin @@ -0,0 +1,319 @@ +//+build amd64 +package chacha20_simd256 + +import "base:intrinsics" +import "core:crypto/_chacha20" +import chacha_simd128 "core:crypto/_chacha20/simd128" +import "core:simd" +import "core:sys/info" + +// This is loosely based on Ted Krovetz's public domain C intrinsic +// implementations. While written using `core:simd`, this is currently +// amd64 specific because we do not have a way to detect ARM SVE. +// +// See: +// supercop-20230530/crypto_stream/chacha20/krovetz/vec128 +// supercop-20230530/crypto_stream/chacha20/krovetz/avx2 + +#assert(ODIN_ENDIAN == .Little) + +@(private = "file") +_ROT_7L: simd.u32x8 : {7, 7, 7, 7, 7, 7, 7, 7} +@(private = "file") +_ROT_7R: simd.u32x8 : {25, 25, 25, 25, 25, 25, 25, 25} +@(private = "file") +_ROT_12L: simd.u32x8 : {12, 12, 12, 12, 12, 12, 12, 12} +@(private = "file") +_ROT_12R: simd.u32x8 : {20, 20, 20, 20, 20, 20, 20, 20} +@(private = "file") +_ROT_8L: simd.u32x8 : {8, 8, 8, 8, 8, 8, 8, 8} +@(private = "file") +_ROT_8R: simd.u32x8 : {24, 24, 24, 24, 24, 24, 24, 24} +@(private = "file") +_ROT_16: simd.u32x8 : {16, 16, 16, 16, 16, 16, 16, 16} +@(private = "file") +_VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0} +@(private = "file") +_VEC_TWO: simd.u64x4 : {2, 0, 2, 0} + +// is_performant returns true iff the target and current host both support +// "enough" SIMD to make this implementation performant. +is_performant :: proc "contextless" () -> bool { + req_features :: info.CPU_Features{.avx, .avx2} + + features, ok := info.cpu_features.? + if !ok { + return false + } + + return features >= req_features +} + +@(private = "file") +_dq_round_simd256 :: #force_inline proc "contextless" ( + v0, v1, v2, v3: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, + simd.u32x8, + simd.u32x8, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + v1 = simd.shuffle(v1, v1, 1, 2, 3, 0, 5, 6, 7, 4) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5) + v3 = simd.shuffle(v3, v3, 3, 0, 1, 2, 7, 4, 5, 6) + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); + v1 = simd.shuffle(v1, v1, 3, 0, 1, 2, 7, 4, 5, 6) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5) + v3 = simd.shuffle(v3, v3, 1, 2, 3, 0, 5, 6, 7, 4) + + return v0, v1, v2, v3 +} + +@(private = "file") +_add_and_permute_state_simd256 :: #force_inline proc "contextless" ( + v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, + simd.u32x8, + simd.u32x8, +) { + t0 := simd.add(v0, s0) + t1 := simd.add(v1, s1) + t2 := simd.add(v2, s2) + t3 := simd.add(v3, s3) + + // Big Endian would byteswap here. + + // Each of v0 .. v3 has 128-bits of keystream for 2 separate blocks. + // permute the state such that (r0, r1) contains block 0, and (r2, r3) + // contains block 1. + r0 := simd.shuffle(t0, t1, 0, 1, 2, 3, 8, 9, 10, 11) + r2 := simd.shuffle(t0, t1, 4, 5, 6, 7, 12, 13, 14, 15) + r1 := simd.shuffle(t2, t3, 0, 1, 2, 3, 8, 9, 10, 11) + r3 := simd.shuffle(t2, t3, 4, 5, 6, 7, 12, 13, 14, 15) + + return r0, r1, r2, r3 +} + +@(private = "file") +_xor_simd256 :: #force_inline proc "contextless" ( + src: [^]simd.u32x8, + v0, v1, v2, v3: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, + simd.u32x8, + simd.u32x8, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:]))) + v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:]))) + v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x8)(src[2:]))) + v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x8)(src[3:]))) + + return v0, v1, v2, v3 +} + +@(private = "file") +_xor_simd256_x1 :: #force_inline proc "contextless" ( + src: [^]simd.u32x8, + v0, v1: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, +) { + v0, v1 := v0, v1 + + v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:]))) + v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:]))) + + return v0, v1 +} + +@(private = "file") +_store_simd256 :: #force_inline proc "contextless" ( + dst: [^]simd.u32x8, + v0, v1, v2, v3: simd.u32x8, +) { + intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0) + intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1) + intrinsics.unaligned_store((^simd.u32x8)(dst[2:]), v2) + intrinsics.unaligned_store((^simd.u32x8)(dst[3:]), v3) +} + +@(private = "file") +_store_simd256_x1 :: #force_inline proc "contextless" ( + dst: [^]simd.u32x8, + v0, v1: simd.u32x8, +) { + intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0) + intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1) +} + +@(enable_target_feature = "sse2,ssse3,avx,avx2") +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per nonce. + _chacha20.check_counter_limit(ctx, nr_blocks) + + dst_v := ([^]simd.u32x8)(raw_data(dst)) + src_v := ([^]simd.u32x8)(raw_data(src)) + + x := &ctx._s + n := nr_blocks + + // The state vector is an array of uint32s in native byte-order. + // Setup s0 .. s3 such that each register stores 2 copies of the + // state. + x_v := ([^]simd.u32x4)(raw_data(x)) + t0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:])) + t1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:])) + t2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:])) + t3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:])) + s0 := simd.swizzle(t0, 0, 1, 2, 3, 0, 1, 2, 3) + s1 := simd.swizzle(t1, 0, 1, 2, 3, 0, 1, 2, 3) + s2 := simd.swizzle(t2, 0, 1, 2, 3, 0, 1, 2, 3) + s3 := simd.swizzle(t3, 0, 1, 2, 3, 0, 1, 2, 3) + + // Advance the counter in the 2nd copy of the state by one. + s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_ZERO_ONE) + + // 8 blocks at a time. + for ; n >= 8; n = n - 8 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + s7 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO) + v4, v5, v6, v7 := s0, s1, s2, s7 + + s11 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s7, _VEC_TWO) + v8, v9, v10, v11 := s0, s1, s2, s11 + + s15 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s11, _VEC_TWO) + v12, v13, v14, v15 := s0, s1, s2, s15 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3) + v4, v5, v6, v7 = _dq_round_simd256(v4, v5, v6, v7) + v8, v9, v10, v11 = _dq_round_simd256(v8, v9, v10, v11) + v12, v13, v14, v15 = _dq_round_simd256(v12, v13, v14, v15) + } + + v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3) + v4, v5, v6, v7 = _add_and_permute_state_simd256(v4, v5, v6, v7, s0, s1, s2, s7) + v8, v9, v10, v11 = _add_and_permute_state_simd256(v8, v9, v10, v11, s0, s1, s2, s11) + v12, v13, v14, v15 = _add_and_permute_state_simd256(v12, v13, v14, v15, s0, s1, s2, s15) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3) + v4, v5, v6, v7 = _xor_simd256(src_v[4:], v4, v5, v6, v7) + v8, v9, v10, v11 = _xor_simd256(src_v[8:], v8, v9, v10, v11) + v12, v13, v14, v15 = _xor_simd256(src_v[12:], v12, v13, v14, v15) + src_v = src_v[16:] + } + + _store_simd256(dst_v, v0, v1, v2, v3) + _store_simd256(dst_v[4:], v4, v5, v6, v7) + _store_simd256(dst_v[8:], v8, v9, v10, v11) + _store_simd256(dst_v[12:], v12, v13, v14, v15) + dst_v = dst_v[16:] + } + + s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s15, _VEC_TWO) + } + + + // 2 (or 1) block at a time. + for ; n > 0; n = n - 2 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3) + } + v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3) + + if n == 1 { + // Note: No need to advance src_v, dst_v, or increment the counter + // since this is guaranteed to be the final block. + #no_bounds_check { + if src != nil { + v0, v1 = _xor_simd256_x1(src_v, v0, v1) + } + + _store_simd256_x1(dst_v, v0, v1) + } + break + } + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3) + src_v = src_v[4:] + } + + _store_simd256(dst_v, v0, v1, v2, v3) + dst_v = dst_v[4:] + } + + s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO) + } + + // Write back the counter. Doing it this way, saves having to + // pull out the correct counter value from s3. + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + u64(nr_blocks) + ctx._s[12] = u32(new_ctr) + ctx._s[13] = u32(new_ctr >> 32) +} + +@(enable_target_feature = "sse2,ssse3,avx") +hchacha20 :: proc "contextless" (dst, key, nonce: []byte) { + // We can just enable AVX and call the simd128 code as going + // wider has 0 performance benefit, but VEX encoded instructions + // is nice. + #force_inline chacha_simd128.hchacha20(dst, key, nonce) +}
\ No newline at end of file diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin new file mode 100644 index 000000000..15edf4a68 --- /dev/null +++ b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin @@ -0,0 +1,17 @@ +//+build !amd64 +package chacha20_simd256 + +import "base:intrinsics" +import "core:crypto/_chacha20" + +is_performant :: proc "contextless" () -> bool { + return false +} + +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + panic("crypto/chacha20: simd256 implementation unsupported") +} + +hchacha20 :: proc "contextless" (dst, key, nonce: []byte) { + intrinsics.trap() +}
\ No newline at end of file diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin index 73d3e1ea2..1fa25f573 100644 --- a/core/crypto/chacha20/chacha20.odin +++ b/core/crypto/chacha20/chacha20.odin @@ -8,119 +8,66 @@ See: package chacha20 import "core:bytes" -import "core:encoding/endian" -import "core:math/bits" +import "core:crypto/_chacha20" import "core:mem" // KEY_SIZE is the (X)ChaCha20 key size in bytes. -KEY_SIZE :: 32 +KEY_SIZE :: _chacha20.KEY_SIZE // NONCE_SIZE is the ChaCha20 nonce size in bytes. -NONCE_SIZE :: 12 +NONCE_SIZE :: _chacha20.NONCE_SIZE // XNONCE_SIZE is the XChaCha20 nonce size in bytes. -XNONCE_SIZE :: 24 - -@(private) -_MAX_CTR_IETF :: 0xffffffff - -@(private) -_BLOCK_SIZE :: 64 -@(private) -_STATE_SIZE_U32 :: 16 -@(private) -_ROUNDS :: 20 - -@(private) -_SIGMA_0: u32 : 0x61707865 -@(private) -_SIGMA_1: u32 : 0x3320646e -@(private) -_SIGMA_2: u32 : 0x79622d32 -@(private) -_SIGMA_3: u32 : 0x6b206574 +XNONCE_SIZE :: _chacha20.XNONCE_SIZE // Context is a ChaCha20 or XChaCha20 instance. Context :: struct { - _s: [_STATE_SIZE_U32]u32, - _buffer: [_BLOCK_SIZE]byte, - _off: int, - _is_ietf_flavor: bool, - _is_initialized: bool, + _state: _chacha20.Context, + _impl: Implementation, } // init inititializes a Context for ChaCha20 or XChaCha20 with the provided // key and nonce. -init :: proc(ctx: ^Context, key, nonce: []byte) { +init :: proc(ctx: ^Context, key, nonce: []byte, impl := Implementation.Simd256) { if len(key) != KEY_SIZE { - panic("crypto/chacha20: invalid ChaCha20 key size") + panic("crypto/chacha20: invalid (X)ChaCha20 key size") } - if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE { + if l := len(nonce); l != NONCE_SIZE && l != XNONCE_SIZE { panic("crypto/chacha20: invalid (X)ChaCha20 nonce size") } k, n := key, nonce - // Derive the XChaCha20 subkey and sub-nonce via HChaCha20. + init_impl(ctx, impl) + is_xchacha := len(nonce) == XNONCE_SIZE if is_xchacha { - sub_key := ctx._buffer[:KEY_SIZE] - _hchacha20(sub_key, k, n) + sub_nonce: [NONCE_SIZE]byte + sub_key := ctx._state._buffer[:KEY_SIZE] + hchacha20(sub_key, k, n, ctx._impl) k = sub_key - n = n[16:24] + copy(sub_nonce[4:], n[16:]) + n = sub_nonce[:] } - ctx._s[0] = _SIGMA_0 - ctx._s[1] = _SIGMA_1 - ctx._s[2] = _SIGMA_2 - ctx._s[3] = _SIGMA_3 - ctx._s[4] = endian.unchecked_get_u32le(k[0:4]) - ctx._s[5] = endian.unchecked_get_u32le(k[4:8]) - ctx._s[6] = endian.unchecked_get_u32le(k[8:12]) - ctx._s[7] = endian.unchecked_get_u32le(k[12:16]) - ctx._s[8] = endian.unchecked_get_u32le(k[16:20]) - ctx._s[9] = endian.unchecked_get_u32le(k[20:24]) - ctx._s[10] = endian.unchecked_get_u32le(k[24:28]) - ctx._s[11] = endian.unchecked_get_u32le(k[28:32]) - ctx._s[12] = 0 - if !is_xchacha { - ctx._s[13] = endian.unchecked_get_u32le(n[0:4]) - ctx._s[14] = endian.unchecked_get_u32le(n[4:8]) - ctx._s[15] = endian.unchecked_get_u32le(n[8:12]) - } else { - ctx._s[13] = 0 - ctx._s[14] = endian.unchecked_get_u32le(n[0:4]) - ctx._s[15] = endian.unchecked_get_u32le(n[4:8]) + _chacha20.init(&ctx._state, k, n, is_xchacha) + if is_xchacha { // The sub-key is stored in the keystream buffer. While // this will be overwritten in most circumstances, explicitly // clear it out early. - mem.zero_explicit(&ctx._buffer, KEY_SIZE) + mem.zero_explicit(&ctx._state._buffer, KEY_SIZE) } - - ctx._off = _BLOCK_SIZE - ctx._is_ietf_flavor = !is_xchacha - ctx._is_initialized = true } // seek seeks the (X)ChaCha20 stream counter to the specified block. seek :: proc(ctx: ^Context, block_nr: u64) { - assert(ctx._is_initialized) - - if ctx._is_ietf_flavor { - if block_nr > _MAX_CTR_IETF { - panic("crypto/chacha20: attempted to seek past maximum counter") - } - } else { - ctx._s[13] = u32(block_nr >> 32) - } - ctx._s[12] = u32(block_nr) - ctx._off = _BLOCK_SIZE + _chacha20.seek(&ctx._state, block_nr) } // xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20 // keystream, and writes the resulting output to dst. Dst and src MUST // alias exactly or not at all. xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { - assert(ctx._is_initialized) + assert(ctx._state._is_initialized) src, dst := src, dst if dst_len := len(dst); dst_len < len(src) { @@ -131,12 +78,13 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { panic("crypto/chacha20: dst and src alias inexactly") } - for remaining := len(src); remaining > 0; { + st := &ctx._state + #no_bounds_check for remaining := len(src); remaining > 0; { // Process multiple blocks at once - if ctx._off == _BLOCK_SIZE { - if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 { - direct_bytes := nr_blocks * _BLOCK_SIZE - _do_blocks(ctx, dst, src, nr_blocks) + if st._off == _chacha20.BLOCK_SIZE { + if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 { + direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE + stream_blocks(ctx, dst, src, nr_blocks) remaining -= direct_bytes if remaining == 0 { return @@ -147,17 +95,17 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { // If there is a partial block, generate and buffer 1 block // worth of keystream. - _do_blocks(ctx, ctx._buffer[:], nil, 1) - ctx._off = 0 + stream_blocks(ctx, st._buffer[:], nil, 1) + st._off = 0 } // Process partial blocks from the buffered keystream. - to_xor := min(_BLOCK_SIZE - ctx._off, remaining) - buffered_keystream := ctx._buffer[ctx._off:] + to_xor := min(_chacha20.BLOCK_SIZE - st._off, remaining) + buffered_keystream := st._buffer[st._off:] for i := 0; i < to_xor; i = i + 1 { dst[i] = buffered_keystream[i] ~ src[i] } - ctx._off += to_xor + st._off += to_xor dst = dst[to_xor:] src = src[to_xor:] remaining -= to_xor @@ -166,15 +114,15 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output. keystream_bytes :: proc(ctx: ^Context, dst: []byte) { - assert(ctx._is_initialized) + assert(ctx._state._is_initialized) - dst := dst - for remaining := len(dst); remaining > 0; { + dst, st := dst, &ctx._state + #no_bounds_check for remaining := len(dst); remaining > 0; { // Process multiple blocks at once - if ctx._off == _BLOCK_SIZE { - if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 { - direct_bytes := nr_blocks * _BLOCK_SIZE - _do_blocks(ctx, dst, nil, nr_blocks) + if st._off == _chacha20.BLOCK_SIZE { + if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 { + direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE + stream_blocks(ctx, dst, nil, nr_blocks) remaining -= direct_bytes if remaining == 0 { return @@ -184,15 +132,15 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) { // If there is a partial block, generate and buffer 1 block // worth of keystream. - _do_blocks(ctx, ctx._buffer[:], nil, 1) - ctx._off = 0 + stream_blocks(ctx, st._buffer[:], nil, 1) + st._off = 0 } // Process partial blocks from the buffered keystream. - to_copy := min(_BLOCK_SIZE - ctx._off, remaining) - buffered_keystream := ctx._buffer[ctx._off:] + to_copy := min(_chacha20.BLOCK_SIZE - st._off, remaining) + buffered_keystream := st._buffer[st._off:] copy(dst[:to_copy], buffered_keystream[:to_copy]) - ctx._off += to_copy + st._off += to_copy dst = dst[to_copy:] remaining -= to_copy } @@ -201,366 +149,5 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) { // reset sanitizes the Context. The Context must be re-initialized to // be used again. reset :: proc(ctx: ^Context) { - mem.zero_explicit(&ctx._s, size_of(ctx._s)) - mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer)) - - ctx._is_initialized = false -} - -@(private) -_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) { - // Enforce the maximum consumed keystream per nonce. - // - // While all modern "standard" definitions of ChaCha20 use - // the IETF 32-bit counter, for XChaCha20 most common - // implementations allow for a 64-bit counter. - // - // Honestly, the answer here is "use a MRAE primitive", but - // go with common practice in the case of XChaCha20. - if ctx._is_ietf_flavor { - if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff { - panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached") - } - } else { - ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12]) - if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 { - panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached") - } - } - - dst, src := dst, src - x := &ctx._s - for n := 0; n < nr_blocks; n = n + 1 { - x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3 - x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] - - for i := _ROUNDS; i > 0; i = i - 2 { - // Even when forcing inlining manually inlining all of - // these is decently faster. - - // quarterround(x, 0, 4, 8, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 16) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 8) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 7) - - // quarterround(x, 1, 5, 9, 13) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 16) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 12) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 8) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 2, 6, 10, 14) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 16) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 12) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 8) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 3, 7, 11, 15) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 16) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 12) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 8) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 0, 5, 10, 15) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 16) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 12) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 8) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 1, 6, 11, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 16) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 8) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 2, 7, 8, 13) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 16) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 12) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 8) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 3, 4, 9, 14) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 16) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 12) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 8) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 7) - } - - x0 += _SIGMA_0 - x1 += _SIGMA_1 - x2 += _SIGMA_2 - x3 += _SIGMA_3 - x4 += x[4] - x5 += x[5] - x6 += x[6] - x7 += x[7] - x8 += x[8] - x9 += x[9] - x10 += x[10] - x11 += x[11] - x12 += x[12] - x13 += x[13] - x14 += x[14] - x15 += x[15] - - // While the "correct" answer to getting more performance out of - // this is "use vector operations", support for that is currently - // a work in progress/to be designed. - // - // In the meantime: - // - The caller(s) ensure that src/dst are valid. - // - The compiler knows if the target is picky about alignment. - - #no_bounds_check { - if src != nil { - endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0) - endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1) - endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2) - endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3) - endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4) - endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5) - endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6) - endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7) - endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8) - endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9) - endian.unchecked_put_u32le(dst[40:44], endian.unchecked_get_u32le(src[40:44]) ~ x10) - endian.unchecked_put_u32le(dst[44:48], endian.unchecked_get_u32le(src[44:48]) ~ x11) - endian.unchecked_put_u32le(dst[48:52], endian.unchecked_get_u32le(src[48:52]) ~ x12) - endian.unchecked_put_u32le(dst[52:56], endian.unchecked_get_u32le(src[52:56]) ~ x13) - endian.unchecked_put_u32le(dst[56:60], endian.unchecked_get_u32le(src[56:60]) ~ x14) - endian.unchecked_put_u32le(dst[60:64], endian.unchecked_get_u32le(src[60:64]) ~ x15) - src = src[_BLOCK_SIZE:] - } else { - endian.unchecked_put_u32le(dst[0:4], x0) - endian.unchecked_put_u32le(dst[4:8], x1) - endian.unchecked_put_u32le(dst[8:12], x2) - endian.unchecked_put_u32le(dst[12:16], x3) - endian.unchecked_put_u32le(dst[16:20], x4) - endian.unchecked_put_u32le(dst[20:24], x5) - endian.unchecked_put_u32le(dst[24:28], x6) - endian.unchecked_put_u32le(dst[28:32], x7) - endian.unchecked_put_u32le(dst[32:36], x8) - endian.unchecked_put_u32le(dst[36:40], x9) - endian.unchecked_put_u32le(dst[40:44], x10) - endian.unchecked_put_u32le(dst[44:48], x11) - endian.unchecked_put_u32le(dst[48:52], x12) - endian.unchecked_put_u32le(dst[52:56], x13) - endian.unchecked_put_u32le(dst[56:60], x14) - endian.unchecked_put_u32le(dst[60:64], x15) - } - dst = dst[_BLOCK_SIZE:] - } - - // Increment the counter. Overflow checking is done upon - // entry into the routine, so a 64-bit increment safely - // covers both cases. - new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 - x[12] = u32(new_ctr) - x[13] = u32(new_ctr >> 32) - } -} - -@(private) -_hchacha20 :: proc "contextless" (dst, key, nonce: []byte) { - x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3 - x4 := endian.unchecked_get_u32le(key[0:4]) - x5 := endian.unchecked_get_u32le(key[4:8]) - x6 := endian.unchecked_get_u32le(key[8:12]) - x7 := endian.unchecked_get_u32le(key[12:16]) - x8 := endian.unchecked_get_u32le(key[16:20]) - x9 := endian.unchecked_get_u32le(key[20:24]) - x10 := endian.unchecked_get_u32le(key[24:28]) - x11 := endian.unchecked_get_u32le(key[28:32]) - x12 := endian.unchecked_get_u32le(nonce[0:4]) - x13 := endian.unchecked_get_u32le(nonce[4:8]) - x14 := endian.unchecked_get_u32le(nonce[8:12]) - x15 := endian.unchecked_get_u32le(nonce[12:16]) - - for i := _ROUNDS; i > 0; i = i - 2 { - // quarterround(x, 0, 4, 8, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 16) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 8) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 7) - - // quarterround(x, 1, 5, 9, 13) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 16) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 12) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 8) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 2, 6, 10, 14) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 16) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 12) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 8) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 3, 7, 11, 15) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 16) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 12) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 8) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 0, 5, 10, 15) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 16) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 12) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 8) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 1, 6, 11, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 16) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 8) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 2, 7, 8, 13) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 16) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 12) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 8) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 3, 4, 9, 14) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 16) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 12) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 8) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 7) - } - - endian.unchecked_put_u32le(dst[0:4], x0) - endian.unchecked_put_u32le(dst[4:8], x1) - endian.unchecked_put_u32le(dst[8:12], x2) - endian.unchecked_put_u32le(dst[12:16], x3) - endian.unchecked_put_u32le(dst[16:20], x12) - endian.unchecked_put_u32le(dst[20:24], x13) - endian.unchecked_put_u32le(dst[24:28], x14) - endian.unchecked_put_u32le(dst[28:32], x15) + _chacha20.reset(&ctx._state) } diff --git a/core/crypto/chacha20/chacha20_impl.odin b/core/crypto/chacha20/chacha20_impl.odin new file mode 100644 index 000000000..67b95ca25 --- /dev/null +++ b/core/crypto/chacha20/chacha20_impl.odin @@ -0,0 +1,52 @@ +package chacha20 + +import "base:intrinsics" +import "core:crypto/_chacha20/ref" +import "core:crypto/_chacha20/simd128" +import "core:crypto/_chacha20/simd256" + +// Implementation is a ChaCha20 implementation. Most callers will not need +// to use this as the package will automatically select the most performant +// implementation available. +Implementation :: enum { + Portable, + Simd128, + Simd256, +} + +@(private) +init_impl :: proc(ctx: ^Context, impl: Implementation) { + impl := impl + if impl == .Simd256 && !simd256.is_performant() { + impl = .Simd128 + } + if impl == .Simd128 && !simd128.is_performant() { + impl = .Portable + } + + ctx._impl = impl +} + +@(private) +stream_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) { + switch ctx._impl { + case .Simd256: + simd256.stream_blocks(&ctx._state, dst, src, nr_blocks) + case .Simd128: + simd128.stream_blocks(&ctx._state, dst, src, nr_blocks) + case .Portable: + ref.stream_blocks(&ctx._state, dst, src, nr_blocks) + } +} + +@(private) +hchacha20 :: proc "contextless" (dst, key, nonce: []byte, impl: Implementation) { + switch impl { + case .Simd256: + simd256.hchacha20(dst, key, nonce) + case .Simd128: + simd128.hchacha20(dst, key, nonce) + case .Portable: + ref.hchacha20(dst, key, nonce) + } +} |