diff options
| author | Yawning Angel <yawning@schwanenlied.me> | 2024-11-02 11:35:17 +0900 |
|---|---|---|
| committer | Yawning Angel <yawning@schwanenlied.me> | 2025-03-23 19:14:33 +0900 |
| commit | 982ab11aa1fa1f1b690b1dbe0e6ad0d070a15bbc (patch) | |
| tree | bcb223bc794607f3cd711bd5f1aef33994379c6a /core/crypto/sha2 | |
| parent | f3f5fbd37315f8ebe39c1fa404c2bcf905b8d316 (diff) | |
core/crypto/sha2: Use hardware SHA224/256 when available (AMD64)
Diffstat (limited to 'core/crypto/sha2')
| -rw-r--r-- | core/crypto/sha2/sha2.odin | 9 | ||||
| -rw-r--r-- | core/crypto/sha2/sha2_impl_hw_gen.odin | 15 | ||||
| -rw-r--r-- | core/crypto/sha2/sha2_impl_hw_intel.odin | 260 |
3 files changed, 282 insertions, 2 deletions
diff --git a/core/crypto/sha2/sha2.odin b/core/crypto/sha2/sha2.odin index 1c1ce11b6..bf726c20c 100644 --- a/core/crypto/sha2/sha2.odin +++ b/core/crypto/sha2/sha2.odin @@ -15,9 +15,9 @@ package sha2 zhibog, dotbmp: Initial implementation. */ -import "core:encoding/endian" +@(require) import "core:encoding/endian" import "core:math/bits" -import "core:mem" +@(require) import "core:mem" // DIGEST_SIZE_224 is the SHA-224 digest size in bytes. DIGEST_SIZE_224 :: 28 @@ -397,6 +397,11 @@ SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 { @(private) sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) { when T == Context_256 { + if is_hardware_accelerated_256() { + sha256_transf_hw(ctx, data) + return + } + w: [64]u32 wv: [8]u32 t1, t2: u32 diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha2_impl_hw_gen.odin new file mode 100644 index 000000000..85c7f8b28 --- /dev/null +++ b/core/crypto/sha2/sha2_impl_hw_gen.odin @@ -0,0 +1,15 @@ +#+build !amd64 +package sha2 + +@(private = "file") +ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" + +// is_hardware_accelerated_256 returns true iff hardware accelerated +// SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + return false +} + +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} diff --git a/core/crypto/sha2/sha2_impl_hw_intel.odin b/core/crypto/sha2/sha2_impl_hw_intel.odin new file mode 100644 index 000000000..f16f353df --- /dev/null +++ b/core/crypto/sha2/sha2_impl_hw_intel.odin @@ -0,0 +1,260 @@ +#+build amd64 +package sha2 + +// Based on the public domain code by Jeffrey Walton, though +// realistically, there only is one sensible way to write this +// and Intel's whitepaper covers it. +// +// See: https://github.com/noloader/SHA-Intrinsics + +import "base:intrinsics" +import "core:simd" +import "core:simd/x86" +import "core:sys/info" + +@(private = "file") +MASK :: x86.__m128i{0x0405060700010203, 0x0c0d0e0f08090a0b} + +@(private = "file") +K_0 :: simd.u64x2{0x71374491428a2f98, 0xe9b5dba5b5c0fbcf} +@(private = "file") +K_1 :: simd.u64x2{0x59f111f13956c25b, 0xab1c5ed5923f82a4} +@(private = "file") +K_2 :: simd.u64x2{0x12835b01d807aa98, 0x550c7dc3243185be} +@(private = "file") +K_3 :: simd.u64x2{0x80deb1fe72be5d74, 0xc19bf1749bdc06a7} +@(private = "file") +K_4 :: simd.u64x2{0xefbe4786e49b69c1, 0x240ca1cc0fc19dc6} +@(private = "file") +K_5 :: simd.u64x2{0x4a7484aa2de92c6f, 0x76f988da5cb0a9dc} +@(private = "file") +K_6 :: simd.u64x2{0xa831c66d983e5152, 0xbf597fc7b00327c8} +@(private = "file") +K_7 :: simd.u64x2{0xd5a79147c6e00bf3, 0x1429296706ca6351} +@(private = "file") +K_8 :: simd.u64x2{0x2e1b213827b70a85, 0x53380d134d2c6dfc} +@(private = "file") +K_9 :: simd.u64x2{0x766a0abb650a7354, 0x92722c8581c2c92e} +@(private = "file") +K_10 :: simd.u64x2{0xa81a664ba2bfe8a1, 0xc76c51a3c24b8b70} +@(private = "file") +K_11 :: simd.u64x2{0xd6990624d192e819, 0x106aa070f40e3585} +@(private = "file") +K_12 :: simd.u64x2{0x1e376c0819a4c116, 0x34b0bcb52748774c} +@(private = "file") +K_13 :: simd.u64x2{0x4ed8aa4a391c0cb3, 0x682e6ff35b9cca4f} +@(private = "file") +K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814} +@(private = "file") +K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7} + + +// is_hardware_accelerated_256 returns true iff hardware accelerated +// SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + features, ok := info.cpu_features.? + if !ok { + return false + } + + req_features :: info.CPU_Features{ + .sse2, + .ssse3, + .sse41, + .sha, + } + return features >= req_features +} + +@(private, enable_target_feature="sse2,ssse3,sse4.1,sha") +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check { + // Load the state + tmp := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[0])) + state_1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[4])) + + tmp = x86._mm_shuffle_epi32(tmp, 0xb1) // CDAB + state_1 = x86._mm_shuffle_epi32(state_1, 0x1b) // EFGH + state_0 := x86._mm_alignr_epi8(tmp, state_1, 8) // ABEF + // state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH + state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp) + + data := data + for len(data) >= BLOCK_SIZE_256 { + state_0_save, state_1_save := state_0, state_1 + + // Rounds 0-3 + msg := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data))) + msg_0 := x86._mm_shuffle_epi8(msg, MASK) + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_0)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0xe) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + // Rounds 4-7 + msg_1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[16:]))) + msg_1 = x86._mm_shuffle_epi8(msg_1, MASK) + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_1)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0xe) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) + + // Rounds 8-11 + msg_2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[32:]))) + msg_2 = x86._mm_shuffle_epi8(msg_2, MASK) + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_2)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0xe) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) + + // Rounds 12-15 + msg_3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[48:]))) + msg_3 = x86._mm_shuffle_epi8(msg_3, MASK) + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_3)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) + msg_0 = x86._mm_add_epi32(msg_0, tmp) + msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) + + // Rounds 16-19 + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_4)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) + msg_1 = x86._mm_add_epi32(msg_1, tmp) + msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) + + // Rounds 20-23 + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_5)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) + msg_2 = x86._mm_add_epi32(msg_2, tmp) + msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) + + // Rounds 24-27 + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_6)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) + msg_3 = x86._mm_add_epi32(msg_3, tmp) + msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) + + // Rounds 28-31 + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_7)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) + msg_0 = x86._mm_add_epi32(msg_0, tmp) + msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) + + // Rounds 32-35 + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_8)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) + msg_1 = x86._mm_add_epi32(msg_1, tmp) + msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) + + // Rounds 36-39 + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_9)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) + msg_2 = x86._mm_add_epi32(msg_2, tmp) + msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) + + // Rounds 40-43 + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_10)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) + msg_3 = x86._mm_add_epi32(msg_3, tmp) + msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) + + // Rounds 44-47 + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_11)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) + msg_0 = x86._mm_add_epi32(msg_0, tmp) + msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) + + // Rounds 48-51 + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_12)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) + msg_1 = x86._mm_add_epi32(msg_1, tmp) + msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) + + // Rounds 52-55 + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_13)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) + msg_2 = x86._mm_add_epi32(msg_2, tmp) + msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + /* Rounds 56-59 */ + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_14)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) + msg_3 = x86._mm_add_epi32(msg_3, tmp) + msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + // Rounds 60-63 + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_15)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + state_0 = x86._mm_add_epi32(state_0, state_0_save) + state_1 = x86._mm_add_epi32(state_1, state_1_save) + + data = data[BLOCK_SIZE_256:] + } + + // Write back the updated state + tmp = x86._mm_shuffle_epi32(state_0, 0x1b) // FEBA + state_1 = x86._mm_shuffle_epi32(state_1, 0xb1) // DCHG + // state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA + state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1) + state_1 = x86._mm_alignr_epi8(state_1, tmp, 8) // ABEF + + intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0) + intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1) +} + +@(private = "file") +kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i { + // HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`. + a_ := simd.to_array(a) + b_ := simd.to_array(b) + return x86.__m128i{a_[0], b_[1]} +} |