// Copyright (c) 2017 Thomas Pornin // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY // DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE // GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #+build amd64 package aes_hw_intel import "base:intrinsics" import "core:crypto/_aes" import "core:simd/x86" @(private = "file") GHASH_STRIDE_HW :: 4 @(private = "file") GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE // GHASH is defined over elements of GF(2^128) with "full little-endian" // representation: leftmost byte is least significant, and, within each // byte, leftmost _bit_ is least significant. The natural ordering in // x86 is "mixed little-endian": bytes are ordered from least to most // significant, but bits within a byte are in most-to-least significant // order. Going to full little-endian representation would require // reversing bits within each byte, which is doable but expensive. // // Instead, we go to full big-endian representation, by swapping bytes // around, which is done with a single _mm_shuffle_epi8() opcode (it // comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We // can use a full big-endian representation because in a carryless // multiplication, we have a nice bit reversal property: // // rev_128(x) * rev_128(y) = rev_255(x * y) // // So by using full big-endian, we still get the right result, except // that it is right-shifted by 1 bit. The left-shift is relatively // inexpensive, and it can be mutualised. // // Since SSE2 opcodes do not have facilities for shifting full 128-bit // values with bit precision, we have to break down values into 64-bit // chunks. We number chunks from 0 to 3 in left to right order. @(private = "file") _BYTESWAP_INDEX: x86.__m128i : { 0x08090a0b0c0d0e0f, 0x0001020304050607 } @(private = "file", require_results, enable_target_feature = "sse2,ssse3") byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i { return x86._mm_shuffle_epi8(x, _BYTESWAP_INDEX) } // From a 128-bit value kw, compute kx as the XOR of the two 64-bit // halves of kw (into the right half of kx; left half is unspecified), // and return kx. @(private = "file", require_results, enable_target_feature = "sse2") bk :: #force_inline proc "contextless" (kw: x86.__m128i) -> x86.__m128i { return x86._mm_xor_si128(kw, x86._mm_shuffle_epi32(kw, 0x0e)) } // Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and // the XOR of the two values (kx), and return (kw, kx). @(private = "file", enable_target_feature = "sse2") pbk :: #force_inline proc "contextless" (k0, k1: x86.__m128i) -> (x86.__m128i, x86.__m128i) { kw := x86._mm_unpacklo_epi64(k1, k0) kx := x86._mm_xor_si128(k0, k1) return kw, kx } // Left-shift by 1 bit a 256-bit value (in four 64-bit words). @(private = "file", require_results, enable_target_feature = "sse2") sl_256 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) { x0, x1, x2, x3 := x0, x1, x2, x3 x0 = x86._mm_or_si128(x86._mm_slli_epi64(x0, 1), x86._mm_srli_epi64(x1, 63)) x1 = x86._mm_or_si128(x86._mm_slli_epi64(x1, 1), x86._mm_srli_epi64(x2, 63)) x2 = x86._mm_or_si128(x86._mm_slli_epi64(x2, 1), x86._mm_srli_epi64(x3, 63)) x3 = x86._mm_slli_epi64(x3, 1) return x0, x1, x2, x3 } // Perform reduction in GF(2^128). @(private = "file", require_results, enable_target_feature = "sse2") reduce_f128 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i) { x0, x1, x2 := x0, x1, x2 x1 = x86._mm_xor_si128( x1, x86._mm_xor_si128( x86._mm_xor_si128( x3, x86._mm_srli_epi64(x3, 1)), x86._mm_xor_si128( x86._mm_srli_epi64(x3, 2), x86._mm_srli_epi64(x3, 7)))) x2 = x86._mm_xor_si128( x86._mm_xor_si128( x2, x86._mm_slli_epi64(x3, 63)), x86._mm_xor_si128( x86._mm_slli_epi64(x3, 62), x86._mm_slli_epi64(x3, 57))) x0 = x86._mm_xor_si128( x0, x86._mm_xor_si128( x86._mm_xor_si128( x2, x86._mm_srli_epi64(x2, 1)), x86._mm_xor_si128( x86._mm_srli_epi64(x2, 2), x86._mm_srli_epi64(x2, 7)))) x1 = x86._mm_xor_si128( x86._mm_xor_si128( x1, x86._mm_slli_epi64(x2, 63)), x86._mm_xor_si128( x86._mm_slli_epi64(x2, 62), x86._mm_slli_epi64(x2, 57))) return x0, x1 } // Square value kw in GF(2^128) into (dw,dx). @(private = "file", require_results, enable_target_feature = "sse2,pclmul") square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128i, x86.__m128i) { z1 := x86._mm_clmulepi64_si128(kw, kw, 0x11) z3 := x86._mm_clmulepi64_si128(kw, kw, 0x00) z0 := x86._mm_shuffle_epi32(z1, 0x0E) z2 := x86._mm_shuffle_epi32(z3, 0x0E) z0, z1, z2, z3 = sl_256(z0, z1, z2, z3) z0, z1 = reduce_f128(z0, z1, z2, z3) return pbk(z0, z1) } // ghash calculates the GHASH of data, with the key `key`, and input `dst` // and `data`, and stores the resulting digest in `dst`. // // Note: `dst` is both an input and an output, to support easy implementation // of GCM. @(enable_target_feature = "sse2,ssse3,pclmul") ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check { if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE { panic_contextless("aes/ghash: invalid dst or key size") } // Note: BearSSL opts to copy the remainder into a zero-filled // 64-byte buffer. We do something slightly more simple. // Load key and dst (h and y). yw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(dst))) h1w := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) yw = byteswap(yw) h1w = byteswap(h1w) h1x := bk(h1w) // Process 4 blocks at a time buf := data l := len(buf) if l >= GHASH_STRIDE_BYTES_HW { // Compute h2 = h^2 h2w, h2x := square_f128(h1w) // Compute h3 = h^3 = h*(h^2) t1 := x86._mm_clmulepi64_si128(h1w, h2w, 0x11) t3 := x86._mm_clmulepi64_si128(h1w, h2w, 0x00) t2 := x86._mm_xor_si128( x86._mm_clmulepi64_si128(h1x, h2x, 0x00), x86._mm_xor_si128(t1, t3)) t0 := x86._mm_shuffle_epi32(t1, 0x0E) t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E)) t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E)) t0, t1, t2, t3 = sl_256(t0, t1, t2, t3) t0, t1 = reduce_f128(t0, t1, t2, t3) h3w, h3x := pbk(t0, t1) // Compute h4 = h^4 = (h^2)^2 h4w, h4x := square_f128(h2w) for l >= GHASH_STRIDE_BYTES_HW { aw0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf))) aw1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[16:]))) aw2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[32:]))) aw3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[48:]))) aw0 = byteswap(aw0) aw1 = byteswap(aw1) aw2 = byteswap(aw2) aw3 = byteswap(aw3) buf, l = buf[GHASH_STRIDE_BYTES_HW:], l - GHASH_STRIDE_BYTES_HW aw0 = x86._mm_xor_si128(aw0, yw) ax1 := bk(aw1) ax2 := bk(aw2) ax3 := bk(aw3) ax0 := bk(aw0) t1 = x86._mm_xor_si128( x86._mm_xor_si128( x86._mm_clmulepi64_si128(aw0, h4w, 0x11), x86._mm_clmulepi64_si128(aw1, h3w, 0x11)), x86._mm_xor_si128( x86._mm_clmulepi64_si128(aw2, h2w, 0x11), x86._mm_clmulepi64_si128(aw3, h1w, 0x11))) t3 = x86._mm_xor_si128( x86._mm_xor_si128( x86._mm_clmulepi64_si128(aw0, h4w, 0x00), x86._mm_clmulepi64_si128(aw1, h3w, 0x00)), x86._mm_xor_si128( x86._mm_clmulepi64_si128(aw2, h2w, 0x00), x86._mm_clmulepi64_si128(aw3, h1w, 0x00))) t2 = x86._mm_xor_si128( x86._mm_xor_si128( x86._mm_clmulepi64_si128(ax0, h4x, 0x00), x86._mm_clmulepi64_si128(ax1, h3x, 0x00)), x86._mm_xor_si128( x86._mm_clmulepi64_si128(ax2, h2x, 0x00), x86._mm_clmulepi64_si128(ax3, h1x, 0x00))) t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3)) t0 = x86._mm_shuffle_epi32(t1, 0x0E) t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E)) t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E)) t0, t1, t2, t3 = sl_256(t0, t1, t2, t3) t0, t1 = reduce_f128(t0, t1, t2, t3) yw = x86._mm_unpacklo_epi64(t1, t0) } } // Process 1 block at a time for l > 0 { src: []byte = --- if l >= _aes.GHASH_BLOCK_SIZE { src = buf buf = buf[_aes.GHASH_BLOCK_SIZE:] l -= _aes.GHASH_BLOCK_SIZE } else { tmp: [_aes.GHASH_BLOCK_SIZE]byte copy(tmp[:], buf) src = tmp[:] l = 0 } aw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) aw = byteswap(aw) aw = x86._mm_xor_si128(aw, yw) ax := bk(aw) t1 := x86._mm_clmulepi64_si128(aw, h1w, 0x11) t3 := x86._mm_clmulepi64_si128(aw, h1w, 0x00) t2 := x86._mm_clmulepi64_si128(ax, h1x, 0x00) t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3)) t0 := x86._mm_shuffle_epi32(t1, 0x0E) t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E)) t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E)) t0, t1, t2, t3 = sl_256(t0, t1, t2, t3) t0, t1 = reduce_f128(t0, t1, t2, t3) yw = x86._mm_unpacklo_epi64(t1, t0) } // Write back the hash (dst, aka y) yw = byteswap(yw) intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), yw) }