// Copyright (c) 2016 Thomas Pornin // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY // DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE // GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package aes_ct64 // Bitsliced AES for 64-bit general purpose (integer) registers. Each // invocation will process up to 4 blocks at a time. This implementation // is derived from the BearSSL ct64 code, and distributed under a 1-clause // BSD license with permission from the original author. // // WARNING: "hic sunt dracones" // // This package also deliberately exposes enough internals to be able to // function as a replacement for `AESENC` and `AESDEC` from AES-NI, to // allow the implementation of non-AES primitives that use the AES round // function such as AEGIS and Deoxys-II. This should ONLY be done when // implementing something other than AES itself. sub_bytes :: proc "contextless" (q: ^[8]u64) { // This S-box implementation is a straightforward translation of // the circuit described by Boyar and Peralta in "A new // combinational logic minimization technique with applications // to cryptology" (https://eprint.iacr.org/2009/191.pdf). // // Note that variables x* (input) and s* (output) are numbered // in "reverse" order (x0 is the high bit, x7 is the low bit). x0 := q[7] x1 := q[6] x2 := q[5] x3 := q[4] x4 := q[3] x5 := q[2] x6 := q[1] x7 := q[0] // Top linear transformation. y14 := x3 ~ x5 y13 := x0 ~ x6 y9 := x0 ~ x3 y8 := x0 ~ x5 t0 := x1 ~ x2 y1 := t0 ~ x7 y4 := y1 ~ x3 y12 := y13 ~ y14 y2 := y1 ~ x0 y5 := y1 ~ x6 y3 := y5 ~ y8 t1 := x4 ~ y12 y15 := t1 ~ x5 y20 := t1 ~ x1 y6 := y15 ~ x7 y10 := y15 ~ t0 y11 := y20 ~ y9 y7 := x7 ~ y11 y17 := y10 ~ y11 y19 := y10 ~ y8 y16 := t0 ~ y11 y21 := y13 ~ y16 y18 := x0 ~ y16 // Non-linear section. t2 := y12 & y15 t3 := y3 & y6 t4 := t3 ~ t2 t5 := y4 & x7 t6 := t5 ~ t2 t7 := y13 & y16 t8 := y5 & y1 t9 := t8 ~ t7 t10 := y2 & y7 t11 := t10 ~ t7 t12 := y9 & y11 t13 := y14 & y17 t14 := t13 ~ t12 t15 := y8 & y10 t16 := t15 ~ t12 t17 := t4 ~ t14 t18 := t6 ~ t16 t19 := t9 ~ t14 t20 := t11 ~ t16 t21 := t17 ~ y20 t22 := t18 ~ y19 t23 := t19 ~ y21 t24 := t20 ~ y18 t25 := t21 ~ t22 t26 := t21 & t23 t27 := t24 ~ t26 t28 := t25 & t27 t29 := t28 ~ t22 t30 := t23 ~ t24 t31 := t22 ~ t26 t32 := t31 & t30 t33 := t32 ~ t24 t34 := t23 ~ t33 t35 := t27 ~ t33 t36 := t24 & t35 t37 := t36 ~ t34 t38 := t27 ~ t36 t39 := t29 & t38 t40 := t25 ~ t39 t41 := t40 ~ t37 t42 := t29 ~ t33 t43 := t29 ~ t40 t44 := t33 ~ t37 t45 := t42 ~ t41 z0 := t44 & y15 z1 := t37 & y6 z2 := t33 & x7 z3 := t43 & y16 z4 := t40 & y1 z5 := t29 & y7 z6 := t42 & y11 z7 := t45 & y17 z8 := t41 & y10 z9 := t44 & y12 z10 := t37 & y3 z11 := t33 & y4 z12 := t43 & y13 z13 := t40 & y5 z14 := t29 & y2 z15 := t42 & y9 z16 := t45 & y14 z17 := t41 & y8 // Bottom linear transformation. t46 := z15 ~ z16 t47 := z10 ~ z11 t48 := z5 ~ z13 t49 := z9 ~ z10 t50 := z2 ~ z12 t51 := z2 ~ z5 t52 := z7 ~ z8 t53 := z0 ~ z3 t54 := z6 ~ z7 t55 := z16 ~ z17 t56 := z12 ~ t48 t57 := t50 ~ t53 t58 := z4 ~ t46 t59 := z3 ~ t54 t60 := t46 ~ t57 t61 := z14 ~ t57 t62 := t52 ~ t58 t63 := t49 ~ t58 t64 := z4 ~ t59 t65 := t61 ~ t62 t66 := z1 ~ t63 s0 := t59 ~ t63 s6 := t56 ~ ~t62 s7 := t48 ~ ~t60 t67 := t64 ~ t65 s3 := t53 ~ t66 s4 := t51 ~ t66 s5 := t47 ~ t65 s1 := t64 ~ ~s3 s2 := t55 ~ ~t67 q[7] = s0 q[6] = s1 q[5] = s2 q[4] = s3 q[3] = s4 q[2] = s5 q[1] = s6 q[0] = s7 } orthogonalize :: proc "contextless" (q: ^[8]u64) { CL2 :: 0x5555555555555555 CH2 :: 0xAAAAAAAAAAAAAAAA q[0], q[1] = (q[0] & CL2) | ((q[1] & CL2) << 1), ((q[0] & CH2) >> 1) | (q[1] & CH2) q[2], q[3] = (q[2] & CL2) | ((q[3] & CL2) << 1), ((q[2] & CH2) >> 1) | (q[3] & CH2) q[4], q[5] = (q[4] & CL2) | ((q[5] & CL2) << 1), ((q[4] & CH2) >> 1) | (q[5] & CH2) q[6], q[7] = (q[6] & CL2) | ((q[7] & CL2) << 1), ((q[6] & CH2) >> 1) | (q[7] & CH2) CL4 :: 0x3333333333333333 CH4 :: 0xCCCCCCCCCCCCCCCC q[0], q[2] = (q[0] & CL4) | ((q[2] & CL4) << 2), ((q[0] & CH4) >> 2) | (q[2] & CH4) q[1], q[3] = (q[1] & CL4) | ((q[3] & CL4) << 2), ((q[1] & CH4) >> 2) | (q[3] & CH4) q[4], q[6] = (q[4] & CL4) | ((q[6] & CL4) << 2), ((q[4] & CH4) >> 2) | (q[6] & CH4) q[5], q[7] = (q[5] & CL4) | ((q[7] & CL4) << 2), ((q[5] & CH4) >> 2) | (q[7] & CH4) CL8 :: 0x0F0F0F0F0F0F0F0F CH8 :: 0xF0F0F0F0F0F0F0F0 q[0], q[4] = (q[0] & CL8) | ((q[4] & CL8) << 4), ((q[0] & CH8) >> 4) | (q[4] & CH8) q[1], q[5] = (q[1] & CL8) | ((q[5] & CL8) << 4), ((q[1] & CH8) >> 4) | (q[5] & CH8) q[2], q[6] = (q[2] & CL8) | ((q[6] & CL8) << 4), ((q[2] & CH8) >> 4) | (q[6] & CH8) q[3], q[7] = (q[3] & CL8) | ((q[7] & CL8) << 4), ((q[3] & CH8) >> 4) | (q[7] & CH8) } @(require_results) interleave_in :: proc "contextless" (w0, w1, w2, w3: u32) -> (q0, q1: u64) #no_bounds_check { x0, x1, x2, x3 := u64(w0), u64(w1), u64(w2), u64(w3) x0 |= (x0 << 16) x1 |= (x1 << 16) x2 |= (x2 << 16) x3 |= (x3 << 16) x0 &= 0x0000FFFF0000FFFF x1 &= 0x0000FFFF0000FFFF x2 &= 0x0000FFFF0000FFFF x3 &= 0x0000FFFF0000FFFF x0 |= (x0 << 8) x1 |= (x1 << 8) x2 |= (x2 << 8) x3 |= (x3 << 8) x0 &= 0x00FF00FF00FF00FF x1 &= 0x00FF00FF00FF00FF x2 &= 0x00FF00FF00FF00FF x3 &= 0x00FF00FF00FF00FF q0 = x0 | (x2 << 8) q1 = x1 | (x3 << 8) return } @(require_results) interleave_out :: proc "contextless" (q0, q1: u64) -> (w0, w1, w2, w3: u32) { x0 := q0 & 0x00FF00FF00FF00FF x1 := q1 & 0x00FF00FF00FF00FF x2 := (q0 >> 8) & 0x00FF00FF00FF00FF x3 := (q1 >> 8) & 0x00FF00FF00FF00FF x0 |= (x0 >> 8) x1 |= (x1 >> 8) x2 |= (x2 >> 8) x3 |= (x3 >> 8) x0 &= 0x0000FFFF0000FFFF x1 &= 0x0000FFFF0000FFFF x2 &= 0x0000FFFF0000FFFF x3 &= 0x0000FFFF0000FFFF w0 = u32(x0) | u32(x0 >> 16) w1 = u32(x1) | u32(x1 >> 16) w2 = u32(x2) | u32(x2 >> 16) w3 = u32(x3) | u32(x3 >> 16) return } @(private) rotr32 :: #force_inline proc "contextless" (x: u64) -> u64 { return (x << 32) | (x >> 32) }