aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorflysand7 <thebumboni@gmail.com>2024-12-01 11:42:24 +1100
committerflysand7 <thebumboni@gmail.com>2024-12-01 11:42:24 +1100
commit596921fb7a8d1b1deff39e61689415e5cb024d2b (patch)
tree1f763de7dbe1ba95963d356fc22ce7f8495d55d4
parent2141299d2b4b11e07bd57b70ce3635764a058498 (diff)
First pass
-rw-r--r--core/simd/simd.odin2226
1 files changed, 2190 insertions, 36 deletions
diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 01d11dfbe..52d04156f 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -1,182 +1,2273 @@
+/*
+The SIMD support package.
+
+SIMD (Single Instruction Multiple Data), is a CPU hardware feature that
+introduce special registers and instructions which operate on multiple units
+of data at the same time, , which enables faster data processing for
+applications with heavy computational workloads.
+
+In Odin SIMD is exposed via a special kinds of arrays, called the *SIMD
+vectors*. The types of SIMD vectors is written as `#simd [N]T`, where N is a
+power of two, and T could be any basic type (integers, floats, etc.). The
+documentation of this package will call *SIMD vectors* just *vectors*.
+
+The elements of SIMD vectors consist of elements, called *scalar values*, or
+*scalars*, each occupying a *lane* of the SIMD vector.
+
+This package implements procedures for working with vectors.
+*/
package simd
import "base:builtin"
import "base:intrinsics"
-// IS_EMULATED is true iff the compile-time target lacks hardware support
-// for at least 128-bit SIMD.
+/*
+Check if SIMD is emulated on a target platform.
+
+This value is `true`, if the compile-time target has the hardware support for
+at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
+for 128-bit SIMD, this value is `false`, and all SIMD operations will be
+emulated.
+*/
IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
false
-// 128-bit vector aliases
+/*
+Vector of 16 `u8` lanes (128 bits).
+*/
u8x16 :: #simd[16]u8
+
+/*
+Vector of 16 `i8` lanes (128 bits).
+*/
i8x16 :: #simd[16]i8
+
+/*
+Vector of 8 `u16` lanes (128 bits).
+*/
u16x8 :: #simd[8]u16
+
+/*
+Vector of 8 `i16` lanes (128 bits).
+*/
i16x8 :: #simd[8]i16
+
+/*
+Vector of 4 `u32` lanes (128 bits).
+*/
u32x4 :: #simd[4]u32
+
+/*
+Vector of 4 `i32` lanes (128 bits).
+*/
i32x4 :: #simd[4]i32
+
+/*
+Vector of 2 `u64` lanes (128 bits).
+*/
u64x2 :: #simd[2]u64
+
+/*
+Vector of 2 `i64` lanes (128 bits).
+*/
i64x2 :: #simd[2]i64
+
+/*
+Vector of 4 `f32` lanes (128 bits).
+*/
f32x4 :: #simd[4]f32
+
+/*
+Vector of 2 `f64` lanes (128 bits).
+*/
f64x2 :: #simd[2]f64
+/*
+Vector of 16 `bool` lanes (128 bits).
+*/
boolx16 :: #simd[16]bool
+
+/*
+Vector of 16 `b8` lanes (128 bits).
+*/
b8x16 :: #simd[16]b8
+
+/*
+Vector of 8 `b16` lanes (128 bits).
+*/
b16x8 :: #simd[8]b16
+
+/*
+Vector of 4 `b32` lanes (128 bits).
+*/
b32x4 :: #simd[4]b32
+
+/*
+Vector of 2 `b64` lanes (128 bits).
+*/
b64x2 :: #simd[2]b64
-// 256-bit vector aliases
+/*
+Vector of 32 `u8` lanes (256 bits).
+*/
u8x32 :: #simd[32]u8
+
+/*
+Vector of 32 `i8` lanes (256 bits).
+*/
i8x32 :: #simd[32]i8
+
+/*
+Vector of 16 `u16` lanes (256 bits).
+*/
u16x16 :: #simd[16]u16
+
+/*
+Vector of 16 `i16` lanes (256 bits).
+*/
i16x16 :: #simd[16]i16
+
+/*
+Vector of 8 `u32` lanes (256 bits).
+*/
u32x8 :: #simd[8]u32
+
+/*
+Vector of 8 `i32` lanes (256 bits).
+*/
i32x8 :: #simd[8]i32
+
+/*
+Vector of 4 `u64` lanes (256 bits).
+*/
u64x4 :: #simd[4]u64
+
+/*
+Vector of 4 `i64` lanes (256 bits).
+*/
i64x4 :: #simd[4]i64
+
+/*
+Vector of 8 `f32` lanes (256 bits).
+*/
f32x8 :: #simd[8]f32
+
+/*
+Vector of 4 `f64` lanes (256 bits).
+*/
f64x4 :: #simd[4]f64
+/*
+Vector of 32 `bool` lanes (256 bits).
+*/
boolx32 :: #simd[32]bool
+
+/*
+Vector of 32 `b8` lanes (256 bits).
+*/
b8x32 :: #simd[32]b8
+
+/*
+Vector of 16 `b16` lanes (256 bits).
+*/
b16x16 :: #simd[16]b16
+
+/*
+Vector of 8 `b32` lanes (256 bits).
+*/
b32x8 :: #simd[8]b32
+
+/*
+Vector of 4 `b64` lanes (256 bits).
+*/
b64x4 :: #simd[4]b64
-// 512-bit vector aliases
+/*
+Vector of 64 `u8` lanes (512 bits).
+*/
u8x64 :: #simd[64]u8
+
+/*
+Vector of 64 `i8` lanes (512 bits).
+*/
i8x64 :: #simd[64]i8
+
+/*
+Vector of 32 `u16` lanes (512 bits).
+*/
u16x32 :: #simd[32]u16
+
+/*
+Vector of 32 `i16` lanes (512 bits).
+*/
i16x32 :: #simd[32]i16
+
+/*
+Vector of 16 `u32` lanes (512 bits).
+*/
u32x16 :: #simd[16]u32
+
+/*
+Vector of 16 `i32` lanes (512 bits).
+*/
i32x16 :: #simd[16]i32
+
+/*
+Vector of 8 `u64` lanes (512 bits).
+*/
u64x8 :: #simd[8]u64
+
+/*
+Vector of 8 `i64` lanes (512 bits).
+*/
i64x8 :: #simd[8]i64
+
+/*
+Vector of 16 `f32` lanes (512 bits).
+*/
f32x16 :: #simd[16]f32
+
+/*
+Vector of 8 `f64` lanes (512 bits).
+*/
f64x8 :: #simd[8]f64
+/*
+Vector of 64 `bool` lanes (512 bits).
+*/
boolx64 :: #simd[64]bool
+
+/*
+Vector of 64 `b8` lanes (512 bits).
+*/
b8x64 :: #simd[64]b8
+
+/*
+Vector of 32 `b16` lanes (512 bits).
+*/
b16x32 :: #simd[32]b16
+
+/*
+Vector of 16 `b32` lanes (512 bits).
+*/
b32x16 :: #simd[16]b32
+
+/*
+Vector of 8 `b64` lanes (512 bits).
+*/
b64x8 :: #simd[8]b64
+/*
+Add SIMD vectors.
+
+This procedure returns a vector, where each lane holds the sum of the
+corresponding `a` and `b` vectors' lanes.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- The sum of two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] + b[i]
+ }
+ return res
+
+Example:
+ +-----+-----+-----+-----+
+ a: | 0 | 1 | 2 | 3 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 0 | 1 | 2 | -1 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 0 | 2 | 4 | 2 |
+ +-----+-----+-----+-----+
+*/
add :: intrinsics.simd_add
+
+/*
+Subtract SIMD vectors.
+
+This procedure returns a vector, where each lane holds the difference between
+the corresponding lanes of the vectors `a` and `b`. The lanes from the vector
+`b` are subtracted from the corresponding lanes of the vector `a`.
+
+Inputs:
+- `a`: Integer or a float vector to subtract from.
+- `b`: Integer or a float vector.
+
+Returns:
+- The difference of two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] - b[i]
+ }
+ return res
+
+Example:
+
+ +-----+-----+-----+-----+
+ a: | 2 | 2 | 2 | 2 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 0 | 1 | 2 | 3 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 2 | 1 | 0 | -1 |
+ +-----+-----+-----+-----+
+*/
sub :: intrinsics.simd_sub
+
+/*
+Multiply (component-wise) SIMD vectors.
+
+This procedure returns a vector, where each lane holds the product of the
+corresponding lanes of the vectors `a` and `b`.
+
+Inputs:
+- `a`: Integer or a float vector.
+- `b`: Integer or a float vector.
+
+Returns:
+- The product of two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] * b[i]
+ }
+ return res
+
+Example:
+
+ +-----+-----+-----+-----+
+ a: | 2 | 2 | 2 | 2 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 0 | -1 | 2 | -3 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 0 | -2 | 4 | -6 |
+ +-----+-----+-----+-----+
+*/
mul :: intrinsics.simd_mul
-div :: intrinsics.simd_div // floats only
-// Keeps Odin's Behaviour
-// (x << y) if y <= mask else 0
+/*
+Divide SIMD vectors.
+
+This procedure returns a vector, where each lane holds the quotient (result
+of division) between the corresponding lanes of the vectors `a` and `b`. Each
+lane of the vector `a` is divided by the corresponding lane of the vector `b`.
+
+This operation performs a standard floating-point division for each lane.
+
+Inputs:
+- `a`: Float vector.
+- `b`: Float vector to divide by.
+
+Returns:
+- The quotient of two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] / b[i]
+ }
+ return res
+
+Example:
+
+ +-----+-----+-----+-----+
+ a: | 2 | 2 | 2 | 2 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 0 | -1 | 2 | -3 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-------+
+ | +∞ | -2 | 1 | -0.66 |
+ +-----+-----+-----+-------+
+*/
+div :: intrinsics.simd_div
+
+/*
+Shift left lanes of a vector.
+
+This procedure returns a vector, such that each lane holds the result of a
+shift-left (aka shift-up) operation, of lane from the vector `a` by the shift
+amount from the corresponding lane of the vector `b`.
+
+If the shift amount is greater than the bit-width of a lane, the result is `0`
+in the corresponding positions of the result.
+
+Inputs:
+- `a`: An integer vector of values to shift.
+- `b`: An unsigned integer vector of the shift amounts.
+
+Result:
+- Shifted vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if b[i] < 8*size_of(a[i]) {
+ res[i] = a[i] << b[i]
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0x11 | 0x55 | 0x03 | 0xff |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 2 | 1 | 33 | 1 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+--------+
+ | 0x44 | 0xaa | 0 | 0xfe |
+ +-------+-------+-------+--------+
+*/
shl :: intrinsics.simd_shl
+
+/*
+Shift right lanes of a vector.
+
+This procedure returns a vector, such that each lane holds the result of a
+shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
+amount from the corresponding lane of the vector `b`.
+
+If the shift amount is greater than the bit-width of a lane, the result is `0`
+in the corresponding positions of the result.
+
+If the first vector is a vector of signed integers, the arithmetic shift
+operation is performed. Otherwise, if the first vector is a vector of unsigned
+integers, a logical shift is performed.
+
+Inputs:
+- `a`: An integer vector of values to shift.
+- `b`: An unsigned integer vector of the shift amounts.
+
+Result:
+- Shifted vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if b[i] < 8*size_of(a[i]) {
+ res[i] = a[i] >> b[i]
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+This example assumes that the `a` vector is of a signed type.
+
+ +-------+-------+-------+-------+
+ a: | 0x11 | 0x55 | 0x03 | 0xff |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 2 | 1 | 33 | 1 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+--------+
+ | 0x04 | 0x2a | 0 | 0xff |
+ +-------+-------+-------+--------+
+*/
shr :: intrinsics.simd_shr
-// Similar to C's Behaviour
-// x << (y & mask)
+/*
+Shift left lanes of a vector (masked).
+
+This procedure returns a vector, such that each lane holds the result of a
+shift-left (aka shift-up) operation, of lane from the vector `a` by the shift
+amount from the corresponding lane of the vector `b`.
+
+The shift amount is rounded to (masked) to the bit-width of the lane.
+
+Inputs:
+- `a`: An integer vector of values to shift.
+- `b`: An unsigned integer vector of the shift amounts.
+
+Result:
+- Shifted vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ mask := 8*size_of(a[i]) - 1
+ res[i] = a[i] << (b[i] & mask)
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0x11 | 0x55 | 0x03 | 0xff |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 2 | 1 | 33 | 1 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+--------+
+ | 0x44 | 0xaa | 0x06 | 0xfe |
+ +-------+-------+-------+--------+
+*/
shl_masked :: intrinsics.simd_shl_masked
+
+/*
+Shift right lanes of a vector (masked).
+
+This procedure returns a vector, such that each lane holds the result of a
+shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
+amount from the corresponding lane of the vector `b`.
+
+The shift amount is rounded to (masked) to the bit-width of the lane.
+
+If the first vector is a vector of signed integers, the arithmetic shift
+operation is performed. Otherwise, if the first vector is a vector of unsigned
+integers, a logical shift is performed.
+
+Inputs:
+- `a`: An integer vector of values to shift.
+- `b`: An unsigned integer vector of the shift amounts.
+
+Result:
+- Shifted vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ mask := 8*size_of(a[i]) - 1
+ res[i] = a[i] >> (b[i] & mask)
+ }
+ return res
+
+Example:
+
+This example assumes that the `a` vector is of a signed type.
+
+ +-------+-------+-------+-------+
+ a: | 0x11 | 0x55 | 0x03 | 0xff |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 2 | 1 | 33 | 1 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+--------+
+ | 0x04 | 0x2a | 0x01 | 0xff |
+ +-------+-------+-------+--------+
+*/
shr_masked :: intrinsics.simd_shr_masked
-// Saturation Arithmetic
+/*
+Saturated addition of SIMD vectors.
+
+The *saturated sum* is a sum, that upon overflow or underflow, instead of
+round-tripping, keeps the value clamped between the minimum and the maximum
+values of the lane type.
+
+This procedure returns a vector where each lane is the saturated sum of the
+corresponding lanes of vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer vector.
+- `b`: An integer vector.
+
+Returns:
+- Saturated sum of the two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ switch {
+ case b[i] >= max(type_of(a[i])) - a[i]: // (overflow of a[i])
+ res[i] = max(type_of(a[i]))
+ case b[i] <= min(type_of(a[i])) - a[i]: // (underflow of a[i])
+ res[i] = min(type_of(a[i]))
+ } else {
+ res[i] = a[i] + b[i]
+ }
+ }
+ return res
+
+Example:
+
+Assuming unsigned bytes as the type of the element in a lane:
+
+ +-----+-----+-----+-----+
+ a: | 0 | 255 | 2 | 3 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 1 | 3 | 2 | -1 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 1 | 255 | 4 | 2 |
+ +-----+-----+-----+-----+
+*/
saturating_add :: intrinsics.simd_saturating_add
+
+/*
+Saturated subtraction of 2 lanes of vectors.
+
+The *saturated difference* is a difference, that upon overflow or underflow,
+instead of round-tripping, keeps the value clamped between the minimum and the
+maximum values of the lane type.
+
+This procedure returns a vector where each lane is the saturated difference of
+the corresponding lanes of vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer vector to subtract from.
+- `b`: An integer vector.
+
+Returns:
+- Saturated difference of the two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ switch {
+ case b[i] >= max(type_of(a[i])) + a[i]: // (overflow of a[i])
+ res[i] = max(type_of(a[i]))
+ case b[i] <= min(type_of(a[i])) + a[i]: // (underflow of a[i])
+ res[i] = min(type_of(a[i]))
+ } else {
+ res[i] = a[i] - b[i]
+ }
+ }
+ return res
+
+Example:
+
+Assuming unsigned bytes as the type of the element in a lane:
+
+ +-----+-----+-----+-----+
+ a: | 0 | 255 | 2 | 3 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 3 | 3 | 2 | -1 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 0 | 252 | 0 | 4 |
+ +-----+-----+-----+-----+
+*/
saturating_sub :: intrinsics.simd_saturating_sub
+/*
+Bitwise AND of the lanes of SIMD vectors.
+
+This procedure returns a vector, such that each lane has the result of a bitwise
+AND operation between the corresponding lanes of the vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer or boolean vector.
+- `b`: An integer or boolean vector.
+
+Returns:
+- Result of the bitwise AND operation between two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] & b[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0x11 | 0x33 | 0x55 | 0xaa |
+ +------+------+------+------+
+ +------+------+------+------+
+ b: | 0xff | 0xf0 | 0x0f | 0x00 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0x11 | 0x30 | 0x05 | 0x00 |
+ +------+------+------+------+
+*/
bit_and :: intrinsics.simd_bit_and
+
+/*
+Bitwise OR of the 2 lanes of vectors.
+
+This procedure returns a vector, such that each lane has the result of a bitwise
+OR operation between the corresponding lanes of the vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer or boolean vector.
+- `b`: An integer or boolean vector.
+
+Returns:
+- Result of the bitwise OR operation between two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] | b[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0x11 | 0x33 | 0x55 | 0xaa |
+ +------+------+------+------+
+ +------+------+------+------+
+ b: | 0xff | 0xf0 | 0x0f | 0x00 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0xff | 0xf3 | 0x5f | 0xaa |
+ +------+------+------+------+
+*/
bit_or :: intrinsics.simd_bit_or
+
+/*
+Bitwise XOR of the 2 lanes of vectors.
+
+This procedure returns a vector, such that each lane has the result of a bitwise
+XOR operation between the corresponding lanes of the vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer or boolean vector.
+- `b`: An integer or boolean vector.
+
+Returns:
+- Result of the bitwise XOR operation between two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] ~ b[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0x11 | 0x33 | 0x55 | 0xaa |
+ +------+------+------+------+
+ +------+------+------+------+
+ b: | 0xff | 0xf0 | 0x0f | 0x00 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0xee | 0xc3 | 0x5a | 0xaa |
+ +------+------+------+------+
+*/
bit_xor :: intrinsics.simd_bit_xor
+
+/*
+Bitwise AND NOT of the 2 lanes of vectors.
+
+This procedure returns a vector, such that each lane has the result of a bitwise
+AND NOT operation between the corresponding lanes of the vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer or boolean vector.
+- `b`: An integer or boolean vector.
+
+Returns:
+- Result of the bitwise AND NOT operation between two vectors.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = a[i] &~ b[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0x11 | 0x33 | 0x55 | 0xaa |
+ +------+------+------+------+
+ +------+------+------+------+
+ b: | 0xff | 0xf0 | 0x0f | 0x00 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0x00 | 0x03 | 0x50 | 0xaa |
+ +------+------+------+------+
+*/
bit_and_not :: intrinsics.simd_bit_and_not
+/*
+Negation of a SIMD vector.
+
+This procedure returns a vector where each lane is the negation of the
+corresponding lane in the vector `a`.
+
+Inputs:
+- `a`: An integer or a float vector to negate
+
+Returns:
+- Negated vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = -a[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0 | 1 | 2 | 3 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0 | -1 | -2 | -3 |
+ +------+------+------+------+
+*/
neg :: intrinsics.simd_neg
+/*
+Absolute value of a SIMD vector.
+
+This procedure returns a vector where each lane has the absolute value of the
+corresponding lane in the vector `a`.
+
+Inputs:
+- `a`: An integer or a float vector to negate
+
+Returns:
+- Absolute value of a vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ switch {
+ case a[i] < 0: res[i] = -a[i]
+ case a[i] > 0: res[i] = a[i]
+ case a[i] == 0: res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0 | -1 | 2 | -3 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0 | 1 | 2 | 3 |
+ +------+------+------+------+
+*/
abs :: intrinsics.simd_abs
+/*
+Minimum of each lane of SIMD vectors.
+
+This procedure returns a vector, such that each lane has is the minimum value
+between the corresponding lanes in vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- Vector with minimum values of each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] < b[i] {
+ res[i] = a[i]
+ } else {
+ res[i] = b[i]
+ }
+ }
+ return res
+
+Example:
+
+ +-----+-----+-----+-----+
+ a: | 0 | 1 | 2 | 3 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 0 | 2 | 1 | -1 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 0 | 1 | 1 | -1 |
+ +-----+-----+-----+-----+
+*/
min :: intrinsics.simd_min
+
+/*
+Maximum of each lane of SIMD vectors.
+
+This procedure returns a vector, such that each lane has is the maximum value
+between the corresponding lanes in vectors `a` and `b`.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- Vector with maximum values of each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] > b[i] {
+ res[i] = a[i]
+ } else {
+ res[i] = b[i]
+ }
+ }
+ return res
+
+Example:
+
+ +-----+-----+-----+-----+
+ a: | 0 | 1 | 2 | 3 |
+ +-----+-----+-----+-----+
+ +-----+-----+-----+-----+
+ b: | 0 | 2 | 1 | -1 |
+ +-----+-----+-----+-----+
+ res:
+ +-----+-----+-----+-----+
+ | 0 | 2 | 2 | 3 |
+ +-----+-----+-----+-----+
+*/
max :: intrinsics.simd_max
+
+/*
+Clamp lanes of SIMD vector.
+
+This procedure returns a vector, where each lane is the result of the
+clamping of the lane from the vector `v` between the values in the corresponding
+lanes of vectors `min` and `max`.
+
+Inputs:
+- `v`: An integer or a float vector with values to be clamped.
+- `min`: An integer or a float vector with minimum bounds.
+- `max`: An integer or a float vectoe with maximum bounds.
+
+**Operation**:
+
+ for i in len(res) {
+ val := v[i]
+ switch {
+ case val < min: val = min
+ case val > max: val = max
+ }
+ res[i] = val
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ v: | -1 | 0.3 | 1.2 | 1 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ min: | 0 | 0 | 0 | 0 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ max: | 1 | 1 | 1 | 1 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ | 0 | 0.3 | 1 | 1 |
+ +-------+-------+-------+-------+
+*/
clamp :: intrinsics.simd_clamp
-// Return an unsigned integer of the same size as the input type
-// NOT A BOOLEAN
-// element-wise:
-// false => 0x00...00
-// true => 0xff...ff
+/*
+Check if lanes of SIMD vectors are equal.
+
+This procedure checks each pair of lanes from vectors `a` and `b` for whether
+they are equal, and if they are, the corresponding lane of the result vector
+will have a value with all bits set (`0xff..ff`). Otherwise the lane of the
+result vector will have the value `0`.
+
+Inputs:
+- `a`: An integer, a float or a boolean vector.
+- `b`: An integer, a float or a boolean vector.
+
+Returns:
+- A vector of unsigned integers of the same size as the input vector's lanes,
+containing comparison results for each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] == b[i] {
+ res[i] = unsigned(-1)
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0 | 1 | 2 | 3 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 0 | 2 | 2 | 2 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ | 0xff | 0x00 | 0xff | 0x00 |
+ +-------+-------+-------+-------+
+*/
lanes_eq :: intrinsics.simd_lanes_eq
+
+/*
+Check if lanes of SIMD vectors are not equal.
+
+This procedure checks each pair of lanes from vectors `a` and `b` for whether
+they are not equal, and if they are, the corresponding lane of the result
+vector will have a value with all bits set (`0xff..ff`). Otherwise the lane of
+the result vector will have the value `0`.
+
+Inputs:
+- `a`: An integer, a float or a boolean vector.
+- `b`: An integer, a float or a boolean vector.
+
+Returns:
+- A vector of unsigned integers of the same size as the input vector's lanes,
+containing comparison results for each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] != b[i] {
+ res[i] = unsigned(-1)
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0 | 1 | 2 | 3 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 0 | 2 | 2 | 2 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ | 0x00 | 0xff | 0x00 | 0xff |
+ +-------+-------+-------+-------+
+*/
lanes_ne :: intrinsics.simd_lanes_ne
+
+/*
+Check if values of SIMD vector are less than the values of another SIMD vector.
+
+This procedure checks each pair of lanes from vectors `a` and `b` for whether
+the lane of `a` is less than the lane of `b`, and if so, the corresponding lane
+of the result vector will have a value with all bits set (`0xff..ff`). Otherwise
+the lane of the result vector will have the value `0`.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- A vector of unsigned integers of the same size as the input vector's lanes,
+containing comparison results for each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] < b[i] {
+ res[i] = unsigned(-1)
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0 | 1 | 2 | 3 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 0 | 2 | 2 | 2 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ r: | 0x00 | 0x00 | 0x00 | 0xff |
+ +-------+-------+-------+-------+
+*/
lanes_lt :: intrinsics.simd_lanes_lt
+
+/*
+Check if values of SIMD vector are less than or equal the values of another
+SIMD vector.
+
+This procedure checks each pair of lanes from vectors `a` and `b` for whether
+lane of `a` is less than or equal to the lane of `b`, and if so, the
+corresponding lane of the result vector will have a value with all bits set
+(`0xff..ff`). Otherwise the lane of the result vector will have the value `0`.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- A vector of unsigned integers of the same size as the input vector's lanes,
+containing comparison results for each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] <= b[i] {
+ res[i] = unsigned(-1)
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0 | 1 | 2 | 3 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 0 | 2 | 2 | 2 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ | 0xff | 0x00 | 0xff | 0xff |
+ +-------+-------+-------+-------+
+*/
lanes_le :: intrinsics.simd_lanes_le
+
+/*
+Check if values of SIMD vector are greater than the values of another SIMD
+vector.
+
+This procedure checks each pair of lanes from vectors `a` and `b` for whether
+lane of `a` is greater than to the lane of `b`, and if so, the corresponding
+lane of the result vector will have a value with all bits set (`0xff..ff`).
+Otherwise the lane of the result vector will have the value `0`.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- A vector of unsigned integers of the same size as the input vector's lanes,
+containing comparison results for each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] > b[i] {
+ res[i] = unsigned(-1)
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0 | 1 | 2 | 3 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 0 | 2 | 2 | 2 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ | 0x00 | 0x00 | 0x00 | 0xff |
+ +-------+-------+-------+-------+
+*/
lanes_gt :: intrinsics.simd_lanes_gt
+
+/*
+Check if values of SIMD vector are greater than or equal the values of another
+SIMD vector.
+
+This procedure checks each pair of lanes from vectors `a` and `b` for whether
+lane of `a` is greater than or equal to the lane of `b`, and if so, the
+corresponding lane of the result vector will have a value with all bits set
+(`0xff..ff`). Otherwise the lane of the result vector will have the value `0`.
+
+Inputs:
+- `a`: An integer or a float vector.
+- `b`: An integer or a float vector.
+
+Returns:
+- A vector of unsigned integers of the same size as the input vector's lanes,
+containing comparison results for each lane.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if a[i] >= b[i] {
+ res[i] = unsigned(-1)
+ } else {
+ res[i] = 0
+ }
+ }
+ return res
+
+Example:
+
+ +-------+-------+-------+-------+
+ a: | 0 | 1 | 2 | 3 |
+ +-------+-------+-------+-------+
+ +-------+-------+-------+-------+
+ b: | 0 | 2 | 2 | 2 |
+ +-------+-------+-------+-------+
+ res:
+ +-------+-------+-------+-------+
+ | 0xff | 0x00 | 0xff | 0xff |
+ +-------+-------+-------+-------+
+*/
lanes_ge :: intrinsics.simd_lanes_ge
+/*
+Perform a gather load into a SIMD vector.
+
+A *gather* operation is memory load operation, that loads values from an vector
+of addresses into a single value vector. This can be used to achieve the
+following results:
-// Gather and Scatter intrinsics
+- Accessing every N'th element of an array (strided access)
+- Access of elements according to some computed offsets (indexed access).
+- Access of elements in a different order (shuffling access).
+
+When used alongside other SIMD procedures in order to compute the offsets
+for the `ptr` and `mask` parameters.
+
+Inputs:
+- `ptr`: A vector of memory locations. Each pointer points to a single value,
+ of a SIMD vector's lane type, that will be loaded into the vector. Pointer
+ in this vector can be `nil` or any other invalid value, if the corresponding
+ value in the `mask` parameter is zero.
+- `val`: A vector of values that will be used at corresponding positions
+ of the result vector, if the corresponding memory location has been
+ masked out.
+- `mask`: A vector of booleans or unsigned integers that determines which memory
+ locations to read from. If the value at an index has the value true
+ (lowest bit set), the value at that index will be loaded into the result
+ vector from the corresponding memory location in the `ptr` vector. Otherwise
+ the value will be loaded from the `val` vector.
+
+Returns:
+
+This procedure returns a vector with all values from unmasked indices
+loaded from the pointer vector `ptr`, and all values from masked indices loaded
+from the value vector `val`.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if mask[i]&1 == 1 {
+ res[i] = ptr[i]^
+ } else {
+ res[i] = val[i]
+ }
+ }
+ return res
+
+Example:
+
+Example below loads 2 lanes of values from 2 lanes of float vectors, `v1` and
+`v2`. From each of these vectors we're loading the second value, into the first
+and the third position of the result vector.
+
+Therefore the `ptrs` argument is initialized such that the first and the third
+value are the addresses of the values that we want to load into the result
+vector, and we'll fill in `nil` for the rest of them. To prevent CPU from
+dereferencing those `nil` addresses we provide the mask that only allows us
+to load valid positions of the `ptrs` array, and the array of defaults which
+will have `127` in each position as the default value.
+
+ v1 := [4] f32 {1, 2, 3, 4};
+ v2 := [4] f32 {9, 10,11,12};
+ ptrs := #simd [4]rawptr { &v1[1], nil, &v2[1], nil }
+ mask := #simd [4]bool { true, false, true, false }
+ defaults := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
+ res := simd.gather(ptrs, defaults, mask)
+ fmt.println(res)
+
+The code would print `<2, 127, 10, 127>`. First and the third positions came
+from the `ptrs` array, and the other 2 lanes of from the default vector.
+Graphic below shows how the values of the result are decided based on the mask:
+
+ +-------------------------------+
+ mask: | 1 | 0 | 1 | 0 |
+ +-------------------------------+
+ | | | `----------------------------.
+ | | | |
+ | `---- | ------------------------. |
+ v v v v
+ +-------------------------------+ +-------------------+
+ ptrs: | &m0 | nil | &m2 | nil | vals: | d0 | d1 | d2 | d3 |
+ +-------------------------------+ +-------------------+
+ | | | |
+ | .--- | -------------------------' |
+ | | | ,-------------------------'
+ v v v v
+ +-------------------------------+
+ result: | m0 | d1 | m2 | d3 |
+ +-------------------------------+
+*/
gather :: intrinsics.simd_gather
+
+/*
+Perform a scatter store from a SIMD vector.
+
+A *scatter* operation is a memory store operation that stores values from a
+vector into multiple memory locations. This operation is effectively the
+opposite from the *gather* operation.
+
+Inputs:
+- `ptr`: A vector of memory locations. Each masked location will be written
+ to with a value from the `val` vector. Pointer in this vector can be `nil`
+ or any other invalid value, if the corresponding value in the `mask`
+ parameter is zero.
+- `val`: A vector of values to write to the memory locations.
+- `mask`: A vector of booleans or unsigned integers, that decides which lanes
+ get written to memory. If the value of the mask is `true` (the lowest bit
+ set), the corresponding lane is written into memory. Otherwise it's not
+ written into memory.
+
+**Operation**:
+
+ for i in 0 ..< len(ptr) {
+ if mask[i]&1 == 1 {
+ ptr[i]^ = val[i]
+ }
+ }
+
+Example:
+
+Example below writes value `127` to the second element of two different
+vectors. The addresses of store destinations are written to the first and the
+third argument of the `ptr` vector, and the `mask` is set accordingly.
+
+ v1 := [4] f32 {1, 2, 3, 4};
+ v2 := [4] f32 {5, 6, 7, 8};
+ ptrs := #simd [4]rawptr { &v1[1], nil, &v2[1], nil }
+ mask := #simd [4]bool { true, false, true, false }
+ vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
+ simd.scatter(ptrs, vals, mask)
+ fmt.println(v1)
+ fmt.println(v2)
+
+This code prints the values of the two vectors, after modification by `scatter`:
+
+ [1, 127, 3, 4]
+ [5, 127, 7, 8]
+
+Graphic below shows how the data gets written into memory.
+
+
+ +-------------------+
+ mask: | 1 | 0 | 1 | 0 |
+ +-------------------+
+ | | | |
+ v X v X
+ +-------------------+
+ vals: | d0 | d1 | d2 | d3 |
+ +-------------------+
+ | \
+ v v
+ +-----------------------+
+ ptrs: | &m0 | nil | &m2 | nil |
+ +-----------------------+
+*/
scatter :: intrinsics.simd_scatter
+
+/*
+Perform a masked load into the SIMD vector.
+
+This procedure performs a masked load from memory, into the vector. The `ptr`
+argument specifies the base address from which the values of SIMD vector
+will be loaded. The mask selects the source for the result vector's lanes. If
+the mask for the corresponding lane has the value `true` (lowest bit set), the
+result lane is loaded from memory. Otherwise the result lane is loaded from the
+corresponding lane of the `val` vector.
+
+Inputs:
+- `ptr`: The address of the vector values to load. Masked-off values are not
+ accessed.
+- `val`: The vector of values that will be loaded into the masked slots of the
+ result vector.
+- `mask`: The mask that selects where to load the values from.
+
+Returns:
+- The loaded vector. The lanes for which the mask was set, are loaded from
+memory, and the other lanes are loaded from the `val` vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ if mask[i]&1 == 1 {
+ res[i] = ptr[i]
+ } else {
+ res[i] = vals[i]
+ }
+ }
+ return res
+
+Example:
+
+The following code loads two values from the `src` vector, the first and the
+third value (selected by the mask). The masked-off values are given the value
+of 127 (`0x7f`).
+
+ src := [4] f32 {1, 2, 3, 4};
+ mask := #simd [4]bool { true, false, true, false }
+ vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
+ res := simd.masked_load(&src, vals, mask)
+ fmt.println(res)
+
+The above code prints the following:
+
+ <1, 127, 3, 127>
+
+Graphic below demonstrates the flow of lanes.
+
+ +-------------------------------+
+ mask: | 1 | 0 | 1 | 0 |
+ +-------------------------------+
+ | | | `----------------------------.
+ | | | |
+ | `---- | ------------------------. |
+ ptr v v v v
+ +---->+-------------------------------+ +-------------------+
+ | v1 | v2 | v3 | v4 | vals: | d0 | d1 | d2 | d3 |
+ +-------------------------------+ +-------------------+
+ | | | |
+ | .--- | -------------------------' |
+ | | | ,-------------------------'
+ v v v v
+ +-------------------------------+
+ result: | v1 | d1 | v3 | d3 |
+ +-------------------------------+
+*/
masked_load :: intrinsics.simd_masked_load
+
+/*
+Perform a masked store to memory.
+
+This procedure performs a masked store, from a vector `val`, into memory at
+address `ptr`, with the `mask` deciding which lanes are going to be stored,
+and which aren't. If the mask at a corresponding lane has the value `true`
+(lowest bit set), the lane is stored into memory. Otherwise the lane is not
+stored into memory.
+
+Inputs:
+- `ptr`: The base address of the store.
+- `val`: The vector to store.
+- `mask`: The mask, selecting which lanes of the vector to store into memory.
+
+**Operation**:
+
+ for i in len(val) {
+ if mask[i]&1 == 1 {
+ ptr[i] = val
+ }
+ }
+
+Example:
+
+Example below stores the value 127 into the first and the third slot of the
+vector `v`.
+
+ v := [4] f32 {1, 2, 3, 4};
+ mask := #simd [4]bool { true, false, true, false }
+ vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
+ simd.masked_store(&v, vals, mask)
+ fmt.println(v)
+
+After the masked store the printed result is:
+
+ [127, 2, 127, 4]
+
+Graphic below shows the flow of lanes:
+
+ +-------------------+
+ mask: | 1 | 0 | 1 | 0 |
+ +-------------------+
+ | | | |
+ v X v X
+ +-------------------+
+ vals: | v0 | v1 | v2 | v3 |
+ +-------------------+
+ | \
+ ptr v v
+ +--->+-----------------------+
+ | v0 | ... | v2 | ... |
+ +-----------------------+
+*/
masked_store :: intrinsics.simd_masked_store
+
+/*
+Load consecutive scalar values and expand into a SIMD vector.
+
+This procedure loads a number of consecutive scalar values from an address,
+specified by the `ptr` parameter, and stores them in a result vector, according
+to the mask. The number of values read from memory is the number of set bits
+in the mask. The lanes, for which the mask has the value `true` get the next
+consecutive value from the memory, otherwise if the mask is `false` for the
+lane, its value is filled from the corresponding lane of the `val` parameter.
+
+This procedure acts like `masked_store`, except the values from the memory are
+read consecutively, and not according to the lanes. The memory values are read
+and assigned to the result vector's masked lanes in order of increasing
+addresses.
+
+Inputs:
+- `ptr`: The pointer to the memory to read from.
+- `vals`: The default values for masked-off entries.
+- `mask`: The mask, that determines which lanes get consecutive memory values.
+
+Returns:
+- The result vector, holding masked memory values unmasked default values.
+
+**Operation**:
+
+ mem_idx := 0
+ for i in 0 ..< len(mask) {
+ if mask[i]&1 == 1 {
+ res[i] = ptr[mem_idx]
+ mem_idx += 1
+ } else {
+ res[i] = val[i]
+ }
+ }
+ return res
+
+Example:
+
+The example below loads two values from memory of the vector `v`. Two values in
+the mask are set to `true`, meaning only two memory items will be loaded into
+the result vector. The mask is set to `true` in the first and the third
+position, which specifies that the first memory item will be read into the
+first lane of the result vector, and the second memory item will be read into
+the third lane of the result vector. All the other lanes of the result vector
+will be initialized to the default value `127`.
+
+ v := [2] f64 {1, 2};
+ mask := #simd [4]bool { true, false, true, false }
+ vals := #simd [4]f64 { 0x7f, 0x7f, 0x7f, 0x7f }
+ res := simd.masked_expand_load(&v, vals, mask)
+ fmt.println(res)
+
+The above code prints the following:
+
+ <1, 127, 2, 127>
+
+Graphical representation of the operation:
+
+
+ ptr --->+-----------+-----
+ | m0 | m1 | ...
+ +-----------+-----
+ | `--.
+ v v
+ +-------------------+ +-------------------+
+ mask: | 1 | 0 | 1 | 0 | vals: | v0 | v1 | v2 | v3 |
+ +-------------------+ +-------------------+
+ | | | |
+ | .-- | -----------------------' |
+ | | | ,----------------------------'
+ v v v v
+ +-------------------+
+ result: | m0 | v1 | m1 | v3 |
+ +-------------------+
+*/
masked_expand_load :: intrinsics.simd_masked_expand_load
+
+/*
+Store masked values to consecutive memory locations.
+
+This procedure stores values from masked lanes of a vector `val` consecutively
+into memory. This operation is the opposite of `masked_expand_load`. The number
+of items stored into memory is the number of set bits in the mask. If the value
+in a lane of a mask is `true`, that lane is stored into memory. Otherwise
+nothing is stored.
+
+Inputs:
+- `ptr`: The pointer to the memory of a store.
+- `val`: The vector to store into memory.
+- `mask`: The mask that selects which values to store into memory.
+
+**Operation**:
+
+ mem_idx := 0
+ for i in 0 ..< len(mask) {
+ if mask[i]&1 == 1 {
+ ptr[mem_idx] = val[i]
+ mem_idx += 1
+ }
+ }
+
+Example:
+
+The code below fills the vector `v` with two values from a 4-element SIMD
+vector, the first and the third value. The items in the mask are set to `true`
+in those lanes.
+
+ v := [2] f64 { };
+ mask := #simd [4]bool { true, false, true, false }
+ vals := #simd [4]f64 { 1, 2, 3, 4 }
+ simd.masked_compress_store(&v, vals, mask)
+ fmt.println(v)
+
+The code above prints the following:
+
+ [1, 3]
+
+Graphical representation of the operation:
+
+ +-------------------+
+ mask: | 1 | 0 | 1 | 0 |
+ +-------------------+
+ | |
+ v v
+ +-------------------+
+ vals: | v0 | v1 | v2 | v3 |
+ +-------------------+
+ | ,--'
+ ptr v v
+ +--->+-----------------
+ | v0 | v2 | ...
+ +-----------------
+*/
masked_compress_store :: intrinsics.simd_masked_compress_store
-// extract :: proc(a: #simd[N]T, idx: uint) -> T
+/*
+Extract scalar from a SIMD vector's lane.
+
+This procedure returns the scalar, from the lane at the specified index of the
+vector.
+
+Inputs:
+- `a`: The vector to extract from.
+- `idx`: Lane index.
+
+Returns:
+- The value of the lane at the specified index.
+
+**Operation**:
+
+ return a[idx]
+*/
extract :: intrinsics.simd_extract
-// replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T
+
+/*
+Replace the value in a SIMD vector's lane.
+
+This procedure places a scalar value at the lane at the specified index of
+the vector.
+
+Inputs:
+- `a`: The vector to replace a lane.
+- `idx`: Lane index.
+- `elem`: Scalar to place.
+
+Returns:
+- Vector with the specified lane replaced.
+
+**Operation**:
+
+ a[idx] = elem
+*/
replace :: intrinsics.simd_replace
+/*
+Reduce SIMD vector to a scalar by adding all the lanes.
+
+This procedure returns a scalar, that is the ordered sum of all SIMD lanes. The
+ordered sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative,
+that is `(a+b)+c` may not equal to `a+(b+c)`.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res += a[i]
+ }
+*/
reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
+
+/*
+Reduce SIMD vector to a scalar by multiplying all the lanes.
+
+This procedure returns a scalar, that is the ordered product of all SIMD lanes.
+The ordered product may be important for accounting for precision errors in
+floating-point computation, as floating-point multiplication is not associative,
+that is `(a*b)*c` may not equal to `a*(b*c)`.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 1
+ for i in 0 ..< len(a) {
+ res *= a[i]
+ }
+*/
reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
-reduce_min :: intrinsics.simd_reduce_min
-reduce_max :: intrinsics.simd_reduce_max
-reduce_and :: intrinsics.simd_reduce_and
-reduce_or :: intrinsics.simd_reduce_or
-reduce_xor :: intrinsics.simd_reduce_xor
-reduce_any :: intrinsics.simd_reduce_any
-reduce_all :: intrinsics.simd_reduce_all
+/*
+Reduce SIMD vector to a scalar by finding the minimum value between all of the lanes.
+
+This procedure returns a scalar, that is the minimum value of all the lanes
+in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Minimum value of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res = min(res, a[i])
+ }
+*/
+reduce_min :: intrinsics.simd_reduce_min
-// swizzle :: proc(a: #simd[N]T, indices: ..int) -> #simd[len(indices)]T
+/*
+Reduce SIMD vector to a scalar by finding the maximum value between all of the lanes.
+
+This procedure returns a scalar, that is the maximum value of all the lanes
+in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Minimum value of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res = max(res, a[i])
+ }
+*/
+reduce_max :: intrinsics.simd_reduce_max
+
+/*
+Reduce SIMD vector to a scalar by performing bitwise AND of all of the lanes.
+
+This procedure returns a scalar, that is the result of the bitwise AND operation
+between all of the lanes in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Bitwise AND of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res &= a[i]
+ }
+*/
+reduce_and :: intrinsics.simd_reduce_and
+
+/*
+Reduce SIMD vector to a scalar by performing bitwise OR of all of the lanes.
+
+This procedure returns a scalar, that is the result of the bitwise OR operation
+between all of the lanes in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Bitwise AND of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res |= a[i]
+ }
+*/
+reduce_or :: intrinsics.simd_reduce_or
+
+/*
+Reduce SIMD vector to a scalar by performing bitwise XOR of all of the lanes.
+
+This procedure returns a scalar, that is the result of the bitwise XOR operation
+between all of the lanes in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Bitwise XOR of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res ~= a[i]
+ }
+*/
+reduce_xor :: intrinsics.simd_reduce_xor
+
+/*
+Reduce SIMD vector to a scalar by performing bitwise OR of all of the lanes.
+
+This procedure returns a scalar, that is the result of the bitwise OR operation
+between all of the lanes in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Bitwise OR of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res |= a[i]
+ }
+*/
+reduce_any :: intrinsics.simd_reduce_any
+
+/*
+Reduce SIMD vector to a scalar by performing bitwise AND of all of the lanes.
+
+This procedure returns a scalar, that is the result of the bitwise AND operation
+between all of the lanes in a vector.
+
+Inputs:
+- `a`: Vector to reduce
+
+Result:
+- Bitwise AND of all lanes, as a scalar.
+
+**Operation**:
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res &= a[i]
+ }
+*/
+reduce_all :: intrinsics.simd_reduce_all
+
+/*
+Reorder the lanes of a SIMD vector.
+
+This procedure reorders the lanes of a vector, according to the provided
+indices. The number of indices correspond to the number of lanes in the
+result vector and must be the same as the number of lanes of the input vector.
+Each index specifies, the lane of the scalar from the input vector, which
+will be written at the corresponding position of the result vector.
+
+Inputs:
+- `x`: The input vector.
+- `indices`: The indices of lanes to write to the result vector.
+
+Result:
+- Swizzled input vector.
+
+**Operation**:
+
+ res = {}
+ for i in 0 ..< len(indices) {
+ res[i] = x[indices[i]]
+ }
+ return res
+
+**Example**
+
+The example below shows how the indices are used to determine which lanes of the
+input vector get written into the result vector.
+
+ x := #simd [4]f32 { 1.5, 2.5, 3.5, 4.5 }
+ res := simd.swizzle(x, 0, 3, 1, 1)
+ fmt.println("res")
+
+The above code will print the following to the console:
+
+ [ 1.5, 3.5, 2.5, 2.5 ]
+
+The graphical representation of the operation is as follows. The `idx` vector in
+the picture represents the `indices` parameter:
+
+ 0 1 2 3
+ +-----+-----+-----+-----+
+ x: | 1.5 | 2.5 | 3.5 | 4.5 |
+ +-----+-----+-----+-----+
+ ^ ^ ^
+ | | |
+ | '----. |
+ | .---- | ---'
+ | | |
+ | | +------.
+ +-----+-----+-----+-----+
+ idx: | 0 | 3 | 1 | 1 |
+ +-----+-----+-----+-----+
+ ^ ^ ^ ^
+ | | | |
+ +-----+-----+-----+-----+
+ res: | 1.5 | 3.5 | 2.5 | 2.5 |
+ +-----+-----+-----+-----+
+*/
swizzle :: builtin.swizzle
-// shuffle :: proc(a, b: #simd[N]T, indices: #simd[max 2*N]u32) -> #simd[len(indices)]T
+
+/*
+Reorder the lanes of two SIMD vectors.
+
+This procedure returns a vector, containing the scalars from the lanes of two
+vectors, according to the provided indices vector. Each index in the indices
+vector specifies, the lane of the scalar from one of the two input vectors,
+which will be written at the corresponding position of the result vector. If
+the index is within bounds 0 ..< len(A), it corresponds to the indices of the
+first input vector. Otherwise the index corresponds to the indices of the second
+input vector.
+
+Inputs:
+- `a`: The first input vector.
+- `b`: The second input vector.
+- `indices`: The indices.
+
+Result:
+- Input vectors, shuffled according to the indices.
+
+**Operation**:
+
+ res = {}
+ for i in 0 ..< len(indices) {
+ idx = indices[i];
+ if idx < len(a) {
+ res[i] = a[idx]
+ } else {
+ res[i] = b[idx]
+ }
+ }
+ return res
+
+**Example**
+
+The example below shows how the indices are used to determine lanes of the
+input vector that are shuffled into the result vector.
+
+ a := #simd [4]f32 { 1, 2, 3, 4 }
+ b := #simd [4]f32 { 5, 6, 7, 8 }
+ indices := #simd[4]
+ res := simd.swizzle(x, 0, 4, 2, 5)
+ fmt.println("res")
+
+The above code will print the following to the console:
+
+ [ 1, 5, 3, 6 ]
+
+The graphical representation of the operation is as follows. The `idx` vector in
+the picture represents the `indices` parameter:
+
+ 0 1 2 3 4 5 6 7
+ +-----+-----+-----+-----+ +-----+-----+-----+-----+
+ a: | 1 | 2 | 3 | 4 | b: | 5 | 6 | 7 | 8 |
+ +-----+-----+-----+-----+ +-----+-----+-----+-----+
+ ^ ^ ^ ^
+ | | | |
+ | | | |
+ | .--- | ----------------' |
+ | | | .-----------------'
+ +-----+-----+-----+-----+
+ idx: | 0 | 4 | 2 | 5 |
+ +-----+-----+-----+-----+
+ ^ ^ ^ ^
+ | | | |
+ +-----+-----+-----+-----+
+ res: | 1 | 5 | 3 | 6 |
+ +-----+-----+-----+-----+
+*/
shuffle :: intrinsics.simd_shuffle
-// select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T
-select :: intrinsics.simd_select
+/*
+Select values from one of the two vectors.
+
+This procedure returns a vector, which has, on each lane a value from one of the
+corresponding lanes in one of the two input vectors based on the `cond`
+parameter. On each lane, if the value of the `cond` parameter is `true` (or
+non-zero), the result lane will have a value from the `true` input vector,
+otherwise the result lane will have a value from the `false` input vector.
+
+Inputs:
+- `cond`: The condition vector.
+- `true`: The first input vector.
+- `false`: The second input vector.
+
+Result:
+- The result of selecting values from the two input vectors.
+
+**Operation**:
+ res = {}
+ for i in 0 ..< len(cond) {
+ if cond[i] {
+ res[i] = true[i]
+ } else {
+ res[i] = false[i]
+ }
+ }
+ return res
+
+**Example**:
+
+The following example selects values from the two input vectors, `a` and `b`
+into a single vector.
+
+ a := #simd [4] f64 { 1,2,3,4 };
+ b := #simd [4] f64 { 5,6,7,8 };
+ cond := #simd[4] int { 1, 0, 1, 0 }
+ fmt.println(simd.select(cond,a,b))
+
+Output:
+
+ [ 1, 6, 3, 8 ]
+
+Graphically, the operation looks as follows. The `t` and `f` represent the
+`true` and `false` vectors respectively:
+
+ 0 1 2 3 0 1 2 3
+ +-----+-----+-----+-----+ +-----+-----+-----+-----+
+ t: | 1 | 2 | 3 | 4 | f: | 5 | 6 | 7 | 8 |
+ +-----+-----+-----+-----+ +-----+-----+-----+-----+
+ ^ ^ ^ ^
+ | | | |
+ | | | |
+ | .--- | ----------------------' |
+ | | | .-----------------------------'
+ +-----+-----+-----+-----+
+ cond: | 1 | 0 | 1 | 0 |
+ +-----+-----+-----+-----+
+ ^ ^ ^ ^
+ | | | |
+ +-----+-----+-----+-----+
+ res: | 1 | 5 | 3 | 6 |
+ +-----+-----+-----+-----+
+*/
+select :: intrinsics.simd_select
+/*
+Compute the square root of each lane in a SIMD vector.
+*/
sqrt :: intrinsics.sqrt
+
+/*
+Ceil each lane in a SIMD vector.
+*/
ceil :: intrinsics.simd_ceil
+
+/*
+Floor each lane in a SIMD vector.
+*/
floor :: intrinsics.simd_floor
+
+/*
+Truncate each lane in a SIMD vector.
+*/
trunc :: intrinsics.simd_trunc
+
+/*
+Compute the nearest integer of each lane in a SIMD vector.
+*/
nearest :: intrinsics.simd_nearest
+/*
+Transmute a SIMD vector into an integer vector.
+*/
to_bits :: intrinsics.simd_to_bits
+/*
+Reverse the lanes of a SIMD vector.
+
+This procedure reverses the lanes of a SIMD vector, putting last lane in the
+first spot, etc. This procedure is equivalent to the following call (for
+4-element vectors):
+
+ swizzle(a, 3, 2, 1, 0)
+*/
lanes_reverse :: intrinsics.simd_lanes_reverse
-lanes_rotate_left :: intrinsics.simd_lanes_rotate_left
+/*
+Rotate the lanes of a SIMD vector left.
+
+This procedure rotates the lanes of a SIMD vector, putting the first lane of the
+last spot, second lane in the first spot, third lane in the second spot, etc.
+For 4-element vectors, this procedure is equvalent to the following:
+
+ swizzle(a, 1, 2, 3, 0)
+*/
+lanes_rotate_left :: intrinsics.simd_lanes_rotate_left
+
+/*
+Rotate the lanes of a SIMD vector right.
+
+This procedure rotates the lanes of a SIMD vector, putting the first lane of the
+second spot, second lane in the third spot, etc. For 4-element vectors, this
+procedure is equvalent to the following:
+
+ swizzle(a, 3, 0, 1, 2)
+*/
lanes_rotate_right :: intrinsics.simd_lanes_rotate_right
-count_ones :: intrinsics.count_ones
-count_zeros :: intrinsics.count_zeros
+/*
+Count the number of set bits in each lane of a SIMD vector.
+*/
+count_ones :: intrinsics.count_ones
+
+/*
+Count the number of unset bits in each lane of a SIMD vector.
+*/
+count_zeros :: intrinsics.count_zeros
+
+/*
+Count the number of trailing unset bits in each lane of a SIMD vector.
+*/
count_trailing_zeros :: intrinsics.count_trailing_zeros
-count_leading_zeros :: intrinsics.count_leading_zeros
-reverse_bits :: intrinsics.reverse_bits
+/*
+Count the number of leading unset bits in each lane of a SIMD vector.
+*/
+count_leading_zeros :: intrinsics.count_leading_zeros
+
+/*
+Reverse the bit pattern of a SIMD vector.
+*/
+reverse_bits :: intrinsics.reverse_bits
+
+/*
+Perform a FMA (Fused multiply-add) operation on each lane of SIMD vectors.
+
+A fused multiply-add is a ternary operation that for three operands, `a`, `b`
+and `c` performs the operation `a*b+c`. This operation is a hardware feature
+that allows to minimize floating-point error and allow for faster computation.
+
+This procedure performs a FMA operation on each lane of the SIMD vectors.
+
+Inputs:
+- `a`: The multiplier
+- `b`: The multiplicand
+- `c`: The addend
+
+Returns:
+- `a*b+c`
+
+**Operation**
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res[i] = fma(a[i], b[i], c[i])
+ }
+ return res
+*/
fused_mul_add :: intrinsics.fused_mul_add
-fma :: intrinsics.fused_mul_add
+/*
+Perform a FMA (Fused multiply-add) operation on each lane of SIMD vectors.
+
+A fused multiply-add is a ternary operation that for three operands, `a`, `b`
+and `c` performs the operation `a*b+c`. This operation is a hardware feature
+that allows to minimize floating-point error and allow for faster computation.
+
+This procedure performs a FMA operation on each lane of the SIMD vectors.
+
+Inputs:
+- `a`: The multiplier
+- `b`: The multiplicand
+- `c`: The addend
+
+Returns:
+- `a*b+c`
+
+**Operation**
+
+ res := 0
+ for i in 0 ..< len(a) {
+ res[i] = fma(a[i], b[i], c[i])
+ }
+ return res
+*/
+fma :: intrinsics.fused_mul_add
+
+/*
+Convert pointer to SIMD vector to an array pointer.
+*/
to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E {
return (^[LANES]E)(v)
}
+
+/*
+Convert SIMD vector to an array.
+*/
to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E {
return transmute([LANES]E)(v)
}
+
+/*
+Convert array to SIMD vector.
+*/
from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E {
return transmute(#simd[LANES]E)v
}
+/*
+Convert slice to SIMD vector.
+*/
from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
assert(len(slice) >= LANES, "slice length must be a least the number of lanes")
array: [LANES]E
@@ -186,10 +2277,36 @@ from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
return transmute(T)array
}
+/*
+Perform binary not operation on a SIMD vector.
+
+This procedure returns a vector where each lane is the result of the binary
+NOT operation of the corresponding lane in the vector `a`.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = ~a[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0x00 | 0x50 | 0x80 | 0xff |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0xff | 0xaf | 0x7f | 0x00 |
+ +------+------+------+------+
+*/
bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) {
return xor(v, T(~E(0)))
}
+/*
+Copy the signs from lanes of one SIMD vector into another SIMD vector.
+*/
copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
neg_zero := to_bits(T(-0.0))
sign_bit := to_bits(sign) & neg_zero
@@ -197,11 +2314,48 @@ copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T
return transmute(T)(sign_bit|magnitude)
}
+/*
+Return signs of SIMD lanes.
+
+This procedure returns a vector, each lane of which contains either +1.0 or
+-1.0 depending on the sign of the value in the corresponding lane of the
+input vector. If the lane of the input vector has NaN, then the result vector
+will contain this NaN value as-is.
+*/
signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
is_nan := lanes_ne(v, v)
return select(is_nan, v, copysign(T(1), v))
}
+/*
+Calculate reciprocals of SIMD lanes.
+
+This procedure returns a vector where each lane is the reciprocal of the
+corresponding lane in the vector `a`.
+
+Inputs:
+- `a`: An integer or a float vector to negate
+
+Returns:
+- Negated vector.
+
+**Operation**:
+
+ for i in 0 ..< len(res) {
+ res[i] = 1.0 / a[i]
+ }
+ return res
+
+Example:
+
+ +------+------+------+------+
+ a: | 0 | 1 | 3 | 5 |
+ +------+------+------+------+
+ res:
+ +------+------+------+------+
+ | 0 | 1 | 0.33 | 0.2 |
+ +------+------+------+------+
+*/
recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
return T(1) / v
}