aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgingerBill <gingerBill@users.noreply.github.com>2022-05-31 11:52:24 +0100
committerGitHub <noreply@github.com>2022-05-31 11:52:24 +0100
commita1f15c2c69b557be5a95882d18137d1f74d980ee (patch)
tree3f484753712a6d9d9cf1074f56bc91af6d6432c1
parenta6c779b50ecf5c8c0cb86c9d49768ab34508b1d2 (diff)
parent516f6647b46c69a67139154c02c74b436cd4b999 (diff)
Merge pull request #1807 from odin-lang/simd-dev
Generic #simd type and intrinsics
-rw-r--r--core/intrinsics/intrinsics.odin99
-rw-r--r--core/mem/raw.odin1
-rw-r--r--core/runtime/core_builtin.odin6
-rw-r--r--core/simd/simd.odin188
-rw-r--r--core/simd/x86/abm.odin24
-rw-r--r--core/simd/x86/adx.odin56
-rw-r--r--core/simd/x86/cmpxchg16b.odin8
-rw-r--r--core/simd/x86/cpu.odin94
-rw-r--r--core/simd/x86/fxsr.odin36
-rw-r--r--core/simd/x86/pclmulqdq.odin13
-rw-r--r--core/simd/x86/rdtsc.odin20
-rw-r--r--core/simd/x86/sha.odin49
-rw-r--r--core/simd/x86/sse.odin618
-rw-r--r--core/simd/x86/sse2.odin1191
-rw-r--r--core/simd/x86/sse3.odin68
-rw-r--r--core/simd/x86/sse41.odin352
-rw-r--r--core/simd/x86/sse42.odin149
-rw-r--r--core/simd/x86/ssse3.odin140
-rw-r--r--core/simd/x86/types.odin57
-rw-r--r--core/sys/cpu/cpu.odin33
-rw-r--r--core/sys/cpu/cpu_x86.odin67
-rw-r--r--examples/all/all_main.odin2
-rw-r--r--src/build_settings.cpp118
-rw-r--r--src/check_builtin.cpp877
-rw-r--r--src/check_decl.cpp50
-rw-r--r--src/check_expr.cpp241
-rw-r--r--src/check_stmt.cpp56
-rw-r--r--src/check_type.cpp20
-rw-r--r--src/checker.cpp16
-rw-r--r--src/checker.hpp4
-rw-r--r--src/checker_builtin_procs.hpp190
-rw-r--r--src/common.cpp7
-rw-r--r--src/entity.cpp8
-rw-r--r--src/llvm_backend.cpp4
-rw-r--r--src/llvm_backend_const.cpp111
-rw-r--r--src/llvm_backend_expr.cpp208
-rw-r--r--src/llvm_backend_proc.cpp552
-rw-r--r--src/llvm_backend_utility.cpp11
-rw-r--r--src/main.cpp4
-rw-r--r--src/parser.cpp14
-rw-r--r--src/parser.hpp1
-rw-r--r--src/string.cpp9
-rw-r--r--src/types.cpp24
43 files changed, 5432 insertions, 364 deletions
diff --git a/core/intrinsics/intrinsics.odin b/core/intrinsics/intrinsics.odin
index d71522936..9994a1914 100644
--- a/core/intrinsics/intrinsics.odin
+++ b/core/intrinsics/intrinsics.odin
@@ -6,12 +6,14 @@ package intrinsics
is_package_imported :: proc(package_name: string) -> bool ---
// Types
-simd_vector :: proc($N: int, $T: typeid) -> type/#simd[N]T
soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T
// Volatile
volatile_load :: proc(dst: ^$T) -> T ---
-volatile_store :: proc(dst: ^$T, val: T) -> T ---
+volatile_store :: proc(dst: ^$T, val: T) ---
+
+non_temporal_load :: proc(dst: ^$T) -> T ---
+non_temporal_store :: proc(dst: ^$T, val: T) ---
// Trapping
debug_trap :: proc() ---
@@ -23,18 +25,20 @@ alloca :: proc(size, align: int) -> [^]u8 ---
cpu_relax :: proc() ---
read_cycle_counter :: proc() -> i64 ---
-count_ones :: proc(x: $T) -> T where type_is_integer(T) ---
-count_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
-count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
-count_leading_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
-reverse_bits :: proc(x: $T) -> T where type_is_integer(T) ---
+count_ones :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+count_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+count_leading_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
+reverse_bits :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
byte_swap :: proc(x: $T) -> T where type_is_integer(T) || type_is_float(T) ---
overflow_add :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
-sqrt :: proc(x: $T) -> T where type_is_float(T) ---
+sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---
+
+fused_mul_add :: proc(a, b, c: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---
mem_copy :: proc(dst, src: rawptr, len: int) ---
mem_copy_non_overlapping :: proc(dst, src: rawptr, len: int) ---
@@ -186,6 +190,81 @@ type_hasher_proc :: proc($T: typeid) -> (hasher: proc "contextless" (data: rawpt
constant_utf16_cstring :: proc($literal: string) -> [^]u16 ---
+// SIMD related
+simd_add :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_sub :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_mul :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_div :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_rem :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+// Keeps Odin's Behaviour
+// (x << y) if y <= mask else 0
+simd_shl :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+simd_shr :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+
+// Similar to C's Behaviour
+// x << (y & mask)
+simd_shl_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+simd_shr_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
+
+simd_add_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_sub_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+simd_and :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_or :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_xor :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_and_not :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+simd_neg :: proc(a: #simd[N]T) -> #simd[N]T ---
+
+simd_abs :: proc(a: #simd[N]T) -> #simd[N]T ---
+
+simd_min :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_max :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+simd_clamp :: proc(v, min, max: #simd[N]T) -> #simd[N]T ---
+
+// Return an unsigned integer of the same size as the input type
+// NOT A BOOLEAN
+// element-wise:
+// false => 0x00...00
+// true => 0xff...ff
+simd_lanes_eq :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_ne :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_lt :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_le :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_gt :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
+
+simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
+simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
+
+simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T ---
+simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T ---
+simd_reduce_min :: proc(a: #simd[N]T) -> T ---
+simd_reduce_max :: proc(a: #simd[N]T) -> T ---
+simd_reduce_and :: proc(a: #simd[N]T) -> T ---
+simd_reduce_or :: proc(a: #simd[N]T) -> T ---
+simd_reduce_xor :: proc(a: #simd[N]T) -> T ---
+
+simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
+simd_select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---
+
+// Lane-wise operations
+simd_ceil :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+simd_floor :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+simd_trunc :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+// rounding to the nearest integral value; if two values are equally near, rounds to the even one
+simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
+
+simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---
+
+// equivalent a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
+simd_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
+
+simd_rotate_left :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+simd_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+
+
// WASM targets only
wasm_memory_grow :: proc(index, delta: uintptr) -> int ---
wasm_memory_size :: proc(index: uintptr) -> int ---
@@ -199,6 +278,10 @@ wasm_memory_size :: proc(index: uintptr) -> int ---
wasm_memory_atomic_wait32 :: proc(ptr: ^u32, expected: u32, timeout_ns: i64) -> u32 ---
wasm_memory_atomic_notify32 :: proc(ptr: ^u32, waiters: u32) -> (waiters_woken_up: u32) ---
+// x86 Targets (i386, amd64)
+x86_cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) ---
+x86_xgetbv :: proc(cx: u32) -> (eax, edx: u32) ---
+
// Darwin targets only
objc_object :: struct{}
diff --git a/core/mem/raw.odin b/core/mem/raw.odin
index 0a0780dfd..2bce2d7aa 100644
--- a/core/mem/raw.odin
+++ b/core/mem/raw.odin
@@ -21,6 +21,7 @@ make_any :: proc "contextless" (data: rawptr, id: typeid) -> any {
}
raw_array_data :: runtime.raw_array_data
+raw_simd_data :: runtime.raw_simd_data
raw_string_data :: runtime.raw_string_data
raw_slice_data :: runtime.raw_slice_data
raw_dynamic_array_data :: runtime.raw_dynamic_array_data
diff --git a/core/runtime/core_builtin.odin b/core/runtime/core_builtin.odin
index 4ddc3928a..7cb5287c0 100644
--- a/core/runtime/core_builtin.odin
+++ b/core/runtime/core_builtin.odin
@@ -604,6 +604,10 @@ raw_array_data :: proc "contextless" (a: $P/^($T/[$N]$E)) -> [^]E {
return ([^]E)(a)
}
@builtin
+raw_simd_data :: proc "contextless" (a: $P/^($T/#simd[$N]$E)) -> [^]E {
+ return ([^]E)(a)
+}
+@builtin
raw_slice_data :: proc "contextless" (s: $S/[]$E) -> [^]E {
ptr := (transmute(Raw_Slice)s).data
return ([^]E)(ptr)
@@ -619,7 +623,7 @@ raw_string_data :: proc "contextless" (s: $S/string) -> [^]u8 {
}
@builtin
-raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data}
+raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data, raw_simd_data}
diff --git a/core/simd/simd.odin b/core/simd/simd.odin
new file mode 100644
index 000000000..390ff377a
--- /dev/null
+++ b/core/simd/simd.odin
@@ -0,0 +1,188 @@
+package simd
+
+import "core:builtin"
+import "core:intrinsics"
+
+// 128-bit vector aliases
+u8x16 :: #simd[16]u8
+i8x16 :: #simd[16]i8
+u16x8 :: #simd[8]u16
+i16x8 :: #simd[8]i16
+u32x4 :: #simd[4]u32
+i32x4 :: #simd[4]i32
+u64x2 :: #simd[2]u64
+i64x2 :: #simd[2]i64
+f32x4 :: #simd[4]f32
+f64x2 :: #simd[2]f64
+
+boolx16 :: #simd[16]bool
+b8x16 :: #simd[16]b8
+b16x8 :: #simd[8]b16
+b32x4 :: #simd[4]b32
+b64x2 :: #simd[2]b64
+
+// 256-bit vector aliases
+u8x32 :: #simd[32]u8
+i8x32 :: #simd[32]i8
+u16x16 :: #simd[16]u16
+i16x16 :: #simd[16]i16
+u32x8 :: #simd[8]u32
+i32x8 :: #simd[8]i32
+u64x4 :: #simd[4]u64
+i64x4 :: #simd[4]i64
+f32x8 :: #simd[8]f32
+f64x4 :: #simd[4]f64
+
+boolx32 :: #simd[32]bool
+b8x32 :: #simd[32]b8
+b16x16 :: #simd[16]b16
+b32x8 :: #simd[8]b32
+b64x4 :: #simd[4]b64
+
+// 512-bit vector aliases
+u8x64 :: #simd[64]u8
+i8x64 :: #simd[64]i8
+u16x32 :: #simd[32]u16
+i16x32 :: #simd[32]i16
+u32x16 :: #simd[16]u32
+i32x16 :: #simd[16]i32
+u64x8 :: #simd[8]u64
+i64x8 :: #simd[8]i64
+f32x16 :: #simd[16]f32
+f64x8 :: #simd[8]f64
+
+boolx64 :: #simd[64]bool
+b8x64 :: #simd[64]b8
+b16x32 :: #simd[32]b16
+b32x16 :: #simd[16]b32
+b64x8 :: #simd[8]b64
+
+
+add :: intrinsics.simd_add
+sub :: intrinsics.simd_sub
+mul :: intrinsics.simd_mul
+div :: intrinsics.simd_div
+rem :: intrinsics.simd_rem // integers only
+
+// Keeps Odin's Behaviour
+// (x << y) if y <= mask else 0
+shl :: intrinsics.simd_shl
+shr :: intrinsics.simd_shr
+
+// Similar to C's Behaviour
+// x << (y & mask)
+shl_masked :: intrinsics.simd_shl_masked
+shr_masked :: intrinsics.simd_shr_masked
+
+// Saturation Arithmetic
+add_sat :: intrinsics.simd_add_sat
+sub_sat :: intrinsics.simd_sub_sat
+
+and :: intrinsics.simd_and
+or :: intrinsics.simd_or
+xor :: intrinsics.simd_xor
+and_not :: intrinsics.simd_and_not
+
+neg :: intrinsics.simd_neg
+
+abs :: intrinsics.simd_abs
+
+min :: intrinsics.simd_min
+max :: intrinsics.simd_max
+clamp :: intrinsics.simd_clamp
+
+// Return an unsigned integer of the same size as the input type
+// NOT A BOOLEAN
+// element-wise:
+// false => 0x00...00
+// true => 0xff...ff
+lanes_eq :: intrinsics.simd_lanes_eq
+lanes_ne :: intrinsics.simd_lanes_ne
+lanes_lt :: intrinsics.simd_lanes_lt
+lanes_le :: intrinsics.simd_lanes_le
+lanes_gt :: intrinsics.simd_lanes_gt
+lanes_ge :: intrinsics.simd_lanes_ge
+
+// extract :: proc(a: #simd[N]T, idx: uint) -> T
+extract :: intrinsics.simd_extract
+// replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T
+replace :: intrinsics.simd_replace
+
+reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
+reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
+reduce_min :: intrinsics.simd_reduce_min
+reduce_max :: intrinsics.simd_reduce_max
+reduce_and :: intrinsics.simd_reduce_and
+reduce_or :: intrinsics.simd_reduce_or
+reduce_xor :: intrinsics.simd_reduce_xor
+
+// swizzle :: proc(a: #simd[N]T, indices: ..int) -> #simd[len(indices)]T
+swizzle :: builtin.swizzle
+
+// shuffle :: proc(a, b: #simd[N]T, indices: #simd[max 2*N]u32) -> #simd[len(indices)]T
+shuffle :: intrinsics.simd_shuffle
+
+// select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T
+select :: intrinsics.simd_select
+
+
+sqrt :: intrinsics.sqrt
+ceil :: intrinsics.simd_ceil
+floor :: intrinsics.simd_floor
+trunc :: intrinsics.simd_trunc
+nearest :: intrinsics.simd_nearest
+
+to_bits :: intrinsics.simd_to_bits
+
+lanes_reverse :: intrinsics.simd_lanes_reverse
+
+lanes_rotate_left :: intrinsics.simd_lanes_rotate_left
+lanes_rotate_right :: intrinsics.simd_lanes_rotate_right
+
+count_ones :: intrinsics.count_ones
+count_zeros :: intrinsics.count_zeros
+count_trailing_zeros :: intrinsics.count_trailing_zeros
+count_leading_zeros :: intrinsics.count_leading_zeros
+reverse_bits :: intrinsics.reverse_bits
+
+fused_mul_add :: intrinsics.fused_mul_add
+fma :: intrinsics.fused_mul_add
+
+to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E {
+ return (^[LANES]E)(v)
+}
+to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E {
+ return transmute([LANES]E)(v)
+}
+from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E {
+ return transmute(#simd[LANES]E)v
+}
+
+from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
+ assert(len(slice) >= LANES, "slice length must be a least the number of lanes")
+ array: [LANES]E
+ #no_bounds_check for i in 0..<LANES {
+ array[i] = slice[i]
+ }
+ return transmute(T)array
+}
+
+bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) {
+ return xor(v, T(~E(0)))
+}
+
+copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
+ neg_zero := to_bits(T(-0.0))
+ sign_bit := to_bits(sign) & neg_zero
+ magnitude := to_bits(v) &~ neg_zero
+ return transmute(T)(sign_bit|magnitude)
+}
+
+signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
+ is_nan := lanes_ne(v, v)
+ return select(is_nan, v, copysign(T(1), v))
+}
+
+recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
+ return T(1) / v
+}
diff --git a/core/simd/x86/abm.odin b/core/simd/x86/abm.odin
new file mode 100644
index 000000000..79b806242
--- /dev/null
+++ b/core/simd/x86/abm.odin
@@ -0,0 +1,24 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+
+@(require_results, enable_target_feature="lzcnt")
+_lzcnt_u32 :: #force_inline proc "c" (x: u32) -> u32 {
+ return intrinsics.count_leading_zeros(x)
+}
+@(require_results, enable_target_feature="popcnt")
+_popcnt32 :: #force_inline proc "c" (x: u32) -> i32 {
+ return i32(intrinsics.count_ones(x))
+}
+
+when ODIN_ARCH == .amd64 {
+ @(require_results, enable_target_feature="lzcnt")
+ _lzcnt_u64 :: #force_inline proc "c" (x: u64) -> u64 {
+ return intrinsics.count_leading_zeros(x)
+ }
+ @(require_results, enable_target_feature="popcnt")
+ _popcnt64 :: #force_inline proc "c" (x: u64) -> i32 {
+ return i32(intrinsics.count_ones(x))
+ }
+} \ No newline at end of file
diff --git a/core/simd/x86/adx.odin b/core/simd/x86/adx.odin
new file mode 100644
index 000000000..d03cffcff
--- /dev/null
+++ b/core/simd/x86/adx.odin
@@ -0,0 +1,56 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results)
+_addcarry_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 {
+ x, y := llvm_addcarry_u32(c_in, a, b)
+ out^ = y
+ return x
+}
+@(require_results)
+_addcarryx_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 {
+ return llvm_addcarryx_u32(c_in, a, b, out)
+}
+@(require_results)
+_subborrow_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 {
+ x, y := llvm_subborrow_u32(c_in, a, b)
+ out^ = y
+ return x
+}
+
+when ODIN_ARCH == .amd64 {
+ @(require_results)
+ _addcarry_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 {
+ x, y := llvm_addcarry_u64(c_in, a, b)
+ out^ = y
+ return x
+ }
+ @(require_results)
+ _addcarryx_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 {
+ return llvm_addcarryx_u64(c_in, a, b, out)
+ }
+ @(require_results)
+ _subborrow_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 {
+ x, y := llvm_subborrow_u64(c_in, a, b)
+ out^ = y
+ return x
+ }
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.addcarry.32")
+ llvm_addcarry_u32 :: proc(a: u8, b: u32, c: u32) -> (u8, u32) ---
+ @(link_name="llvm.x86.addcarryx.u32")
+ llvm_addcarryx_u32 :: proc(a: u8, b: u32, c: u32, d: rawptr) -> u8 ---
+ @(link_name="llvm.x86.subborrow.32")
+ llvm_subborrow_u32 :: proc(a: u8, b: u32, c: u32) -> (u8, u32) ---
+
+ // amd64 only
+ @(link_name="llvm.x86.addcarry.64")
+ llvm_addcarry_u64 :: proc(a: u8, b: u64, c: u64) -> (u8, u64) ---
+ @(link_name="llvm.x86.addcarryx.u64")
+ llvm_addcarryx_u64 :: proc(a: u8, b: u64, c: u64, d: rawptr) -> u8 ---
+ @(link_name="llvm.x86.subborrow.64")
+ llvm_subborrow_u64 :: proc(a: u8, b: u64, c: u64) -> (u8, u64) ---
+}
diff --git a/core/simd/x86/cmpxchg16b.odin b/core/simd/x86/cmpxchg16b.odin
new file mode 100644
index 000000000..d575dd9df
--- /dev/null
+++ b/core/simd/x86/cmpxchg16b.odin
@@ -0,0 +1,8 @@
+//+build amd64
+package simd_x86
+
+import "core:intrinsics"
+
+cmpxchg16b :: #force_inline proc "c" (dst: ^u128, old, new: u128, $success, $failure: intrinsics.Atomic_Memory_Order) -> (val: u128) {
+ return intrinsics.atomic_compare_exchange_strong_explicit(dst, old, new, success, failure)
+} \ No newline at end of file
diff --git a/core/simd/x86/cpu.odin b/core/simd/x86/cpu.odin
new file mode 100644
index 000000000..14e90c0f0
--- /dev/null
+++ b/core/simd/x86/cpu.odin
@@ -0,0 +1,94 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+
+// cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) ---
+cpuid :: intrinsics.x86_cpuid
+
+// xgetbv :: proc(cx: u32) -> (eax, edx: u32) ---
+xgetbv :: intrinsics.x86_xgetbv
+
+
+CPU_Feature :: enum u64 {
+ aes, // AES hardware implementation (AES NI)
+ adx, // Multi-precision add-carry instruction extensions
+ avx, // Advanced vector extension
+ avx2, // Advanced vector extension 2
+ bmi1, // Bit manipulation instruction set 1
+ bmi2, // Bit manipulation instruction set 2
+ erms, // Enhanced REP for MOVSB and STOSB
+ fma, // Fused-multiply-add instructions
+ os_xsave, // OS supports XSAVE/XRESTOR for saving/restoring XMM registers.
+ pclmulqdq, // PCLMULQDQ instruction - most often used for AES-GCM
+ popcnt, // Hamming weight instruction POPCNT.
+ rdrand, // RDRAND instruction (on-chip random number generator)
+ rdseed, // RDSEED instruction (on-chip random number generator)
+ sse2, // Streaming SIMD extension 2 (always available on amd64)
+ sse3, // Streaming SIMD extension 3
+ ssse3, // Supplemental streaming SIMD extension 3
+ sse41, // Streaming SIMD extension 4 and 4.1
+ sse42, // Streaming SIMD extension 4 and 4.2
+}
+
+CPU_Features :: distinct bit_set[CPU_Feature; u64]
+
+cpu_features: Maybe(CPU_Features)
+
+@(init, private)
+init_cpu_features :: proc "c" () {
+ is_set :: #force_inline proc "c" (hwc: u32, value: u32) -> bool {
+ return hwc&value != 0
+ }
+ try_set :: #force_inline proc "c" (set: ^CPU_Features, feature: CPU_Feature, hwc: u32, value: u32) {
+ if is_set(hwc, value) {
+ set^ += {feature}
+ }
+ }
+
+ max_id, _, _, _ := cpuid(0, 0)
+ if max_id < 1 {
+ return
+ }
+
+ set: CPU_Features
+
+ _, _, ecx1, edx1 := cpuid(1, 0)
+
+ try_set(&set, .sse2, 26, edx1)
+ try_set(&set, .sse3, 0, ecx1)
+ try_set(&set, .pclmulqdq, 1, ecx1)
+ try_set(&set, .ssse3, 9, ecx1)
+ try_set(&set, .fma, 12, ecx1)
+ try_set(&set, .sse41, 19, ecx1)
+ try_set(&set, .sse42, 20, ecx1)
+ try_set(&set, .popcnt, 23, ecx1)
+ try_set(&set, .aes, 25, ecx1)
+ try_set(&set, .os_xsave, 27, ecx1)
+ try_set(&set, .rdrand, 30, ecx1)
+
+ os_supports_avx := false
+ if .os_xsave in set {
+ eax, _ := xgetbv(0)
+ os_supports_avx = is_set(1, eax) && is_set(2, eax)
+ }
+ if os_supports_avx {
+ try_set(&set, .avx, 28, ecx1)
+ }
+
+ if max_id < 7 {
+ return
+ }
+
+ _, ebx7, _, _ := cpuid(7, 0)
+ try_set(&set, .bmi1, 3, ebx7)
+ if os_supports_avx {
+ try_set(&set, .avx2, 5, ebx7)
+ }
+ try_set(&set, .bmi2, 8, ebx7)
+ try_set(&set, .erms, 9, ebx7)
+ try_set(&set, .rdseed, 18, ebx7)
+ try_set(&set, .adx, 19, ebx7)
+
+ cpu_features = set
+}
diff --git a/core/simd/x86/fxsr.odin b/core/simd/x86/fxsr.odin
new file mode 100644
index 000000000..cd78de7d4
--- /dev/null
+++ b/core/simd/x86/fxsr.odin
@@ -0,0 +1,36 @@
+//+build i386, amd64
+package simd_x86
+
+@(enable_target_feature="fxsr")
+_fxsave :: #force_inline proc "c" (mem_addr: rawptr) {
+ fxsave(mem_addr)
+}
+@(enable_target_feature="fxsr")
+_fxrstor :: #force_inline proc "c" (mem_addr: rawptr) {
+ fxrstor(mem_addr)
+}
+
+when ODIN_ARCH == .amd64 {
+ @(enable_target_feature="fxsr")
+ _fxsave64 :: #force_inline proc "c" (mem_addr: rawptr) {
+ fxsave64(mem_addr)
+ }
+ @(enable_target_feature="fxsr")
+ _fxrstor64 :: #force_inline proc "c" (mem_addr: rawptr) {
+ fxrstor64(mem_addr)
+ }
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.fxsave")
+ fxsave :: proc(p: rawptr) ---
+ @(link_name="llvm.x86.fxrstor")
+ fxrstor :: proc(p: rawptr) ---
+
+ // amd64 only
+ @(link_name="llvm.x86.fxsave64")
+ fxsave64 :: proc(p: rawptr) ---
+ @(link_name="llvm.x86.fxrstor64")
+ fxrstor64 :: proc(p: rawptr) ---
+} \ No newline at end of file
diff --git a/core/simd/x86/pclmulqdq.odin b/core/simd/x86/pclmulqdq.odin
new file mode 100644
index 000000000..692fb7ce1
--- /dev/null
+++ b/core/simd/x86/pclmulqdq.odin
@@ -0,0 +1,13 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results, enable_target_feature="pclmulqdq")
+_mm_clmulepi64_si128 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
+ return pclmulqdq(a, b, u8(IMM8))
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.pclmulqdq")
+ pclmulqdq :: proc(a, round_key: __m128i, #const imm8: u8) -> __m128i ---
+} \ No newline at end of file
diff --git a/core/simd/x86/rdtsc.odin b/core/simd/x86/rdtsc.odin
new file mode 100644
index 000000000..54024c3f2
--- /dev/null
+++ b/core/simd/x86/rdtsc.odin
@@ -0,0 +1,20 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results)
+_rdtsc :: #force_inline proc "c" () -> u64 {
+ return rdtsc()
+}
+
+@(require_results)
+__rdtscp :: #force_inline proc "c" (aux: ^u32) -> u64 {
+ return rdtscp(aux)
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.rdtsc")
+ rdtsc :: proc() -> u64 ---
+ @(link_name="llvm.x86.rdtscp")
+ rdtscp :: proc(aux: rawptr) -> u64 ---
+} \ No newline at end of file
diff --git a/core/simd/x86/sha.odin b/core/simd/x86/sha.odin
new file mode 100644
index 000000000..f015f4b8a
--- /dev/null
+++ b/core/simd/x86/sha.odin
@@ -0,0 +1,49 @@
+//+build i386, amd64
+package simd_x86
+
+@(require_results, enable_target_feature="sha")
+_mm_sha1msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)sha1msg1(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha1msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)sha1msg2(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha1nexte_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)sha1nexte(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha1rnds4_epu32 :: #force_inline proc "c" (a, b: __m128i, $FUNC: u32) -> __m128i where 0 <= FUNC, FUNC <= 3 {
+ return transmute(__m128i)sha1rnds4(transmute(i32x4)a, transmute(i32x4)b, u8(FUNC & 0xff))
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha256msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)sha256msg1(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha256msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)sha256msg2(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sha")
+_mm_sha256rnds2_epu32 :: #force_inline proc "c" (a, b, k: __m128i) -> __m128i {
+ return transmute(__m128i)sha256rnds2(transmute(i32x4)a, transmute(i32x4)b, transmute(i32x4)k)
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.sha1msg1")
+ sha1msg1 :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sha1msg2")
+ sha1msg2 :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sha1nexte")
+ sha1nexte :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sha1rnds4")
+ sha1rnds4 :: proc(a, b: i32x4, #const c: u8) -> i32x4 ---
+ @(link_name="llvm.x86.sha256msg1")
+ sha256msg1 :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sha256msg2")
+ sha256msg2 :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sha256rnds2")
+ sha256rnds2 :: proc(a, b, k: i32x4) -> i32x4 ---
+} \ No newline at end of file
diff --git a/core/simd/x86/sse.odin b/core/simd/x86/sse.odin
new file mode 100644
index 000000000..3efdeccba
--- /dev/null
+++ b/core/simd/x86/sse.odin
@@ -0,0 +1,618 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+
+// _MM_SHUFFLE(z, y, x, w) -> (z<<6 | y<<4 | x<<2 | w)
+_MM_SHUFFLE :: intrinsics.simd_x86__MM_SHUFFLE
+
+_MM_HINT_T0 :: 3
+_MM_HINT_T1 :: 2
+_MM_HINT_T2 :: 1
+_MM_HINT_NTA :: 0
+_MM_HINT_ET0 :: 7
+_MM_HINT_ET1 :: 6
+
+
+_MM_EXCEPT_INVALID :: 0x0001
+_MM_EXCEPT_DENORM :: 0x0002
+_MM_EXCEPT_DIV_ZERO :: 0x0004
+_MM_EXCEPT_OVERFLOW :: 0x0008
+_MM_EXCEPT_UNDERFLOW :: 0x0010
+_MM_EXCEPT_INEXACT :: 0x0020
+_MM_EXCEPT_MASK :: 0x003f
+
+_MM_MASK_INVALID :: 0x0080
+_MM_MASK_DENORM :: 0x0100
+_MM_MASK_DIV_ZERO :: 0x0200
+_MM_MASK_OVERFLOW :: 0x0400
+_MM_MASK_UNDERFLOW :: 0x0800
+_MM_MASK_INEXACT :: 0x1000
+_MM_MASK_MASK :: 0x1f80
+
+_MM_ROUND_NEAREST :: 0x0000
+_MM_ROUND_DOWN :: 0x2000
+_MM_ROUND_UP :: 0x4000
+_MM_ROUND_TOWARD_ZERO :: 0x6000
+
+_MM_ROUND_MASK :: 0x6000
+
+_MM_FLUSH_ZERO_MASK :: 0x8000
+_MM_FLUSH_ZERO_ON :: 0x8000
+_MM_FLUSH_ZERO_OFF :: 0x0000
+
+
+@(require_results, enable_target_feature="sse")
+_mm_add_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return addss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_add_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.add(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_sub_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return subss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_sub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.sub(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_mul_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return mulss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_mul_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.mul(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_div_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return divss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_div_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.div(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_sqrt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return sqrtss(a)
+}
+@(require_results, enable_target_feature="sse")
+_mm_sqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return sqrtps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_rcp_ss :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return rcpss(a)
+}
+@(require_results, enable_target_feature="sse")
+_mm_rcp_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return rcpps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_rsqrt_ss :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return rsqrtss(a)
+}
+@(require_results, enable_target_feature="sse")
+_mm_rsqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return rsqrtps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_min_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return minss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_min_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return minps(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_max_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return maxss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_max_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return maxps(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_and_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return transmute(__m128)simd.and(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_andnot_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return transmute(__m128)simd.and_not(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_or_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return transmute(__m128)simd.or(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_xor_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return transmute(__m128)simd.xor(transmute(__m128i)a, transmute(__m128i)b)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_cmpeq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 0)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmplt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 1)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmple_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 2)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpgt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, cmpss(b, a, 1), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, cmpss(b, a, 2), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpneq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 4)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnlt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 5)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnle_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 6)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpngt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, cmpss(b, a, 5), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, cmpss(b, a, 6), 4, 1, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 7)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpunord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpss(a, b, 3)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_cmpeq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(a, b, 0)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmplt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(a, b, 1)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmple_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(a, b, 2)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpgt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(b, a, 1)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(b, a, 2)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpneq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(a, b, 4)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnlt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(a, b, 5)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnle_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(a, b, 6)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpngt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(b, a, 5)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpnge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(b, a, 6)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(b, a, 7)
+}
+@(require_results, enable_target_feature="sse")
+_mm_cmpunord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return cmpps(b, a, 3)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_comieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return comieq_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return comilt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return comile_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return comigt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return comige_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_comineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return comineq_ss(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_ucomieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return ucomieq_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return ucomilt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return ucomile_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return ucomigt_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return ucomige_ss(a, b)
+}
+@(require_results, enable_target_feature="sse")
+_mm_ucomineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
+ return ucomineq_ss(a, b)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_cvtss_si32 :: #force_inline proc "c" (a: __m128) -> i32 {
+ return cvtss2si(a)
+}
+_mm_cvt_ss2si :: _mm_cvtss_si32
+_mm_cvttss_si32 :: _mm_cvtss_si32
+
+@(require_results, enable_target_feature="sse")
+_mm_cvtss_f32 :: #force_inline proc "c" (a: __m128) -> f32 {
+ return simd.extract(a, 0)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_cvtsi32_ss :: #force_inline proc "c" (a: __m128, b: i32) -> __m128 {
+ return cvtsi2ss(a, b)
+}
+_mm_cvt_si2ss :: _mm_cvtsi32_ss
+
+
+@(require_results, enable_target_feature="sse")
+_mm_set_ss :: #force_inline proc "c" (a: f32) -> __m128 {
+ return __m128{a, 0, 0, 0}
+}
+@(require_results, enable_target_feature="sse")
+_mm_set1_ps :: #force_inline proc "c" (a: f32) -> __m128 {
+ return __m128(a)
+}
+_mm_set_ps1 :: _mm_set1_ps
+
+@(require_results, enable_target_feature="sse")
+_mm_set_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 {
+ return __m128{d, c, b, a}
+}
+@(require_results, enable_target_feature="sse")
+_mm_setr_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 {
+ return __m128{a, b, c, d}
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_setzero_ps :: #force_inline proc "c" () -> __m128 {
+ return __m128{0, 0, 0, 0}
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: __m128, $MASK: u32) -> __m128 {
+ return simd.shuffle(
+ a, b,
+ u32(MASK) & 0b11,
+ (u32(MASK)>>2) & 0b11,
+ ((u32(MASK)>>4) & 0b11)+4,
+ ((u32(MASK)>>6) & 0b11)+4)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_unpackhi_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, b, 2, 6, 3, 7)
+}
+@(require_results, enable_target_feature="sse")
+_mm_unpacklo_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, b, 0, 4, 1, 5)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_movehl_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, b, 6, 7, 2, 3)
+}
+@(require_results, enable_target_feature="sse")
+_mm_movelh_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, b, 0, 1, 4, 5)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_movemask_ps :: #force_inline proc "c" (a: __m128) -> u32 {
+ return movmskps(a)
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_load_ss :: #force_inline proc "c" (p: ^f32) -> __m128 {
+ return __m128{p^, 0, 0, 0}
+}
+@(require_results, enable_target_feature="sse")
+_mm_load1_ps :: #force_inline proc "c" (p: ^f32) -> __m128 {
+ a := p^
+ return __m128(a)
+}
+_mm_load_ps1 :: _mm_load1_ps
+
+@(require_results, enable_target_feature="sse")
+_mm_load_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
+ return (^__m128)(p)^
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_loadu_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
+ dst := _mm_undefined_ps()
+ intrinsics.mem_copy_non_overlapping(&dst, p, size_of(__m128))
+ return dst
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_loadr_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
+ return simd.lanes_reverse(_mm_load_ps(p))
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_loadu_si64 :: #force_inline proc "c" (mem_addr: rawptr) -> __m128i {
+ a := intrinsics.unaligned_load((^i64)(mem_addr))
+ return __m128i{a, 0}
+}
+
+@(enable_target_feature="sse")
+_mm_store_ss :: #force_inline proc "c" (p: ^f32, a: __m128) {
+ p^ = simd.extract(a, 0)
+}
+
+@(enable_target_feature="sse")
+_mm_store1_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+ b := simd.swizzle(a, 0, 0, 0, 0)
+ (^__m128)(p)^ = b
+}
+_mm_store_ps1 :: _mm_store1_ps
+
+
+@(enable_target_feature="sse")
+_mm_store_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+ (^__m128)(p)^ = a
+}
+@(enable_target_feature="sse")
+_mm_storeu_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+ b := a
+ intrinsics.mem_copy_non_overlapping(p, &b, size_of(__m128))
+}
+@(enable_target_feature="sse")
+_mm_storer_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
+ (^__m128)(p)^ = simd.lanes_reverse(a)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_move_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return simd.shuffle(a, b, 4, 1, 2, 3)
+}
+
+@(enable_target_feature="sse")
+_mm_sfence :: #force_inline proc "c" () {
+ sfence()
+}
+
+@(require_results, enable_target_feature="sse")
+_mm_getcsr :: #force_inline proc "c" () -> (result: u32) {
+ stmxcsr(&result)
+ return result
+}
+
+@(enable_target_feature="sse")
+_mm_setcsr :: #force_inline proc "c" (val: u32) {
+ val := val
+ ldmxcsr(&val)
+}
+
+@(require_results, enable_target_feature="sse")
+_MM_GET_EXCEPTION_MASK :: #force_inline proc "c" () -> u32 {
+ return _mm_getcsr() & _MM_MASK_MASK
+}
+@(require_results, enable_target_feature="sse")
+_MM_GET_EXCEPTION_STATE :: #force_inline proc "c" () -> u32 {
+ return _mm_getcsr() & _MM_EXCEPT_MASK
+}
+@(require_results, enable_target_feature="sse")
+_MM_GET_FLUSH_ZERO_MODE :: #force_inline proc "c" () -> u32 {
+ return _mm_getcsr() & _MM_FLUSH_ZERO_MASK
+}
+@(require_results, enable_target_feature="sse")
+_MM_GET_ROUNDING_MODE :: #force_inline proc "c" () -> u32 {
+ return _mm_getcsr() & _MM_ROUND_MASK
+}
+
+@(enable_target_feature="sse")
+_MM_SET_EXCEPTION_MASK :: #force_inline proc "c" (x: u32) {
+ _mm_setcsr((_mm_getcsr() &~ _MM_MASK_MASK) | x)
+}
+@(enable_target_feature="sse")
+_MM_SET_EXCEPTION_STATE :: #force_inline proc "c" (x: u32) {
+ _mm_setcsr((_mm_getcsr() &~ _MM_EXCEPT_MASK) | x)
+}
+@(enable_target_feature="sse")
+_MM_SET_FLUSH_ZERO_MODE :: #force_inline proc "c" (x: u32) {
+ _mm_setcsr((_mm_getcsr() &~ _MM_FLUSH_ZERO_MASK) | x)
+}
+@(enable_target_feature="sse")
+_MM_SET_ROUNDING_MODE :: #force_inline proc "c" (x: u32) {
+ _mm_setcsr((_mm_getcsr() &~ _MM_ROUND_MASK) | x)
+}
+
+@(enable_target_feature="sse")
+_mm_prefetch :: #force_inline proc "c" (p: rawptr, $STRATEGY: u32) {
+ prefetch(p, (STRATEGY>>2)&1, STRATEGY&3, 1)
+}
+
+
+@(require_results, enable_target_feature="sse")
+_mm_undefined_ps :: #force_inline proc "c" () -> __m128 {
+ return _mm_set1_ps(0)
+}
+
+@(enable_target_feature="sse")
+_MM_TRANSPOSE4_PS :: #force_inline proc "c" (row0, row1, row2, row3: ^__m128) {
+ tmp0 := _mm_unpacklo_ps(row0^, row1^)
+ tmp1 := _mm_unpacklo_ps(row2^, row3^)
+ tmp2 := _mm_unpackhi_ps(row0^, row1^)
+ tmp3 := _mm_unpackhi_ps(row2^, row3^)
+
+ row0^ = _mm_movelh_ps(tmp0, tmp2)
+ row1^ = _mm_movelh_ps(tmp2, tmp0)
+ row2^ = _mm_movelh_ps(tmp1, tmp3)
+ row3^ = _mm_movelh_ps(tmp3, tmp1)
+}
+
+@(enable_target_feature="sse")
+_mm_stream_ps :: #force_inline proc "c" (addr: [^]f32, a: __m128) {
+ intrinsics.non_temporal_store((^__m128)(addr), a)
+}
+
+when ODIN_ARCH == .amd64 {
+ @(require_results, enable_target_feature="sse")
+ _mm_cvtss_si64 :: #force_inline proc "c"(a: __m128) -> i64 {
+ return cvtss2si64(a)
+ }
+ @(require_results, enable_target_feature="sse")
+ _mm_cvttss_si64 :: #force_inline proc "c"(a: __m128) -> i64 {
+ return cvttss2si64(a)
+ }
+ @(require_results, enable_target_feature="sse")
+ _mm_cvtsi64_ss :: #force_inline proc "c"(a: __m128, b: i64) -> __m128 {
+ return cvtsi642ss(a, b)
+ }
+}
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.sse.add.ss")
+ addss :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.sub.ss")
+ subss :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.mul.ss")
+ mulss :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.div.ss")
+ divss :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.sqrt.ss")
+ sqrtss :: proc(a: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.sqrt.ps")
+ sqrtps :: proc(a: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.rcp.ss")
+ rcpss :: proc(a: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.rcp.ps")
+ rcpps :: proc(a: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.rsqrt.ss")
+ rsqrtss :: proc(a: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.rsqrt.ps")
+ rsqrtps :: proc(a: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.min.ss")
+ minss :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.min.ps")
+ minps :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.max.ss")
+ maxss :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.max.ps")
+ maxps :: proc(a, b: __m128) -> __m128 ---
+ @(link_name="llvm.x86.sse.movmsk.ps")
+ movmskps :: proc(a: __m128) -> u32 ---
+ @(link_name="llvm.x86.sse.cmp.ps")
+ cmpps :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+ @(link_name="llvm.x86.sse.comieq.ss")
+ comieq_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.comilt.ss")
+ comilt_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.comile.ss")
+ comile_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.comigt.ss")
+ comigt_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.comige.ss")
+ comige_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.comineq.ss")
+ comineq_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.ucomieq.ss")
+ ucomieq_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.ucomilt.ss")
+ ucomilt_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.ucomile.ss")
+ ucomile_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.ucomigt.ss")
+ ucomigt_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.ucomige.ss")
+ ucomige_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.ucomineq.ss")
+ ucomineq_ss :: proc(a, b: __m128) -> b32 ---
+ @(link_name="llvm.x86.sse.cvtss2si")
+ cvtss2si :: proc(a: __m128) -> i32 ---
+ @(link_name="llvm.x86.sse.cvttss2si")
+ cvttss2si :: proc(a: __m128) -> i32 ---
+ @(link_name="llvm.x86.sse.cvtsi2ss")
+ cvtsi2ss :: proc(a: __m128, b: i32) -> __m128 ---
+ @(link_name="llvm.x86.sse.sfence")
+ sfence :: proc() ---
+ @(link_name="llvm.x86.sse.stmxcsr")
+ stmxcsr :: proc(p: rawptr) ---
+ @(link_name="llvm.x86.sse.ldmxcsr")
+ ldmxcsr :: proc(p: rawptr) ---
+ @(link_name="llvm.prefetch")
+ prefetch :: proc(p: rawptr, #const rw, loc, ty: u32) ---
+ @(link_name="llvm.x86.sse.cmp.ss")
+ cmpss :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+
+
+ // amd64 only
+ @(link_name="llvm.x86.sse.cvtss2si64")
+ cvtss2si64 :: proc(a: __m128) -> i64 ---
+ @(link_name="llvm.x86.sse.cvttss2si64")
+ cvttss2si64 :: proc(a: __m128) -> i64 ---
+ @(link_name="llvm.x86.sse.cvtsi642ss")
+ cvtsi642ss :: proc(a: __m128, b: i64) -> __m128 ---
+}
diff --git a/core/simd/x86/sse2.odin b/core/simd/x86/sse2.odin
new file mode 100644
index 000000000..f33bd2195
--- /dev/null
+++ b/core/simd/x86/sse2.odin
@@ -0,0 +1,1191 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+
+@(enable_target_feature="sse2")
+_mm_pause :: #force_inline proc "c" () {
+ pause()
+}
+@(enable_target_feature="sse2")
+_mm_clflush :: #force_inline proc "c" (p: rawptr) {
+ clflush(p)
+}
+@(enable_target_feature="sse2")
+_mm_lfence :: #force_inline proc "c" () {
+ lfence()
+}
+@(enable_target_feature="sse2")
+_mm_mfence :: #force_inline proc "c" () {
+ mfence()
+}
+
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add(transmute(i64x2)a, transmute(i64x2)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add_sat(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add_sat(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add_sat(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_adds_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.add_sat(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_avg_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pavgb(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_avg_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pavgw(transmute(u16x8)a, transmute(u16x8)b)
+}
+
+@(require_results, enable_target_feature="sse2")
+_mm_madd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaddwd(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaxsw(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaxub(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pminsw(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pminub(transmute(u8x16)a, transmute(u8x16)b)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_mulhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmulhw(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mulhi_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmulhuw(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mullo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.mul(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mul_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmuludq(transmute(u32x4)a, transmute(u32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sad_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)psadbw(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub(transmute(i64x2)a, transmute(i64x2)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub_sat(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub_sat(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub_sat(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.sub_sat(transmute(u16x8)a, transmute(u16x8)b)
+}
+
+
+
+@(private)
+@(require_results, enable_target_feature="sse2")
+_mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ shift :: IMM8 & 0xff
+
+ return transmute(__m128i)simd.shuffle(
+ transmute(i8x16)a,
+ i8x16(0),
+ 0 when shift > 15 else (16 - shift + 0),
+ 1 when shift > 15 else (16 - shift + 1),
+ 2 when shift > 15 else (16 - shift + 2),
+ 3 when shift > 15 else (16 - shift + 3),
+ 4 when shift > 15 else (16 - shift + 4),
+ 5 when shift > 15 else (16 - shift + 5),
+ 6 when shift > 15 else (16 - shift + 6),
+ 7 when shift > 15 else (16 - shift + 7),
+ 8 when shift > 15 else (16 - shift + 8),
+ 9 when shift > 15 else (16 - shift + 9),
+ 10 when shift > 15 else (16 - shift + 10),
+ 11 when shift > 15 else (16 - shift + 11),
+ 12 when shift > 15 else (16 - shift + 12),
+ 13 when shift > 15 else (16 - shift + 13),
+ 14 when shift > 15 else (16 - shift + 14),
+ 15 when shift > 15 else (16 - shift + 15),
+ )
+}
+
+@(private)
+@(require_results, enable_target_feature="sse2")
+_mm_srli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ shift :: IMM8
+ return transmute(__m128i)simd.shuffle(
+ transmute(i8x16)a,
+ i8x16(0),
+ 0 + 16 when shift > 15 else (shift + 0),
+ 1 + 16 when shift > 15 else (shift + 1),
+ 2 + 16 when shift > 15 else (shift + 2),
+ 3 + 16 when shift > 15 else (shift + 3),
+ 4 + 16 when shift > 15 else (shift + 4),
+ 5 + 16 when shift > 15 else (shift + 5),
+ 6 + 16 when shift > 15 else (shift + 6),
+ 7 + 16 when shift > 15 else (shift + 7),
+ 8 + 16 when shift > 15 else (shift + 8),
+ 9 + 16 when shift > 15 else (shift + 9),
+ 10 + 16 when shift > 15 else (shift + 10),
+ 11 + 16 when shift > 15 else (shift + 11),
+ 12 + 16 when shift > 15 else (shift + 12),
+ 13 + 16 when shift > 15 else (shift + 13),
+ 14 + 16 when shift > 15 else (shift + 14),
+ 15 + 16 when shift > 15 else (shift + 15),
+ )
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_slli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return _mm_slli_si128_impl(a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_bslli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return _mm_slli_si128_impl(a, IMM8)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_bsrli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return _mm_srli_si128_impl(a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_slli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)pslliw(transmute(i16x8)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sll_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psllw(transmute(i16x8)a, transmute(i16x8)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_slli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)psllid(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sll_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)pslld(transmute(i32x4)a, transmute(i32x4)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_slli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)pslliq(transmute(i64x2)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sll_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psllq(transmute(i64x2)a, transmute(i64x2)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srai_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)psraiw(transmute(i16x8)a. IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sra_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psraw(transmute(i16x8)a, transmute(i16x8)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srai_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)psraid(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sra_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psrad(transmute(i32x4)a, transmute(i32x4)count)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_srli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return _mm_srli_si128_impl(a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)psrliw(transmute(i16x8)a. IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srl_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psrlw(transmute(i16x8)a, transmute(i16x8)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)psrlid(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srl_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psrld(transmute(i32x4)a, transmute(i32x4)count)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)psrliq(transmute(i64x2)a, IMM8)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_srl_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i {
+ return transmute(__m128i)psrlq(transmute(i64x2)a, transmute(i64x2)count)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_and_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return simd.and(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_andnot_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return simd.and_not(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_or_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return simd.or(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_xor_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return simd.xor(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_eq(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_eq(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_eq(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_gt(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_gt(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_gt(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_lt(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_lt(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_lt(transmute(i32x4)a, transmute(i32x4)b)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m128d {
+ v := transmute(i32x4)a
+ return cast(__m128d)simd.shuffle(v, v, 0, 1)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsi32_sd :: #force_inline proc "c" (a: __m128d, b: i32) -> __m128d {
+ return simd.replace(a, 0, f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtepi32_ps :: #force_inline proc "c" (a: __m128i) -> __m128 {
+ return cvtdq2ps(transmute(i32x4)a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i {
+ return transmute(__m128i)cvtps2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsi32_si128 :: #force_inline proc "c" (a: i32) -> __m128i {
+ return transmute(__m128i)i32x4{a, 0, 0, 0}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsi128_si32 :: #force_inline proc "c" (a: __m128i) -> i32 {
+ return simd.extract(transmute(i32x4)a, 0)
+}
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi64x :: #force_inline proc "c" (e1, e0: i64) -> __m128i {
+ return transmute(__m128i)i64x2{e0, e1}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i {
+ return transmute(__m128i)i32x4{e0, e1, e2, e3}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i {
+ return transmute(__m128i)i16x8{e0, e1, e2, e3, e4, e5, e6, e7}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i {
+ return transmute(__m128i)i8x16{e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m128i {
+ return _mm_set_epi64x(a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m128i {
+ return _mm_set_epi32(a, a, a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m128i {
+ return _mm_set_epi16(a, a, a, a, a, a, a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m128i {
+ return _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i {
+ return _mm_set_epi32(e0, e1, e2, e3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i {
+ return _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i {
+ return _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setzero_si128 :: #force_inline proc "c" () -> __m128i {
+ return _mm_set1_epi64x(0)
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_loadl_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+ return _mm_set_epi64x(0, intrinsics.unaligned_load((^i64)(mem_addr)))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+ return mem_addr^
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+ dst := _mm_undefined_si128()
+ intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128i))
+ return dst
+}
+@(enable_target_feature="sse2")
+_mm_maskmoveu_si128 :: #force_inline proc "c" (a, mask: __m128i, mem_addr: rawptr) {
+ maskmovdqu(transmute(i8x16)a, transmute(i8x16)mask, mem_addr)
+}
+@(enable_target_feature="sse2")
+_mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+ mem_addr^ = a
+}
+@(enable_target_feature="sse2")
+_mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+ storeudq(mem_addr, a)
+}
+@(enable_target_feature="sse2")
+_mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+ a := a
+ intrinsics.mem_copy_non_overlapping(mem_addr, &a, 8)
+}
+@(enable_target_feature="sse2")
+_mm_stream_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
+ intrinsics.non_temporal_store(mem_addr, a)
+}
+@(enable_target_feature="sse2")
+_mm_stream_si32 :: #force_inline proc "c" (mem_addr: ^i32, a: i32) {
+ intrinsics.non_temporal_store(mem_addr, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_move_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ zero := _mm_setzero_si128()
+ return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)zero, 0, 2)
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_packs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)packsswb(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_packs_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)packssdw(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_packus_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)packuswb(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_extract_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
+ return i32(simd.extract(transmute(u16x8)a, IMM8))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_insert_epi16 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
+ return i32(simd.replace(transmute(u16x8)a, IMM8, i16(i)))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_movemask_epi8 :: #force_inline proc "c" (a: __m128i) -> i32 {
+ return pmovmskb(transmute(i8x16)a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ v := transmute(i32x4)a
+ return transmute(__m128i)simd.shuffle(
+ v,
+ v,
+ IMM8 & 0b11,
+ (IMM8 >> 2) & 0b11,
+ (IMM8 >> 4) & 0b11,
+ (IMM8 >> 6) & 0b11,
+ )
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shufflehi_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ v := transmute(i16x8)a
+ return transmute(__m128i)simd.shuffle(
+ v,
+ v,
+ 0,
+ 1,
+ 2,
+ 3,
+ (IMM8 & 0b11) + 4,
+ ((IMM8 >> 2) & 0b11) + 4,
+ ((IMM8 >> 4) & 0b11) + 4,
+ ((IMM8 >> 6) & 0b11) + 4,
+ )
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shufflelo_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
+ v := transmute(i16x8)a
+ return transmute(__m128i)simd.shuffle(
+ v,
+ v,
+ IMM8 & 0b11,
+ (IMM8 >> 2) & 0b11,
+ (IMM8 >> 4) & 0b11,
+ (IMM8 >> 6) & 0b11,
+ 4,
+ 5,
+ 6,
+ 7,
+ )
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(
+ transmute(i8x16)a,
+ transmute(i8x16)b,
+ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+ )
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 4, 12, 5, 13, 6, 14, 7, 15)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 2, 6, 3, 7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 1, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(
+ transmute(i8x16)a,
+ transmute(i8x16)b,
+ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+ )
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 0, 8, 1, 9, 2, 10, 3, 11)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 0, 4, 1, 5)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 0, 2)
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_add_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_add_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.add(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_div_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_div_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.div(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return maxsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_max_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return maxpd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return minsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_min_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return minpd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mul_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_mul_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.mul(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sqrt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sqrt_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+ return simd.sqrt(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_sub_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.sub(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_and_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return transmute(__m128d)_mm_and_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_andnot_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return transmute(__m128d)_mm_andnot_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_or_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return transmute(__m128d)_mm_or_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_xor_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return transmute(__m128d)_mm_xor_si128(transmute(__m128i)a, transmute(__m128i)b)
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 1)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmple_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 2)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(_mm_cmplt_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(_mm_cmple_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpunord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpneq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 4)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnlt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 5)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnle_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmpsd(a, b, 6)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpngt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(_mm_cmpnlt_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.replace(_mm_cmpnle_sd(b, a), 1, simd.extract(a, 1))
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpeq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmplt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 1)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmple_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 2)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpgt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return _mm_cmplt_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return _mm_cmple_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 7)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpunord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpneq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 4)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnlt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 5)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnle_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return cmppd(a, b, 6)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpngt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return _mm_cmpnlt_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cmpnge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return _mm_cmpnle_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return comieqsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return comiltsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return comilesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return comigtsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return comigesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_comineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return comineqsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return ucomieqsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return ucomiltsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return ucomilesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return ucomigtsd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return ucomigesd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_ucomineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+ return ucomineqsd(a, b)
+}
+
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_cvtpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 {
+ return cvtpd2ps(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m128d {
+ return cvtps2pd(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i {
+ return transmute(__m128i)cvtpd2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 {
+ return cvtsd2si(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsd_ss :: #force_inline proc "c" (a, b: __m128d) -> __m128 {
+ return cvtsd2ss(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtsd_f64 :: #force_inline proc "c" (a: __m128d) -> f64 {
+ return simd.extract(a, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvtss_sd :: #force_inline proc "c" (a, b: __m128) -> __m128d {
+ return cvtss2sd(a, b)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvttpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i {
+ return transmute(__m128i)cvttpd2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvttsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 {
+ return cvttsd2si(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_cvttps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i {
+ return transmute(__m128i)cvttps2dq(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_sd :: #force_inline proc "c" (a: f64) -> __m128d {
+ return _mm_set_pd(0.0, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set1_pd :: #force_inline proc "c" (a: f64) -> __m128d {
+ return _mm_set_pd(a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_pd1 :: #force_inline proc "c" (a: f64) -> __m128d {
+ return _mm_set_pd(a, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_set_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d {
+ return __m128d{b, a}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setr_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d {
+ return _mm_set_pd(b, a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_setzero_pd :: #force_inline proc "c" () -> __m128d {
+ return _mm_set_pd(0.0, 0.0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_movemask_pd :: #force_inline proc "c" (a: __m128d) -> i32 {
+ return movmskpd(a)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+ return (^__m128d)(mem_addr)^
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_sd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+ return _mm_setr_pd(mem_addr^, 0.)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadh_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d {
+ return _mm_setr_pd(simd.extract(a, 0), mem_addr^)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadl_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d {
+ return _mm_setr_pd(mem_addr^, simd.extract(a, 1))
+}
+@(enable_target_feature="sse2")
+_mm_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ intrinsics.non_temporal_store((^__m128d)(mem_addr), a)
+}
+@(enable_target_feature="sse2")
+_mm_store_sd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ mem_addr^ = simd.extract(a, 0)
+}
+@(enable_target_feature="sse2")
+_mm_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ (^__m128d)(mem_addr)^ = a
+}
+@(enable_target_feature="sse2")
+_mm_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ storeupd(mem_addr, a)
+}
+@(enable_target_feature="sse2")
+_mm_store1_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ (^__m128d)(mem_addr)^ = simd.shuffle(a, a, 0, 0)
+}
+@(enable_target_feature="sse2")
+_mm_store_pd1 :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ (^__m128d)(mem_addr)^ = simd.shuffle(a, a, 0, 0)
+}
+@(enable_target_feature="sse2")
+_mm_storer_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ (^__m128d)(mem_addr)^ = simd.shuffle(a, a, 1, 0)
+}
+@(enable_target_feature="sse2")
+_mm_storeh_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ mem_addr^ = simd.extract(a, 1)
+}
+@(enable_target_feature="sse2")
+_mm_storel_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) {
+ mem_addr^ = simd.extract(a, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load1_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+ d := mem_addr^
+ return _mm_setr_pd(d, d)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_load_pd1 :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+ return _mm_load1_pd(mem_addr)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadr_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+ a := _mm_load_pd(mem_addr)
+ return simd.shuffle(a, a, 1, 0)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d {
+ dst := _mm_undefined_pd()
+ intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128d))
+ return dst
+}
+@(require_results, enable_target_feature="sse2")
+_mm_shuffle_pd :: #force_inline proc "c" (a, b: __m128d, $MASK: u32) -> __m128d {
+ return simd.shuffle(a, b, MASK&0b1, ((MASK>>1)&0b1) + 2)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_move_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return _mm_setr_pd(simd.extract(b, 0), simd.extract(a, 1))
+}
+
+
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_castpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 {
+ return transmute(__m128)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castpd_si128 :: #force_inline proc "c" (a: __m128d) -> __m128i {
+ return transmute(__m128i)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castps_pd :: #force_inline proc "c" (a: __m128) -> __m128d {
+ return transmute(__m128d)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castps_si128 :: #force_inline proc "c" (a: __m128) -> __m128i {
+ return transmute(__m128i)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castsi128_pd :: #force_inline proc "c" (a: __m128i) -> __m128d {
+ return transmute(__m128d)a
+}
+@(require_results, enable_target_feature="sse2")
+_mm_castsi128_ps :: #force_inline proc "c" (a: __m128i) -> __m128 {
+ return transmute(__m128)a
+}
+
+
+@(require_results, enable_target_feature="sse2")
+_mm_undefined_pd :: #force_inline proc "c" () -> __m128d {
+ return __m128d{0, 0}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_undefined_si128 :: #force_inline proc "c" () -> __m128i {
+ return __m128i{0, 0}
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpackhi_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.shuffle(a, b, 1, 3)
+}
+@(require_results, enable_target_feature="sse2")
+_mm_unpacklo_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return simd.shuffle(a, b, 0, 2)
+}
+
+
+when ODIN_ARCH == .amd64 {
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 {
+ return cvtsd2si64(a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 {
+ return _mm_cvtsd_si64(a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvttsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 {
+ return cvttsd2si64(a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvttsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 {
+ return _mm_cvttsd_si64(a)
+ }
+ @(enable_target_feature="sse2")
+ _mm_stream_si64 :: #force_inline proc "c" (mem_addr: ^i64, a: i64) {
+ intrinsics.non_temporal_store(mem_addr, a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsi64_si128 :: #force_inline proc "c" (a: i64) -> __m128i {
+ return _mm_set_epi64x(0, a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsi64x_si128 :: #force_inline proc "c" (a: i64) -> __m128i {
+ return _mm_cvtsi64_si128(a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsi128_si64 :: #force_inline proc "c" (a: __m128i) -> i64 {
+ return simd.extract(transmute(i64x2)a, 0)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsi128_si64x :: #force_inline proc "c" (a: __m128i) -> i64 {
+ return _mm_cvtsi128_si64(a)
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsi64_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d {
+ return simd.replace(a, 0, f64(b))
+ }
+ @(require_results, enable_target_feature="sse2")
+ _mm_cvtsi64x_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d {
+ return _mm_cvtsi64_sd(a, b)
+ }
+}
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name="llvm.x86.sse2.pause")
+ pause :: proc() ---
+ @(link_name="llvm.x86.sse2.clflush")
+ clflush :: proc(p: rawptr) ---
+ @(link_name="llvm.x86.sse2.lfence")
+ lfence :: proc() ---
+ @(link_name="llvm.x86.sse2.mfence")
+ mfence :: proc() ---
+ @(link_name="llvm.x86.sse2.pavg.b")
+ pavgb :: proc(a, b: u8x16) -> u8x16 ---
+ @(link_name="llvm.x86.sse2.pavg.w")
+ pavgw :: proc(a, b: u16x8) -> u16x8 ---
+ @(link_name="llvm.x86.sse2.pmadd.wd")
+ pmaddwd :: proc(a, b: i16x8) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.pmaxs.w")
+ pmaxsw :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.pmaxu.b")
+ pmaxub :: proc(a, b: u8x16) -> u8x16 ---
+ @(link_name="llvm.x86.sse2.pmins.w")
+ pminsw :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.pminu.b")
+ pminub :: proc(a, b: u8x16) -> u8x16 ---
+ @(link_name="llvm.x86.sse2.pmulh.w")
+ pmulhw :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.pmulhu.w")
+ pmulhuw :: proc(a, b: u16x8) -> u16x8 ---
+ @(link_name="llvm.x86.sse2.pmulu.dq")
+ pmuludq :: proc(a, b: u32x4) -> u64x2 ---
+ @(link_name="llvm.x86.sse2.psad.bw")
+ psadbw :: proc(a, b: u8x16) -> u64x2 ---
+ @(link_name="llvm.x86.sse2.pslli.w")
+ pslliw :: proc(a: i16x8, #const imm8: u32) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.psll.w")
+ psllw :: proc(a: i16x8, count: i16x8) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.pslli.d")
+ psllid :: proc(a: i32x4, #const imm8: u32) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.psll.d")
+ pslld :: proc(a: i32x4, count: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.pslli.q")
+ pslliq :: proc(a: i64x2, #const imm8: u32) -> i64x2 ---
+ @(link_name="llvm.x86.sse2.psll.q")
+ psllq :: proc(a: i64x2, count: i64x2) -> i64x2 ---
+ @(link_name="llvm.x86.sse2.psrai.w")
+ psraiw :: proc(a: i16x8, #const imm8: u32) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.psra.w")
+ psraw :: proc(a: i16x8, count: i16x8) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.psrai.d")
+ psraid :: proc(a: i32x4, #const imm8: u32) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.psra.d")
+ psrad :: proc(a: i32x4, count: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.psrli.w")
+ psrliw :: proc(a: i16x8, #const imm8: u32) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.psrl.w")
+ psrlw :: proc(a: i16x8, count: i16x8) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.psrli.d")
+ psrlid :: proc(a: i32x4, #const imm8: u32) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.psrl.d")
+ psrld :: proc(a: i32x4, count: i32x4) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.psrli.q")
+ psrliq :: proc(a: i64x2, #const imm8: u32) -> i64x2 ---
+ @(link_name="llvm.x86.sse2.psrl.q")
+ psrlq :: proc(a: i64x2, count: i64x2) -> i64x2 ---
+ @(link_name="llvm.x86.sse2.cvtdq2ps")
+ cvtdq2ps :: proc(a: i32x4) -> __m128 ---
+ @(link_name="llvm.x86.sse2.cvtps2dq")
+ cvtps2dq :: proc(a: __m128) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.maskmov.dqu")
+ maskmovdqu :: proc(a: i8x16, mask: i8x16, mem_addr: rawptr) ---
+ @(link_name="llvm.x86.sse2.packsswb.128")
+ packsswb :: proc(a, b: i16x8) -> i8x16 ---
+ @(link_name="llvm.x86.sse2.packssdw.128")
+ packssdw :: proc(a, b: i32x4) -> i16x8 ---
+ @(link_name="llvm.x86.sse2.packuswb.128")
+ packuswb :: proc(a, b: i16x8) -> u8x16 ---
+ @(link_name="llvm.x86.sse2.pmovmskb.128")
+ pmovmskb :: proc(a: i8x16) -> i32 ---
+ @(link_name="llvm.x86.sse2.max.sd")
+ maxsd :: proc(a, b: __m128d) -> __m128d ---
+ @(link_name="llvm.x86.sse2.max.pd")
+ maxpd :: proc(a, b: __m128d) -> __m128d ---
+ @(link_name="llvm.x86.sse2.min.sd")
+ minsd :: proc(a, b: __m128d) -> __m128d ---
+ @(link_name="llvm.x86.sse2.min.pd")
+ minpd :: proc(a, b: __m128d) -> __m128d ---
+ @(link_name="llvm.x86.sse2.sqrt.sd")
+ sqrtsd :: proc(a: __m128d) -> __m128d ---
+ @(link_name="llvm.x86.sse2.sqrt.pd")
+ sqrtpd :: proc(a: __m128d) -> __m128d ---
+ @(link_name="llvm.x86.sse2.cmp.sd")
+ cmpsd :: proc(a, b: __m128d, imm8: i8) -> __m128d ---
+ @(link_name="llvm.x86.sse2.cmp.pd")
+ cmppd :: proc(a, b: __m128d, imm8: i8) -> __m128d ---
+ @(link_name="llvm.x86.sse2.comieq.sd")
+ comieqsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.comilt.sd")
+ comiltsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.comile.sd")
+ comilesd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.comigt.sd")
+ comigtsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.comige.sd")
+ comigesd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.comineq.sd")
+ comineqsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.ucomieq.sd")
+ ucomieqsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.ucomilt.sd")
+ ucomiltsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.ucomile.sd")
+ ucomilesd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.ucomigt.sd")
+ ucomigtsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.ucomige.sd")
+ ucomigesd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.ucomineq.sd")
+ ucomineqsd :: proc(a, b: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.movmsk.pd")
+ movmskpd :: proc(a: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.cvtpd2ps")
+ cvtpd2ps :: proc(a: __m128d) -> __m128 ---
+ @(link_name="llvm.x86.sse2.cvtps2pd")
+ cvtps2pd :: proc(a: __m128) -> __m128d ---
+ @(link_name="llvm.x86.sse2.cvtpd2dq")
+ cvtpd2dq :: proc(a: __m128d) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.cvtsd2si")
+ cvtsd2si :: proc(a: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.cvtsd2ss")
+ cvtsd2ss :: proc(a, b: __m128d) -> __m128 ---
+ @(link_name="llvm.x86.sse2.cvtss2sd")
+ cvtss2sd :: proc(a, b: __m128) -> __m128d ---
+ @(link_name="llvm.x86.sse2.cvttpd2dq")
+ cvttpd2dq :: proc(a: __m128d) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.cvttsd2si")
+ cvttsd2si :: proc(a: __m128d) -> i32 ---
+ @(link_name="llvm.x86.sse2.cvttps2dq")
+ cvttps2dq :: proc(a: __m128) -> i32x4 ---
+ @(link_name="llvm.x86.sse2.storeu.dq")
+ storeudq :: proc(mem_addr: rawptr, a: __m128i) ---
+ @(link_name="llvm.x86.sse2.storeu.pd")
+ storeupd :: proc(mem_addr: rawptr, a: __m128d) ---
+
+ // amd64 only
+ @(link_name="llvm.x86.sse2.cvtsd2si64")
+ cvtsd2si64 :: proc(a: __m128d) -> i64 ---
+ @(link_name="llvm.x86.sse2.cvttsd2si64")
+ cvttsd2si64 :: proc(a: __m128d) -> i64 ---
+}
diff --git a/core/simd/x86/sse3.odin b/core/simd/x86/sse3.odin
new file mode 100644
index 000000000..7a3073c18
--- /dev/null
+++ b/core/simd/x86/sse3.odin
@@ -0,0 +1,68 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+
+@(require_results, enable_target_feature="sse3")
+_mm_addsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return addsubps(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_addsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d {
+ return addsubpd(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hadd_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d {
+ return haddpd(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hadd_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return haddps(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d {
+ return hsubpd(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_hsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return hsubps(a, b)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_lddqu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i {
+ return transmute(__m128i)lddqu(mem_addr)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_movedup_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+ return simd.shuffle(a, a, 0, 0)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_loaddup_pd :: #force_inline proc "c" (mem_addr: [^]f64) -> __m128d {
+ return _mm_load1_pd(mem_addr)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_movehdup_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return simd.shuffle(a, a, 1, 1, 3, 3)
+}
+@(require_results, enable_target_feature="sse3")
+_mm_moveldup_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return simd.shuffle(a, a, 0, 0, 2, 2)
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name = "llvm.x86.sse3.addsub.ps")
+ addsubps :: proc(a, b: __m128) -> __m128 ---
+ @(link_name = "llvm.x86.sse3.addsub.pd")
+ addsubpd :: proc(a: __m128d, b: __m128d) -> __m128d ---
+ @(link_name = "llvm.x86.sse3.hadd.pd")
+ haddpd :: proc(a: __m128d, b: __m128d) -> __m128d ---
+ @(link_name = "llvm.x86.sse3.hadd.ps")
+ haddps :: proc(a, b: __m128) -> __m128 ---
+ @(link_name = "llvm.x86.sse3.hsub.pd")
+ hsubpd :: proc(a: __m128d, b: __m128d) -> __m128d ---
+ @(link_name = "llvm.x86.sse3.hsub.ps")
+ hsubps :: proc(a, b: __m128) -> __m128 ---
+ @(link_name = "llvm.x86.sse3.ldu.dq")
+ lddqu :: proc(mem_addr: rawptr) -> i8x16 ---
+} \ No newline at end of file
diff --git a/core/simd/x86/sse41.odin b/core/simd/x86/sse41.odin
new file mode 100644
index 000000000..b35be33f2
--- /dev/null
+++ b/core/simd/x86/sse41.odin
@@ -0,0 +1,352 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:simd"
+
+// SSE4 rounding constants
+_MM_FROUND_TO_NEAREST_INT :: 0x00
+_MM_FROUND_TO_NEG_INF :: 0x01
+_MM_FROUND_TO_POS_INF :: 0x02
+_MM_FROUND_TO_ZERO :: 0x03
+_MM_FROUND_CUR_DIRECTION :: 0x04
+_MM_FROUND_RAISE_EXC :: 0x00
+_MM_FROUND_NO_EXC :: 0x08
+_MM_FROUND_NINT :: 0x00
+_MM_FROUND_FLOOR :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF
+_MM_FROUND_CEIL :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF
+_MM_FROUND_TRUNC :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO
+_MM_FROUND_RINT :: _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION
+_MM_FROUND_NEARBYINT :: _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION
+
+
+
+@(require_results, enable_target_feature="sse4.1")
+_mm_blendv_epi8 :: #force_inline proc "c" (a, b, mask: __m128i) -> __m128i {
+ return transmute(__m128i)pblendvb(transmute(i8x16)a, transmute(i8x16)b, transmute(i8x16)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blend_epi16 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
+ return transmute(__m128i)pblendw(transmute(i16x8)a, transmute(i16x8)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blendv_pd :: #force_inline proc "c" (a, b, mask: __m128d) -> __m128d {
+ return blendvpd(a, b, mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blendv_ps :: #force_inline proc "c" (a, b, mask: __m128) -> __m128 {
+ return blendvps(a, b, mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blend_pd :: #force_inline proc "c" (a, b: __m128d, $IMM2: u8) -> __m128d {
+ return blendpd(a, b, IMM2)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_blend_ps :: #force_inline proc "c" (a, b: __m128, $IMM4: u8) -> __m128 {
+ return blendps(a, b, IMM4)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_extract_ps :: #force_inline proc "c" (a: __m128, $IMM8: u32) -> i32 {
+ return transmute(i32)simd.extract(a, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_extract_epi8 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
+ return i32(simd.extract(transmute(u8x16)a, IMM8))
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_extract_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
+ return simd.extract(transmute(i32x4)a, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_insert_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 {
+ return insertps(a, b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_insert_epi8 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)simd.replace(transmute(i8x16)a, IMM8, i8(i))
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_insert_epi32 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
+ return transmute(__m128i)simd.replace(transmute(i32x4)a, IMM8, i)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaxsb(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaxuw(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaxsd(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_max_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaxud(transmute(u32x4)a, transmute(u32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pminsb(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pminuw(transmute(u16x8)a, transmute(u16x8)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pminsd(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_min_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pminud(transmute(u32x4)a, transmute(u32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_packus_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)packusdw(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cmpeq_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_eq(transmute(i64x2)a, transmute(i64x2)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(i8x16)a
+ y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7)
+ return transmute(__m128i)i16x8(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(i8x16)a
+ y := simd.shuffle(x, x, 0, 1, 2, 3)
+ return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(i8x16)a
+ y := simd.shuffle(x, x, 0, 1)
+ return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(i16x8)a
+ y := simd.shuffle(x, x, 0, 1, 2, 3)
+ return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(i16x8)a
+ y := simd.shuffle(x, x, 0, 1)
+ return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepi32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(i32x4)a
+ y := simd.shuffle(x, x, 0, 1)
+ return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(u8x16)a
+ y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7)
+ return transmute(__m128i)i16x8(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(u8x16)a
+ y := simd.shuffle(x, x, 0, 1, 2, 3)
+ return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(u8x16)a
+ y := simd.shuffle(x, x, 0, 1)
+ return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(u16x8)a
+ y := simd.shuffle(x, x, 0, 1, 2, 3)
+ return transmute(__m128i)i32x4(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(u16x8)a
+ y := simd.shuffle(x, x, 0, 1)
+ return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_cvtepu32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ x := transmute(u32x4)a
+ y := simd.shuffle(x, x, 0, 1)
+ return transmute(__m128i)i64x2(y)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_dp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM8: u8) -> __m128d {
+ return dppd(a, b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_dp_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 {
+ return dpps(a, b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+ return simd.floor(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return simd.floor(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return roundsd(a, b, _MM_FROUND_FLOOR)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_floor_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return roundss(a, b, _MM_FROUND_FLOOR)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
+ return simd.ceil(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
+ return simd.ceil(a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
+ return roundsd(a, b, _MM_FROUND_CEIL)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_ceil_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
+ return roundss(a, b, _MM_FROUND_CEIL)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_pd :: #force_inline proc "c" (a: __m128d, $ROUNDING: i32) -> __m128d {
+ return roundpd(a, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_ps :: #force_inline proc "c" (a: __m128, $ROUNDING: i32) -> __m128 {
+ return roundps(a, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_sd :: #force_inline proc "c" (a, b: __m128d, $ROUNDING: i32) -> __m128d {
+ return roundsd(a, b, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_round_ss :: #force_inline proc "c" (a, b: __m128, $ROUNDING: i32) -> __m128 {
+ return roundss(a, b, ROUNDING)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_minpos_epu16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ return transmute(__m128i)phminposuw(transmute(u16x8)a)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_mul_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmuldq(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_mullo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.mul(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_mpsadbw_epu8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
+ return transmute(__m128i)mpsadbw(transmute(u8x16)a, transmute(u8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_testz_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+ return ptestz(transmute(i64x2)a, transmute(i64x2)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_testc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+ return ptestc(transmute(i64x2)a, transmute(i64x2)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_testnzc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+ return ptestnzc(transmute(i64x2)a, transmute(i64x2)mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_test_all_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+ return _mm_testz_si128(a, mask)
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_test_all_ones :: #force_inline proc "c" (a: __m128i) -> i32 {
+ return _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
+}
+@(require_results, enable_target_feature="sse4.1")
+_mm_test_mix_ones_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
+ return _mm_testnzc_si128(a, mask)
+}
+
+
+when ODIN_ARCH == .amd64 {
+ @(require_results, enable_target_feature="sse4.1")
+ _mm_extract_epi64 :: #force_inline proc "c" (a: __m128i, $IMM1: u32) -> i64 {
+ return simd.extract(transmute(i64x2)a, IMM1)
+ }
+
+ @(require_results, enable_target_feature="sse4.1")
+ _mm_insert_epi64 :: #force_inline proc "c" (a: __m128i, i: i64, $IMM1: u32) -> __m128i {
+ return transmute(__m128i)simd.replace(transmute(i64x2)a, IMM1, i)
+ }
+}
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name = "llvm.x86.sse41.pblendvb")
+ pblendvb :: proc(a, b: i8x16, mask: i8x16) -> i8x16 ---
+ @(link_name = "llvm.x86.sse41.blendvpd")
+ blendvpd :: proc(a, b, mask: __m128d) -> __m128d ---
+ @(link_name = "llvm.x86.sse41.blendvps")
+ blendvps :: proc(a, b, mask: __m128) -> __m128 ---
+ @(link_name = "llvm.x86.sse41.blendpd")
+ blendpd :: proc(a, b: __m128d, #const imm2: u8) -> __m128d ---
+ @(link_name = "llvm.x86.sse41.blendps")
+ blendps :: proc(a, b: __m128, #const imm4: u8) -> __m128 ---
+ @(link_name = "llvm.x86.sse41.pblendw")
+ pblendw :: proc(a: i16x8, b: i16x8, #const imm8: u8) -> i16x8 ---
+ @(link_name = "llvm.x86.sse41.insertps")
+ insertps :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+ @(link_name = "llvm.x86.sse41.pmaxsb")
+ pmaxsb :: proc(a, b: i8x16) -> i8x16 ---
+ @(link_name = "llvm.x86.sse41.pmaxuw")
+ pmaxuw :: proc(a, b: u16x8) -> u16x8 ---
+ @(link_name = "llvm.x86.sse41.pmaxsd")
+ pmaxsd :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name = "llvm.x86.sse41.pmaxud")
+ pmaxud :: proc(a, b: u32x4) -> u32x4 ---
+ @(link_name = "llvm.x86.sse41.pminsb")
+ pminsb :: proc(a, b: i8x16) -> i8x16 ---
+ @(link_name = "llvm.x86.sse41.pminuw")
+ pminuw :: proc(a, b: u16x8) -> u16x8 ---
+ @(link_name = "llvm.x86.sse41.pminsd")
+ pminsd :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name = "llvm.x86.sse41.pminud")
+ pminud :: proc(a, b: u32x4) -> u32x4 ---
+ @(link_name = "llvm.x86.sse41.packusdw")
+ packusdw :: proc(a, b: i32x4) -> u16x8 ---
+ @(link_name = "llvm.x86.sse41.dppd")
+ dppd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
+ @(link_name = "llvm.x86.sse41.dpps")
+ dpps :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
+ @(link_name = "llvm.x86.sse41.round.pd")
+ roundpd :: proc(a: __m128d, rounding: i32) -> __m128d ---
+ @(link_name = "llvm.x86.sse41.round.ps")
+ roundps :: proc(a: __m128, rounding: i32) -> __m128 ---
+ @(link_name = "llvm.x86.sse41.round.sd")
+ roundsd :: proc(a, b: __m128d, rounding: i32) -> __m128d ---
+ @(link_name = "llvm.x86.sse41.round.ss")
+ roundss :: proc(a, b: __m128, rounding: i32) -> __m128 ---
+ @(link_name = "llvm.x86.sse41.phminposuw")
+ phminposuw :: proc(a: u16x8) -> u16x8 ---
+ @(link_name = "llvm.x86.sse41.pmuldq")
+ pmuldq :: proc(a, b: i32x4) -> i64x2 ---
+ @(link_name = "llvm.x86.sse41.mpsadbw")
+ mpsadbw :: proc(a, b: u8x16, #const imm8: u8) -> u16x8 ---
+ @(link_name = "llvm.x86.sse41.ptestz")
+ ptestz :: proc(a, mask: i64x2) -> i32 ---
+ @(link_name = "llvm.x86.sse41.ptestc")
+ ptestc :: proc(a, mask: i64x2) -> i32 ---
+ @(link_name = "llvm.x86.sse41.ptestnzc")
+ ptestnzc :: proc(a, mask: i64x2) -> i32 ---
+} \ No newline at end of file
diff --git a/core/simd/x86/sse42.odin b/core/simd/x86/sse42.odin
new file mode 100644
index 000000000..62b4f0478
--- /dev/null
+++ b/core/simd/x86/sse42.odin
@@ -0,0 +1,149 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:simd"
+
+_SIDD_UBYTE_OPS :: 0b0000_0000
+_SIDD_UWORD_OPS :: 0b0000_0001
+_SIDD_SBYTE_OPS :: 0b0000_0010
+_SIDD_SWORD_OPS :: 0b0000_0011
+
+_SIDD_CMP_EQUAL_ANY :: 0b0000_0000
+_SIDD_CMP_RANGES :: 0b0000_0100
+_SIDD_CMP_EQUAL_EACH :: 0b0000_1000
+_SIDD_CMP_EQUAL_ORDERED :: 0b0000_1100
+
+_SIDD_POSITIVE_POLARITY :: 0b0000_0000
+_SIDD_NEGATIVE_POLARITY :: 0b0001_0000
+_SIDD_MASKED_POSITIVE_POLARITY :: 0b0010_0000
+_SIDD_MASKED_NEGATIVE_POLARITY :: 0b0011_0000
+
+_SIDD_LEAST_SIGNIFICANT :: 0b0000_0000
+_SIDD_MOST_SIGNIFICANT :: 0b0100_0000
+
+_SIDD_BIT_MASK :: 0b0000_0000
+_SIDD_UNIT_MASK :: 0b0100_0000
+
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrm :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> __m128i {
+ return transmute(__m128i)pcmpistrm128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistri :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+ return pcmpistri128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrz :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+ return pcmpistriz128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrc :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+ return pcmpistric128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistrs :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+ return pcmpistris128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistro :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+ return pcmpistrio128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpistra :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 {
+ return pcmpistria128(transmute(i8x16)a, transmute(i8x16)b, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrm :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> __m128i {
+ return transmute(__m128i)pcmpestrm128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestri :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+ return pcmpestri128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrz :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+ return pcmpestriz128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrc :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+ return pcmpestric128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestrs :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+ return pcmpestris128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestro :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+ return pcmpestrio128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpestra :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 {
+ return pcmpestria128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_crc32_u8 :: #force_inline proc "c" (crc: u32, v: u8) -> u32 {
+ return crc32_32_8(crc, v)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_crc32_u16 :: #force_inline proc "c" (crc: u32, v: u16) -> u32 {
+ return crc32_32_16(crc, v)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_crc32_u32 :: #force_inline proc "c" (crc: u32, v: u32) -> u32 {
+ return crc32_32_32(crc, v)
+}
+@(require_results, enable_target_feature="sse4.2")
+_mm_cmpgt_epi64 :: #force_inline proc "c" (a: __m128i, b: __m128i) -> __m128i {
+ return transmute(__m128i)simd.lanes_gt(transmute(i64x2)a, transmute(i64x2)b)
+}
+
+when ODIN_ARCH == .amd64 {
+ @(require_results, enable_target_feature="sse4.2")
+ _mm_crc32_u64 :: #force_inline proc "c" (crc: u64, v: u64) -> u64 {
+ return crc32_64_64(crc, v)
+ }
+}
+
+@(private, default_calling_convention="c")
+foreign _ {
+ // SSE 4.2 string and text comparison ops
+ @(link_name="llvm.x86.sse42.pcmpestrm128")
+ pcmpestrm128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> u8x16 ---
+ @(link_name="llvm.x86.sse42.pcmpestri128")
+ pcmpestri128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpestriz128")
+ pcmpestriz128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpestric128")
+ pcmpestric128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpestris128")
+ pcmpestris128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpestrio128")
+ pcmpestrio128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpestria128")
+ pcmpestria128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpistrm128")
+ pcmpistrm128 :: proc(a, b: i8x16, #const imm8: i8) -> i8x16 ---
+ @(link_name="llvm.x86.sse42.pcmpistri128")
+ pcmpistri128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpistriz128")
+ pcmpistriz128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpistric128")
+ pcmpistric128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpistris128")
+ pcmpistris128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpistrio128")
+ pcmpistrio128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+ @(link_name="llvm.x86.sse42.pcmpistria128")
+ pcmpistria128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 ---
+ // SSE 4.2 CRC instructions
+ @(link_name="llvm.x86.sse42.crc32.32.8")
+ crc32_32_8 :: proc(crc: u32, v: u8) -> u32 ---
+ @(link_name="llvm.x86.sse42.crc32.32.16")
+ crc32_32_16 :: proc(crc: u32, v: u16) -> u32 ---
+ @(link_name="llvm.x86.sse42.crc32.32.32")
+ crc32_32_32 :: proc(crc: u32, v: u32) -> u32 ---
+
+ // AMD64 Only
+ @(link_name="llvm.x86.sse42.crc32.64.64")
+ crc32_64_64 :: proc(crc: u64, v: u64) -> u64 ---
+}
diff --git a/core/simd/x86/ssse3.odin b/core/simd/x86/ssse3.odin
new file mode 100644
index 000000000..f11ef6774
--- /dev/null
+++ b/core/simd/x86/ssse3.odin
@@ -0,0 +1,140 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:intrinsics"
+import "core:simd"
+_ :: simd
+
+@(require_results, enable_target_feature="ssse3")
+_mm_abs_epi8 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ return transmute(__m128i)pabsb128(transmute(i8x16)a)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_abs_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ return transmute(__m128i)pabsw128(transmute(i16x8)a)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
+ return transmute(__m128i)pabsd128(transmute(i32x4)a)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i {
+ shift :: IMM8
+
+ // If palignr is shifting the pair of vectors more than the size of two
+ // lanes, emit zero.
+ if shift > 32 {
+ return _mm_set1_epi8(0)
+ }
+ a, b := a, b
+ if shift > 16 {
+ a, b = _mm_set1_epi8(0), a
+ }
+
+ return transmute(__m128i)simd.shuffle(
+ transmute(i8x16)b,
+ transmute(i8x16)a,
+ 0 when shift > 32 else shift - 16 + 0 when shift > 16 else shift + 0,
+ 1 when shift > 32 else shift - 16 + 1 when shift > 16 else shift + 1,
+ 2 when shift > 32 else shift - 16 + 2 when shift > 16 else shift + 2,
+ 3 when shift > 32 else shift - 16 + 3 when shift > 16 else shift + 3,
+ 4 when shift > 32 else shift - 16 + 4 when shift > 16 else shift + 4,
+ 5 when shift > 32 else shift - 16 + 5 when shift > 16 else shift + 5,
+ 6 when shift > 32 else shift - 16 + 6 when shift > 16 else shift + 6,
+ 7 when shift > 32 else shift - 16 + 7 when shift > 16 else shift + 7,
+ 8 when shift > 32 else shift - 16 + 8 when shift > 16 else shift + 8,
+ 9 when shift > 32 else shift - 16 + 9 when shift > 16 else shift + 9,
+ 10 when shift > 32 else shift - 16 + 10 when shift > 16 else shift + 10,
+ 11 when shift > 32 else shift - 16 + 11 when shift > 16 else shift + 11,
+ 12 when shift > 32 else shift - 16 + 12 when shift > 16 else shift + 12,
+ 13 when shift > 32 else shift - 16 + 13 when shift > 16 else shift + 13,
+ 14 when shift > 32 else shift - 16 + 14 when shift > 16 else shift + 14,
+ 15 when shift > 32 else shift - 16 + 15 when shift > 16 else shift + 15,
+ )
+}
+
+
+@(require_results, enable_target_feature="ssse3")
+_mm_hadd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)phaddw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hadds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)phaddsw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hadd_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)phaddd128(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hsub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)phsubw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hsubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)phsubsw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_hsub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)phsubd128(transmute(i32x4)a, transmute(i32x4)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_maddubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmaddubsw128(transmute(u8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_mulhrs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)pmulhrsw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_sign_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)psignb128(transmute(i8x16)a, transmute(i8x16)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_sign_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)psignw128(transmute(i16x8)a, transmute(i16x8)b)
+}
+@(require_results, enable_target_feature="ssse3")
+_mm_sign_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+ return transmute(__m128i)psignd128(transmute(i32x4)a, transmute(i32x4)b)
+}
+
+
+
+@(private, default_calling_convention="c")
+foreign _ {
+ @(link_name = "llvm.x86.ssse3.pabs.b.128")
+ pabsb128 :: proc(a: i8x16) -> u8x16 ---
+ @(link_name = "llvm.x86.ssse3.pabs.w.128")
+ pabsw128 :: proc(a: i16x8) -> u16x8 ---
+ @(link_name = "llvm.x86.ssse3.pabs.d.128")
+ pabsd128 :: proc(a: i32x4) -> u32x4 ---
+ @(link_name = "llvm.x86.ssse3.pshuf.b.128")
+ pshufb128 :: proc(a, b: u8x16) -> u8x16 ---
+ @(link_name = "llvm.x86.ssse3.phadd.w.128")
+ phaddw128 :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.phadd.sw.128")
+ phaddsw128 :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.phadd.d.128")
+ phaddd128 :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name = "llvm.x86.ssse3.phsub.w.128")
+ phsubw128 :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.phsub.sw.128")
+ phsubsw128 :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.phsub.d.128")
+ phsubd128 :: proc(a, b: i32x4) -> i32x4 ---
+ @(link_name = "llvm.x86.ssse3.pmadd.ub.sw.128")
+ pmaddubsw128 :: proc(a: u8x16, b: i8x16) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.pmul.hr.sw.128")
+ pmulhrsw128 :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.psign.b.128")
+ psignb128 :: proc(a, b: i8x16) -> i8x16 ---
+ @(link_name = "llvm.x86.ssse3.psign.w.128")
+ psignw128 :: proc(a, b: i16x8) -> i16x8 ---
+ @(link_name = "llvm.x86.ssse3.psign.d.128")
+ psignd128 :: proc(a, b: i32x4) -> i32x4 ---
+} \ No newline at end of file
diff --git a/core/simd/x86/types.odin b/core/simd/x86/types.odin
new file mode 100644
index 000000000..06a2cd41e
--- /dev/null
+++ b/core/simd/x86/types.odin
@@ -0,0 +1,57 @@
+//+build i386, amd64
+package simd_x86
+
+import "core:simd"
+
+bf16 :: u16
+
+__m128i :: #simd[2]i64
+__m128 :: #simd[4]f32
+__m128d :: #simd[2]f64
+
+__m256i :: #simd[4]i64
+__m256 :: #simd[8]f32
+__m256d :: #simd[4]f64
+
+__m512i :: #simd[8]i64
+__m512 :: #simd[16]f32
+__m512d :: #simd[8]f64
+
+__m128bh :: #simd[8]bf16
+__m256bh :: #simd[16]bf16
+__m512bh :: #simd[32]bf16
+
+
+/// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer
+__mmask64 :: u64
+
+/// The `__mmask32` type used in AVX-512 intrinsics, a 32-bit integer
+__mmask32 :: u32
+
+/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer
+__mmask16 :: u16
+
+/// The `__mmask8` type used in AVX-512 intrinsics, a 8-bit integer
+__mmask8 :: u8
+
+/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics.
+_MM_CMPINT_ENUM :: i32
+
+/// The `MM_MANTISSA_NORM_ENUM` type used to specify mantissa normalized operations in AVX-512 intrinsics.
+_MM_MANTISSA_NORM_ENUM :: i32
+
+/// The `MM_MANTISSA_SIGN_ENUM` type used to specify mantissa signed operations in AVX-512 intrinsics.
+_MM_MANTISSA_SIGN_ENUM :: i32
+
+_MM_PERM_ENUM :: i32
+
+@(private) u8x16 :: simd.u8x16
+@(private) i8x16 :: simd.i8x16
+@(private) u16x8 :: simd.u16x8
+@(private) i16x8 :: simd.i16x8
+@(private) u32x4 :: simd.u32x4
+@(private) i32x4 :: simd.i32x4
+@(private) u64x2 :: simd.u64x2
+@(private) i64x2 :: simd.i64x2
+@(private) f32x4 :: simd.f32x4
+@(private) f64x2 :: simd.f64x2
diff --git a/core/sys/cpu/cpu.odin b/core/sys/cpu/cpu.odin
deleted file mode 100644
index b99fe01d8..000000000
--- a/core/sys/cpu/cpu.odin
+++ /dev/null
@@ -1,33 +0,0 @@
-package sys_cpu
-
-Cache_Line_Pad :: struct {_: [_cache_line_size]byte};
-
-initialized: bool;
-
-x86: struct {
- _: Cache_Line_Pad,
- has_aes: bool, // AES hardware implementation (AES NI)
- has_adx: bool, // Multi-precision add-carry instruction extensions
- has_avx: bool, // Advanced vector extension
- has_avx2: bool, // Advanced vector extension 2
- has_bmi1: bool, // Bit manipulation instruction set 1
- has_bmi2: bool, // Bit manipulation instruction set 2
- has_erms: bool, // Enhanced REP for MOVSB and STOSB
- has_fma: bool, // Fused-multiply-add instructions
- has_os_xsave: bool, // OS supports XSAVE/XRESTOR for saving/restoring XMM registers.
- has_pclmulqdq: bool, // PCLMULQDQ instruction - most often used for AES-GCM
- has_popcnt: bool, // Hamming weight instruction POPCNT.
- has_rdrand: bool, // RDRAND instruction (on-chip random number generator)
- has_rdseed: bool, // RDSEED instruction (on-chip random number generator)
- has_sse2: bool, // Streaming SIMD extension 2 (always available on amd64)
- has_sse3: bool, // Streaming SIMD extension 3
- has_ssse3: bool, // Supplemental streaming SIMD extension 3
- has_sse41: bool, // Streaming SIMD extension 4 and 4.1
- has_sse42: bool, // Streaming SIMD extension 4 and 4.2
- _: Cache_Line_Pad,
-};
-
-
-init :: proc() {
- _init();
-}
diff --git a/core/sys/cpu/cpu_x86.odin b/core/sys/cpu/cpu_x86.odin
deleted file mode 100644
index 146822e61..000000000
--- a/core/sys/cpu/cpu_x86.odin
+++ /dev/null
@@ -1,67 +0,0 @@
-//+build i386, amd64
-package sys_cpu
-
-_cache_line_size :: 64;
-
-cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) {
- return expand_to_tuple(asm(u32, u32) -> struct{eax, ebc, ecx, edx: u32} {
- "cpuid",
- "={ax},={bx},={cx},={dx},{ax},{cx}",
- }(ax, cx));
-}
-
-xgetbv :: proc() -> (eax, edx: u32) {
- return expand_to_tuple(asm(u32) -> struct{eax, edx: u32} {
- "xgetbv",
- "={ax},={dx},{cx}",
- }(0));
-}
-
-_init :: proc() {
- is_set :: proc(hwc: u32, value: u32) -> bool {
- return hwc&value != 0;
- }
-
- initialized = true;
-
- max_id, _, _, _ := cpuid(0, 0);
-
- if max_id < 1 {
- return;
- }
-
- _, _, ecx1, edx1 := cpuid(1, 0);
-
- x86.has_sse2 = is_set(26, edx1);
-
- x86.has_sse3 = is_set(0, ecx1);
- x86.has_pclmulqdq = is_set(1, ecx1);
- x86.has_ssse3 = is_set(9, ecx1);
- x86.has_fma = is_set(12, ecx1);
- x86.has_sse41 = is_set(19, ecx1);
- x86.has_sse42 = is_set(20, ecx1);
- x86.has_popcnt = is_set(23, ecx1);
- x86.has_aes = is_set(25, ecx1);
- x86.has_os_xsave = is_set(27, ecx1);
- x86.has_rdrand = is_set(30, ecx1);
-
- os_supports_avx := false;
- if x86.has_os_xsave {
- eax, _ := xgetbv();
- os_supports_avx = is_set(1, eax) && is_set(2, eax);
- }
-
- x86.has_avx = is_set(28, ecx1) && os_supports_avx;
-
- if max_id < 7 {
- return;
- }
-
- _, ebx7, _, _ := cpuid(7, 0);
- x86.has_bmi1 = is_set(3, ebx7);
- x86.has_avx2 = is_set(5, ebx7) && os_supports_avx;
- x86.has_bmi2 = is_set(8, ebx7);
- x86.has_erms = is_set(9, ebx7);
- x86.has_rdseed = is_set(18, ebx7);
- x86.has_adx = is_set(19, ebx7);
-}
diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin
index 36326b48e..1ab242305 100644
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -96,6 +96,7 @@ import filepath "core:path/filepath"
import reflect "core:reflect"
import runtime "core:runtime"
+import simd "core:simd"
import slice "core:slice"
import sort "core:sort"
import strconv "core:strconv"
@@ -192,6 +193,7 @@ _ :: slashpath
_ :: filepath
_ :: reflect
_ :: runtime
+_ :: simd
_ :: slice
_ :: sort
_ :: strconv
diff --git a/src/build_settings.cpp b/src/build_settings.cpp
index e9f5f2099..a82cc80c9 100644
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -256,7 +256,6 @@ struct BuildContext {
String extra_linker_flags;
String extra_assembler_flags;
String microarch;
- String target_features;
BuildModeKind build_mode;
bool generate_docs;
i32 optimization_level;
@@ -320,6 +319,10 @@ struct BuildContext {
PtrMap<char const *, ExactValue> defined_values;
+ BlockingMutex target_features_mutex;
+ StringSet target_features_set;
+ String target_features_string;
+
};
gb_global BuildContext build_context = {0};
@@ -629,6 +632,15 @@ bool is_arch_wasm(void) {
return false;
}
+bool is_arch_x86(void) {
+ switch (build_context.metrics.arch) {
+ case TargetArch_i386:
+ case TargetArch_amd64:
+ return true;
+ }
+ return false;
+}
+
bool allow_check_foreign_filepath(void) {
switch (build_context.metrics.arch) {
case TargetArch_wasm32:
@@ -1188,6 +1200,100 @@ void init_build_context(TargetMetrics *cross_target) {
#include "microsoft_craziness.h"
#endif
+
+Array<String> split_by_comma(String const &list) {
+ isize n = 1;
+ for (isize i = 0; i < list.len; i++) {
+ if (list.text[i] == ',') {
+ n++;
+ }
+ }
+ auto res = array_make<String>(heap_allocator(), n);
+
+ String s = list;
+ for (isize i = 0; i < n; i++) {
+ isize m = string_index_byte(s, ',');
+ if (m < 0) {
+ res[i] = s;
+ break;
+ }
+ res[i] = substring(s, 0, m);
+ s = substring(s, m+1, s.len);
+ }
+ return res;
+}
+
+bool check_target_feature_is_valid(TokenPos pos, String const &feature) {
+ // TODO(bill): check_target_feature_is_valid
+ return true;
+}
+
+bool check_target_feature_is_enabled(TokenPos pos, String const &target_feature_list) {
+ BuildContext *bc = &build_context;
+ mutex_lock(&bc->target_features_mutex);
+ defer (mutex_unlock(&bc->target_features_mutex));
+
+ auto items = split_by_comma(target_feature_list);
+ array_free(&items);
+ for_array(i, items) {
+ String const &item = items.data[i];
+ if (!check_target_feature_is_valid(pos, item)) {
+ error(pos, "Target feature '%.*s' is not valid", LIT(item));
+ return false;
+ }
+ if (!string_set_exists(&bc->target_features_set, item)) {
+ error(pos, "Target feature '%.*s' is not enabled", LIT(item));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void enable_target_feature(TokenPos pos, String const &target_feature_list) {
+ BuildContext *bc = &build_context;
+ mutex_lock(&bc->target_features_mutex);
+ defer (mutex_unlock(&bc->target_features_mutex));
+
+ auto items = split_by_comma(target_feature_list);
+ array_free(&items);
+ for_array(i, items) {
+ String const &item = items.data[i];
+ if (!check_target_feature_is_valid(pos, item)) {
+ error(pos, "Target feature '%.*s' is not valid", LIT(item));
+ }
+ }
+}
+
+
+char const *target_features_set_to_cstring(gbAllocator allocator, bool with_quotes) {
+ isize len = 0;
+ for_array(i, build_context.target_features_set.entries) {
+ if (i != 0) {
+ len += 1;
+ }
+ String feature = build_context.target_features_set.entries[i].value;
+ len += feature.len;
+ if (with_quotes) len += 2;
+ }
+ char *features = gb_alloc_array(allocator, char, len+1);
+ len = 0;
+ for_array(i, build_context.target_features_set.entries) {
+ if (i != 0) {
+ features[len++] = ',';
+ }
+
+ if (with_quotes) features[len++] = '"';
+ String feature = build_context.target_features_set.entries[i].value;
+ gb_memmove(features, feature.text, feature.len);
+ len += feature.len;
+ if (with_quotes) features[len++] = '"';
+ }
+ features[len++] = 0;
+
+ return features;
+}
+
// NOTE(Jeroen): Set/create the output and other paths and report an error as appropriate.
// We've previously called `parse_build_flags`, so `out_filepath` should be set.
bool init_build_paths(String init_filename) {
@@ -1197,6 +1303,9 @@ bool init_build_paths(String init_filename) {
// NOTE(Jeroen): We're pre-allocating BuildPathCOUNT slots so that certain paths are always at the same enumerated index.
array_init(&bc->build_paths, permanent_allocator(), BuildPathCOUNT);
+ string_set_init(&bc->target_features_set, heap_allocator(), 1024);
+ mutex_init(&bc->target_features_mutex);
+
// [BuildPathMainPackage] Turn given init path into a `Path`, which includes normalizing it into a full path.
bc->build_paths[BuildPath_Main_Package] = path_from_string(ha, init_filename);
@@ -1377,5 +1486,10 @@ bool init_build_paths(String init_filename) {
return false;
}
+ if (bc->target_features_string.len != 0) {
+ enable_target_feature({}, bc->target_features_string);
+ }
+
return true;
-} \ No newline at end of file
+}
+
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index 55dd6b016..92e3987a0 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -246,7 +246,7 @@ bool is_constant_string(CheckerContext *c, String const &builtin_name, Ast *expr
}
bool check_builtin_objc_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) {
- String builtin_name = builtin_procs[id].name;
+ String const &builtin_name = builtin_procs[id].name;
if (build_context.metrics.os != TargetOs_darwin) {
// allow on doc generation (e.g. Metal stuff)
@@ -409,6 +409,667 @@ bool check_atomic_memory_order_argument(CheckerContext *c, Ast *expr, String con
}
+
+bool check_builtin_simd_operation(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) {
+ ast_node(ce, CallExpr, call);
+
+ String const &builtin_name = builtin_procs[id].name;
+ switch (id) {
+ // Any numeric
+ case BuiltinProc_simd_add:
+ case BuiltinProc_simd_sub:
+ case BuiltinProc_simd_mul:
+ case BuiltinProc_simd_div:
+ case BuiltinProc_simd_min:
+ case BuiltinProc_simd_max:
+ {
+ Operand x = {};
+ Operand y = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(y.type)) {
+ error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!are_types_identical(x.type, y.type)) {
+ gbString xs = type_to_string(x.type);
+ gbString ys = type_to_string(y.type);
+ error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+ gb_string_free(ys);
+ gb_string_free(xs);
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ if (!is_type_integer(elem) && !is_type_float(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ // Integer only
+ case BuiltinProc_simd_add_sat:
+ case BuiltinProc_simd_sub_sat:
+ case BuiltinProc_simd_rem:
+ case BuiltinProc_simd_and:
+ case BuiltinProc_simd_or:
+ case BuiltinProc_simd_xor:
+ case BuiltinProc_simd_and_not:
+ {
+ Operand x = {};
+ Operand y = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(y.type)) {
+ error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!are_types_identical(x.type, y.type)) {
+ gbString xs = type_to_string(x.type);
+ gbString ys = type_to_string(y.type);
+ error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+ gb_string_free(ys);
+ gb_string_free(xs);
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+
+ switch (id) {
+ case BuiltinProc_simd_add_sat:
+ case BuiltinProc_simd_sub_sat:
+ case BuiltinProc_simd_rem:
+ if (!is_type_integer(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ break;
+ default:
+ if (!is_type_integer(elem) && !is_type_boolean(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or boolean element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ break;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ case BuiltinProc_simd_shl: // Odin-like
+ case BuiltinProc_simd_shr: // Odin-like
+ case BuiltinProc_simd_shl_masked: // C-like
+ case BuiltinProc_simd_shr_masked: // C-like
+ {
+ Operand x = {};
+ Operand y = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(y.type)) {
+ error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ GB_ASSERT(x.type->kind == Type_SimdVector);
+ GB_ASSERT(y.type->kind == Type_SimdVector);
+ Type *xt = x.type;
+ Type *yt = y.type;
+
+ if (xt->SimdVector.count != yt->SimdVector.count) {
+ error(x.expr, "'%.*s' mismatched simd vector lengths, got '%lld' vs '%lld'",
+ LIT(builtin_name),
+ cast(long long)xt->SimdVector.count,
+ cast(long long)yt->SimdVector.count);
+ return false;
+ }
+ if (!is_type_integer(base_array_type(x.type))) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ if (!is_type_unsigned(base_array_type(y.type))) {
+ gbString ys = type_to_string(y.type);
+ error(y.expr, "'%.*s' expected a #simd type with an unsigned integer element as the shifting operand, got '%s'", LIT(builtin_name), ys);
+ gb_string_free(ys);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ // Unary
+ case BuiltinProc_simd_neg:
+ case BuiltinProc_simd_abs:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]);
+ if (x.mode == Addressing_Invalid) {
+ return false;
+ }
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ if (!is_type_integer(elem) && !is_type_float(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ // Return integer masks
+ case BuiltinProc_simd_lanes_eq:
+ case BuiltinProc_simd_lanes_ne:
+ case BuiltinProc_simd_lanes_lt:
+ case BuiltinProc_simd_lanes_le:
+ case BuiltinProc_simd_lanes_gt:
+ case BuiltinProc_simd_lanes_ge:
+ {
+ // op(#simd[N]T, #simd[N]T) -> #simd[N]V
+ // where `V` is an integer, `size_of(T) == size_of(V)`
+ // `V` will all 0s if false and all 1s if true (e.g. 0x00 and 0xff for false and true, respectively)
+
+ Operand x = {};
+ Operand y = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ switch (id) {
+ case BuiltinProc_simd_lanes_eq:
+ case BuiltinProc_simd_lanes_ne:
+ if (!is_type_integer(elem) && !is_type_float(elem) && !is_type_boolean(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer, floating point, or boolean element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ break;
+ default:
+ if (!is_type_integer(elem) && !is_type_float(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ break;
+ }
+
+
+ Type *vt = base_type(x.type);
+ GB_ASSERT(vt->kind == Type_SimdVector);
+ i64 count = vt->SimdVector.count;
+
+ i64 sz = type_size_of(elem);
+ Type *new_elem = nullptr;
+
+ switch (sz) {
+ case 1: new_elem = t_u8; break;
+ case 2: new_elem = t_u16; break;
+ case 4: new_elem = t_u32; break;
+ case 8: new_elem = t_u64; break;
+ case 16:
+ error(x.expr, "'%.*s' not supported 128-bit integer backed simd vector types", LIT(builtin_name));
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = alloc_type_simd_vector(count, new_elem);
+ return true;
+ }
+
+ case BuiltinProc_simd_extract:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ i64 max_count = x.type->SimdVector.count;
+ i64 value = -1;
+ if (!check_index_value(c, x.type, false, ce->args[1], max_count, &value)) {
+ return false;
+ }
+ if (max_count < 0) {
+ error(ce->args[1], "'%.*s' expected a constant integer index, got '%lld'", LIT(builtin_name), cast(long long)value);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = elem;
+ return true;
+ }
+ break;
+ case BuiltinProc_simd_replace:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ i64 max_count = x.type->SimdVector.count;
+ i64 value = -1;
+ if (!check_index_value(c, x.type, false, ce->args[1], max_count, &value)) {
+ return false;
+ }
+ if (max_count < 0) {
+ error(ce->args[1], "'%.*s' expected a constant integer index, got '%lld'", LIT(builtin_name), cast(long long)value);
+ return false;
+ }
+
+ Operand y = {};
+ check_expr_with_type_hint(c, &y, ce->args[2], elem); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, elem); if (y.mode == Addressing_Invalid) return false;
+ if (!are_types_identical(y.type, elem)) {
+ gbString et = type_to_string(elem);
+ gbString yt = type_to_string(y.type);
+ error(y.expr, "'%.*s' expected a type of '%s' to insert, got '%s'", LIT(builtin_name), et, yt);
+ gb_string_free(yt);
+ gb_string_free(et);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+ break;
+
+ case BuiltinProc_simd_reduce_add_ordered:
+ case BuiltinProc_simd_reduce_mul_ordered:
+ case BuiltinProc_simd_reduce_min:
+ case BuiltinProc_simd_reduce_max:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ if (!is_type_integer(elem) && !is_type_float(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = base_array_type(x.type);
+ return true;
+ }
+
+ case BuiltinProc_simd_reduce_and:
+ case BuiltinProc_simd_reduce_or:
+ case BuiltinProc_simd_reduce_xor:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ if (!is_type_integer(elem) && !is_type_boolean(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or boolean element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = base_array_type(x.type);
+ return true;
+ }
+
+
+ case BuiltinProc_simd_shuffle:
+ {
+ Operand x = {};
+ Operand y = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(y.type)) {
+ error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!are_types_identical(x.type, y.type)) {
+ gbString xs = type_to_string(x.type);
+ gbString ys = type_to_string(y.type);
+ error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+ gb_string_free(ys);
+ gb_string_free(xs);
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+
+ i64 max_count = x.type->SimdVector.count + y.type->SimdVector.count;
+
+ i64 arg_count = 0;
+ for_array(i, ce->args) {
+ if (i < 2) {
+ continue;
+ }
+ Ast *arg = ce->args[i];
+ Operand op = {};
+ check_expr(c, &op, arg);
+ if (op.mode == Addressing_Invalid) {
+ return false;
+ }
+ Type *arg_type = base_type(op.type);
+ if (!is_type_integer(arg_type) || op.mode != Addressing_Constant) {
+ error(op.expr, "Indices to '%.*s' must be constant integers", LIT(builtin_name));
+ return false;
+ }
+
+ if (big_int_is_neg(&op.value.value_integer)) {
+ error(op.expr, "Negative '%.*s' index", LIT(builtin_name));
+ return false;
+ }
+
+ BigInt mc = {};
+ big_int_from_i64(&mc, max_count);
+ if (big_int_cmp(&mc, &op.value.value_integer) <= 0) {
+ error(op.expr, "'%.*s' index exceeds length", LIT(builtin_name));
+ return false;
+ }
+
+ arg_count++;
+ }
+
+ if (arg_count > max_count) {
+ error(call, "Too many '%.*s' indices, %td > %td", LIT(builtin_name), arg_count, max_count);
+ return false;
+ }
+
+
+ if (!is_power_of_two(arg_count)) {
+ error(call, "'%.*s' must have a power of two index arguments, got %lld", LIT(builtin_name), cast(long long)arg_count);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = alloc_type_simd_vector(arg_count, elem);
+ return true;
+ }
+
+ case BuiltinProc_simd_select:
+ {
+ Operand cond = {};
+ check_expr(c, &cond, ce->args[0]); if (cond.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(cond.type)) {
+ error(cond.expr, "'%.*s' expected a simd vector boolean type", LIT(builtin_name));
+ return false;
+ }
+ Type *cond_elem = base_array_type(cond.type);
+ if (!is_type_boolean(cond_elem) && !is_type_integer(cond_elem)) {
+ gbString cond_str = type_to_string(cond.type);
+ error(cond.expr, "'%.*s' expected a simd vector boolean or integer type, got '%s'", LIT(builtin_name), cond_str);
+ gb_string_free(cond_str);
+ return false;
+ }
+
+ Operand x = {};
+ Operand y = {};
+ check_expr(c, &x, ce->args[1]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[2], x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(y.type)) {
+ error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!are_types_identical(x.type, y.type)) {
+ gbString xs = type_to_string(x.type);
+ gbString ys = type_to_string(y.type);
+ error(x.expr, "'%.*s' expected 2 results of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+ gb_string_free(ys);
+ gb_string_free(xs);
+ return false;
+ }
+
+ if (cond.type->SimdVector.count != x.type->SimdVector.count) {
+ error(x.expr, "'%.*s' expected condition vector to match the length of the result lengths, got '%lld' vs '%lld'",
+ LIT(builtin_name),
+ cast(long long)cond.type->SimdVector.count,
+ cast(long long)x.type->SimdVector.count);
+ return false;
+ }
+
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ case BuiltinProc_simd_ceil:
+ case BuiltinProc_simd_floor:
+ case BuiltinProc_simd_trunc:
+ case BuiltinProc_simd_nearest:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector boolean type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ if (!is_type_float(elem)) {
+ gbString x_str = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a simd vector floating point type, got '%s'", LIT(builtin_name), x_str);
+ gb_string_free(x_str);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ case BuiltinProc_simd_lanes_reverse:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ operand->type = x.type;
+ operand->mode = Addressing_Value;
+ return true;
+ }
+
+ case BuiltinProc_simd_lanes_rotate_left:
+ case BuiltinProc_simd_lanes_rotate_right:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Operand offset = {};
+ check_expr(c, &offset, ce->args[1]); if (offset.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &offset, t_i64);
+ if (!is_type_integer(offset.type) || offset.mode != Addressing_Constant) {
+ error(offset.expr, "'%.*s' expected a constant integer offset");
+ return false;
+ }
+ check_assignment(c, &offset, t_i64, builtin_name);
+
+ operand->type = x.type;
+ operand->mode = Addressing_Value;
+ return true;
+ }
+
+ case BuiltinProc_simd_clamp:
+ {
+ Operand x = {};
+ Operand y = {};
+ Operand z = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &z, ce->args[2], x.type); if (z.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &z, x.type);
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(y.type)) {
+ error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!is_type_simd_vector(z.type)) {
+ error(z.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ if (!are_types_identical(x.type, y.type)) {
+ gbString xs = type_to_string(x.type);
+ gbString ys = type_to_string(y.type);
+ error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys);
+ gb_string_free(ys);
+ gb_string_free(xs);
+ return false;
+ }
+ if (!are_types_identical(x.type, z.type)) {
+ gbString xs = type_to_string(x.type);
+ gbString zs = type_to_string(z.type);
+ error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, zs);
+ gb_string_free(zs);
+ gb_string_free(xs);
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ if (!is_type_integer(elem) && !is_type_float(elem)) {
+ gbString xs = type_to_string(x.type);
+ error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = x.type;
+ return true;
+ }
+
+ case BuiltinProc_simd_to_bits:
+ {
+ Operand x = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+
+ if (!is_type_simd_vector(x.type)) {
+ error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name));
+ return false;
+ }
+ Type *elem = base_array_type(x.type);
+ i64 count = get_array_type_count(x.type);
+ i64 sz = type_size_of(elem);
+ Type *bit_elem = nullptr;
+ switch (sz) {
+ case 1: bit_elem = t_u8; break;
+ case 2: bit_elem = t_u16; break;
+ case 4: bit_elem = t_u32; break;
+ case 8: bit_elem = t_u64; break;
+ }
+ GB_ASSERT(bit_elem != nullptr);
+
+ operand->type = alloc_type_simd_vector(count, bit_elem);
+ operand->mode = Addressing_Value;
+ return true;
+ }
+
+ case BuiltinProc_simd_x86__MM_SHUFFLE:
+ {
+ Operand x[4] = {};
+ for (unsigned i = 0; i < 4; i++) {
+ check_expr(c, x+i, ce->args[i]); if (x[i].mode == Addressing_Invalid) return false;
+ }
+
+ u32 offsets[4] = {6, 4, 2, 0};
+ u32 result = 0;
+ for (unsigned i = 0; i < 4; i++) {
+ if (!is_type_integer(x[i].type) || x[i].mode != Addressing_Constant) {
+ gbString xs = type_to_string(x[i].type);
+ error(x[i].expr, "'%.*s' expected a constant integer", LIT(builtin_name), xs);
+ gb_string_free(xs);
+ return false;
+ }
+ i64 val = exact_value_to_i64(x[i].value);
+ if (val < 0 || val > 3) {
+ error(x[i].expr, "'%.*s' expected a constant integer in the range 0..<4, got %lld", LIT(builtin_name), cast(long long)val);
+ return false;
+ }
+ result |= cast(u32)(val) << offsets[i];
+ }
+
+ operand->type = t_untyped_integer;
+ operand->mode = Addressing_Constant;
+ operand->value = exact_value_i64(result);
+ return true;
+ }
+ default:
+ GB_PANIC("Unhandled simd intrinsic: %.*s", LIT(builtin_name));
+ }
+
+ return false;
+}
+
+
bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) {
ast_node(ce, CallExpr, call);
if (ce->inlining != ProcInlining_none) {
@@ -479,7 +1140,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
break;
}
- String builtin_name = builtin_procs[id].name;
+ String const &builtin_name = builtin_procs[id].name;
if (ce->args.count > 0) {
@@ -491,6 +1152,17 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
}
}
+ if (BuiltinProc__simd_begin < id && id < BuiltinProc__simd_end) {
+ bool ok = check_builtin_simd_operation(c, operand, call, id, type_hint);
+ if (!ok) {
+ operand->type = t_invalid;
+ }
+ operand->mode = Addressing_Value;
+ operand->value = {};
+ operand->expr = call;
+ return ok;
+ }
+
switch (id) {
default:
GB_PANIC("Implement built-in procedure: %.*s", LIT(builtin_name));
@@ -1031,6 +1703,11 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
bt->Struct.soa_kind == StructSoa_Dynamic) {
mode = Addressing_Value;
}
+ } else if (is_type_simd_vector(op_type)) {
+ Type *bt = base_type(op_type);
+ mode = Addressing_Constant;
+ value = exact_value_i64(bt->SimdVector.count);
+ type = t_untyped_integer;
}
if (operand->mode == Addressing_Type && mode != Addressing_Constant) {
mode = Addressing_Invalid;
@@ -1445,6 +2122,11 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
operand->mode = Addressing_Value;
}
+ if (is_type_simd_vector(type) && !is_power_of_two(arg_count)) {
+ error(call, "'swizzle' with a #simd vector must have a power of two arguments, got %lld", cast(long long)arg_count);
+ return false;
+ }
+
operand->type = determine_swizzle_array_type(original_type, type_hint, arg_count);
break;
}
@@ -2279,7 +2961,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
if (i == j) continue;
Operand *b = ops[j];
convert_to_typed(c, a, b->type);
- if (a->mode == Addressing_Invalid) { return false; }
+ if (a->mode == Addressing_Invalid) return false;
}
}
@@ -2685,46 +3367,6 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
break;
}
- case BuiltinProc_simd_vector: {
- Operand x = {};
- Operand y = {};
- x = *operand;
- if (!is_type_integer(x.type) || x.mode != Addressing_Constant) {
- error(call, "Expected a constant integer for 'intrinsics.simd_vector'");
- operand->mode = Addressing_Type;
- operand->type = t_invalid;
- return false;
- }
- if (big_int_is_neg(&x.value.value_integer)) {
- error(call, "Negative vector element length");
- operand->mode = Addressing_Type;
- operand->type = t_invalid;
- return false;
- }
- i64 count = big_int_to_i64(&x.value.value_integer);
-
- check_expr_or_type(c, &y, ce->args[1]);
- if (y.mode != Addressing_Type) {
- error(call, "Expected a type 'intrinsics.simd_vector'");
- operand->mode = Addressing_Type;
- operand->type = t_invalid;
- return false;
- }
- Type *elem = y.type;
- if (!is_type_valid_vector_elem(elem)) {
- gbString str = type_to_string(elem);
- error(call, "Invalid element type for 'intrinsics.simd_vector', expected an integer or float with no specific endianness, got '%s'", str);
- gb_string_free(str);
- operand->mode = Addressing_Type;
- operand->type = t_invalid;
- return false;
- }
-
- operand->mode = Addressing_Type;
- operand->type = alloc_type_simd_vector(count, elem);
- break;
- }
-
case BuiltinProc_is_package_imported: {
bool value = false;
@@ -2944,7 +3586,14 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
return false;
}
- if (!is_type_integer_like(x.type)) {
+ if (is_type_simd_vector(x.type)) {
+ Type *elem = base_array_type(x.type);
+ if (!is_type_integer_like(elem)) {
+ gbString xts = type_to_string(x.type);
+ error(x.expr, "#simd values passed to '%.*s' must have an element of an integer-like type (integer, boolean, enum, bit_set), got %s", LIT(builtin_name), xts);
+ gb_string_free(xts);
+ }
+ } else if (!is_type_integer_like(x.type)) {
gbString xts = type_to_string(x.type);
error(x.expr, "Values passed to '%.*s' must be an integer-like type (integer, boolean, enum, bit_set), got %s", LIT(builtin_name), xts);
gb_string_free(xts);
@@ -3002,7 +3651,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
if (y.mode == Addressing_Invalid) {
return false;
}
- convert_to_typed(c, &y, x.type);
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
convert_to_typed(c, &x, y.type);
if (is_type_untyped(x.type)) {
gbString xts = type_to_string(x.type);
@@ -3039,14 +3688,23 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
if (x.mode == Addressing_Invalid) {
return false;
}
- if (!is_type_float(x.type)) {
+
+ Type *elem = core_array_type(x.type);
+ if (!is_type_float(x.type) && !(is_type_simd_vector(x.type) && is_type_float(elem))) {
gbString xts = type_to_string(x.type);
- error(x.expr, "Expected a floating point value for '%.*s', got %s", LIT(builtin_name), xts);
+ error(x.expr, "Expected a floating point or #simd vector value for '%.*s', got %s", LIT(builtin_name), xts);
gb_string_free(xts);
return false;
+ } else if (is_type_different_to_arch_endianness(elem)) {
+ GB_ASSERT(elem->kind == Type_Basic);
+ if (elem->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) {
+ gbString xts = type_to_string(x.type);
+ error(x.expr, "Expected a float which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts);
+ gb_string_free(xts);
+ return false;
+ }
}
-
- if (x.mode == Addressing_Constant) {
+ if (is_type_float(x.type) && x.mode == Addressing_Constant) {
f64 v = exact_value_to_f64(x.value);
operand->mode = Addressing_Constant;
@@ -3059,6 +3717,59 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
}
break;
+ case BuiltinProc_fused_mul_add:
+ {
+ Operand x = {};
+ Operand y = {};
+ Operand z = {};
+ check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false;
+ check_expr(c, &y, ce->args[1]); if (y.mode == Addressing_Invalid) return false;
+ check_expr(c, &z, ce->args[2]); if (z.mode == Addressing_Invalid) return false;
+
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &x, y.type); if (x.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &z, x.type); if (z.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &x, z.type); if (x.mode == Addressing_Invalid) return false;
+ if (is_type_untyped(x.type)) {
+ gbString xts = type_to_string(x.type);
+ error(x.expr, "Expected a typed floating point value or #simd vector for '%.*s', got %s", LIT(builtin_name), xts);
+ gb_string_free(xts);
+ return false;
+ }
+
+ Type *elem = core_array_type(x.type);
+ if (!is_type_float(x.type) && !(is_type_simd_vector(x.type) && is_type_float(elem))) {
+ gbString xts = type_to_string(x.type);
+ error(x.expr, "Expected a floating point or #simd vector value for '%.*s', got %s", LIT(builtin_name), xts);
+ gb_string_free(xts);
+ return false;
+ }
+ if (is_type_different_to_arch_endianness(elem)) {
+ GB_ASSERT(elem->kind == Type_Basic);
+ if (elem->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) {
+ gbString xts = type_to_string(x.type);
+ error(x.expr, "Expected a float which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts);
+ gb_string_free(xts);
+ return false;
+ }
+ }
+
+ if (!are_types_identical(x.type, y.type) || !are_types_identical(y.type, z.type)) {
+ gbString xts = type_to_string(x.type);
+ gbString yts = type_to_string(y.type);
+ gbString zts = type_to_string(z.type);
+ error(x.expr, "Mismatched types for '%.*s', got %s vs %s vs %s", LIT(builtin_name), xts, yts, zts);
+ gb_string_free(zts);
+ gb_string_free(yts);
+ gb_string_free(xts);
+ return false;
+ }
+
+ operand->mode = Addressing_Value;
+ operand->type = default_type(x.type);
+ }
+ break;
+
case BuiltinProc_mem_copy:
case BuiltinProc_mem_copy_non_overlapping:
{
@@ -3309,9 +4020,8 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
break;
case BuiltinProc_volatile_store:
- /*fallthrough*/
case BuiltinProc_unaligned_store:
- /*fallthrough*/
+ case BuiltinProc_non_temporal_store:
case BuiltinProc_atomic_store:
{
Type *elem = nullptr;
@@ -3358,9 +4068,8 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
case BuiltinProc_volatile_load:
- /*fallthrough*/
case BuiltinProc_unaligned_load:
- /*fallthrough*/
+ case BuiltinProc_non_temporal_load:
case BuiltinProc_atomic_load:
{
Type *elem = nullptr;
@@ -3618,7 +4327,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
if (x.mode == Addressing_Invalid) {
return false;
}
- convert_to_typed(c, &y, x.type);
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
if (x.mode == Addressing_Invalid) {
return false;
}
@@ -3675,7 +4384,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
if (y.mode == Addressing_Invalid) {
return false;
}
- convert_to_typed(c, &y, x.type);
+ convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
convert_to_typed(c, &x, y.type);
if (!are_types_identical(x.type, y.type)) {
gbString xts = type_to_string(x.type);
@@ -4566,6 +5275,64 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
}
break;
+ case BuiltinProc_x86_cpuid:
+ {
+ if (!is_arch_x86()) {
+ error(call, "'%.*s' is only allowed on x86 targets (i386, amd64)", LIT(builtin_name));
+ return false;
+ }
+
+ Operand ax = {};
+ Operand cx = {};
+
+ check_expr_with_type_hint(c, &ax, ce->args[0], t_u32); if (ax.mode == Addressing_Invalid) return false;
+ check_expr_with_type_hint(c, &cx, ce->args[1], t_u32); if (cx.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &ax, t_u32); if (ax.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &cx, t_u32); if (cx.mode == Addressing_Invalid) return false;
+ if (!are_types_identical(ax.type, t_u32)) {
+ gbString str = type_to_string(ax.type);
+ error(ax.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str);
+ gb_string_free(str);
+ return false;
+ }
+ if (!are_types_identical(cx.type, t_u32)) {
+ gbString str = type_to_string(cx.type);
+ error(cx.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str);
+ gb_string_free(str);
+ return false;
+ }
+ Type *types[4] = {t_u32, t_u32, t_u32, t_u32}; // eax ebc ecx edx
+ operand->type = alloc_type_tuple_from_field_types(types, gb_count_of(types), false, false);
+ operand->mode = Addressing_Value;
+ operand->value = {};
+ return true;
+ }
+ break;
+ case BuiltinProc_x86_xgetbv:
+ {
+ if (!is_arch_x86()) {
+ error(call, "'%.*s' is only allowed on x86 targets (i386, amd64)", LIT(builtin_name));
+ return false;
+ }
+
+ Operand cx = {};
+ check_expr_with_type_hint(c, &cx, ce->args[0], t_u32); if (cx.mode == Addressing_Invalid) return false;
+ convert_to_typed(c, &cx, t_u32); if (cx.mode == Addressing_Invalid) return false;
+ if (!are_types_identical(cx.type, t_u32)) {
+ gbString str = type_to_string(cx.type);
+ error(cx.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str);
+ gb_string_free(str);
+ return false;
+ }
+
+ Type *types[2] = {t_u32, t_u32};
+ operand->type = alloc_type_tuple_from_field_types(types, gb_count_of(types), false, false);
+ operand->mode = Addressing_Value;
+ operand->value = {};
+ return true;
+ }
+ break;
+
}
return true;
diff --git a/src/check_decl.cpp b/src/check_decl.cpp
index 82ac6c677..86280b6cb 100644
--- a/src/check_decl.cpp
+++ b/src/check_decl.cpp
@@ -313,13 +313,19 @@ void check_type_decl(CheckerContext *ctx, Entity *e, Ast *init_expr, Type *def)
}
named->Named.base = base;
- if (is_distinct && is_type_typeid(e->type)) {
- error(init_expr, "'distinct' cannot be applied to 'typeid'");
- is_distinct = false;
- }
- if (is_distinct && is_type_any(e->type)) {
- error(init_expr, "'distinct' cannot be applied to 'any'");
- is_distinct = false;
+ if (is_distinct) {
+ if (is_type_typeid(e->type)) {
+ error(init_expr, "'distinct' cannot be applied to 'typeid'");
+ is_distinct = false;
+ } else if (is_type_any(e->type)) {
+ error(init_expr, "'distinct' cannot be applied to 'any'");
+ is_distinct = false;
+ } else if (is_type_simd_vector(e->type)) {
+ gbString str = type_to_string(e->type);
+ error(init_expr, "'distinct' cannot be applied to '%s'", str);
+ gb_string_free(str);
+ is_distinct = false;
+ }
}
if (!is_distinct) {
e->type = bt;
@@ -893,6 +899,18 @@ void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
}
}
+ if (ac.require_target_feature.len != 0 && ac.enable_target_feature.len != 0) {
+ error(e->token, "Attributes @(require_target_feature=...) and @(enable_target_feature=...) cannot be used together");
+ } else if (ac.require_target_feature.len != 0) {
+ if (check_target_feature_is_enabled(e->token.pos, ac.require_target_feature)) {
+ e->Procedure.target_feature = ac.require_target_feature;
+ } else {
+ e->Procedure.target_feature_disabled = true;
+ }
+ } else if (ac.enable_target_feature.len != 0) {
+ enable_target_feature(e->token.pos, ac.enable_target_feature);
+ e->Procedure.target_feature = ac.enable_target_feature;
+ }
switch (e->Procedure.optimization_mode) {
case ProcedureOptimizationMode_None:
@@ -996,10 +1014,12 @@ void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
}
}
- if (pt->result_count == 0 && ac.require_results) {
- error(pl->type, "'require_results' is not needed on a procedure with no results");
- } else {
- pt->require_results = ac.require_results;
+ if (ac.require_results) {
+ if (pt->result_count == 0) {
+ error(pl->type, "'require_results' is not needed on a procedure with no results");
+ } else {
+ pt->require_results = true;
+ }
}
if (ac.link_name.len > 0) {
@@ -1309,20 +1329,20 @@ void check_proc_group_decl(CheckerContext *ctx, Entity *&pg_entity, DeclInfo *d)
if (!both_have_where_clauses) switch (kind) {
case ProcOverload_Identical:
- error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+ error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
is_invalid = true;
break;
// case ProcOverload_CallingConvention:
- // error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+ // error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
// is_invalid = true;
// break;
case ProcOverload_ParamVariadic:
- error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+ error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
is_invalid = true;
break;
case ProcOverload_ResultCount:
case ProcOverload_ResultTypes:
- error(p->token, "Overloaded procedure '%.*s' as the same parameters but different results in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
+ error(p->token, "Overloaded procedure '%.*s' has the same parameters but different results in the procedure group '%.*s'", LIT(name), LIT(proc_group_name));
is_invalid = true;
break;
case ProcOverload_Polymorphic:
diff --git a/src/check_expr.cpp b/src/check_expr.cpp
index 7b269e048..f954f1583 100644
--- a/src/check_expr.cpp
+++ b/src/check_expr.cpp
@@ -442,6 +442,14 @@ bool find_or_generate_polymorphic_procedure(CheckerContext *old_c, Entity *base_
final_proc_type->Proc.is_poly_specialized = true;
final_proc_type->Proc.is_polymorphic = true;
+ final_proc_type->Proc.variadic = src->Proc.variadic;
+ final_proc_type->Proc.require_results = src->Proc.require_results;
+ final_proc_type->Proc.c_vararg = src->Proc.c_vararg;
+ final_proc_type->Proc.has_named_results = src->Proc.has_named_results;
+ final_proc_type->Proc.diverging = src->Proc.diverging;
+ final_proc_type->Proc.return_by_pointer = src->Proc.return_by_pointer;
+ final_proc_type->Proc.optional_ok = src->Proc.optional_ok;
+
for (isize i = 0; i < operands.count; i++) {
Operand o = operands[i];
@@ -777,6 +785,14 @@ i64 check_distance_between_types(CheckerContext *c, Operand *operand, Type *type
return distance + 6;
}
}
+
+ if (is_type_simd_vector(dst)) {
+ Type *dst_elem = base_array_type(dst);
+ i64 distance = check_distance_between_types(c, operand, dst_elem);
+ if (distance >= 0) {
+ return distance + 6;
+ }
+ }
if (is_type_matrix(dst)) {
Type *dst_elem = base_array_type(dst);
@@ -786,6 +802,7 @@ i64 check_distance_between_types(CheckerContext *c, Operand *operand, Type *type
}
}
+
if (is_type_any(dst)) {
if (!is_type_polymorphic(src)) {
if (operand->mode == Addressing_Context && operand->type == t_context) {
@@ -1328,6 +1345,19 @@ bool is_polymorphic_type_assignable(CheckerContext *c, Type *poly, Type *source,
}
}
return false;
+
+ case Type_SimdVector:
+ if (source->kind == Type_SimdVector) {
+ if (poly->SimdVector.generic_count != nullptr) {
+ if (!polymorphic_assign_index(&poly->SimdVector.generic_count, &poly->SimdVector.count, source->SimdVector.count)) {
+ return false;
+ }
+ }
+ if (poly->SimdVector.count == source->SimdVector.count) {
+ return is_polymorphic_type_assignable(c, poly->SimdVector.elem, source->SimdVector.elem, true, modify_type);
+ }
+ }
+ return false;
}
return false;
}
@@ -1567,9 +1597,11 @@ bool check_unary_op(CheckerContext *c, Operand *o, Token op) {
bool check_binary_op(CheckerContext *c, Operand *o, Token op) {
Type *main_type = o->type;
+
// TODO(bill): Handle errors correctly
Type *type = base_type(core_array_type(main_type));
Type *ct = core_type(type);
+
switch (op.kind) {
case Token_Sub:
case Token_SubEq:
@@ -1638,14 +1670,6 @@ bool check_binary_op(CheckerContext *c, Operand *o, Token op) {
error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string));
return false;
}
- if (is_type_simd_vector(o->type)) {
- switch (op.kind) {
- case Token_ModMod:
- case Token_ModModEq:
- error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string));
- return false;
- }
- }
break;
case Token_AndNot:
@@ -1654,14 +1678,6 @@ bool check_binary_op(CheckerContext *c, Operand *o, Token op) {
error(op, "Operator '%.*s' is only allowed with integers and bit sets", LIT(op.string));
return false;
}
- if (is_type_simd_vector(o->type)) {
- switch (op.kind) {
- case Token_AndNot:
- case Token_AndNotEq:
- error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string));
- return false;
- }
- }
break;
case Token_CmpAnd:
@@ -2487,6 +2503,8 @@ void check_shift(CheckerContext *c, Operand *x, Operand *y, Ast *node, Type *typ
gb_string_free(err_str);
}
+ // TODO(bill): Should we support shifts for fixed arrays and #simd vectors?
+
if (!is_type_integer(x->type)) {
gbString err_str = expr_to_string(y->expr);
error(node, "Shift operand '%s' must be an integer", err_str);
@@ -2697,6 +2715,26 @@ bool check_is_castable_to(CheckerContext *c, Operand *operand, Type *y) {
return true;
}
+ if (is_type_simd_vector(src) && is_type_simd_vector(dst)) {
+ if (src->SimdVector.count != dst->SimdVector.count) {
+ return false;
+ }
+ Type *elem_src = base_array_type(src);
+ Type *elem_dst = base_array_type(dst);
+ Operand x = {};
+ x.type = elem_src;
+ x.mode = Addressing_Value;
+ return check_is_castable_to(c, &x, elem_dst);
+ }
+
+ if (is_type_simd_vector(dst)) {
+ Type *elem = base_array_type(dst);
+ if (check_is_castable_to(c, operand, elem)) {
+ return true;
+ }
+ }
+
+
return false;
}
@@ -4116,7 +4154,11 @@ ExactValue get_constant_field(CheckerContext *c, Operand const *operand, Selecti
Type *determine_swizzle_array_type(Type *original_type, Type *type_hint, isize new_count) {
Type *array_type = base_type(type_deref(original_type));
- GB_ASSERT(array_type->kind == Type_Array);
+ GB_ASSERT(array_type->kind == Type_Array || array_type->kind == Type_SimdVector);
+ if (array_type->kind == Type_SimdVector) {
+ Type *elem_type = array_type->SimdVector.elem;
+ return alloc_type_simd_vector(new_count, elem_type);
+ }
Type *elem_type = array_type->Array.elem;
Type *swizzle_array_type = nullptr;
@@ -7738,111 +7780,106 @@ ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *node, Type *
}
if (cl->elems.count > 0 && cl->elems[0]->kind == Ast_FieldValue) {
- if (is_type_simd_vector(t)) {
- error(cl->elems[0], "'field = value' is not allowed for SIMD vector literals");
- } else {
- RangeCache rc = range_cache_make(heap_allocator());
- defer (range_cache_destroy(&rc));
+ RangeCache rc = range_cache_make(heap_allocator());
+ defer (range_cache_destroy(&rc));
- for_array(i, cl->elems) {
- Ast *elem = cl->elems[i];
- if (elem->kind != Ast_FieldValue) {
- error(elem, "Mixture of 'field = value' and value elements in a literal is not allowed");
- continue;
- }
- ast_node(fv, FieldValue, elem);
+ for_array(i, cl->elems) {
+ Ast *elem = cl->elems[i];
+ if (elem->kind != Ast_FieldValue) {
+ error(elem, "Mixture of 'field = value' and value elements in a literal is not allowed");
+ continue;
+ }
+ ast_node(fv, FieldValue, elem);
- if (is_ast_range(fv->field)) {
- Token op = fv->field->BinaryExpr.op;
+ if (is_ast_range(fv->field)) {
+ Token op = fv->field->BinaryExpr.op;
- Operand x = {};
- Operand y = {};
- bool ok = check_range(c, fv->field, &x, &y, nullptr);
- if (!ok) {
- continue;
- }
- if (x.mode != Addressing_Constant || !is_type_integer(core_type(x.type))) {
- error(x.expr, "Expected a constant integer as an array field");
- continue;
- }
+ Operand x = {};
+ Operand y = {};
+ bool ok = check_range(c, fv->field, &x, &y, nullptr);
+ if (!ok) {
+ continue;
+ }
+ if (x.mode != Addressing_Constant || !is_type_integer(core_type(x.type))) {
+ error(x.expr, "Expected a constant integer as an array field");
+ continue;
+ }
- if (y.mode != Addressing_Constant || !is_type_integer(core_type(y.type))) {
- error(y.expr, "Expected a constant integer as an array field");
- continue;
- }
+ if (y.mode != Addressing_Constant || !is_type_integer(core_type(y.type))) {
+ error(y.expr, "Expected a constant integer as an array field");
+ continue;
+ }
- i64 lo = exact_value_to_i64(x.value);
- i64 hi = exact_value_to_i64(y.value);
- i64 max_index = hi;
- if (op.kind == Token_RangeHalf) { // ..< (exclusive)
- hi -= 1;
- } else { // .. (inclusive)
- max_index += 1;
- }
+ i64 lo = exact_value_to_i64(x.value);
+ i64 hi = exact_value_to_i64(y.value);
+ i64 max_index = hi;
+ if (op.kind == Token_RangeHalf) { // ..< (exclusive)
+ hi -= 1;
+ } else { // .. (inclusive)
+ max_index += 1;
+ }
- bool new_range = range_cache_add_range(&rc, lo, hi);
- if (!new_range) {
- error(elem, "Overlapping field range index %lld %.*s %lld for %.*s", lo, LIT(op.string), hi, LIT(context_name));
- continue;
- }
+ bool new_range = range_cache_add_range(&rc, lo, hi);
+ if (!new_range) {
+ error(elem, "Overlapping field range index %lld %.*s %lld for %.*s", lo, LIT(op.string), hi, LIT(context_name));
+ continue;
+ }
- if (max_type_count >= 0 && (lo < 0 || lo >= max_type_count)) {
- error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", lo, max_type_count, LIT(context_name));
- continue;
- }
- if (max_type_count >= 0 && (hi < 0 || hi >= max_type_count)) {
- error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", hi, max_type_count, LIT(context_name));
- continue;
- }
+ if (max_type_count >= 0 && (lo < 0 || lo >= max_type_count)) {
+ error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", lo, max_type_count, LIT(context_name));
+ continue;
+ }
+ if (max_type_count >= 0 && (hi < 0 || hi >= max_type_count)) {
+ error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", hi, max_type_count, LIT(context_name));
+ continue;
+ }
- if (max < hi) {
- max = max_index;
- }
+ if (max < hi) {
+ max = max_index;
+ }
- Operand operand = {};
- check_expr_with_type_hint(c, &operand, fv->value, elem_type);
- check_assignment(c, &operand, elem_type, context_name);
+ Operand operand = {};
+ check_expr_with_type_hint(c, &operand, fv->value, elem_type);
+ check_assignment(c, &operand, elem_type, context_name);
- is_constant = is_constant && operand.mode == Addressing_Constant;
- } else {
- Operand op_index = {};
- check_expr(c, &op_index, fv->field);
+ is_constant = is_constant && operand.mode == Addressing_Constant;
+ } else {
+ Operand op_index = {};
+ check_expr(c, &op_index, fv->field);
- if (op_index.mode != Addressing_Constant || !is_type_integer(core_type(op_index.type))) {
- error(elem, "Expected a constant integer as an array field");
- continue;
- }
- // add_type_and_value(c->info, op_index.expr, op_index.mode, op_index.type, op_index.value);
+ if (op_index.mode != Addressing_Constant || !is_type_integer(core_type(op_index.type))) {
+ error(elem, "Expected a constant integer as an array field");
+ continue;
+ }
+ // add_type_and_value(c->info, op_index.expr, op_index.mode, op_index.type, op_index.value);
- i64 index = exact_value_to_i64(op_index.value);
+ i64 index = exact_value_to_i64(op_index.value);
- if (max_type_count >= 0 && (index < 0 || index >= max_type_count)) {
- error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", index, max_type_count, LIT(context_name));
- continue;
- }
+ if (max_type_count >= 0 && (index < 0 || index >= max_type_count)) {
+ error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", index, max_type_count, LIT(context_name));
+ continue;
+ }
- bool new_index = range_cache_add_index(&rc, index);
- if (!new_index) {
- error(elem, "Duplicate field index %lld for %.*s", index, LIT(context_name));
- continue;
- }
+ bool new_index = range_cache_add_index(&rc, index);
+ if (!new_index) {
+ error(elem, "Duplicate field index %lld for %.*s", index, LIT(context_name));
+ continue;
+ }
- if (max < index+1) {
- max = index+1;
- }
+ if (max < index+1) {
+ max = index+1;
+ }
- Operand operand = {};
- check_expr_with_type_hint(c, &operand, fv->value, elem_type);
- check_assignment(c, &operand, elem_type, context_name);
+ Operand operand = {};
+ check_expr_with_type_hint(c, &operand, fv->value, elem_type);
+ check_assignment(c, &operand, elem_type, context_name);
- is_constant = is_constant && operand.mode == Addressing_Constant;
- }
+ is_constant = is_constant && operand.mode == Addressing_Constant;
}
-
- cl->max_count = max;
}
+ cl->max_count = max;
} else {
isize index = 0;
for (; index < cl->elems.count; index++) {
@@ -7887,7 +7924,7 @@ ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *node, Type *
if (t->kind == Type_SimdVector) {
if (!is_constant) {
- error(node, "Expected all constant elements for a simd vector");
+ // error(node, "Expected all constant elements for a simd vector");
}
}
diff --git a/src/check_stmt.cpp b/src/check_stmt.cpp
index f2c830c1b..f061b4961 100644
--- a/src/check_stmt.cpp
+++ b/src/check_stmt.cpp
@@ -1381,6 +1381,18 @@ bool all_operands_valid(Array<Operand> const &operands) {
return true;
}
+bool check_stmt_internal_builtin_proc_id(Ast *expr, BuiltinProcId *id_) {
+ BuiltinProcId id = BuiltinProc_Invalid;
+ Entity *e = entity_of_node(expr);
+ if (e != nullptr && e->kind == Entity_Builtin) {
+ if (e->Builtin.id && e->Builtin.id != BuiltinProc_DIRECTIVE) {
+ id = cast(BuiltinProcId)e->Builtin.id;
+ }
+ }
+ if (id_) *id_ = id;
+ return id != BuiltinProc_Invalid;
+}
+
void check_stmt_internal(CheckerContext *ctx, Ast *node, u32 flags) {
u32 mod_flags = flags & (~Stmt_FallthroughAllowed);
switch (node->kind) {
@@ -1405,29 +1417,43 @@ void check_stmt_internal(CheckerContext *ctx, Ast *node, u32 flags) {
if (kind == Expr_Stmt) {
return;
}
- Ast *expr = strip_or_return_expr(operand.expr);
+ Ast *expr = strip_or_return_expr(operand.expr);
if (expr->kind == Ast_CallExpr) {
+ BuiltinProcId builtin_id = BuiltinProc_Invalid;
+ bool do_require = false;
+
AstCallExpr *ce = &expr->CallExpr;
- Type *t = type_of_expr(ce->proc);
- if (is_type_proc(t)) {
- if (t->Proc.require_results) {
- gbString expr_str = expr_to_string(ce->proc);
- error(node, "'%s' requires that its results must be handled", expr_str);
- gb_string_free(expr_str);
- }
+ Type *t = base_type(type_of_expr(ce->proc));
+ if (t->kind == Type_Proc) {
+ do_require = t->Proc.require_results;
+ } else if (check_stmt_internal_builtin_proc_id(ce->proc, &builtin_id)) {
+ auto const &bp = builtin_procs[builtin_id];
+ do_require = bp.kind == Expr_Expr && !bp.ignore_results;
+ }
+ if (do_require) {
+ gbString expr_str = expr_to_string(ce->proc);
+ error(node, "'%s' requires that its results must be handled", expr_str);
+ gb_string_free(expr_str);
}
return;
} else if (expr->kind == Ast_SelectorCallExpr) {
+ BuiltinProcId builtin_id = BuiltinProc_Invalid;
+ bool do_require = false;
+
AstSelectorCallExpr *se = &expr->SelectorCallExpr;
ast_node(ce, CallExpr, se->call);
- Type *t = type_of_expr(ce->proc);
- if (is_type_proc(t)) {
- if (t->Proc.require_results) {
- gbString expr_str = expr_to_string(ce->proc);
- error(node, "'%s' requires that its results must be handled", expr_str);
- gb_string_free(expr_str);
- }
+ Type *t = base_type(type_of_expr(ce->proc));
+ if (t->kind == Type_Proc) {
+ do_require = t->Proc.require_results;
+ } else if (check_stmt_internal_builtin_proc_id(ce->proc, &builtin_id)) {
+ auto const &bp = builtin_procs[builtin_id];
+ do_require = bp.kind == Expr_Expr && !bp.ignore_results;
+ }
+ if (do_require) {
+ gbString expr_str = expr_to_string(ce->proc);
+ error(node, "'%s' requires that its results must be handled", expr_str);
+ gb_string_free(expr_str);
}
return;
}
diff --git a/src/check_type.cpp b/src/check_type.cpp
index 51f472961..fc5b7aed7 100644
--- a/src/check_type.cpp
+++ b/src/check_type.cpp
@@ -1234,7 +1234,7 @@ bool check_type_specialization_to(CheckerContext *ctx, Type *specialization, Typ
}
-Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *poly_type, Operand operand) {
+Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *poly_type, Operand const &operand) {
bool modify_type = !ctx->no_polymorphic_errors;
bool show_error = modify_type && !ctx->hide_polymorphic_errors;
if (!is_operand_value(operand)) {
@@ -2795,15 +2795,27 @@ bool check_type_internal(CheckerContext *ctx, Ast *e, Type **type, Type *named_t
if (name == "soa") {
*type = make_soa_struct_fixed(ctx, e, at->elem, elem, count, generic_type);
} else if (name == "simd") {
- if (!is_type_valid_vector_elem(elem)) {
+ if (!is_type_valid_vector_elem(elem) && !is_type_polymorphic(elem)) {
gbString str = type_to_string(elem);
- error(at->elem, "Invalid element type for 'intrinsics.simd_vector', expected an integer or float with no specific endianness, got '%s'", str);
+ error(at->elem, "Invalid element type for #simd, expected an integer, float, or boolean with no specific endianness, got '%s'", str);
gb_string_free(str);
*type = alloc_type_array(elem, count, generic_type);
goto array_end;
}
- *type = alloc_type_simd_vector(count, elem);
+ if (generic_type != nullptr) {
+ // Ignore
+ } else if (count < 1 || !is_power_of_two(count)) {
+ error(at->count, "Invalid length for #simd, expected a power of two length, got '%lld'", cast(long long)count);
+ *type = alloc_type_array(elem, count, generic_type);
+ goto array_end;
+ }
+
+ *type = alloc_type_simd_vector(count, elem, generic_type);
+
+ if (count > SIMD_ELEMENT_COUNT_MAX) {
+ error(at->count, "#simd support a maximum element count of %d, got %lld", SIMD_ELEMENT_COUNT_MAX, cast(long long)count);
+ }
} else {
error(at->tag, "Invalid tag applied to array, got #%.*s", LIT(name));
*type = alloc_type_array(elem, count, generic_type);
diff --git a/src/checker.cpp b/src/checker.cpp
index 8afc6eb14..874839ece 100644
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -3207,6 +3207,22 @@ DECL_ATTRIBUTE_PROC(proc_decl_attribute) {
}
}
return true;
+ } else if (name == "require_target_feature") {
+ ExactValue ev = check_decl_attribute_value(c, value);
+ if (ev.kind == ExactValue_String) {
+ ac->require_target_feature = ev.value_string;
+ } else {
+ error(elem, "Expected a string value for '%.*s'", LIT(name));
+ }
+ return true;
+ } else if (name == "enable_target_feature") {
+ ExactValue ev = check_decl_attribute_value(c, value);
+ if (ev.kind == ExactValue_String) {
+ ac->enable_target_feature = ev.value_string;
+ } else {
+ error(elem, "Expected a string value for '%.*s'", LIT(name));
+ }
+ return true;
}
return false;
}
diff --git a/src/checker.hpp b/src/checker.hpp
index 1c9ffd8c7..f11a00532 100644
--- a/src/checker.hpp
+++ b/src/checker.hpp
@@ -60,6 +60,7 @@ struct BuiltinProc {
ExprKind kind;
BuiltinProcPkg pkg;
bool diverging;
+ bool ignore_results; // ignores require results handling
};
@@ -124,6 +125,9 @@ struct AttributeContext {
String objc_name;
bool objc_is_class_method;
Type * objc_type;
+
+ String require_target_feature; // required by the target micro-architecture
+ String enable_target_feature; // will be enabled for the procedure only
};
AttributeContext make_attribute_context(String link_prefix) {
diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp
index d407ef7c1..05f256775 100644
--- a/src/checker_builtin_procs.hpp
+++ b/src/checker_builtin_procs.hpp
@@ -45,7 +45,6 @@ enum BuiltinProcId {
// "Intrinsics"
BuiltinProc_is_package_imported,
- BuiltinProc_simd_vector,
BuiltinProc_soa_struct,
BuiltinProc_alloca,
@@ -66,6 +65,7 @@ enum BuiltinProcId {
BuiltinProc_overflow_mul,
BuiltinProc_sqrt,
+ BuiltinProc_fused_mul_add,
BuiltinProc_mem_copy,
BuiltinProc_mem_copy_non_overlapping,
@@ -80,6 +80,8 @@ enum BuiltinProcId {
BuiltinProc_unaligned_store,
BuiltinProc_unaligned_load,
+ BuiltinProc_non_temporal_store,
+ BuiltinProc_non_temporal_load,
BuiltinProc_prefetch_read_instruction,
BuiltinProc_prefetch_read_data,
@@ -118,10 +120,76 @@ enum BuiltinProcId {
BuiltinProc_fixed_point_div_sat,
BuiltinProc_expect,
+
+BuiltinProc__simd_begin,
+ BuiltinProc_simd_add,
+ BuiltinProc_simd_sub,
+ BuiltinProc_simd_mul,
+ BuiltinProc_simd_div,
+ BuiltinProc_simd_rem,
+ BuiltinProc_simd_shl, // Odin logic
+ BuiltinProc_simd_shr, // Odin logic
+ BuiltinProc_simd_shl_masked, // C logic
+ BuiltinProc_simd_shr_masked, // C logic
+
+ BuiltinProc_simd_add_sat, // saturation arithmetic
+ BuiltinProc_simd_sub_sat, // saturation arithmetic
+
+ BuiltinProc_simd_and,
+ BuiltinProc_simd_or,
+ BuiltinProc_simd_xor,
+ BuiltinProc_simd_and_not,
+
+ BuiltinProc_simd_neg,
+ BuiltinProc_simd_abs,
+
+ BuiltinProc_simd_min,
+ BuiltinProc_simd_max,
+ BuiltinProc_simd_clamp,
+
+ BuiltinProc_simd_lanes_eq,
+ BuiltinProc_simd_lanes_ne,
+ BuiltinProc_simd_lanes_lt,
+ BuiltinProc_simd_lanes_le,
+ BuiltinProc_simd_lanes_gt,
+ BuiltinProc_simd_lanes_ge,
+
+ BuiltinProc_simd_extract,
+ BuiltinProc_simd_replace,
+
+ BuiltinProc_simd_reduce_add_ordered,
+ BuiltinProc_simd_reduce_mul_ordered,
+ BuiltinProc_simd_reduce_min,
+ BuiltinProc_simd_reduce_max,
+ BuiltinProc_simd_reduce_and,
+ BuiltinProc_simd_reduce_or,
+ BuiltinProc_simd_reduce_xor,
+
+ BuiltinProc_simd_shuffle,
+ BuiltinProc_simd_select,
+
+ BuiltinProc_simd_ceil,
+ BuiltinProc_simd_floor,
+ BuiltinProc_simd_trunc,
+ BuiltinProc_simd_nearest,
+
+ BuiltinProc_simd_to_bits,
+
+ BuiltinProc_simd_lanes_reverse,
+ BuiltinProc_simd_lanes_rotate_left,
+ BuiltinProc_simd_lanes_rotate_right,
+
+
+ // Platform specific SIMD intrinsics
+ BuiltinProc_simd_x86__MM_SHUFFLE,
+BuiltinProc__simd_end,
// Platform specific intrinsics
BuiltinProc_syscall,
+ BuiltinProc_x86_cpuid,
+ BuiltinProc_x86_xgetbv,
+
// Constant type tests
BuiltinProc__type_begin,
@@ -268,7 +336,6 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
// "Intrinsics"
{STR_LIT("is_package_imported"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("simd_vector"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
{STR_LIT("soa_struct"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
{STR_LIT("alloca"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
@@ -290,6 +357,7 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("overflow_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("sqrt"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("fused_mul_add"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("mem_copy"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
{STR_LIT("mem_copy_non_overlapping"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -304,6 +372,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("unaligned_store"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
{STR_LIT("unaligned_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("non_temporal_store"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+ {STR_LIT("non_temporal_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("prefetch_read_instruction"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
{STR_LIT("prefetch_read_data"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -315,26 +385,26 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("atomic_signal_fence"), 1, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
{STR_LIT("atomic_store"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
{STR_LIT("atomic_store_explicit"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_load_explicit"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_add_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_sub_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_and_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_nand"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_nand_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_or"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_or_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_xor_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_exchange"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_exchange_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_compare_exchange_strong"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_compare_exchange_strong_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_compare_exchange_weak"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("atomic_compare_exchange_weak_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("atomic_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_load_explicit"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_add_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_sub_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_and_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_nand"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_nand_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_or"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_or_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_xor_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_exchange"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_exchange_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_compare_exchange_strong"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_compare_exchange_strong_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_compare_exchange_weak"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("atomic_compare_exchange_weak_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
{STR_LIT("fixed_point_mul"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("fixed_point_div"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
@@ -342,8 +412,74 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("fixed_point_div_sat"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("expect"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-
- {STR_LIT("syscall"), 1, true, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_div"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_rem"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_shl"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_shr"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_shl_masked"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_shr_masked"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_add_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_sub_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_or"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_and_not"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_neg"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_abs"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_min"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_max"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_clamp"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_lanes_eq"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_ne"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_lt"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_le"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_gt"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_ge"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_reduce_min"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_reduce_max"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_reduce_and"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_reduce_or"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_reduce_xor"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_shuffle"), 2, true, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_select"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_ceil") , 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_floor"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_trunc"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_nearest"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_to_bits"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_lanes_reverse"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_rotate_left"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("simd_lanes_rotate_right"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT("simd_x86__MM_SHUFFLE"), 4, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
+ {STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
+
+
+ {STR_LIT("syscall"), 1, true, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("x86_cpuid"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("x86_xgetbv"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
@@ -429,12 +565,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("__entry_point"), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
- {STR_LIT("objc_send"), 3, true, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("objc_send"), 3, true, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
{STR_LIT("objc_find_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("objc_find_class"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("objc_register_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
- {STR_LIT("objc_register_class"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+ {STR_LIT("objc_register_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
+ {STR_LIT("objc_register_class"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true},
{STR_LIT("constant_utf16_cstring"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
diff --git a/src/common.cpp b/src/common.cpp
index 94248fb62..77caddfe8 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -47,6 +47,13 @@ void debugf(char const *fmt, ...);
#include "range_cache.cpp"
+bool is_power_of_two(i64 x) {
+ if (x <= 0) {
+ return false;
+ }
+ return !(x & (x-1));
+}
+
int isize_cmp(isize x, isize y) {
if (x < y) {
return -1;
diff --git a/src/entity.cpp b/src/entity.cpp
index 904a630fb..76e6912b9 100644
--- a/src/entity.cpp
+++ b/src/entity.cpp
@@ -233,10 +233,12 @@ struct Entity {
String link_name;
String link_prefix;
DeferredProcedure deferred_procedure;
- bool is_foreign;
- bool is_export;
- bool generated_from_polymorphic;
ProcedureOptimizationMode optimization_mode;
+ bool is_foreign : 1;
+ bool is_export : 1;
+ bool generated_from_polymorphic : 1;
+ bool target_feature_disabled : 1;
+ String target_feature;
} Procedure;
struct {
Array<Entity *> entities;
diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp
index 7cf588853..cf7389ec1 100644
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -1332,8 +1332,8 @@ void lb_generate_code(lbGenerator *gen) {
}
}
- if (build_context.target_features.len != 0) {
- llvm_features = alloc_cstring(permanent_allocator(), build_context.target_features);
+ if (build_context.target_features_set.entries.count != 0) {
+ llvm_features = target_features_set_to_cstring(permanent_allocator(), false);
}
// GB_ASSERT_MSG(LLVMTargetHasAsmBackend(target));
diff --git a/src/llvm_backend_const.cpp b/src/llvm_backend_const.cpp
index 8f17a1cfb..bd76400de 100644
--- a/src/llvm_backend_const.cpp
+++ b/src/llvm_backend_const.cpp
@@ -495,9 +495,9 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
res.value = data;
return res;
} else if (is_type_array(type) &&
- value.kind != ExactValue_Invalid &&
- value.kind != ExactValue_String &&
- value.kind != ExactValue_Compound) {
+ value.kind != ExactValue_Invalid &&
+ value.kind != ExactValue_String &&
+ value.kind != ExactValue_Compound) {
i64 count = type->Array.count;
Type *elem = type->Array.elem;
@@ -513,8 +513,8 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
res.value = llvm_const_array(lb_type(m, elem), elems, cast(unsigned)count);
return res;
} else if (is_type_matrix(type) &&
- value.kind != ExactValue_Invalid &&
- value.kind != ExactValue_Compound) {
+ value.kind != ExactValue_Invalid &&
+ value.kind != ExactValue_Compound) {
i64 row = type->Matrix.row_count;
i64 column = type->Matrix.column_count;
GB_ASSERT(row == column);
@@ -537,6 +537,22 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
res.value = LLVMConstArray(lb_type(m, elem), elems, cast(unsigned)total_elem_count);
return res;
+ } else if (is_type_simd_vector(type) &&
+ value.kind != ExactValue_Invalid &&
+ value.kind != ExactValue_Compound) {
+ i64 count = type->SimdVector.count;
+ Type *elem = type->SimdVector.elem;
+
+ lbValue single_elem = lb_const_value(m, elem, value, allow_local);
+ single_elem.value = llvm_const_cast(single_elem.value, lb_type(m, elem));
+
+ LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, count);
+ for (i64 i = 0; i < count; i++) {
+ elems[i] = single_elem.value;
+ }
+
+ res.value = LLVMConstVector(elems, cast(unsigned)count);
+ return res;
}
switch (value.kind) {
@@ -819,26 +835,81 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
return lb_const_nil(m, original_type);
}
GB_ASSERT(elem_type_can_be_constant(elem_type));
-
isize total_elem_count = cast(isize)type->SimdVector.count;
LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, total_elem_count);
- for (isize i = 0; i < elem_count; i++) {
- TypeAndValue tav = cl->elems[i]->tav;
- GB_ASSERT(tav.mode != Addressing_Invalid);
- values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
- }
- LLVMTypeRef et = lb_type(m, elem_type);
+ if (cl->elems[0]->kind == Ast_FieldValue) {
+ // TODO(bill): This is O(N*M) and will be quite slow; it should probably be sorted before hand
+ isize value_index = 0;
+ for (i64 i = 0; i < total_elem_count; i++) {
+ bool found = false;
- for (isize i = elem_count; i < type->SimdVector.count; i++) {
- values[i] = LLVMConstNull(et);
- }
- for (isize i = 0; i < total_elem_count; i++) {
- values[i] = llvm_const_cast(values[i], et);
- }
+ for (isize j = 0; j < elem_count; j++) {
+ Ast *elem = cl->elems[j];
+ ast_node(fv, FieldValue, elem);
+ if (is_ast_range(fv->field)) {
+ ast_node(ie, BinaryExpr, fv->field);
+ TypeAndValue lo_tav = ie->left->tav;
+ TypeAndValue hi_tav = ie->right->tav;
+ GB_ASSERT(lo_tav.mode == Addressing_Constant);
+ GB_ASSERT(hi_tav.mode == Addressing_Constant);
- res.value = LLVMConstVector(values, cast(unsigned)total_elem_count);
- return res;
+ TokenKind op = ie->op.kind;
+ i64 lo = exact_value_to_i64(lo_tav.value);
+ i64 hi = exact_value_to_i64(hi_tav.value);
+ if (op != Token_RangeHalf) {
+ hi += 1;
+ }
+ if (lo == i) {
+ TypeAndValue tav = fv->value->tav;
+ LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+ for (i64 k = lo; k < hi; k++) {
+ values[value_index++] = val;
+ }
+
+ found = true;
+ i += (hi-lo-1);
+ break;
+ }
+ } else {
+ TypeAndValue index_tav = fv->field->tav;
+ GB_ASSERT(index_tav.mode == Addressing_Constant);
+ i64 index = exact_value_to_i64(index_tav.value);
+ if (index == i) {
+ TypeAndValue tav = fv->value->tav;
+ LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+ values[value_index++] = val;
+ found = true;
+ break;
+ }
+ }
+ }
+
+ if (!found) {
+ values[value_index++] = LLVMConstNull(lb_type(m, elem_type));
+ }
+ }
+
+ res.value = LLVMConstVector(values, cast(unsigned)total_elem_count);
+ return res;
+ } else {
+ for (isize i = 0; i < elem_count; i++) {
+ TypeAndValue tav = cl->elems[i]->tav;
+ GB_ASSERT(tav.mode != Addressing_Invalid);
+ values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
+ }
+ LLVMTypeRef et = lb_type(m, elem_type);
+
+ for (isize i = elem_count; i < total_elem_count; i++) {
+ values[i] = LLVMConstNull(et);
+ }
+ for (isize i = 0; i < total_elem_count; i++) {
+ values[i] = llvm_const_cast(values[i], et);
+ }
+
+ res.value = LLVMConstVector(values, cast(unsigned)total_elem_count);
+ return res;
+ }
} else if (is_type_struct(type)) {
ast_node(cl, CompoundLit, value.value_compound);
diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp
index 133df4d41..1894e85f6 100644
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -258,7 +258,13 @@ lbValue lb_emit_unary_arith(lbProcedure *p, TokenKind op, lbValue x, Type *type)
LLVMBuildStore(p->builder, v2, LLVMBuildStructGEP(p->builder, addr.addr.value, 2, ""));
LLVMBuildStore(p->builder, v3, LLVMBuildStructGEP(p->builder, addr.addr.value, 3, ""));
return lb_addr_load(p, addr);
-
+ } else if (is_type_simd_vector(x.type)) {
+ Type *elem = base_array_type(x.type);
+ if (is_type_float(elem)) {
+ res.value = LLVMBuildFNeg(p->builder, x.value, "");
+ } else {
+ res.value = LLVMBuildNeg(p->builder, x.value, "");
+ }
} else {
GB_PANIC("Unhandled type %s", type_to_string(x.type));
}
@@ -1820,6 +1826,59 @@ lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) {
return res;
}
+ if (is_type_simd_vector(dst)) {
+ Type *et = base_array_type(dst);
+ if (is_type_simd_vector(src)) {
+ Type *src_elem = core_array_type(src);
+ Type *dst_elem = core_array_type(dst);
+
+ GB_ASSERT(src->SimdVector.count == dst->SimdVector.count);
+
+ lbValue res = {};
+ res.type = t;
+ if (are_types_identical(src_elem, dst_elem)) {
+ res.value = value.value;
+ } else if (is_type_float(src_elem) && is_type_integer(dst_elem)) {
+ if (is_type_unsigned(dst_elem)) {
+ res.value = LLVMBuildFPToUI(p->builder, value.value, lb_type(m, t), "");
+ } else {
+ res.value = LLVMBuildFPToSI(p->builder, value.value, lb_type(m, t), "");
+ }
+ } else if (is_type_integer(src_elem) && is_type_float(dst_elem)) {
+ if (is_type_unsigned(src_elem)) {
+ res.value = LLVMBuildUIToFP(p->builder, value.value, lb_type(m, t), "");
+ } else {
+ res.value = LLVMBuildSIToFP(p->builder, value.value, lb_type(m, t), "");
+ }
+ } else if ((is_type_integer(src_elem) || is_type_boolean(src_elem)) && is_type_integer(dst_elem)) {
+ res.value = LLVMBuildIntCast2(p->builder, value.value, lb_type(m, t), !is_type_unsigned(src_elem), "");
+ } else if (is_type_float(src_elem) && is_type_float(dst_elem)) {
+ res.value = LLVMBuildFPCast(p->builder, value.value, lb_type(m, t), "");
+ } else if (is_type_integer(src_elem) && is_type_boolean(dst_elem)) {
+ LLVMValueRef i1vector = LLVMBuildICmp(p->builder, LLVMIntNE, value.value, LLVMConstNull(LLVMTypeOf(value.value)), "");
+ res.value = LLVMBuildIntCast2(p->builder, i1vector, lb_type(m, t), !is_type_unsigned(src_elem), "");
+ } else {
+ GB_PANIC("Unhandled simd vector conversion: %s -> %s", type_to_string(src), type_to_string(dst));
+ }
+ return res;
+ } else {
+ i64 count = get_array_type_count(dst);
+ LLVMTypeRef vt = lb_type(m, t);
+ LLVMTypeRef llvm_u32 = lb_type(m, t_u32);
+ LLVMValueRef elem = lb_emit_conv(p, value, et).value;
+ LLVMValueRef vector = LLVMConstNull(vt);
+ for (i64 i = 0; i < count; i++) {
+ LLVMValueRef idx = LLVMConstInt(llvm_u32, i, false);
+ vector = LLVMBuildInsertElement(p->builder, vector, elem, idx, "");
+ }
+ lbValue res = {};
+ res.type = t;
+ res.value = vector;
+ return res;
+ }
+ }
+
+
// Pointer <-> uintptr
if (is_type_pointer(src) && is_type_uintptr(dst)) {
lbValue res = {};
@@ -2506,6 +2565,57 @@ lbValue lb_emit_comp(lbProcedure *p, TokenKind op_kind, lbValue left, lbValue ri
case Token_NotEq: pred = LLVMIntNE; break;
}
res.value = LLVMBuildICmp(p->builder, pred, left.value, right.value, "");
+ } else if (is_type_simd_vector(a)) {
+ LLVMValueRef mask = nullptr;
+ Type *elem = base_array_type(a);
+ if (is_type_float(elem)) {
+ LLVMRealPredicate pred = {};
+ switch (op_kind) {
+ case Token_CmpEq: pred = LLVMRealOEQ; break;
+ case Token_NotEq: pred = LLVMRealONE; break;
+ }
+ mask = LLVMBuildFCmp(p->builder, pred, left.value, right.value, "");
+ } else {
+ LLVMIntPredicate pred = {};
+ switch (op_kind) {
+ case Token_CmpEq: pred = LLVMIntEQ; break;
+ case Token_NotEq: pred = LLVMIntNE; break;
+ }
+ mask = LLVMBuildICmp(p->builder, pred, left.value, right.value, "");
+ }
+ GB_ASSERT_MSG(mask != nullptr, "Unhandled comparison kind %s (%s) %.*s %s (%s)", type_to_string(left.type), type_to_string(base_type(left.type)), LIT(token_strings[op_kind]), type_to_string(right.type), type_to_string(base_type(right.type)));
+
+ /* NOTE(bill, 2022-05-28):
+ Thanks to Per Vognsen, sign extending <N x i1> to
+ a vector of the same width as the input vector, bit casting to an integer,
+ and then comparing against zero is the better option
+ See: https://lists.llvm.org/pipermail/llvm-dev/2012-September/053046.html
+
+ // Example assuming 128-bit vector
+
+ %1 = <4 x float> ...
+ %2 = <4 x float> ...
+ %3 = fcmp oeq <4 x float> %1, %2
+ %4 = sext <4 x i1> %3 to <4 x i32>
+ %5 = bitcast <4 x i32> %4 to i128
+ %6 = icmp ne i128 %5, 0
+ br i1 %6, label %true1, label %false2
+
+ This will result in 1 cmpps + 1 ptest + 1 br
+ (even without SSE4.1, contrary to what the mail list states, because of pmovmskb)
+
+ */
+
+ unsigned count = cast(unsigned)get_array_type_count(a);
+ unsigned elem_sz = cast(unsigned)(type_size_of(elem)*8);
+ LLVMTypeRef mask_type = LLVMVectorType(LLVMIntTypeInContext(p->module->ctx, elem_sz), count);
+ mask = LLVMBuildSExtOrBitCast(p->builder, mask, mask_type, "");
+
+ LLVMTypeRef mask_int_type = LLVMIntTypeInContext(p->module->ctx, cast(unsigned)(8*type_size_of(a)));
+ LLVMValueRef mask_int = LLVMBuildBitCast(p->builder, mask, mask_int_type, "");
+ res.value = LLVMBuildICmp(p->builder, LLVMIntNE, mask_int, LLVMConstNull(LLVMTypeOf(mask_int)), "");
+ return res;
+
} else {
GB_PANIC("Unhandled comparison kind %s (%s) %.*s %s (%s)", type_to_string(left.type), type_to_string(base_type(left.type)), LIT(token_strings[op_kind]), type_to_string(right.type), type_to_string(base_type(right.type)));
}
@@ -4609,6 +4719,102 @@ lbAddr lb_build_addr(lbProcedure *p, Ast *expr) {
break;
}
+ case Type_SimdVector: {
+ if (cl->elems.count > 0) {
+ lbValue vector_value = lb_const_value(p->module, type, exact_value_compound(expr));
+ defer (lb_addr_store(p, v, vector_value));
+
+ auto temp_data = array_make<lbCompoundLitElemTempData>(temporary_allocator(), 0, cl->elems.count);
+
+ // NOTE(bill): Separate value, store into their own chunks
+ for_array(i, cl->elems) {
+ Ast *elem = cl->elems[i];
+ if (elem->kind == Ast_FieldValue) {
+ ast_node(fv, FieldValue, elem);
+ if (lb_is_elem_const(fv->value, et)) {
+ continue;
+ }
+ if (is_ast_range(fv->field)) {
+ ast_node(ie, BinaryExpr, fv->field);
+ TypeAndValue lo_tav = ie->left->tav;
+ TypeAndValue hi_tav = ie->right->tav;
+ GB_ASSERT(lo_tav.mode == Addressing_Constant);
+ GB_ASSERT(hi_tav.mode == Addressing_Constant);
+
+ TokenKind op = ie->op.kind;
+ i64 lo = exact_value_to_i64(lo_tav.value);
+ i64 hi = exact_value_to_i64(hi_tav.value);
+ if (op != Token_RangeHalf) {
+ hi += 1;
+ }
+
+ lbValue value = lb_build_expr(p, fv->value);
+
+ for (i64 k = lo; k < hi; k++) {
+ lbCompoundLitElemTempData data = {};
+ data.value = value;
+ data.elem_index = cast(i32)k;
+ array_add(&temp_data, data);
+ }
+
+ } else {
+ auto tav = fv->field->tav;
+ GB_ASSERT(tav.mode == Addressing_Constant);
+ i64 index = exact_value_to_i64(tav.value);
+
+ lbValue value = lb_build_expr(p, fv->value);
+ lbCompoundLitElemTempData data = {};
+ data.value = lb_emit_conv(p, value, et);
+ data.expr = fv->value;
+ data.elem_index = cast(i32)index;
+ array_add(&temp_data, data);
+ }
+
+ } else {
+ if (lb_is_elem_const(elem, et)) {
+ continue;
+ }
+ lbCompoundLitElemTempData data = {};
+ data.expr = elem;
+ data.elem_index = cast(i32)i;
+ array_add(&temp_data, data);
+ }
+ }
+
+
+ for_array(i, temp_data) {
+ lbValue field_expr = temp_data[i].value;
+ Ast *expr = temp_data[i].expr;
+
+ auto prev_hint = lb_set_copy_elision_hint(p, lb_addr(temp_data[i].gep), expr);
+
+ if (field_expr.value == nullptr) {
+ field_expr = lb_build_expr(p, expr);
+ }
+ Type *t = field_expr.type;
+ GB_ASSERT(t->kind != Type_Tuple);
+ lbValue ev = lb_emit_conv(p, field_expr, et);
+
+ if (!p->copy_elision_hint.used) {
+ temp_data[i].value = ev;
+ }
+
+ lb_reset_copy_elision_hint(p, prev_hint);
+ }
+
+
+ // TODO(bill): reduce the need for individual `insertelement` if a `shufflevector`
+ // might be a better option
+
+ for_array(i, temp_data) {
+ if (temp_data[i].value.value != nullptr) {
+ LLVMValueRef index = lb_const_int(p->module, t_u32, temp_data[i].elem_index).value;
+ vector_value.value = LLVMBuildInsertElement(p->builder, vector_value.value, temp_data[i].value.value, index, "");
+ }
+ }
+ }
+ break;
+ }
}
return v;
diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp
index 154be2f1f..75ca77641 100644
--- a/src/llvm_backend_proc.cpp
+++ b/src/llvm_backend_proc.cpp
@@ -169,6 +169,19 @@ lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool ignore_body)
}
}
+ if (!entity->Procedure.target_feature_disabled &&
+ entity->Procedure.target_feature.len != 0) {
+ auto features = split_by_comma(entity->Procedure.target_feature);
+ for_array(i, features) {
+ String feature = features[i];
+ LLVMAttributeRef ref = LLVMCreateStringAttribute(
+ m->ctx,
+ cast(char const *)feature.text, cast(unsigned)feature.len,
+ "", 0);
+ LLVMAddAttributeAtIndex(p->value, LLVMAttributeIndex_FunctionIndex, ref);
+ }
+ }
+
if (entity->flags & EntityFlag_Cold) {
lb_add_attribute_to_proc(m, p->value, "cold");
}
@@ -981,10 +994,466 @@ lbValue lb_emit_call(lbProcedure *p, lbValue value, Array<lbValue> const &args,
return result;
}
+LLVMValueRef llvm_splat_float(i64 count, LLVMTypeRef type, f64 value) {
+ LLVMValueRef v = LLVMConstReal(type, value);
+ LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+ for (i64 i = 0; i < count; i++) {
+ values[i] = v;
+ }
+ return LLVMConstVector(values, cast(unsigned)count);
+}
+LLVMValueRef llvm_splat_int(i64 count, LLVMTypeRef type, i64 value, bool is_signed=false) {
+ LLVMValueRef v = LLVMConstInt(type, value, is_signed);
+ LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+ for (i64 i = 0; i < count; i++) {
+ values[i] = v;
+ }
+ return LLVMConstVector(values, cast(unsigned)count);
+}
+
+
+lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, BuiltinProcId builtin_id) {
+ ast_node(ce, CallExpr, expr);
+
+ lbModule *m = p->module;
+
+ lbValue res = {};
+ res.type = tv.type;
+
+ lbValue arg0 = {}; if (ce->args.count > 0) arg0 = lb_build_expr(p, ce->args[0]);
+ lbValue arg1 = {}; if (ce->args.count > 1) arg1 = lb_build_expr(p, ce->args[1]);
+ lbValue arg2 = {}; if (ce->args.count > 2) arg2 = lb_build_expr(p, ce->args[2]);
+
+ Type *elem = base_array_type(arg0.type);
+
+ bool is_float = is_type_float(elem);
+ bool is_signed = !is_type_unsigned(elem);
+
+ LLVMOpcode op_code = cast(LLVMOpcode)0;
+
+ switch (builtin_id) {
+ case BuiltinProc_simd_add:
+ case BuiltinProc_simd_sub:
+ case BuiltinProc_simd_mul:
+ case BuiltinProc_simd_div:
+ case BuiltinProc_simd_rem:
+ if (is_float) {
+ switch (builtin_id) {
+ case BuiltinProc_simd_add: op_code = LLVMFAdd; break;
+ case BuiltinProc_simd_sub: op_code = LLVMFSub; break;
+ case BuiltinProc_simd_mul: op_code = LLVMFMul; break;
+ case BuiltinProc_simd_div: op_code = LLVMFDiv; break;
+ }
+ } else {
+ switch (builtin_id) {
+ case BuiltinProc_simd_add: op_code = LLVMAdd; break;
+ case BuiltinProc_simd_sub: op_code = LLVMSub; break;
+ case BuiltinProc_simd_mul: op_code = LLVMMul; break;
+ case BuiltinProc_simd_div:
+ if (is_signed) {
+ op_code = LLVMSDiv;
+ } else {
+ op_code = LLVMUDiv;
+ }
+ break;
+ case BuiltinProc_simd_rem:
+ if (is_signed) {
+ op_code = LLVMSRem;
+ } else {
+ op_code = LLVMURem;
+ }
+ break;
+ }
+ }
+ if (op_code) {
+ res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, "");
+ return res;
+ }
+ break;
+ case BuiltinProc_simd_shl: // Odin logic
+ case BuiltinProc_simd_shr: // Odin logic
+ case BuiltinProc_simd_shl_masked: // C logic
+ case BuiltinProc_simd_shr_masked: // C logic
+ {
+ i64 sz = type_size_of(elem);
+ GB_ASSERT(arg0.type->kind == Type_SimdVector);
+
+ i64 count = arg0.type->SimdVector.count;
+ Type *elem1 = base_array_type(arg1.type);
+
+ bool is_masked = false;
+ switch (builtin_id) {
+ case BuiltinProc_simd_shl: op_code = LLVMShl; is_masked = false; break;
+ case BuiltinProc_simd_shr: op_code = is_signed ? LLVMAShr : LLVMLShr; is_masked = false; break;
+ case BuiltinProc_simd_shl_masked: op_code = LLVMShl; is_masked = true; break;
+ case BuiltinProc_simd_shr_masked: op_code = is_signed ? LLVMAShr : LLVMLShr; is_masked = true; break;
+ }
+ if (op_code) {
+ LLVMValueRef bits = llvm_splat_int(count, lb_type(m, elem1), sz*8 - 1);
+ if (is_masked) {
+ // C logic
+ LLVMValueRef shift = LLVMBuildAnd(p->builder, arg1.value, bits, "");
+ res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, shift, "");
+ } else {
+ // Odin logic
+ LLVMValueRef zero = lb_const_nil(m, arg1.type).value;
+ LLVMValueRef mask = LLVMBuildICmp(p->builder, LLVMIntULE, arg1.value, bits, "");
+ LLVMValueRef shift = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSelect(p->builder, mask, shift, zero, "");
+ }
+ return res;
+ }
+ }
+ break;
+ case BuiltinProc_simd_and:
+ case BuiltinProc_simd_or:
+ case BuiltinProc_simd_xor:
+ case BuiltinProc_simd_and_not:
+ switch (builtin_id) {
+ case BuiltinProc_simd_and: op_code = LLVMAnd; break;
+ case BuiltinProc_simd_or: op_code = LLVMOr; break;
+ case BuiltinProc_simd_xor: op_code = LLVMXor; break;
+ case BuiltinProc_simd_and_not:
+ op_code = LLVMAnd;
+ arg1.value = LLVMBuildNot(p->builder, arg1.value, "");
+ break;
+ }
+ if (op_code) {
+ res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, "");
+ return res;
+ }
+ break;
+ case BuiltinProc_simd_neg:
+ if (is_float) {
+ res.value = LLVMBuildFNeg(p->builder, arg0.value, "");
+ } else {
+ res.value = LLVMBuildNeg(p->builder, arg0.value, "");
+ }
+ return res;
+ case BuiltinProc_simd_abs:
+ if (is_float) {
+ LLVMValueRef pos = arg0.value;
+ LLVMValueRef neg = LLVMBuildFNeg(p->builder, pos, "");
+ LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOGT, pos, neg, "");
+ res.value = LLVMBuildSelect(p->builder, cond, pos, neg, "");
+ } else {
+ LLVMValueRef pos = arg0.value;
+ LLVMValueRef neg = LLVMBuildNeg(p->builder, pos, "");
+ LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSGT : LLVMIntUGT, pos, neg, "");
+ res.value = LLVMBuildSelect(p->builder, cond, pos, neg, "");
+ }
+ return res;
+ case BuiltinProc_simd_min:
+ if (is_float) {
+ LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOLT, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+ } else {
+ LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSLT : LLVMIntULT, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+ }
+ return res;
+ case BuiltinProc_simd_max:
+ if (is_float) {
+ LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOGT, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+ } else {
+ LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSGT : LLVMIntUGT, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, "");
+ }
+ return res;
+ case BuiltinProc_simd_lanes_eq:
+ case BuiltinProc_simd_lanes_ne:
+ case BuiltinProc_simd_lanes_lt:
+ case BuiltinProc_simd_lanes_le:
+ case BuiltinProc_simd_lanes_gt:
+ case BuiltinProc_simd_lanes_ge:
+ if (is_float) {
+ LLVMRealPredicate pred = cast(LLVMRealPredicate)0;
+ switch (builtin_id) {
+ case BuiltinProc_simd_lanes_eq: pred = LLVMRealOEQ; break;
+ case BuiltinProc_simd_lanes_ne: pred = LLVMRealONE; break;
+ case BuiltinProc_simd_lanes_lt: pred = LLVMRealOLT; break;
+ case BuiltinProc_simd_lanes_le: pred = LLVMRealOLE; break;
+ case BuiltinProc_simd_lanes_gt: pred = LLVMRealOGT; break;
+ case BuiltinProc_simd_lanes_ge: pred = LLVMRealOGE; break;
+ }
+ if (pred) {
+ res.value = LLVMBuildFCmp(p->builder, pred, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSExtOrBitCast(p->builder, res.value, lb_type(m, tv.type), "");
+ return res;
+ }
+ } else {
+ LLVMIntPredicate pred = cast(LLVMIntPredicate)0;
+ switch (builtin_id) {
+ case BuiltinProc_simd_lanes_eq: pred = LLVMIntEQ; break;
+ case BuiltinProc_simd_lanes_ne: pred = LLVMIntNE; break;
+ case BuiltinProc_simd_lanes_lt: pred = is_signed ? LLVMIntSLT :LLVMIntULT; break;
+ case BuiltinProc_simd_lanes_le: pred = is_signed ? LLVMIntSLE :LLVMIntULE; break;
+ case BuiltinProc_simd_lanes_gt: pred = is_signed ? LLVMIntSGT :LLVMIntUGT; break;
+ case BuiltinProc_simd_lanes_ge: pred = is_signed ? LLVMIntSGE :LLVMIntUGE; break;
+ }
+ if (pred) {
+ res.value = LLVMBuildICmp(p->builder, pred, arg0.value, arg1.value, "");
+ res.value = LLVMBuildSExtOrBitCast(p->builder, res.value, lb_type(m, tv.type), "");
+ return res;
+ }
+ }
+ break;
+
+ case BuiltinProc_simd_extract:
+ res.value = LLVMBuildExtractElement(p->builder, arg0.value, arg1.value, "");
+ return res;
+ case BuiltinProc_simd_replace:
+ res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
+ return res;
+
+ case BuiltinProc_simd_reduce_add_ordered:
+ case BuiltinProc_simd_reduce_mul_ordered:
+ {
+ LLVMTypeRef llvm_elem = lb_type(m, elem);
+ LLVMValueRef args[2] = {};
+ isize args_count = 0;
+
+ char const *name = nullptr;
+ switch (builtin_id) {
+ case BuiltinProc_simd_reduce_add_ordered:
+ if (is_float) {
+ name = "llvm.vector.reduce.fadd";
+ args[args_count++] = LLVMConstReal(llvm_elem, 0.0);
+ } else {
+ name = "llvm.vector.reduce.add";
+ }
+ break;
+ case BuiltinProc_simd_reduce_mul_ordered:
+ if (is_float) {
+ name = "llvm.vector.reduce.fmul";
+ args[args_count++] = LLVMConstReal(llvm_elem, 1.0);
+ } else {
+ name = "llvm.vector.reduce.mul";
+ }
+ break;
+ }
+ args[args_count++] = arg0.value;
+
+
+ LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+ unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+ GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+ LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+ res.value = LLVMBuildCall(p->builder, ip, args, cast(unsigned)args_count, "");
+ return res;
+ }
+ case BuiltinProc_simd_reduce_min:
+ case BuiltinProc_simd_reduce_max:
+ case BuiltinProc_simd_reduce_and:
+ case BuiltinProc_simd_reduce_or:
+ case BuiltinProc_simd_reduce_xor:
+ {
+ char const *name = nullptr;
+ switch (builtin_id) {
+ case BuiltinProc_simd_reduce_min:
+ if (is_float) {
+ name = "llvm.vector.reduce.fmin";
+ } else if (is_signed) {
+ name = "llvm.vector.reduce.smin";
+ } else {
+ name = "llvm.vector.reduce.umin";
+ }
+ break;
+ case BuiltinProc_simd_reduce_max:
+ if (is_float) {
+ name = "llvm.vector.reduce.fmax";
+ } else if (is_signed) {
+ name = "llvm.vector.reduce.smax";
+ } else {
+ name = "llvm.vector.reduce.umax";
+ }
+ break;
+ case BuiltinProc_simd_reduce_and: name = "llvm.vector.reduce.and"; break;
+ case BuiltinProc_simd_reduce_or: name = "llvm.vector.reduce.or"; break;
+ case BuiltinProc_simd_reduce_xor: name = "llvm.vector.reduce.xor"; break;
+ }
+ LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+ unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+ GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+ LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+ LLVMValueRef args[1] = {};
+ args[0] = arg0.value;
+
+ res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+ return res;
+ }
+
+ case BuiltinProc_simd_shuffle:
+ {
+ Type *vt = arg0.type;
+ GB_ASSERT(vt->kind == Type_SimdVector);
+
+ i64 indices_count = ce->args.count-2;
+ i64 max_count = vt->SimdVector.count*2;
+ GB_ASSERT(indices_count <= max_count);
+
+ LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, indices_count);
+ for (isize i = 0; i < indices_count; i++) {
+ lbValue idx = lb_build_expr(p, ce->args[i+2]);
+ GB_ASSERT(LLVMIsConstant(idx.value));
+ values[i] = idx.value;
+ }
+ LLVMValueRef indices = LLVMConstVector(values, cast(unsigned)indices_count);
+
+ res.value = LLVMBuildShuffleVector(p->builder, arg0.value, arg1.value, indices, "");
+ return res;
+ }
+
+ case BuiltinProc_simd_select:
+ {
+ LLVMValueRef cond = arg0.value;
+ LLVMValueRef x = lb_build_expr(p, ce->args[1]).value;
+ LLVMValueRef y = lb_build_expr(p, ce->args[2]).value;
+
+ cond = LLVMBuildICmp(p->builder, LLVMIntNE, cond, LLVMConstNull(LLVMTypeOf(cond)), "");
+ res.value = LLVMBuildSelect(p->builder, cond, x, y, "");
+ return res;
+ }
+
+ case BuiltinProc_simd_ceil:
+ case BuiltinProc_simd_floor:
+ case BuiltinProc_simd_trunc:
+ case BuiltinProc_simd_nearest:
+ {
+ char const *name = nullptr;
+ switch (builtin_id) {
+ case BuiltinProc_simd_ceil: name = "llvm.ceil"; break;
+ case BuiltinProc_simd_floor: name = "llvm.floor"; break;
+ case BuiltinProc_simd_trunc: name = "llvm.trunc"; break;
+ case BuiltinProc_simd_nearest: name = "llvm.nearbyint"; break;
+ }
+
+ LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+ unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+ GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+ LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+ LLVMValueRef args[1] = {};
+ args[0] = arg0.value;
+
+ res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+ return res;
+ }
+
+ case BuiltinProc_simd_lanes_reverse:
+ {
+ i64 count = get_array_type_count(arg0.type);
+ LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+ LLVMTypeRef llvm_u32 = lb_type(m, t_u32);
+ for (i64 i = 0; i < count; i++) {
+ values[i] = LLVMConstInt(llvm_u32, count-1-i, false);
+ }
+ LLVMValueRef mask = LLVMConstVector(values, cast(unsigned)count);
+
+ LLVMValueRef v = arg0.value;
+ res.value = LLVMBuildShuffleVector(p->builder, v, v, mask, "");
+ return res;
+ }
+
+ case BuiltinProc_simd_lanes_rotate_left:
+ case BuiltinProc_simd_lanes_rotate_right:
+ {
+
+ i64 count = get_array_type_count(arg0.type);
+ GB_ASSERT(is_power_of_two(count));
+ BigInt bi_count = {};
+ big_int_from_i64(&bi_count, count);
+
+ TypeAndValue const &tv = ce->args[1]->tav;
+ ExactValue val = exact_value_to_integer(tv.value);
+ GB_ASSERT(val.kind == ExactValue_Integer);
+ BigInt *bi = &val.value_integer;
+ if (builtin_id == BuiltinProc_simd_lanes_rotate_right) {
+ big_int_neg(bi, bi);
+ }
+ big_int_rem(bi, bi, &bi_count);
+ big_int_dealloc(&bi_count);
+
+ i64 left = big_int_to_i64(bi);
+
+ LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count);
+ LLVMTypeRef llvm_u32 = lb_type(m, t_u32);
+ for (i64 i = 0; i < count; i++) {
+ u64 idx = cast(u64)(i+left) & cast(u64)(count-1);
+ values[i] = LLVMConstInt(llvm_u32, idx, false);
+ }
+ LLVMValueRef mask = LLVMConstVector(values, cast(unsigned)count);
+
+ LLVMValueRef v = arg0.value;
+ res.value = LLVMBuildShuffleVector(p->builder, v, v, mask, "");
+ return res;
+ }
+
+
+ case BuiltinProc_simd_add_sat:
+ case BuiltinProc_simd_sub_sat:
+ {
+ char const *name = nullptr;
+ switch (builtin_id) {
+ case BuiltinProc_simd_add_sat: name = is_signed ? "llvm.sadd.sat" : "llvm.uadd.sat"; break;
+ case BuiltinProc_simd_sub_sat: name = is_signed ? "llvm.ssub.sat" : "llvm.usub.sat"; break;
+ }
+
+ LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)};
+ unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+ GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+ LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+ LLVMValueRef args[2] = {};
+ args[0] = arg0.value;
+ args[1] = arg1.value;
+
+ res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+ return res;
+ }
+
+ case BuiltinProc_simd_clamp:
+ {
+ LLVMValueRef v = arg0.value;
+ LLVMValueRef min = arg1.value;
+ LLVMValueRef max = arg2.value;
+
+ if (is_float) {
+ v = LLVMBuildSelect(p->builder, LLVMBuildFCmp(p->builder, LLVMRealOLT, v, min, ""), min, v, "");
+ res.value = LLVMBuildSelect(p->builder, LLVMBuildFCmp(p->builder, LLVMRealOGT, v, max, ""), max, v, "");
+ } else if (is_signed) {
+ v = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntSLT, v, min, ""), min, v, "");
+ res.value = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntSGT, v, max, ""), max, v, "");
+ } else {
+ v = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntULT, v, min, ""), min, v, "");
+ res.value = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntUGT, v, max, ""), max, v, "");
+ }
+ return res;
+ }
+
+ case BuiltinProc_simd_to_bits:
+ {
+ res.value = LLVMBuildBitCast(p->builder, arg0.value, lb_type(m, tv.type), "");
+ return res;
+ }
+
+ }
+ GB_PANIC("Unhandled simd intrinsic: '%.*s'", LIT(builtin_procs[builtin_id].name));
+
+ return {};
+}
+
lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, BuiltinProcId id) {
ast_node(ce, CallExpr, expr);
+ if (BuiltinProc__simd_begin < id && id < BuiltinProc__simd_end) {
+ return lb_build_builtin_simd_proc(p, expr, tv, id);
+ }
+
switch (id) {
case BuiltinProc_DIRECTIVE: {
ast_node(bd, BasicDirective, ce->proc);
@@ -1532,6 +2001,31 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
return res;
}
+ case BuiltinProc_fused_mul_add:
+ {
+ Type *type = tv.type;
+ lbValue x = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), type);
+ lbValue y = lb_emit_conv(p, lb_build_expr(p, ce->args[1]), type);
+ lbValue z = lb_emit_conv(p, lb_build_expr(p, ce->args[2]), type);
+
+
+ char const *name = "llvm.fma";
+ LLVMTypeRef types[1] = {lb_type(p->module, type)};
+ unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name));
+ GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0]));
+ LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types));
+
+ LLVMValueRef args[3] = {};
+ args[0] = x.value;
+ args[1] = y.value;
+ args[2] = z.value;
+
+ lbValue res = {};
+ res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), "");
+ res.type = type;
+ return res;
+ }
+
case BuiltinProc_mem_copy:
{
lbValue dst = lb_build_expr(p, ce->args[0]);
@@ -1614,6 +2108,7 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
return {};
case BuiltinProc_volatile_store:
+ case BuiltinProc_non_temporal_store:
case BuiltinProc_atomic_store:
case BuiltinProc_atomic_store_explicit: {
lbValue dst = lb_build_expr(p, ce->args[0]);
@@ -1622,6 +2117,13 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
LLVMValueRef instr = LLVMBuildStore(p->builder, val.value, dst.value);
switch (id) {
+ case BuiltinProc_non_temporal_store:
+ {
+ unsigned kind_id = LLVMGetMDKindIDInContext(p->module->ctx, "nontemporal", 11);
+ LLVMMetadataRef node = LLVMValueAsMetadata(LLVMConstInt(lb_type(p->module, t_u32), 1, false));
+ LLVMSetMetadata(instr, kind_id, LLVMMetadataAsValue(p->module->ctx, node));
+ }
+ break;
case BuiltinProc_volatile_store: LLVMSetVolatile(instr, true); break;
case BuiltinProc_atomic_store: LLVMSetOrdering(instr, LLVMAtomicOrderingSequentiallyConsistent); break;
case BuiltinProc_atomic_store_explicit: LLVMSetOrdering(instr, llvm_atomic_ordering_from_odin(ce->args[2])); break;
@@ -1633,12 +2135,21 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
}
case BuiltinProc_volatile_load:
+ case BuiltinProc_non_temporal_load:
case BuiltinProc_atomic_load:
case BuiltinProc_atomic_load_explicit: {
lbValue dst = lb_build_expr(p, ce->args[0]);
LLVMValueRef instr = LLVMBuildLoad(p->builder, dst.value, "");
switch (id) {
+ case BuiltinProc_non_temporal_load:
+ {
+ unsigned kind_id = LLVMGetMDKindIDInContext(p->module->ctx, "nontemporal", 11);
+ LLVMMetadataRef node = LLVMValueAsMetadata(LLVMConstInt(lb_type(p->module, t_u32), 1, false));
+ LLVMSetMetadata(instr, kind_id, LLVMMetadataAsValue(p->module->ctx, node));
+ }
+ break;
+ break;
case BuiltinProc_volatile_load: LLVMSetVolatile(instr, true); break;
case BuiltinProc_atomic_load: LLVMSetOrdering(instr, LLVMAtomicOrderingSequentiallyConsistent); break;
case BuiltinProc_atomic_load_explicit: LLVMSetOrdering(instr, llvm_atomic_ordering_from_odin(ce->args[1])); break;
@@ -2232,6 +2743,47 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv,
return res;
}
+
+ case BuiltinProc_x86_cpuid:
+ {
+ Type *param_types[2] = {t_u32, t_u32};
+ Type *type = alloc_type_proc_from_types(param_types, gb_count_of(param_types), tv.type, false, ProcCC_None);
+ LLVMTypeRef func_type = LLVMGetElementType(lb_type(p->module, type));
+ LLVMValueRef the_asm = llvm_get_inline_asm(
+ func_type,
+ str_lit("cpuid"),
+ str_lit("={ax},={bx},={cx},={dx},{ax},{cx}"),
+ true
+ );
+ GB_ASSERT(the_asm != nullptr);
+
+ LLVMValueRef args[2] = {};
+ args[0] = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), t_u32).value;
+ args[1] = lb_emit_conv(p, lb_build_expr(p, ce->args[1]), t_u32).value;
+ lbValue res = {};
+ res.type = tv.type;
+ res.value = LLVMBuildCall2(p->builder, func_type, the_asm, args, gb_count_of(args), "");
+ return res;
+ }
+ case BuiltinProc_x86_xgetbv:
+ {
+ Type *type = alloc_type_proc_from_types(&t_u32, 1, tv.type, false, ProcCC_None);
+ LLVMTypeRef func_type = LLVMGetElementType(lb_type(p->module, type));
+ LLVMValueRef the_asm = llvm_get_inline_asm(
+ func_type,
+ str_lit("xgetbv"),
+ str_lit("={ax},={dx},{cx}"),
+ true
+ );
+ GB_ASSERT(the_asm != nullptr);
+
+ LLVMValueRef args[1] = {};
+ args[0] = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), t_u32).value;
+ lbValue res = {};
+ res.type = tv.type;
+ res.value = LLVMBuildCall2(p->builder, func_type, the_asm, args, gb_count_of(args), "");
+ return res;
+ }
}
GB_PANIC("Unhandled built-in procedure %.*s", LIT(builtin_procs[id].name));
diff --git a/src/llvm_backend_utility.cpp b/src/llvm_backend_utility.cpp
index 037171637..52d3a17cf 100644
--- a/src/llvm_backend_utility.cpp
+++ b/src/llvm_backend_utility.cpp
@@ -201,6 +201,11 @@ lbValue lb_emit_transmute(lbProcedure *p, lbValue value, Type *t) {
return res;
}
+ if (is_type_simd_vector(src) && is_type_simd_vector(dst)) {
+ res.value = LLVMBuildBitCast(p->builder, value.value, lb_type(p->module, t), "");
+ return res;
+ }
+
if (lb_is_type_aggregate(src) || lb_is_type_aggregate(dst)) {
lbValue s = lb_address_from_load_or_generate_local(p, value);
lbValue d = lb_emit_transmute(p, s, alloc_type_pointer(t));
@@ -480,8 +485,10 @@ lbValue lb_emit_count_ones(lbProcedure *p, lbValue x, Type *type) {
}
lbValue lb_emit_count_zeros(lbProcedure *p, lbValue x, Type *type) {
- i64 sz = 8*type_size_of(type);
- lbValue size = lb_const_int(p->module, type, cast(u64)sz);
+ Type *elem = base_array_type(type);
+ i64 sz = 8*type_size_of(elem);
+ lbValue size = lb_const_int(p->module, elem, cast(u64)sz);
+ size = lb_emit_conv(p, size, type);
lbValue count = lb_emit_count_ones(p, x, type);
return lb_emit_arith(p, Token_Sub, size, count, type);
}
diff --git a/src/main.cpp b/src/main.cpp
index 13c8bd74d..ee71b91df 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1376,8 +1376,8 @@ bool parse_build_flags(Array<String> args) {
}
case BuildFlag_TargetFeatures: {
GB_ASSERT(value.kind == ExactValue_String);
- build_context.target_features = value.value_string;
- string_to_lower(&build_context.target_features);
+ build_context.target_features_string = value.value_string;
+ string_to_lower(&build_context.target_features_string);
break;
}
case BuildFlag_RelocMode: {
diff --git a/src/parser.cpp b/src/parser.cpp
index d19e249e5..5280fd4b0 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -360,6 +360,7 @@ Ast *clone_ast(Ast *node) {
case Ast_ArrayType:
n->ArrayType.count = clone_ast(n->ArrayType.count);
n->ArrayType.elem = clone_ast(n->ArrayType.elem);
+ n->ArrayType.tag = clone_ast(n->ArrayType.tag);
break;
case Ast_DynamicArrayType:
n->DynamicArrayType.elem = clone_ast(n->DynamicArrayType.elem);
@@ -2127,7 +2128,18 @@ Ast *parse_operand(AstFile *f, bool lhs) {
Token name = expect_token(f, Token_Ident);
if (name.string == "type") {
return ast_helper_type(f, token, parse_type(f));
- } else if (name.string == "soa" || name.string == "simd") {
+ } else if ( name.string == "simd") {
+ Ast *tag = ast_basic_directive(f, token, name);
+ Ast *original_type = parse_type(f);
+ Ast *type = unparen_expr(original_type);
+ switch (type->kind) {
+ case Ast_ArrayType: type->ArrayType.tag = tag; break;
+ default:
+ syntax_error(type, "Expected a fixed array type after #%.*s, got %.*s", LIT(name.string), LIT(ast_strings[type->kind]));
+ break;
+ }
+ return original_type;
+ } else if (name.string == "soa") {
Ast *tag = ast_basic_directive(f, token, name);
Ast *original_type = parse_type(f);
Ast *type = unparen_expr(original_type);
diff --git a/src/parser.hpp b/src/parser.hpp
index dc294b6ce..a648828fb 100644
--- a/src/parser.hpp
+++ b/src/parser.hpp
@@ -411,7 +411,6 @@ AST_KIND(_ExprBegin, "", bool) \
Token ellipsis; \
ProcInlining inlining; \
bool optional_ok_one; \
- i32 builtin_id; \
void *sce_temp_data; \
}) \
AST_KIND(FieldValue, "field value", struct { Token eq; Ast *field, *value; }) \
diff --git a/src/string.cpp b/src/string.cpp
index 616761265..44eccd2d2 100644
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -157,6 +157,15 @@ int string_compare(String const &x, String const &y) {
return 0;
}
+isize string_index_byte(String const &s, u8 x) {
+ for (isize i = 0; i < s.len; i++) {
+ if (s.text[i] == x) {
+ return i;
+ }
+ }
+ return -1;
+}
+
GB_COMPARE_PROC(string_cmp_proc) {
String x = *(String *)a;
String y = *(String *)b;
diff --git a/src/types.cpp b/src/types.cpp
index c79b8e652..ad83e0568 100644
--- a/src/types.cpp
+++ b/src/types.cpp
@@ -261,6 +261,7 @@ struct TypeProc {
TYPE_KIND(SimdVector, struct { \
i64 count; \
Type *elem; \
+ Type *generic_count; \
}) \
TYPE_KIND(RelativePointer, struct { \
Type *pointer_type; \
@@ -362,6 +363,9 @@ enum : int {
MATRIX_ELEMENT_COUNT_MIN = 1,
MATRIX_ELEMENT_COUNT_MAX = 16,
MATRIX_ELEMENT_MAX_SIZE = MATRIX_ELEMENT_COUNT_MAX * (2 * 8), // complex128
+
+ SIMD_ELEMENT_COUNT_MIN = 1,
+ SIMD_ELEMENT_COUNT_MAX = 64,
};
@@ -1085,10 +1089,11 @@ Type *alloc_type_bit_set() {
-Type *alloc_type_simd_vector(i64 count, Type *elem) {
+Type *alloc_type_simd_vector(i64 count, Type *elem, Type *generic_count=nullptr) {
Type *t = alloc_type(Type_SimdVector);
t->SimdVector.count = count;
t->SimdVector.elem = elem;
+ t->SimdVector.generic_count = generic_count;
return t;
}
@@ -1593,6 +1598,8 @@ i64 get_array_type_count(Type *t) {
return bt->Array.count;
} else if (bt->kind == Type_EnumeratedArray) {
return bt->EnumeratedArray.count;
+ } else if (bt->kind == Type_SimdVector) {
+ return bt->SimdVector.count;
}
GB_ASSERT(is_type_array_like(t));
return -1;
@@ -1932,11 +1939,14 @@ bool is_type_valid_vector_elem(Type *t) {
return false;
}
if (is_type_integer(t)) {
- return true;
+ return !is_type_integer_128bit(t);
}
if (is_type_float(t)) {
return true;
}
+ if (is_type_boolean(t)) {
+ return true;
+ }
}
return false;
}
@@ -2078,6 +2088,11 @@ bool is_type_polymorphic(Type *t, bool or_specialized=false) {
return true;
}
return is_type_polymorphic(t->Array.elem, or_specialized);
+ case Type_SimdVector:
+ if (t->SimdVector.generic_count != nullptr) {
+ return true;
+ }
+ return is_type_polymorphic(t->SimdVector.elem, or_specialized);
case Type_DynamicArray:
return is_type_polymorphic(t->DynamicArray.elem, or_specialized);
case Type_Slice:
@@ -2291,6 +2306,9 @@ bool is_type_comparable(Type *t) {
}
}
return true;
+
+ case Type_SimdVector:
+ return true;
}
return false;
}
@@ -3446,7 +3464,7 @@ i64 type_align_of_internal(Type *t, TypePath *path) {
case Type_SimdVector: {
// IMPORTANT TODO(bill): Figure out the alignment of vector types
- return gb_clamp(next_pow2(type_size_of_internal(t, path)), 1, build_context.max_align);
+ return gb_clamp(next_pow2(type_size_of_internal(t, path)), 1, build_context.max_align*2);
}
case Type_Matrix: