diff options
| author | gingerBill <gingerBill@users.noreply.github.com> | 2022-05-31 11:52:24 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-05-31 11:52:24 +0100 |
| commit | a1f15c2c69b557be5a95882d18137d1f74d980ee (patch) | |
| tree | 3f484753712a6d9d9cf1074f56bc91af6d6432c1 | |
| parent | a6c779b50ecf5c8c0cb86c9d49768ab34508b1d2 (diff) | |
| parent | 516f6647b46c69a67139154c02c74b436cd4b999 (diff) | |
Merge pull request #1807 from odin-lang/simd-dev
Generic #simd type and intrinsics
43 files changed, 5432 insertions, 364 deletions
diff --git a/core/intrinsics/intrinsics.odin b/core/intrinsics/intrinsics.odin index d71522936..9994a1914 100644 --- a/core/intrinsics/intrinsics.odin +++ b/core/intrinsics/intrinsics.odin @@ -6,12 +6,14 @@ package intrinsics is_package_imported :: proc(package_name: string) -> bool --- // Types -simd_vector :: proc($N: int, $T: typeid) -> type/#simd[N]T soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T // Volatile volatile_load :: proc(dst: ^$T) -> T --- -volatile_store :: proc(dst: ^$T, val: T) -> T --- +volatile_store :: proc(dst: ^$T, val: T) --- + +non_temporal_load :: proc(dst: ^$T) -> T --- +non_temporal_store :: proc(dst: ^$T, val: T) --- // Trapping debug_trap :: proc() --- @@ -23,18 +25,20 @@ alloca :: proc(size, align: int) -> [^]u8 --- cpu_relax :: proc() --- read_cycle_counter :: proc() -> i64 --- -count_ones :: proc(x: $T) -> T where type_is_integer(T) --- -count_zeros :: proc(x: $T) -> T where type_is_integer(T) --- -count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) --- -count_leading_zeros :: proc(x: $T) -> T where type_is_integer(T) --- -reverse_bits :: proc(x: $T) -> T where type_is_integer(T) --- +count_ones :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) --- +count_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) --- +count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) --- +count_leading_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) --- +reverse_bits :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) --- byte_swap :: proc(x: $T) -> T where type_is_integer(T) || type_is_float(T) --- overflow_add :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok --- overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok --- overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok --- -sqrt :: proc(x: $T) -> T where type_is_float(T) --- +sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) --- + +fused_mul_add :: proc(a, b, c: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) --- mem_copy :: proc(dst, src: rawptr, len: int) --- mem_copy_non_overlapping :: proc(dst, src: rawptr, len: int) --- @@ -186,6 +190,81 @@ type_hasher_proc :: proc($T: typeid) -> (hasher: proc "contextless" (data: rawpt constant_utf16_cstring :: proc($literal: string) -> [^]u16 --- +// SIMD related +simd_add :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_sub :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_mul :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_div :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_rem :: proc(a, b: #simd[N]T) -> #simd[N]T --- + +// Keeps Odin's Behaviour +// (x << y) if y <= mask else 0 +simd_shl :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T --- +simd_shr :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T --- + +// Similar to C's Behaviour +// x << (y & mask) +simd_shl_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T --- +simd_shr_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T --- + +simd_add_sat :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_sub_sat :: proc(a, b: #simd[N]T) -> #simd[N]T --- + +simd_and :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_or :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_xor :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_and_not :: proc(a, b: #simd[N]T) -> #simd[N]T --- + +simd_neg :: proc(a: #simd[N]T) -> #simd[N]T --- + +simd_abs :: proc(a: #simd[N]T) -> #simd[N]T --- + +simd_min :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_max :: proc(a, b: #simd[N]T) -> #simd[N]T --- +simd_clamp :: proc(v, min, max: #simd[N]T) -> #simd[N]T --- + +// Return an unsigned integer of the same size as the input type +// NOT A BOOLEAN +// element-wise: +// false => 0x00...00 +// true => 0xff...ff +simd_lanes_eq :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- +simd_lanes_ne :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- +simd_lanes_lt :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- +simd_lanes_le :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- +simd_lanes_gt :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- +simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- + +simd_extract :: proc(a: #simd[N]T, idx: uint) -> T --- +simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T --- + +simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T --- +simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T --- +simd_reduce_min :: proc(a: #simd[N]T) -> T --- +simd_reduce_max :: proc(a: #simd[N]T) -> T --- +simd_reduce_and :: proc(a: #simd[N]T) -> T --- +simd_reduce_or :: proc(a: #simd[N]T) -> T --- +simd_reduce_xor :: proc(a: #simd[N]T) -> T --- + +simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T --- +simd_select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T --- + +// Lane-wise operations +simd_ceil :: proc(a: #simd[N]any_float) -> #simd[N]any_float --- +simd_floor :: proc(a: #simd[N]any_float) -> #simd[N]any_float --- +simd_trunc :: proc(a: #simd[N]any_float) -> #simd[N]any_float --- +// rounding to the nearest integral value; if two values are equally near, rounds to the even one +simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float --- + +simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) --- + +// equivalent a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0) +simd_reverse :: proc(a: #simd[N]T) -> #simd[N]T --- + +simd_rotate_left :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T --- +simd_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T --- + + // WASM targets only wasm_memory_grow :: proc(index, delta: uintptr) -> int --- wasm_memory_size :: proc(index: uintptr) -> int --- @@ -199,6 +278,10 @@ wasm_memory_size :: proc(index: uintptr) -> int --- wasm_memory_atomic_wait32 :: proc(ptr: ^u32, expected: u32, timeout_ns: i64) -> u32 --- wasm_memory_atomic_notify32 :: proc(ptr: ^u32, waiters: u32) -> (waiters_woken_up: u32) --- +// x86 Targets (i386, amd64) +x86_cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) --- +x86_xgetbv :: proc(cx: u32) -> (eax, edx: u32) --- + // Darwin targets only objc_object :: struct{} diff --git a/core/mem/raw.odin b/core/mem/raw.odin index 0a0780dfd..2bce2d7aa 100644 --- a/core/mem/raw.odin +++ b/core/mem/raw.odin @@ -21,6 +21,7 @@ make_any :: proc "contextless" (data: rawptr, id: typeid) -> any { } raw_array_data :: runtime.raw_array_data +raw_simd_data :: runtime.raw_simd_data raw_string_data :: runtime.raw_string_data raw_slice_data :: runtime.raw_slice_data raw_dynamic_array_data :: runtime.raw_dynamic_array_data diff --git a/core/runtime/core_builtin.odin b/core/runtime/core_builtin.odin index 4ddc3928a..7cb5287c0 100644 --- a/core/runtime/core_builtin.odin +++ b/core/runtime/core_builtin.odin @@ -604,6 +604,10 @@ raw_array_data :: proc "contextless" (a: $P/^($T/[$N]$E)) -> [^]E { return ([^]E)(a) } @builtin +raw_simd_data :: proc "contextless" (a: $P/^($T/#simd[$N]$E)) -> [^]E { + return ([^]E)(a) +} +@builtin raw_slice_data :: proc "contextless" (s: $S/[]$E) -> [^]E { ptr := (transmute(Raw_Slice)s).data return ([^]E)(ptr) @@ -619,7 +623,7 @@ raw_string_data :: proc "contextless" (s: $S/string) -> [^]u8 { } @builtin -raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data} +raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data, raw_simd_data} diff --git a/core/simd/simd.odin b/core/simd/simd.odin new file mode 100644 index 000000000..390ff377a --- /dev/null +++ b/core/simd/simd.odin @@ -0,0 +1,188 @@ +package simd + +import "core:builtin" +import "core:intrinsics" + +// 128-bit vector aliases +u8x16 :: #simd[16]u8 +i8x16 :: #simd[16]i8 +u16x8 :: #simd[8]u16 +i16x8 :: #simd[8]i16 +u32x4 :: #simd[4]u32 +i32x4 :: #simd[4]i32 +u64x2 :: #simd[2]u64 +i64x2 :: #simd[2]i64 +f32x4 :: #simd[4]f32 +f64x2 :: #simd[2]f64 + +boolx16 :: #simd[16]bool +b8x16 :: #simd[16]b8 +b16x8 :: #simd[8]b16 +b32x4 :: #simd[4]b32 +b64x2 :: #simd[2]b64 + +// 256-bit vector aliases +u8x32 :: #simd[32]u8 +i8x32 :: #simd[32]i8 +u16x16 :: #simd[16]u16 +i16x16 :: #simd[16]i16 +u32x8 :: #simd[8]u32 +i32x8 :: #simd[8]i32 +u64x4 :: #simd[4]u64 +i64x4 :: #simd[4]i64 +f32x8 :: #simd[8]f32 +f64x4 :: #simd[4]f64 + +boolx32 :: #simd[32]bool +b8x32 :: #simd[32]b8 +b16x16 :: #simd[16]b16 +b32x8 :: #simd[8]b32 +b64x4 :: #simd[4]b64 + +// 512-bit vector aliases +u8x64 :: #simd[64]u8 +i8x64 :: #simd[64]i8 +u16x32 :: #simd[32]u16 +i16x32 :: #simd[32]i16 +u32x16 :: #simd[16]u32 +i32x16 :: #simd[16]i32 +u64x8 :: #simd[8]u64 +i64x8 :: #simd[8]i64 +f32x16 :: #simd[16]f32 +f64x8 :: #simd[8]f64 + +boolx64 :: #simd[64]bool +b8x64 :: #simd[64]b8 +b16x32 :: #simd[32]b16 +b32x16 :: #simd[16]b32 +b64x8 :: #simd[8]b64 + + +add :: intrinsics.simd_add +sub :: intrinsics.simd_sub +mul :: intrinsics.simd_mul +div :: intrinsics.simd_div +rem :: intrinsics.simd_rem // integers only + +// Keeps Odin's Behaviour +// (x << y) if y <= mask else 0 +shl :: intrinsics.simd_shl +shr :: intrinsics.simd_shr + +// Similar to C's Behaviour +// x << (y & mask) +shl_masked :: intrinsics.simd_shl_masked +shr_masked :: intrinsics.simd_shr_masked + +// Saturation Arithmetic +add_sat :: intrinsics.simd_add_sat +sub_sat :: intrinsics.simd_sub_sat + +and :: intrinsics.simd_and +or :: intrinsics.simd_or +xor :: intrinsics.simd_xor +and_not :: intrinsics.simd_and_not + +neg :: intrinsics.simd_neg + +abs :: intrinsics.simd_abs + +min :: intrinsics.simd_min +max :: intrinsics.simd_max +clamp :: intrinsics.simd_clamp + +// Return an unsigned integer of the same size as the input type +// NOT A BOOLEAN +// element-wise: +// false => 0x00...00 +// true => 0xff...ff +lanes_eq :: intrinsics.simd_lanes_eq +lanes_ne :: intrinsics.simd_lanes_ne +lanes_lt :: intrinsics.simd_lanes_lt +lanes_le :: intrinsics.simd_lanes_le +lanes_gt :: intrinsics.simd_lanes_gt +lanes_ge :: intrinsics.simd_lanes_ge + +// extract :: proc(a: #simd[N]T, idx: uint) -> T +extract :: intrinsics.simd_extract +// replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T +replace :: intrinsics.simd_replace + +reduce_add_ordered :: intrinsics.simd_reduce_add_ordered +reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered +reduce_min :: intrinsics.simd_reduce_min +reduce_max :: intrinsics.simd_reduce_max +reduce_and :: intrinsics.simd_reduce_and +reduce_or :: intrinsics.simd_reduce_or +reduce_xor :: intrinsics.simd_reduce_xor + +// swizzle :: proc(a: #simd[N]T, indices: ..int) -> #simd[len(indices)]T +swizzle :: builtin.swizzle + +// shuffle :: proc(a, b: #simd[N]T, indices: #simd[max 2*N]u32) -> #simd[len(indices)]T +shuffle :: intrinsics.simd_shuffle + +// select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T +select :: intrinsics.simd_select + + +sqrt :: intrinsics.sqrt +ceil :: intrinsics.simd_ceil +floor :: intrinsics.simd_floor +trunc :: intrinsics.simd_trunc +nearest :: intrinsics.simd_nearest + +to_bits :: intrinsics.simd_to_bits + +lanes_reverse :: intrinsics.simd_lanes_reverse + +lanes_rotate_left :: intrinsics.simd_lanes_rotate_left +lanes_rotate_right :: intrinsics.simd_lanes_rotate_right + +count_ones :: intrinsics.count_ones +count_zeros :: intrinsics.count_zeros +count_trailing_zeros :: intrinsics.count_trailing_zeros +count_leading_zeros :: intrinsics.count_leading_zeros +reverse_bits :: intrinsics.reverse_bits + +fused_mul_add :: intrinsics.fused_mul_add +fma :: intrinsics.fused_mul_add + +to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E { + return (^[LANES]E)(v) +} +to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E { + return transmute([LANES]E)(v) +} +from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E { + return transmute(#simd[LANES]E)v +} + +from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T { + assert(len(slice) >= LANES, "slice length must be a least the number of lanes") + array: [LANES]E + #no_bounds_check for i in 0..<LANES { + array[i] = slice[i] + } + return transmute(T)array +} + +bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) { + return xor(v, T(~E(0))) +} + +copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) { + neg_zero := to_bits(T(-0.0)) + sign_bit := to_bits(sign) & neg_zero + magnitude := to_bits(v) &~ neg_zero + return transmute(T)(sign_bit|magnitude) +} + +signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) { + is_nan := lanes_ne(v, v) + return select(is_nan, v, copysign(T(1), v)) +} + +recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) { + return T(1) / v +} diff --git a/core/simd/x86/abm.odin b/core/simd/x86/abm.odin new file mode 100644 index 000000000..79b806242 --- /dev/null +++ b/core/simd/x86/abm.odin @@ -0,0 +1,24 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" + +@(require_results, enable_target_feature="lzcnt") +_lzcnt_u32 :: #force_inline proc "c" (x: u32) -> u32 { + return intrinsics.count_leading_zeros(x) +} +@(require_results, enable_target_feature="popcnt") +_popcnt32 :: #force_inline proc "c" (x: u32) -> i32 { + return i32(intrinsics.count_ones(x)) +} + +when ODIN_ARCH == .amd64 { + @(require_results, enable_target_feature="lzcnt") + _lzcnt_u64 :: #force_inline proc "c" (x: u64) -> u64 { + return intrinsics.count_leading_zeros(x) + } + @(require_results, enable_target_feature="popcnt") + _popcnt64 :: #force_inline proc "c" (x: u64) -> i32 { + return i32(intrinsics.count_ones(x)) + } +}
\ No newline at end of file diff --git a/core/simd/x86/adx.odin b/core/simd/x86/adx.odin new file mode 100644 index 000000000..d03cffcff --- /dev/null +++ b/core/simd/x86/adx.odin @@ -0,0 +1,56 @@ +//+build i386, amd64 +package simd_x86 + +@(require_results) +_addcarry_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 { + x, y := llvm_addcarry_u32(c_in, a, b) + out^ = y + return x +} +@(require_results) +_addcarryx_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 { + return llvm_addcarryx_u32(c_in, a, b, out) +} +@(require_results) +_subborrow_u32 :: #force_inline proc "c" (c_in: u8, a: u32, b: u32, out: ^u32) -> u8 { + x, y := llvm_subborrow_u32(c_in, a, b) + out^ = y + return x +} + +when ODIN_ARCH == .amd64 { + @(require_results) + _addcarry_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 { + x, y := llvm_addcarry_u64(c_in, a, b) + out^ = y + return x + } + @(require_results) + _addcarryx_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 { + return llvm_addcarryx_u64(c_in, a, b, out) + } + @(require_results) + _subborrow_u64 :: #force_inline proc "c" (c_in: u8, a: u64, b: u64, out: ^u64) -> u8 { + x, y := llvm_subborrow_u64(c_in, a, b) + out^ = y + return x + } +} + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.addcarry.32") + llvm_addcarry_u32 :: proc(a: u8, b: u32, c: u32) -> (u8, u32) --- + @(link_name="llvm.x86.addcarryx.u32") + llvm_addcarryx_u32 :: proc(a: u8, b: u32, c: u32, d: rawptr) -> u8 --- + @(link_name="llvm.x86.subborrow.32") + llvm_subborrow_u32 :: proc(a: u8, b: u32, c: u32) -> (u8, u32) --- + + // amd64 only + @(link_name="llvm.x86.addcarry.64") + llvm_addcarry_u64 :: proc(a: u8, b: u64, c: u64) -> (u8, u64) --- + @(link_name="llvm.x86.addcarryx.u64") + llvm_addcarryx_u64 :: proc(a: u8, b: u64, c: u64, d: rawptr) -> u8 --- + @(link_name="llvm.x86.subborrow.64") + llvm_subborrow_u64 :: proc(a: u8, b: u64, c: u64) -> (u8, u64) --- +} diff --git a/core/simd/x86/cmpxchg16b.odin b/core/simd/x86/cmpxchg16b.odin new file mode 100644 index 000000000..d575dd9df --- /dev/null +++ b/core/simd/x86/cmpxchg16b.odin @@ -0,0 +1,8 @@ +//+build amd64 +package simd_x86 + +import "core:intrinsics" + +cmpxchg16b :: #force_inline proc "c" (dst: ^u128, old, new: u128, $success, $failure: intrinsics.Atomic_Memory_Order) -> (val: u128) { + return intrinsics.atomic_compare_exchange_strong_explicit(dst, old, new, success, failure) +}
\ No newline at end of file diff --git a/core/simd/x86/cpu.odin b/core/simd/x86/cpu.odin new file mode 100644 index 000000000..14e90c0f0 --- /dev/null +++ b/core/simd/x86/cpu.odin @@ -0,0 +1,94 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" + +// cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) --- +cpuid :: intrinsics.x86_cpuid + +// xgetbv :: proc(cx: u32) -> (eax, edx: u32) --- +xgetbv :: intrinsics.x86_xgetbv + + +CPU_Feature :: enum u64 { + aes, // AES hardware implementation (AES NI) + adx, // Multi-precision add-carry instruction extensions + avx, // Advanced vector extension + avx2, // Advanced vector extension 2 + bmi1, // Bit manipulation instruction set 1 + bmi2, // Bit manipulation instruction set 2 + erms, // Enhanced REP for MOVSB and STOSB + fma, // Fused-multiply-add instructions + os_xsave, // OS supports XSAVE/XRESTOR for saving/restoring XMM registers. + pclmulqdq, // PCLMULQDQ instruction - most often used for AES-GCM + popcnt, // Hamming weight instruction POPCNT. + rdrand, // RDRAND instruction (on-chip random number generator) + rdseed, // RDSEED instruction (on-chip random number generator) + sse2, // Streaming SIMD extension 2 (always available on amd64) + sse3, // Streaming SIMD extension 3 + ssse3, // Supplemental streaming SIMD extension 3 + sse41, // Streaming SIMD extension 4 and 4.1 + sse42, // Streaming SIMD extension 4 and 4.2 +} + +CPU_Features :: distinct bit_set[CPU_Feature; u64] + +cpu_features: Maybe(CPU_Features) + +@(init, private) +init_cpu_features :: proc "c" () { + is_set :: #force_inline proc "c" (hwc: u32, value: u32) -> bool { + return hwc&value != 0 + } + try_set :: #force_inline proc "c" (set: ^CPU_Features, feature: CPU_Feature, hwc: u32, value: u32) { + if is_set(hwc, value) { + set^ += {feature} + } + } + + max_id, _, _, _ := cpuid(0, 0) + if max_id < 1 { + return + } + + set: CPU_Features + + _, _, ecx1, edx1 := cpuid(1, 0) + + try_set(&set, .sse2, 26, edx1) + try_set(&set, .sse3, 0, ecx1) + try_set(&set, .pclmulqdq, 1, ecx1) + try_set(&set, .ssse3, 9, ecx1) + try_set(&set, .fma, 12, ecx1) + try_set(&set, .sse41, 19, ecx1) + try_set(&set, .sse42, 20, ecx1) + try_set(&set, .popcnt, 23, ecx1) + try_set(&set, .aes, 25, ecx1) + try_set(&set, .os_xsave, 27, ecx1) + try_set(&set, .rdrand, 30, ecx1) + + os_supports_avx := false + if .os_xsave in set { + eax, _ := xgetbv(0) + os_supports_avx = is_set(1, eax) && is_set(2, eax) + } + if os_supports_avx { + try_set(&set, .avx, 28, ecx1) + } + + if max_id < 7 { + return + } + + _, ebx7, _, _ := cpuid(7, 0) + try_set(&set, .bmi1, 3, ebx7) + if os_supports_avx { + try_set(&set, .avx2, 5, ebx7) + } + try_set(&set, .bmi2, 8, ebx7) + try_set(&set, .erms, 9, ebx7) + try_set(&set, .rdseed, 18, ebx7) + try_set(&set, .adx, 19, ebx7) + + cpu_features = set +} diff --git a/core/simd/x86/fxsr.odin b/core/simd/x86/fxsr.odin new file mode 100644 index 000000000..cd78de7d4 --- /dev/null +++ b/core/simd/x86/fxsr.odin @@ -0,0 +1,36 @@ +//+build i386, amd64 +package simd_x86 + +@(enable_target_feature="fxsr") +_fxsave :: #force_inline proc "c" (mem_addr: rawptr) { + fxsave(mem_addr) +} +@(enable_target_feature="fxsr") +_fxrstor :: #force_inline proc "c" (mem_addr: rawptr) { + fxrstor(mem_addr) +} + +when ODIN_ARCH == .amd64 { + @(enable_target_feature="fxsr") + _fxsave64 :: #force_inline proc "c" (mem_addr: rawptr) { + fxsave64(mem_addr) + } + @(enable_target_feature="fxsr") + _fxrstor64 :: #force_inline proc "c" (mem_addr: rawptr) { + fxrstor64(mem_addr) + } +} + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.fxsave") + fxsave :: proc(p: rawptr) --- + @(link_name="llvm.x86.fxrstor") + fxrstor :: proc(p: rawptr) --- + + // amd64 only + @(link_name="llvm.x86.fxsave64") + fxsave64 :: proc(p: rawptr) --- + @(link_name="llvm.x86.fxrstor64") + fxrstor64 :: proc(p: rawptr) --- +}
\ No newline at end of file diff --git a/core/simd/x86/pclmulqdq.odin b/core/simd/x86/pclmulqdq.odin new file mode 100644 index 000000000..692fb7ce1 --- /dev/null +++ b/core/simd/x86/pclmulqdq.odin @@ -0,0 +1,13 @@ +//+build i386, amd64 +package simd_x86 + +@(require_results, enable_target_feature="pclmulqdq") +_mm_clmulepi64_si128 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { + return pclmulqdq(a, b, u8(IMM8)) +} + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.pclmulqdq") + pclmulqdq :: proc(a, round_key: __m128i, #const imm8: u8) -> __m128i --- +}
\ No newline at end of file diff --git a/core/simd/x86/rdtsc.odin b/core/simd/x86/rdtsc.odin new file mode 100644 index 000000000..54024c3f2 --- /dev/null +++ b/core/simd/x86/rdtsc.odin @@ -0,0 +1,20 @@ +//+build i386, amd64 +package simd_x86 + +@(require_results) +_rdtsc :: #force_inline proc "c" () -> u64 { + return rdtsc() +} + +@(require_results) +__rdtscp :: #force_inline proc "c" (aux: ^u32) -> u64 { + return rdtscp(aux) +} + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.rdtsc") + rdtsc :: proc() -> u64 --- + @(link_name="llvm.x86.rdtscp") + rdtscp :: proc(aux: rawptr) -> u64 --- +}
\ No newline at end of file diff --git a/core/simd/x86/sha.odin b/core/simd/x86/sha.odin new file mode 100644 index 000000000..f015f4b8a --- /dev/null +++ b/core/simd/x86/sha.odin @@ -0,0 +1,49 @@ +//+build i386, amd64 +package simd_x86 + +@(require_results, enable_target_feature="sha") +_mm_sha1msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)sha1msg1(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sha") +_mm_sha1msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)sha1msg2(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sha") +_mm_sha1nexte_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)sha1nexte(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sha") +_mm_sha1rnds4_epu32 :: #force_inline proc "c" (a, b: __m128i, $FUNC: u32) -> __m128i where 0 <= FUNC, FUNC <= 3 { + return transmute(__m128i)sha1rnds4(transmute(i32x4)a, transmute(i32x4)b, u8(FUNC & 0xff)) +} +@(require_results, enable_target_feature="sha") +_mm_sha256msg1_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)sha256msg1(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sha") +_mm_sha256msg2_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)sha256msg2(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sha") +_mm_sha256rnds2_epu32 :: #force_inline proc "c" (a, b, k: __m128i) -> __m128i { + return transmute(__m128i)sha256rnds2(transmute(i32x4)a, transmute(i32x4)b, transmute(i32x4)k) +} + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.sha1msg1") + sha1msg1 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sha1msg2") + sha1msg2 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sha1nexte") + sha1nexte :: proc(a, b: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sha1rnds4") + sha1rnds4 :: proc(a, b: i32x4, #const c: u8) -> i32x4 --- + @(link_name="llvm.x86.sha256msg1") + sha256msg1 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sha256msg2") + sha256msg2 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sha256rnds2") + sha256rnds2 :: proc(a, b, k: i32x4) -> i32x4 --- +}
\ No newline at end of file diff --git a/core/simd/x86/sse.odin b/core/simd/x86/sse.odin new file mode 100644 index 000000000..3efdeccba --- /dev/null +++ b/core/simd/x86/sse.odin @@ -0,0 +1,618 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" +import "core:simd" + +// _MM_SHUFFLE(z, y, x, w) -> (z<<6 | y<<4 | x<<2 | w) +_MM_SHUFFLE :: intrinsics.simd_x86__MM_SHUFFLE + +_MM_HINT_T0 :: 3 +_MM_HINT_T1 :: 2 +_MM_HINT_T2 :: 1 +_MM_HINT_NTA :: 0 +_MM_HINT_ET0 :: 7 +_MM_HINT_ET1 :: 6 + + +_MM_EXCEPT_INVALID :: 0x0001 +_MM_EXCEPT_DENORM :: 0x0002 +_MM_EXCEPT_DIV_ZERO :: 0x0004 +_MM_EXCEPT_OVERFLOW :: 0x0008 +_MM_EXCEPT_UNDERFLOW :: 0x0010 +_MM_EXCEPT_INEXACT :: 0x0020 +_MM_EXCEPT_MASK :: 0x003f + +_MM_MASK_INVALID :: 0x0080 +_MM_MASK_DENORM :: 0x0100 +_MM_MASK_DIV_ZERO :: 0x0200 +_MM_MASK_OVERFLOW :: 0x0400 +_MM_MASK_UNDERFLOW :: 0x0800 +_MM_MASK_INEXACT :: 0x1000 +_MM_MASK_MASK :: 0x1f80 + +_MM_ROUND_NEAREST :: 0x0000 +_MM_ROUND_DOWN :: 0x2000 +_MM_ROUND_UP :: 0x4000 +_MM_ROUND_TOWARD_ZERO :: 0x6000 + +_MM_ROUND_MASK :: 0x6000 + +_MM_FLUSH_ZERO_MASK :: 0x8000 +_MM_FLUSH_ZERO_ON :: 0x8000 +_MM_FLUSH_ZERO_OFF :: 0x0000 + + +@(require_results, enable_target_feature="sse") +_mm_add_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return addss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_add_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.add(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_sub_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return subss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_sub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.sub(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_mul_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return mulss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_mul_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.mul(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_div_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return divss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_div_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.div(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_sqrt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return sqrtss(a) +} +@(require_results, enable_target_feature="sse") +_mm_sqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return sqrtps(a) +} + +@(require_results, enable_target_feature="sse") +_mm_rcp_ss :: #force_inline proc "c" (a: __m128) -> __m128 { + return rcpss(a) +} +@(require_results, enable_target_feature="sse") +_mm_rcp_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return rcpps(a) +} + +@(require_results, enable_target_feature="sse") +_mm_rsqrt_ss :: #force_inline proc "c" (a: __m128) -> __m128 { + return rsqrtss(a) +} +@(require_results, enable_target_feature="sse") +_mm_rsqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return rsqrtps(a) +} + +@(require_results, enable_target_feature="sse") +_mm_min_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return minss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_min_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return minps(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_max_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return maxss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_max_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return maxps(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_and_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return transmute(__m128)simd.and(transmute(__m128i)a, transmute(__m128i)b) +} +@(require_results, enable_target_feature="sse") +_mm_andnot_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return transmute(__m128)simd.and_not(transmute(__m128i)a, transmute(__m128i)b) +} +@(require_results, enable_target_feature="sse") +_mm_or_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return transmute(__m128)simd.or(transmute(__m128i)a, transmute(__m128i)b) +} +@(require_results, enable_target_feature="sse") +_mm_xor_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return transmute(__m128)simd.xor(transmute(__m128i)a, transmute(__m128i)b) +} + + +@(require_results, enable_target_feature="sse") +_mm_cmpeq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 0) +} +@(require_results, enable_target_feature="sse") +_mm_cmplt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 1) +} +@(require_results, enable_target_feature="sse") +_mm_cmple_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 2) +} +@(require_results, enable_target_feature="sse") +_mm_cmpgt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, cmpss(b, a, 1), 4, 1, 2, 3) +} +@(require_results, enable_target_feature="sse") +_mm_cmpge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, cmpss(b, a, 2), 4, 1, 2, 3) +} +@(require_results, enable_target_feature="sse") +_mm_cmpneq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 4) +} +@(require_results, enable_target_feature="sse") +_mm_cmpnlt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 5) +} +@(require_results, enable_target_feature="sse") +_mm_cmpnle_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 6) +} +@(require_results, enable_target_feature="sse") +_mm_cmpngt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, cmpss(b, a, 5), 4, 1, 2, 3) +} +@(require_results, enable_target_feature="sse") +_mm_cmpnge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, cmpss(b, a, 6), 4, 1, 2, 3) +} +@(require_results, enable_target_feature="sse") +_mm_cmpord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 7) +} +@(require_results, enable_target_feature="sse") +_mm_cmpunord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpss(a, b, 3) +} + + +@(require_results, enable_target_feature="sse") +_mm_cmpeq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(a, b, 0) +} +@(require_results, enable_target_feature="sse") +_mm_cmplt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(a, b, 1) +} +@(require_results, enable_target_feature="sse") +_mm_cmple_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(a, b, 2) +} +@(require_results, enable_target_feature="sse") +_mm_cmpgt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(b, a, 1) +} +@(require_results, enable_target_feature="sse") +_mm_cmpge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(b, a, 2) +} +@(require_results, enable_target_feature="sse") +_mm_cmpneq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(a, b, 4) +} +@(require_results, enable_target_feature="sse") +_mm_cmpnlt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(a, b, 5) +} +@(require_results, enable_target_feature="sse") +_mm_cmpnle_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(a, b, 6) +} +@(require_results, enable_target_feature="sse") +_mm_cmpngt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(b, a, 5) +} +@(require_results, enable_target_feature="sse") +_mm_cmpnge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(b, a, 6) +} +@(require_results, enable_target_feature="sse") +_mm_cmpord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(b, a, 7) +} +@(require_results, enable_target_feature="sse") +_mm_cmpunord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return cmpps(b, a, 3) +} + + +@(require_results, enable_target_feature="sse") +_mm_comieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return comieq_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_comilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return comilt_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_comile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return comile_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_comigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return comigt_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_comige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return comige_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_comineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return comineq_ss(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_ucomieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return ucomieq_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_ucomilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return ucomilt_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_ucomile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return ucomile_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_ucomigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return ucomigt_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_ucomige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return ucomige_ss(a, b) +} +@(require_results, enable_target_feature="sse") +_mm_ucomineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 { + return ucomineq_ss(a, b) +} + +@(require_results, enable_target_feature="sse") +_mm_cvtss_si32 :: #force_inline proc "c" (a: __m128) -> i32 { + return cvtss2si(a) +} +_mm_cvt_ss2si :: _mm_cvtss_si32 +_mm_cvttss_si32 :: _mm_cvtss_si32 + +@(require_results, enable_target_feature="sse") +_mm_cvtss_f32 :: #force_inline proc "c" (a: __m128) -> f32 { + return simd.extract(a, 0) +} + +@(require_results, enable_target_feature="sse") +_mm_cvtsi32_ss :: #force_inline proc "c" (a: __m128, b: i32) -> __m128 { + return cvtsi2ss(a, b) +} +_mm_cvt_si2ss :: _mm_cvtsi32_ss + + +@(require_results, enable_target_feature="sse") +_mm_set_ss :: #force_inline proc "c" (a: f32) -> __m128 { + return __m128{a, 0, 0, 0} +} +@(require_results, enable_target_feature="sse") +_mm_set1_ps :: #force_inline proc "c" (a: f32) -> __m128 { + return __m128(a) +} +_mm_set_ps1 :: _mm_set1_ps + +@(require_results, enable_target_feature="sse") +_mm_set_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 { + return __m128{d, c, b, a} +} +@(require_results, enable_target_feature="sse") +_mm_setr_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 { + return __m128{a, b, c, d} +} + +@(require_results, enable_target_feature="sse") +_mm_setzero_ps :: #force_inline proc "c" () -> __m128 { + return __m128{0, 0, 0, 0} +} + +@(require_results, enable_target_feature="sse") +_mm_shuffle_ps :: #force_inline proc "c" (a, b: __m128, $MASK: u32) -> __m128 { + return simd.shuffle( + a, b, + u32(MASK) & 0b11, + (u32(MASK)>>2) & 0b11, + ((u32(MASK)>>4) & 0b11)+4, + ((u32(MASK)>>6) & 0b11)+4) +} + + +@(require_results, enable_target_feature="sse") +_mm_unpackhi_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, b, 2, 6, 3, 7) +} +@(require_results, enable_target_feature="sse") +_mm_unpacklo_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, b, 0, 4, 1, 5) +} + +@(require_results, enable_target_feature="sse") +_mm_movehl_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, b, 6, 7, 2, 3) +} +@(require_results, enable_target_feature="sse") +_mm_movelh_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, b, 0, 1, 4, 5) +} + +@(require_results, enable_target_feature="sse") +_mm_movemask_ps :: #force_inline proc "c" (a: __m128) -> u32 { + return movmskps(a) +} + +@(require_results, enable_target_feature="sse") +_mm_load_ss :: #force_inline proc "c" (p: ^f32) -> __m128 { + return __m128{p^, 0, 0, 0} +} +@(require_results, enable_target_feature="sse") +_mm_load1_ps :: #force_inline proc "c" (p: ^f32) -> __m128 { + a := p^ + return __m128(a) +} +_mm_load_ps1 :: _mm_load1_ps + +@(require_results, enable_target_feature="sse") +_mm_load_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 { + return (^__m128)(p)^ +} + +@(require_results, enable_target_feature="sse") +_mm_loadu_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 { + dst := _mm_undefined_ps() + intrinsics.mem_copy_non_overlapping(&dst, p, size_of(__m128)) + return dst +} + +@(require_results, enable_target_feature="sse") +_mm_loadr_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 { + return simd.lanes_reverse(_mm_load_ps(p)) +} + +@(require_results, enable_target_feature="sse") +_mm_loadu_si64 :: #force_inline proc "c" (mem_addr: rawptr) -> __m128i { + a := intrinsics.unaligned_load((^i64)(mem_addr)) + return __m128i{a, 0} +} + +@(enable_target_feature="sse") +_mm_store_ss :: #force_inline proc "c" (p: ^f32, a: __m128) { + p^ = simd.extract(a, 0) +} + +@(enable_target_feature="sse") +_mm_store1_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) { + b := simd.swizzle(a, 0, 0, 0, 0) + (^__m128)(p)^ = b +} +_mm_store_ps1 :: _mm_store1_ps + + +@(enable_target_feature="sse") +_mm_store_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) { + (^__m128)(p)^ = a +} +@(enable_target_feature="sse") +_mm_storeu_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) { + b := a + intrinsics.mem_copy_non_overlapping(p, &b, size_of(__m128)) +} +@(enable_target_feature="sse") +_mm_storer_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) { + (^__m128)(p)^ = simd.lanes_reverse(a) +} + + +@(require_results, enable_target_feature="sse") +_mm_move_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return simd.shuffle(a, b, 4, 1, 2, 3) +} + +@(enable_target_feature="sse") +_mm_sfence :: #force_inline proc "c" () { + sfence() +} + +@(require_results, enable_target_feature="sse") +_mm_getcsr :: #force_inline proc "c" () -> (result: u32) { + stmxcsr(&result) + return result +} + +@(enable_target_feature="sse") +_mm_setcsr :: #force_inline proc "c" (val: u32) { + val := val + ldmxcsr(&val) +} + +@(require_results, enable_target_feature="sse") +_MM_GET_EXCEPTION_MASK :: #force_inline proc "c" () -> u32 { + return _mm_getcsr() & _MM_MASK_MASK +} +@(require_results, enable_target_feature="sse") +_MM_GET_EXCEPTION_STATE :: #force_inline proc "c" () -> u32 { + return _mm_getcsr() & _MM_EXCEPT_MASK +} +@(require_results, enable_target_feature="sse") +_MM_GET_FLUSH_ZERO_MODE :: #force_inline proc "c" () -> u32 { + return _mm_getcsr() & _MM_FLUSH_ZERO_MASK +} +@(require_results, enable_target_feature="sse") +_MM_GET_ROUNDING_MODE :: #force_inline proc "c" () -> u32 { + return _mm_getcsr() & _MM_ROUND_MASK +} + +@(enable_target_feature="sse") +_MM_SET_EXCEPTION_MASK :: #force_inline proc "c" (x: u32) { + _mm_setcsr((_mm_getcsr() &~ _MM_MASK_MASK) | x) +} +@(enable_target_feature="sse") +_MM_SET_EXCEPTION_STATE :: #force_inline proc "c" (x: u32) { + _mm_setcsr((_mm_getcsr() &~ _MM_EXCEPT_MASK) | x) +} +@(enable_target_feature="sse") +_MM_SET_FLUSH_ZERO_MODE :: #force_inline proc "c" (x: u32) { + _mm_setcsr((_mm_getcsr() &~ _MM_FLUSH_ZERO_MASK) | x) +} +@(enable_target_feature="sse") +_MM_SET_ROUNDING_MODE :: #force_inline proc "c" (x: u32) { + _mm_setcsr((_mm_getcsr() &~ _MM_ROUND_MASK) | x) +} + +@(enable_target_feature="sse") +_mm_prefetch :: #force_inline proc "c" (p: rawptr, $STRATEGY: u32) { + prefetch(p, (STRATEGY>>2)&1, STRATEGY&3, 1) +} + + +@(require_results, enable_target_feature="sse") +_mm_undefined_ps :: #force_inline proc "c" () -> __m128 { + return _mm_set1_ps(0) +} + +@(enable_target_feature="sse") +_MM_TRANSPOSE4_PS :: #force_inline proc "c" (row0, row1, row2, row3: ^__m128) { + tmp0 := _mm_unpacklo_ps(row0^, row1^) + tmp1 := _mm_unpacklo_ps(row2^, row3^) + tmp2 := _mm_unpackhi_ps(row0^, row1^) + tmp3 := _mm_unpackhi_ps(row2^, row3^) + + row0^ = _mm_movelh_ps(tmp0, tmp2) + row1^ = _mm_movelh_ps(tmp2, tmp0) + row2^ = _mm_movelh_ps(tmp1, tmp3) + row3^ = _mm_movelh_ps(tmp3, tmp1) +} + +@(enable_target_feature="sse") +_mm_stream_ps :: #force_inline proc "c" (addr: [^]f32, a: __m128) { + intrinsics.non_temporal_store((^__m128)(addr), a) +} + +when ODIN_ARCH == .amd64 { + @(require_results, enable_target_feature="sse") + _mm_cvtss_si64 :: #force_inline proc "c"(a: __m128) -> i64 { + return cvtss2si64(a) + } + @(require_results, enable_target_feature="sse") + _mm_cvttss_si64 :: #force_inline proc "c"(a: __m128) -> i64 { + return cvttss2si64(a) + } + @(require_results, enable_target_feature="sse") + _mm_cvtsi64_ss :: #force_inline proc "c"(a: __m128, b: i64) -> __m128 { + return cvtsi642ss(a, b) + } +} + + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.sse.add.ss") + addss :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.sub.ss") + subss :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.mul.ss") + mulss :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.div.ss") + divss :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.sqrt.ss") + sqrtss :: proc(a: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.sqrt.ps") + sqrtps :: proc(a: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.rcp.ss") + rcpss :: proc(a: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.rcp.ps") + rcpps :: proc(a: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.rsqrt.ss") + rsqrtss :: proc(a: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.rsqrt.ps") + rsqrtps :: proc(a: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.min.ss") + minss :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.min.ps") + minps :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.max.ss") + maxss :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.max.ps") + maxps :: proc(a, b: __m128) -> __m128 --- + @(link_name="llvm.x86.sse.movmsk.ps") + movmskps :: proc(a: __m128) -> u32 --- + @(link_name="llvm.x86.sse.cmp.ps") + cmpps :: proc(a, b: __m128, #const imm8: u8) -> __m128 --- + @(link_name="llvm.x86.sse.comieq.ss") + comieq_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.comilt.ss") + comilt_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.comile.ss") + comile_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.comigt.ss") + comigt_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.comige.ss") + comige_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.comineq.ss") + comineq_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.ucomieq.ss") + ucomieq_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.ucomilt.ss") + ucomilt_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.ucomile.ss") + ucomile_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.ucomigt.ss") + ucomigt_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.ucomige.ss") + ucomige_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.ucomineq.ss") + ucomineq_ss :: proc(a, b: __m128) -> b32 --- + @(link_name="llvm.x86.sse.cvtss2si") + cvtss2si :: proc(a: __m128) -> i32 --- + @(link_name="llvm.x86.sse.cvttss2si") + cvttss2si :: proc(a: __m128) -> i32 --- + @(link_name="llvm.x86.sse.cvtsi2ss") + cvtsi2ss :: proc(a: __m128, b: i32) -> __m128 --- + @(link_name="llvm.x86.sse.sfence") + sfence :: proc() --- + @(link_name="llvm.x86.sse.stmxcsr") + stmxcsr :: proc(p: rawptr) --- + @(link_name="llvm.x86.sse.ldmxcsr") + ldmxcsr :: proc(p: rawptr) --- + @(link_name="llvm.prefetch") + prefetch :: proc(p: rawptr, #const rw, loc, ty: u32) --- + @(link_name="llvm.x86.sse.cmp.ss") + cmpss :: proc(a, b: __m128, #const imm8: u8) -> __m128 --- + + + // amd64 only + @(link_name="llvm.x86.sse.cvtss2si64") + cvtss2si64 :: proc(a: __m128) -> i64 --- + @(link_name="llvm.x86.sse.cvttss2si64") + cvttss2si64 :: proc(a: __m128) -> i64 --- + @(link_name="llvm.x86.sse.cvtsi642ss") + cvtsi642ss :: proc(a: __m128, b: i64) -> __m128 --- +} diff --git a/core/simd/x86/sse2.odin b/core/simd/x86/sse2.odin new file mode 100644 index 000000000..f33bd2195 --- /dev/null +++ b/core/simd/x86/sse2.odin @@ -0,0 +1,1191 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" +import "core:simd" + +@(enable_target_feature="sse2") +_mm_pause :: #force_inline proc "c" () { + pause() +} +@(enable_target_feature="sse2") +_mm_clflush :: #force_inline proc "c" (p: rawptr) { + clflush(p) +} +@(enable_target_feature="sse2") +_mm_lfence :: #force_inline proc "c" () { + lfence() +} +@(enable_target_feature="sse2") +_mm_mfence :: #force_inline proc "c" () { + mfence() +} + +@(require_results, enable_target_feature="sse2") +_mm_add_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_add_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_add_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse2") +_mm_add_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add(transmute(i64x2)a, transmute(i64x2)b) +} +@(require_results, enable_target_feature="sse2") +_mm_adds_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add_sat(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_adds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add_sat(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_adds_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add_sat(transmute(u8x16)a, transmute(u8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_adds_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.add_sat(transmute(u16x8)a, transmute(u16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_avg_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pavgb(transmute(u8x16)a, transmute(u8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_avg_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pavgw(transmute(u16x8)a, transmute(u16x8)b) +} + +@(require_results, enable_target_feature="sse2") +_mm_madd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaddwd(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_max_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaxsw(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_max_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaxub(transmute(u8x16)a, transmute(u8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_min_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pminsw(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_min_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pminub(transmute(u8x16)a, transmute(u8x16)b) +} + + +@(require_results, enable_target_feature="sse2") +_mm_mulhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmulhw(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_mulhi_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmulhuw(transmute(u16x8)a, transmute(u16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_mullo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.mul(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_mul_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmuludq(transmute(u32x4)a, transmute(u32x4)b) +} +@(require_results, enable_target_feature="sse2") +_mm_sad_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psadbw(transmute(u8x16)a, transmute(u8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_sub_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_sub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_sub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse2") +_mm_sub_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub(transmute(i64x2)a, transmute(i64x2)b) +} +@(require_results, enable_target_feature="sse2") +_mm_subs_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub_sat(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_subs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub_sat(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_subs_epu8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub_sat(transmute(u8x16)a, transmute(u8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.sub_sat(transmute(u16x8)a, transmute(u16x8)b) +} + + + +@(private) +@(require_results, enable_target_feature="sse2") +_mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + shift :: IMM8 & 0xff + + return transmute(__m128i)simd.shuffle( + transmute(i8x16)a, + i8x16(0), + 0 when shift > 15 else (16 - shift + 0), + 1 when shift > 15 else (16 - shift + 1), + 2 when shift > 15 else (16 - shift + 2), + 3 when shift > 15 else (16 - shift + 3), + 4 when shift > 15 else (16 - shift + 4), + 5 when shift > 15 else (16 - shift + 5), + 6 when shift > 15 else (16 - shift + 6), + 7 when shift > 15 else (16 - shift + 7), + 8 when shift > 15 else (16 - shift + 8), + 9 when shift > 15 else (16 - shift + 9), + 10 when shift > 15 else (16 - shift + 10), + 11 when shift > 15 else (16 - shift + 11), + 12 when shift > 15 else (16 - shift + 12), + 13 when shift > 15 else (16 - shift + 13), + 14 when shift > 15 else (16 - shift + 14), + 15 when shift > 15 else (16 - shift + 15), + ) +} + +@(private) +@(require_results, enable_target_feature="sse2") +_mm_srli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + shift :: IMM8 + return transmute(__m128i)simd.shuffle( + transmute(i8x16)a, + i8x16(0), + 0 + 16 when shift > 15 else (shift + 0), + 1 + 16 when shift > 15 else (shift + 1), + 2 + 16 when shift > 15 else (shift + 2), + 3 + 16 when shift > 15 else (shift + 3), + 4 + 16 when shift > 15 else (shift + 4), + 5 + 16 when shift > 15 else (shift + 5), + 6 + 16 when shift > 15 else (shift + 6), + 7 + 16 when shift > 15 else (shift + 7), + 8 + 16 when shift > 15 else (shift + 8), + 9 + 16 when shift > 15 else (shift + 9), + 10 + 16 when shift > 15 else (shift + 10), + 11 + 16 when shift > 15 else (shift + 11), + 12 + 16 when shift > 15 else (shift + 12), + 13 + 16 when shift > 15 else (shift + 13), + 14 + 16 when shift > 15 else (shift + 14), + 15 + 16 when shift > 15 else (shift + 15), + ) +} + + +@(require_results, enable_target_feature="sse2") +_mm_slli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return _mm_slli_si128_impl(a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_bslli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return _mm_slli_si128_impl(a, IMM8) +} + + +@(require_results, enable_target_feature="sse2") +_mm_bsrli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return _mm_srli_si128_impl(a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_slli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)pslliw(transmute(i16x8)a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_sll_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psllw(transmute(i16x8)a, transmute(i16x8)count) +} +@(require_results, enable_target_feature="sse2") +_mm_slli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)psllid(transmute(i32x4)a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_sll_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)pslld(transmute(i32x4)a, transmute(i32x4)count) +} +@(require_results, enable_target_feature="sse2") +_mm_slli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)pslliq(transmute(i64x2)a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_sll_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psllq(transmute(i64x2)a, transmute(i64x2)count) +} +@(require_results, enable_target_feature="sse2") +_mm_srai_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)psraiw(transmute(i16x8)a. IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_sra_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psraw(transmute(i16x8)a, transmute(i16x8)count) +} +@(require_results, enable_target_feature="sse2") +_mm_srai_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)psraid(transmute(i32x4)a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_sra_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psrad(transmute(i32x4)a, transmute(i32x4)count) +} + + +@(require_results, enable_target_feature="sse2") +_mm_srli_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return _mm_srli_si128_impl(a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_srli_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)psrliw(transmute(i16x8)a. IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_srl_epi16 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psrlw(transmute(i16x8)a, transmute(i16x8)count) +} +@(require_results, enable_target_feature="sse2") +_mm_srli_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)psrlid(transmute(i32x4)a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_srl_epi32 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psrld(transmute(i32x4)a, transmute(i32x4)count) +} +@(require_results, enable_target_feature="sse2") +_mm_srli_epi64 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + return transmute(__m128i)psrliq(transmute(i64x2)a, IMM8) +} +@(require_results, enable_target_feature="sse2") +_mm_srl_epi64 :: #force_inline proc "c" (a, count: __m128i) -> __m128i { + return transmute(__m128i)psrlq(transmute(i64x2)a, transmute(i64x2)count) +} + + +@(require_results, enable_target_feature="sse2") +_mm_and_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return simd.and(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_andnot_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return simd.and_not(b, a) +} +@(require_results, enable_target_feature="sse2") +_mm_or_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return simd.or(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_xor_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return simd.xor(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpeq_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_eq(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpeq_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_eq(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpeq_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_eq(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpgt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_gt(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpgt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_gt(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpgt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_gt(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmplt_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_lt(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmplt_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_lt(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_cmplt_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_lt(transmute(i32x4)a, transmute(i32x4)b) +} + + +@(require_results, enable_target_feature="sse2") +_mm_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m128d { + v := transmute(i32x4)a + return cast(__m128d)simd.shuffle(v, v, 0, 1) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtsi32_sd :: #force_inline proc "c" (a: __m128d, b: i32) -> __m128d { + return simd.replace(a, 0, f64(b)) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtepi32_ps :: #force_inline proc "c" (a: __m128i) -> __m128 { + return cvtdq2ps(transmute(i32x4)a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i { + return transmute(__m128i)cvtps2dq(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtsi32_si128 :: #force_inline proc "c" (a: i32) -> __m128i { + return transmute(__m128i)i32x4{a, 0, 0, 0} +} +@(require_results, enable_target_feature="sse2") +_mm_cvtsi128_si32 :: #force_inline proc "c" (a: __m128i) -> i32 { + return simd.extract(transmute(i32x4)a, 0) +} + + + +@(require_results, enable_target_feature="sse2") +_mm_set_epi64x :: #force_inline proc "c" (e1, e0: i64) -> __m128i { + return transmute(__m128i)i64x2{e0, e1} +} +@(require_results, enable_target_feature="sse2") +_mm_set_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i { + return transmute(__m128i)i32x4{e0, e1, e2, e3} +} +@(require_results, enable_target_feature="sse2") +_mm_set_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i { + return transmute(__m128i)i16x8{e0, e1, e2, e3, e4, e5, e6, e7} +} +@(require_results, enable_target_feature="sse2") +_mm_set_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i { + return transmute(__m128i)i8x16{e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15} +} +@(require_results, enable_target_feature="sse2") +_mm_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m128i { + return _mm_set_epi64x(a, a) +} +@(require_results, enable_target_feature="sse2") +_mm_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m128i { + return _mm_set_epi32(a, a, a, a) +} +@(require_results, enable_target_feature="sse2") +_mm_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m128i { + return _mm_set_epi16(a, a, a, a, a, a, a, a) +} +@(require_results, enable_target_feature="sse2") +_mm_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m128i { + return _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} +@(require_results, enable_target_feature="sse2") +_mm_setr_epi32 :: #force_inline proc "c" (e3, e2, e1, e0: i32) -> __m128i { + return _mm_set_epi32(e0, e1, e2, e3) +} +@(require_results, enable_target_feature="sse2") +_mm_setr_epi16 :: #force_inline proc "c" (e7, e6, e5, e4, e3, e2, e1, e0: i16) -> __m128i { + return _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7) +} +@(require_results, enable_target_feature="sse2") +_mm_setr_epi8 :: #force_inline proc "c" (e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0: i8) -> __m128i { + return _mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) +} +@(require_results, enable_target_feature="sse2") +_mm_setzero_si128 :: #force_inline proc "c" () -> __m128i { + return _mm_set1_epi64x(0) +} + + +@(require_results, enable_target_feature="sse2") +_mm_loadl_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + return _mm_set_epi64x(0, intrinsics.unaligned_load((^i64)(mem_addr))) +} +@(require_results, enable_target_feature="sse2") +_mm_load_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + return mem_addr^ +} +@(require_results, enable_target_feature="sse2") +_mm_loadu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + dst := _mm_undefined_si128() + intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128i)) + return dst +} +@(enable_target_feature="sse2") +_mm_maskmoveu_si128 :: #force_inline proc "c" (a, mask: __m128i, mem_addr: rawptr) { + maskmovdqu(transmute(i8x16)a, transmute(i8x16)mask, mem_addr) +} +@(enable_target_feature="sse2") +_mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + mem_addr^ = a +} +@(enable_target_feature="sse2") +_mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + storeudq(mem_addr, a) +} +@(enable_target_feature="sse2") +_mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + a := a + intrinsics.mem_copy_non_overlapping(mem_addr, &a, 8) +} +@(enable_target_feature="sse2") +_mm_stream_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) { + intrinsics.non_temporal_store(mem_addr, a) +} +@(enable_target_feature="sse2") +_mm_stream_si32 :: #force_inline proc "c" (mem_addr: ^i32, a: i32) { + intrinsics.non_temporal_store(mem_addr, a) +} +@(require_results, enable_target_feature="sse2") +_mm_move_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + zero := _mm_setzero_si128() + return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)zero, 0, 2) +} + + + + +@(require_results, enable_target_feature="sse2") +_mm_packs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)packsswb(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_packs_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)packssdw(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse2") +_mm_packus_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)packuswb(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="sse2") +_mm_extract_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 { + return i32(simd.extract(transmute(u16x8)a, IMM8)) +} +@(require_results, enable_target_feature="sse2") +_mm_insert_epi16 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i { + return i32(simd.replace(transmute(u16x8)a, IMM8, i16(i))) +} +@(require_results, enable_target_feature="sse2") +_mm_movemask_epi8 :: #force_inline proc "c" (a: __m128i) -> i32 { + return pmovmskb(transmute(i8x16)a) +} +@(require_results, enable_target_feature="sse2") +_mm_shuffle_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + v := transmute(i32x4)a + return transmute(__m128i)simd.shuffle( + v, + v, + IMM8 & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + ) +} +@(require_results, enable_target_feature="sse2") +_mm_shufflehi_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + v := transmute(i16x8)a + return transmute(__m128i)simd.shuffle( + v, + v, + 0, + 1, + 2, + 3, + (IMM8 & 0b11) + 4, + ((IMM8 >> 2) & 0b11) + 4, + ((IMM8 >> 4) & 0b11) + 4, + ((IMM8 >> 6) & 0b11) + 4, + ) +} +@(require_results, enable_target_feature="sse2") +_mm_shufflelo_epi16 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i { + v := transmute(i16x8)a + return transmute(__m128i)simd.shuffle( + v, + v, + IMM8 & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + 4, + 5, + 6, + 7, + ) +} +@(require_results, enable_target_feature="sse2") +_mm_unpackhi_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle( + transmute(i8x16)a, + transmute(i8x16)b, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ) +} +@(require_results, enable_target_feature="sse2") +_mm_unpackhi_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 4, 12, 5, 13, 6, 14, 7, 15) +} +@(require_results, enable_target_feature="sse2") +_mm_unpackhi_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 2, 6, 3, 7) +} +@(require_results, enable_target_feature="sse2") +_mm_unpackhi_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 1, 3) +} +@(require_results, enable_target_feature="sse2") +_mm_unpacklo_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle( + transmute(i8x16)a, + transmute(i8x16)b, + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + ) +} +@(require_results, enable_target_feature="sse2") +_mm_unpacklo_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle(transmute(i16x8)a, transmute(i16x8)b, 0, 8, 1, 9, 2, 10, 3, 11) +} +@(require_results, enable_target_feature="sse2") +_mm_unpacklo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle(transmute(i32x4)a, transmute(i32x4)b, 0, 4, 1, 5) +} +@(require_results, enable_target_feature="sse2") +_mm_unpacklo_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.shuffle(transmute(i64x2)a, transmute(i64x2)b, 0, 2) +} + + + + +@(require_results, enable_target_feature="sse2") +_mm_add_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) +} +@(require_results, enable_target_feature="sse2") +_mm_add_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.add(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_div_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) +} +@(require_results, enable_target_feature="sse2") +_mm_div_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.div(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_max_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return maxsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_max_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return maxpd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_min_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return minsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_min_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return minpd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_mul_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) +} +@(require_results, enable_target_feature="sse2") +_mm_mul_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.mul(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_sqrt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(a, 0, _mm_cvtsd_f64(sqrtsd(b))) +} +@(require_results, enable_target_feature="sse2") +_mm_sqrt_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { + return simd.sqrt(a) +} +@(require_results, enable_target_feature="sse2") +_mm_sub_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) +} +@(require_results, enable_target_feature="sse2") +_mm_sub_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.sub(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_and_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return transmute(__m128d)_mm_and_si128(transmute(__m128i)a, transmute(__m128i)b) +} +@(require_results, enable_target_feature="sse2") +_mm_andnot_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return transmute(__m128d)_mm_andnot_si128(transmute(__m128i)a, transmute(__m128i)b) +} +@(require_results, enable_target_feature="sse2") +_mm_or_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return transmute(__m128d)_mm_or_si128(transmute(__m128i)a, transmute(__m128i)b) +} +@(require_results, enable_target_feature="sse2") +_mm_xor_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return transmute(__m128d)_mm_xor_si128(transmute(__m128i)a, transmute(__m128i)b) +} + + + + +@(require_results, enable_target_feature="sse2") +_mm_cmpeq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 0) +} +@(require_results, enable_target_feature="sse2") +_mm_cmplt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 1) +} +@(require_results, enable_target_feature="sse2") +_mm_cmple_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 2) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpgt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(_mm_cmplt_sd(b, a), 1, simd.extract(a, 1)) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(_mm_cmple_sd(b, a), 1, simd.extract(a, 1)) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 7) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpunord_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 3) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpneq_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 4) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpnlt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 5) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpnle_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmpsd(a, b, 6) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpngt_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(_mm_cmpnlt_sd(b, a), 1, simd.extract(a, 1)) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpnge_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.replace(_mm_cmpnle_sd(b, a), 1, simd.extract(a, 1)) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpeq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 0) +} +@(require_results, enable_target_feature="sse2") +_mm_cmplt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 1) +} +@(require_results, enable_target_feature="sse2") +_mm_cmple_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 2) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpgt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return _mm_cmplt_pd(b, a) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return _mm_cmple_pd(b, a) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 7) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpunord_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 3) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpneq_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 4) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpnlt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 5) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpnle_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return cmppd(a, b, 6) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpngt_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return _mm_cmpnlt_pd(b, a) +} +@(require_results, enable_target_feature="sse2") +_mm_cmpnge_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return _mm_cmpnle_pd(b, a) +} +@(require_results, enable_target_feature="sse2") +_mm_comieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return comieqsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_comilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return comiltsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_comile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return comilesd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_comigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return comigtsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_comige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return comigesd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_comineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return comineqsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_ucomieq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return ucomieqsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_ucomilt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return ucomiltsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_ucomile_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return ucomilesd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_ucomigt_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return ucomigtsd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_ucomige_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return ucomigesd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_ucomineq_sd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return ucomineqsd(a, b) +} + + + + + +@(require_results, enable_target_feature="sse2") +_mm_cvtpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 { + return cvtpd2ps(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m128d { + return cvtps2pd(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i { + return transmute(__m128i)cvtpd2dq(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 { + return cvtsd2si(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtsd_ss :: #force_inline proc "c" (a, b: __m128d) -> __m128 { + return cvtsd2ss(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtsd_f64 :: #force_inline proc "c" (a: __m128d) -> f64 { + return simd.extract(a, 0) +} +@(require_results, enable_target_feature="sse2") +_mm_cvtss_sd :: #force_inline proc "c" (a, b: __m128) -> __m128d { + return cvtss2sd(a, b) +} +@(require_results, enable_target_feature="sse2") +_mm_cvttpd_epi32 :: #force_inline proc "c" (a: __m128d) -> __m128i { + return transmute(__m128i)cvttpd2dq(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvttsd_si32 :: #force_inline proc "c" (a: __m128d) -> i32 { + return cvttsd2si(a) +} +@(require_results, enable_target_feature="sse2") +_mm_cvttps_epi32 :: #force_inline proc "c" (a: __m128) -> __m128i { + return transmute(__m128i)cvttps2dq(a) +} +@(require_results, enable_target_feature="sse2") +_mm_set_sd :: #force_inline proc "c" (a: f64) -> __m128d { + return _mm_set_pd(0.0, a) +} +@(require_results, enable_target_feature="sse2") +_mm_set1_pd :: #force_inline proc "c" (a: f64) -> __m128d { + return _mm_set_pd(a, a) +} +@(require_results, enable_target_feature="sse2") +_mm_set_pd1 :: #force_inline proc "c" (a: f64) -> __m128d { + return _mm_set_pd(a, a) +} +@(require_results, enable_target_feature="sse2") +_mm_set_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d { + return __m128d{b, a} +} +@(require_results, enable_target_feature="sse2") +_mm_setr_pd :: #force_inline proc "c" (a: f64, b: f64) -> __m128d { + return _mm_set_pd(b, a) +} +@(require_results, enable_target_feature="sse2") +_mm_setzero_pd :: #force_inline proc "c" () -> __m128d { + return _mm_set_pd(0.0, 0.0) +} +@(require_results, enable_target_feature="sse2") +_mm_movemask_pd :: #force_inline proc "c" (a: __m128d) -> i32 { + return movmskpd(a) +} +@(require_results, enable_target_feature="sse2") +_mm_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { + return (^__m128d)(mem_addr)^ +} +@(require_results, enable_target_feature="sse2") +_mm_load_sd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { + return _mm_setr_pd(mem_addr^, 0.) +} +@(require_results, enable_target_feature="sse2") +_mm_loadh_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d { + return _mm_setr_pd(simd.extract(a, 0), mem_addr^) +} +@(require_results, enable_target_feature="sse2") +_mm_loadl_pd :: #force_inline proc "c" (a: __m128d, mem_addr: ^f64) -> __m128d { + return _mm_setr_pd(mem_addr^, simd.extract(a, 1)) +} +@(enable_target_feature="sse2") +_mm_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + intrinsics.non_temporal_store((^__m128d)(mem_addr), a) +} +@(enable_target_feature="sse2") +_mm_store_sd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + mem_addr^ = simd.extract(a, 0) +} +@(enable_target_feature="sse2") +_mm_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + (^__m128d)(mem_addr)^ = a +} +@(enable_target_feature="sse2") +_mm_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + storeupd(mem_addr, a) +} +@(enable_target_feature="sse2") +_mm_store1_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + (^__m128d)(mem_addr)^ = simd.shuffle(a, a, 0, 0) +} +@(enable_target_feature="sse2") +_mm_store_pd1 :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + (^__m128d)(mem_addr)^ = simd.shuffle(a, a, 0, 0) +} +@(enable_target_feature="sse2") +_mm_storer_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + (^__m128d)(mem_addr)^ = simd.shuffle(a, a, 1, 0) +} +@(enable_target_feature="sse2") +_mm_storeh_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + mem_addr^ = simd.extract(a, 1) +} +@(enable_target_feature="sse2") +_mm_storel_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m128d) { + mem_addr^ = simd.extract(a, 0) +} +@(require_results, enable_target_feature="sse2") +_mm_load1_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { + d := mem_addr^ + return _mm_setr_pd(d, d) +} +@(require_results, enable_target_feature="sse2") +_mm_load_pd1 :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { + return _mm_load1_pd(mem_addr) +} +@(require_results, enable_target_feature="sse2") +_mm_loadr_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { + a := _mm_load_pd(mem_addr) + return simd.shuffle(a, a, 1, 0) +} +@(require_results, enable_target_feature="sse2") +_mm_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m128d { + dst := _mm_undefined_pd() + intrinsics.mem_copy_non_overlapping(&dst, mem_addr, size_of(__m128d)) + return dst +} +@(require_results, enable_target_feature="sse2") +_mm_shuffle_pd :: #force_inline proc "c" (a, b: __m128d, $MASK: u32) -> __m128d { + return simd.shuffle(a, b, MASK&0b1, ((MASK>>1)&0b1) + 2) +} +@(require_results, enable_target_feature="sse2") +_mm_move_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return _mm_setr_pd(simd.extract(b, 0), simd.extract(a, 1)) +} + + + + +@(require_results, enable_target_feature="sse2") +_mm_castpd_ps :: #force_inline proc "c" (a: __m128d) -> __m128 { + return transmute(__m128)a +} +@(require_results, enable_target_feature="sse2") +_mm_castpd_si128 :: #force_inline proc "c" (a: __m128d) -> __m128i { + return transmute(__m128i)a +} +@(require_results, enable_target_feature="sse2") +_mm_castps_pd :: #force_inline proc "c" (a: __m128) -> __m128d { + return transmute(__m128d)a +} +@(require_results, enable_target_feature="sse2") +_mm_castps_si128 :: #force_inline proc "c" (a: __m128) -> __m128i { + return transmute(__m128i)a +} +@(require_results, enable_target_feature="sse2") +_mm_castsi128_pd :: #force_inline proc "c" (a: __m128i) -> __m128d { + return transmute(__m128d)a +} +@(require_results, enable_target_feature="sse2") +_mm_castsi128_ps :: #force_inline proc "c" (a: __m128i) -> __m128 { + return transmute(__m128)a +} + + +@(require_results, enable_target_feature="sse2") +_mm_undefined_pd :: #force_inline proc "c" () -> __m128d { + return __m128d{0, 0} +} +@(require_results, enable_target_feature="sse2") +_mm_undefined_si128 :: #force_inline proc "c" () -> __m128i { + return __m128i{0, 0} +} +@(require_results, enable_target_feature="sse2") +_mm_unpackhi_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.shuffle(a, b, 1, 3) +} +@(require_results, enable_target_feature="sse2") +_mm_unpacklo_pd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return simd.shuffle(a, b, 0, 2) +} + + +when ODIN_ARCH == .amd64 { + @(require_results, enable_target_feature="sse2") + _mm_cvtsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 { + return cvtsd2si64(a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 { + return _mm_cvtsd_si64(a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvttsd_si64 :: #force_inline proc "c" (a: __m128d) -> i64 { + return cvttsd2si64(a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvttsd_si64x :: #force_inline proc "c" (a: __m128d) -> i64 { + return _mm_cvttsd_si64(a) + } + @(enable_target_feature="sse2") + _mm_stream_si64 :: #force_inline proc "c" (mem_addr: ^i64, a: i64) { + intrinsics.non_temporal_store(mem_addr, a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsi64_si128 :: #force_inline proc "c" (a: i64) -> __m128i { + return _mm_set_epi64x(0, a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsi64x_si128 :: #force_inline proc "c" (a: i64) -> __m128i { + return _mm_cvtsi64_si128(a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsi128_si64 :: #force_inline proc "c" (a: __m128i) -> i64 { + return simd.extract(transmute(i64x2)a, 0) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsi128_si64x :: #force_inline proc "c" (a: __m128i) -> i64 { + return _mm_cvtsi128_si64(a) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsi64_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d { + return simd.replace(a, 0, f64(b)) + } + @(require_results, enable_target_feature="sse2") + _mm_cvtsi64x_sd :: #force_inline proc "c" (a: __m128d, b: i64) -> __m128d { + return _mm_cvtsi64_sd(a, b) + } +} + + +@(private, default_calling_convention="c") +foreign _ { + @(link_name="llvm.x86.sse2.pause") + pause :: proc() --- + @(link_name="llvm.x86.sse2.clflush") + clflush :: proc(p: rawptr) --- + @(link_name="llvm.x86.sse2.lfence") + lfence :: proc() --- + @(link_name="llvm.x86.sse2.mfence") + mfence :: proc() --- + @(link_name="llvm.x86.sse2.pavg.b") + pavgb :: proc(a, b: u8x16) -> u8x16 --- + @(link_name="llvm.x86.sse2.pavg.w") + pavgw :: proc(a, b: u16x8) -> u16x8 --- + @(link_name="llvm.x86.sse2.pmadd.wd") + pmaddwd :: proc(a, b: i16x8) -> i32x4 --- + @(link_name="llvm.x86.sse2.pmaxs.w") + pmaxsw :: proc(a, b: i16x8) -> i16x8 --- + @(link_name="llvm.x86.sse2.pmaxu.b") + pmaxub :: proc(a, b: u8x16) -> u8x16 --- + @(link_name="llvm.x86.sse2.pmins.w") + pminsw :: proc(a, b: i16x8) -> i16x8 --- + @(link_name="llvm.x86.sse2.pminu.b") + pminub :: proc(a, b: u8x16) -> u8x16 --- + @(link_name="llvm.x86.sse2.pmulh.w") + pmulhw :: proc(a, b: i16x8) -> i16x8 --- + @(link_name="llvm.x86.sse2.pmulhu.w") + pmulhuw :: proc(a, b: u16x8) -> u16x8 --- + @(link_name="llvm.x86.sse2.pmulu.dq") + pmuludq :: proc(a, b: u32x4) -> u64x2 --- + @(link_name="llvm.x86.sse2.psad.bw") + psadbw :: proc(a, b: u8x16) -> u64x2 --- + @(link_name="llvm.x86.sse2.pslli.w") + pslliw :: proc(a: i16x8, #const imm8: u32) -> i16x8 --- + @(link_name="llvm.x86.sse2.psll.w") + psllw :: proc(a: i16x8, count: i16x8) -> i16x8 --- + @(link_name="llvm.x86.sse2.pslli.d") + psllid :: proc(a: i32x4, #const imm8: u32) -> i32x4 --- + @(link_name="llvm.x86.sse2.psll.d") + pslld :: proc(a: i32x4, count: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sse2.pslli.q") + pslliq :: proc(a: i64x2, #const imm8: u32) -> i64x2 --- + @(link_name="llvm.x86.sse2.psll.q") + psllq :: proc(a: i64x2, count: i64x2) -> i64x2 --- + @(link_name="llvm.x86.sse2.psrai.w") + psraiw :: proc(a: i16x8, #const imm8: u32) -> i16x8 --- + @(link_name="llvm.x86.sse2.psra.w") + psraw :: proc(a: i16x8, count: i16x8) -> i16x8 --- + @(link_name="llvm.x86.sse2.psrai.d") + psraid :: proc(a: i32x4, #const imm8: u32) -> i32x4 --- + @(link_name="llvm.x86.sse2.psra.d") + psrad :: proc(a: i32x4, count: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sse2.psrli.w") + psrliw :: proc(a: i16x8, #const imm8: u32) -> i16x8 --- + @(link_name="llvm.x86.sse2.psrl.w") + psrlw :: proc(a: i16x8, count: i16x8) -> i16x8 --- + @(link_name="llvm.x86.sse2.psrli.d") + psrlid :: proc(a: i32x4, #const imm8: u32) -> i32x4 --- + @(link_name="llvm.x86.sse2.psrl.d") + psrld :: proc(a: i32x4, count: i32x4) -> i32x4 --- + @(link_name="llvm.x86.sse2.psrli.q") + psrliq :: proc(a: i64x2, #const imm8: u32) -> i64x2 --- + @(link_name="llvm.x86.sse2.psrl.q") + psrlq :: proc(a: i64x2, count: i64x2) -> i64x2 --- + @(link_name="llvm.x86.sse2.cvtdq2ps") + cvtdq2ps :: proc(a: i32x4) -> __m128 --- + @(link_name="llvm.x86.sse2.cvtps2dq") + cvtps2dq :: proc(a: __m128) -> i32x4 --- + @(link_name="llvm.x86.sse2.maskmov.dqu") + maskmovdqu :: proc(a: i8x16, mask: i8x16, mem_addr: rawptr) --- + @(link_name="llvm.x86.sse2.packsswb.128") + packsswb :: proc(a, b: i16x8) -> i8x16 --- + @(link_name="llvm.x86.sse2.packssdw.128") + packssdw :: proc(a, b: i32x4) -> i16x8 --- + @(link_name="llvm.x86.sse2.packuswb.128") + packuswb :: proc(a, b: i16x8) -> u8x16 --- + @(link_name="llvm.x86.sse2.pmovmskb.128") + pmovmskb :: proc(a: i8x16) -> i32 --- + @(link_name="llvm.x86.sse2.max.sd") + maxsd :: proc(a, b: __m128d) -> __m128d --- + @(link_name="llvm.x86.sse2.max.pd") + maxpd :: proc(a, b: __m128d) -> __m128d --- + @(link_name="llvm.x86.sse2.min.sd") + minsd :: proc(a, b: __m128d) -> __m128d --- + @(link_name="llvm.x86.sse2.min.pd") + minpd :: proc(a, b: __m128d) -> __m128d --- + @(link_name="llvm.x86.sse2.sqrt.sd") + sqrtsd :: proc(a: __m128d) -> __m128d --- + @(link_name="llvm.x86.sse2.sqrt.pd") + sqrtpd :: proc(a: __m128d) -> __m128d --- + @(link_name="llvm.x86.sse2.cmp.sd") + cmpsd :: proc(a, b: __m128d, imm8: i8) -> __m128d --- + @(link_name="llvm.x86.sse2.cmp.pd") + cmppd :: proc(a, b: __m128d, imm8: i8) -> __m128d --- + @(link_name="llvm.x86.sse2.comieq.sd") + comieqsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.comilt.sd") + comiltsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.comile.sd") + comilesd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.comigt.sd") + comigtsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.comige.sd") + comigesd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.comineq.sd") + comineqsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.ucomieq.sd") + ucomieqsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.ucomilt.sd") + ucomiltsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.ucomile.sd") + ucomilesd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.ucomigt.sd") + ucomigtsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.ucomige.sd") + ucomigesd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.ucomineq.sd") + ucomineqsd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.movmsk.pd") + movmskpd :: proc(a: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.cvtpd2ps") + cvtpd2ps :: proc(a: __m128d) -> __m128 --- + @(link_name="llvm.x86.sse2.cvtps2pd") + cvtps2pd :: proc(a: __m128) -> __m128d --- + @(link_name="llvm.x86.sse2.cvtpd2dq") + cvtpd2dq :: proc(a: __m128d) -> i32x4 --- + @(link_name="llvm.x86.sse2.cvtsd2si") + cvtsd2si :: proc(a: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.cvtsd2ss") + cvtsd2ss :: proc(a, b: __m128d) -> __m128 --- + @(link_name="llvm.x86.sse2.cvtss2sd") + cvtss2sd :: proc(a, b: __m128) -> __m128d --- + @(link_name="llvm.x86.sse2.cvttpd2dq") + cvttpd2dq :: proc(a: __m128d) -> i32x4 --- + @(link_name="llvm.x86.sse2.cvttsd2si") + cvttsd2si :: proc(a: __m128d) -> i32 --- + @(link_name="llvm.x86.sse2.cvttps2dq") + cvttps2dq :: proc(a: __m128) -> i32x4 --- + @(link_name="llvm.x86.sse2.storeu.dq") + storeudq :: proc(mem_addr: rawptr, a: __m128i) --- + @(link_name="llvm.x86.sse2.storeu.pd") + storeupd :: proc(mem_addr: rawptr, a: __m128d) --- + + // amd64 only + @(link_name="llvm.x86.sse2.cvtsd2si64") + cvtsd2si64 :: proc(a: __m128d) -> i64 --- + @(link_name="llvm.x86.sse2.cvttsd2si64") + cvttsd2si64 :: proc(a: __m128d) -> i64 --- +} diff --git a/core/simd/x86/sse3.odin b/core/simd/x86/sse3.odin new file mode 100644 index 000000000..7a3073c18 --- /dev/null +++ b/core/simd/x86/sse3.odin @@ -0,0 +1,68 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" +import "core:simd" + +@(require_results, enable_target_feature="sse3") +_mm_addsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return addsubps(a, b) +} +@(require_results, enable_target_feature="sse3") +_mm_addsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d { + return addsubpd(a, b) +} +@(require_results, enable_target_feature="sse3") +_mm_hadd_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d { + return haddpd(a, b) +} +@(require_results, enable_target_feature="sse3") +_mm_hadd_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return haddps(a, b) +} +@(require_results, enable_target_feature="sse3") +_mm_hsub_pd :: #force_inline proc "c" (a: __m128d, b: __m128d) -> __m128d { + return hsubpd(a, b) +} +@(require_results, enable_target_feature="sse3") +_mm_hsub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return hsubps(a, b) +} +@(require_results, enable_target_feature="sse3") +_mm_lddqu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i) -> __m128i { + return transmute(__m128i)lddqu(mem_addr) +} +@(require_results, enable_target_feature="sse3") +_mm_movedup_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { + return simd.shuffle(a, a, 0, 0) +} +@(require_results, enable_target_feature="sse3") +_mm_loaddup_pd :: #force_inline proc "c" (mem_addr: [^]f64) -> __m128d { + return _mm_load1_pd(mem_addr) +} +@(require_results, enable_target_feature="sse3") +_mm_movehdup_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return simd.shuffle(a, a, 1, 1, 3, 3) +} +@(require_results, enable_target_feature="sse3") +_mm_moveldup_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return simd.shuffle(a, a, 0, 0, 2, 2) +} + +@(private, default_calling_convention="c") +foreign _ { + @(link_name = "llvm.x86.sse3.addsub.ps") + addsubps :: proc(a, b: __m128) -> __m128 --- + @(link_name = "llvm.x86.sse3.addsub.pd") + addsubpd :: proc(a: __m128d, b: __m128d) -> __m128d --- + @(link_name = "llvm.x86.sse3.hadd.pd") + haddpd :: proc(a: __m128d, b: __m128d) -> __m128d --- + @(link_name = "llvm.x86.sse3.hadd.ps") + haddps :: proc(a, b: __m128) -> __m128 --- + @(link_name = "llvm.x86.sse3.hsub.pd") + hsubpd :: proc(a: __m128d, b: __m128d) -> __m128d --- + @(link_name = "llvm.x86.sse3.hsub.ps") + hsubps :: proc(a, b: __m128) -> __m128 --- + @(link_name = "llvm.x86.sse3.ldu.dq") + lddqu :: proc(mem_addr: rawptr) -> i8x16 --- +}
\ No newline at end of file diff --git a/core/simd/x86/sse41.odin b/core/simd/x86/sse41.odin new file mode 100644 index 000000000..b35be33f2 --- /dev/null +++ b/core/simd/x86/sse41.odin @@ -0,0 +1,352 @@ +//+build i386, amd64 +package simd_x86 + +import "core:simd" + +// SSE4 rounding constants +_MM_FROUND_TO_NEAREST_INT :: 0x00 +_MM_FROUND_TO_NEG_INF :: 0x01 +_MM_FROUND_TO_POS_INF :: 0x02 +_MM_FROUND_TO_ZERO :: 0x03 +_MM_FROUND_CUR_DIRECTION :: 0x04 +_MM_FROUND_RAISE_EXC :: 0x00 +_MM_FROUND_NO_EXC :: 0x08 +_MM_FROUND_NINT :: 0x00 +_MM_FROUND_FLOOR :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF +_MM_FROUND_CEIL :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF +_MM_FROUND_TRUNC :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO +_MM_FROUND_RINT :: _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION +_MM_FROUND_NEARBYINT :: _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION + + + +@(require_results, enable_target_feature="sse4.1") +_mm_blendv_epi8 :: #force_inline proc "c" (a, b, mask: __m128i) -> __m128i { + return transmute(__m128i)pblendvb(transmute(i8x16)a, transmute(i8x16)b, transmute(i8x16)mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_blend_epi16 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { + return transmute(__m128i)pblendw(transmute(i16x8)a, transmute(i16x8)b, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_blendv_pd :: #force_inline proc "c" (a, b, mask: __m128d) -> __m128d { + return blendvpd(a, b, mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_blendv_ps :: #force_inline proc "c" (a, b, mask: __m128) -> __m128 { + return blendvps(a, b, mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_blend_pd :: #force_inline proc "c" (a, b: __m128d, $IMM2: u8) -> __m128d { + return blendpd(a, b, IMM2) +} +@(require_results, enable_target_feature="sse4.1") +_mm_blend_ps :: #force_inline proc "c" (a, b: __m128, $IMM4: u8) -> __m128 { + return blendps(a, b, IMM4) +} +@(require_results, enable_target_feature="sse4.1") +_mm_extract_ps :: #force_inline proc "c" (a: __m128, $IMM8: u32) -> i32 { + return transmute(i32)simd.extract(a, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_extract_epi8 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 { + return i32(simd.extract(transmute(u8x16)a, IMM8)) +} +@(require_results, enable_target_feature="sse4.1") +_mm_extract_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 { + return simd.extract(transmute(i32x4)a, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_insert_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 { + return insertps(a, b, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_insert_epi8 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i { + return transmute(__m128i)simd.replace(transmute(i8x16)a, IMM8, i8(i)) +} +@(require_results, enable_target_feature="sse4.1") +_mm_insert_epi32 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i { + return transmute(__m128i)simd.replace(transmute(i32x4)a, IMM8, i) +} +@(require_results, enable_target_feature="sse4.1") +_mm_max_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaxsb(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_max_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaxuw(transmute(u16x8)a, transmute(u16x8)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_max_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaxsd(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_max_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaxud(transmute(u32x4)a, transmute(u32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_min_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pminsb(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_min_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pminuw(transmute(u16x8)a, transmute(u16x8)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_min_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pminsd(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_min_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pminud(transmute(u32x4)a, transmute(u32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_packus_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)packusdw(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cmpeq_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_eq(transmute(i64x2)a, transmute(i64x2)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepi8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(i8x16)a + y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7) + return transmute(__m128i)i16x8(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepi8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(i8x16)a + y := simd.shuffle(x, x, 0, 1, 2, 3) + return transmute(__m128i)i32x4(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepi8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(i8x16)a + y := simd.shuffle(x, x, 0, 1) + return transmute(__m128i)i64x2(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepi16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(i16x8)a + y := simd.shuffle(x, x, 0, 1, 2, 3) + return transmute(__m128i)i32x4(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepi16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(i16x8)a + y := simd.shuffle(x, x, 0, 1) + return transmute(__m128i)i64x2(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepi32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(i32x4)a + y := simd.shuffle(x, x, 0, 1) + return transmute(__m128i)i64x2(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepu8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(u8x16)a + y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7) + return transmute(__m128i)i16x8(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepu8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(u8x16)a + y := simd.shuffle(x, x, 0, 1, 2, 3) + return transmute(__m128i)i32x4(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepu8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(u8x16)a + y := simd.shuffle(x, x, 0, 1) + return transmute(__m128i)i64x2(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepu16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(u16x8)a + y := simd.shuffle(x, x, 0, 1, 2, 3) + return transmute(__m128i)i32x4(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepu16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(u16x8)a + y := simd.shuffle(x, x, 0, 1) + return transmute(__m128i)i64x2(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_cvtepu32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i { + x := transmute(u32x4)a + y := simd.shuffle(x, x, 0, 1) + return transmute(__m128i)i64x2(y) +} +@(require_results, enable_target_feature="sse4.1") +_mm_dp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM8: u8) -> __m128d { + return dppd(a, b, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_dp_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 { + return dpps(a, b, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_floor_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { + return simd.floor(a) +} +@(require_results, enable_target_feature="sse4.1") +_mm_floor_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return simd.floor(a) +} +@(require_results, enable_target_feature="sse4.1") +_mm_floor_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return roundsd(a, b, _MM_FROUND_FLOOR) +} +@(require_results, enable_target_feature="sse4.1") +_mm_floor_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return roundss(a, b, _MM_FROUND_FLOOR) +} +@(require_results, enable_target_feature="sse4.1") +_mm_ceil_pd :: #force_inline proc "c" (a: __m128d) -> __m128d { + return simd.ceil(a) +} +@(require_results, enable_target_feature="sse4.1") +_mm_ceil_ps :: #force_inline proc "c" (a: __m128) -> __m128 { + return simd.ceil(a) +} +@(require_results, enable_target_feature="sse4.1") +_mm_ceil_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d { + return roundsd(a, b, _MM_FROUND_CEIL) +} +@(require_results, enable_target_feature="sse4.1") +_mm_ceil_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 { + return roundss(a, b, _MM_FROUND_CEIL) +} +@(require_results, enable_target_feature="sse4.1") +_mm_round_pd :: #force_inline proc "c" (a: __m128d, $ROUNDING: i32) -> __m128d { + return roundpd(a, ROUNDING) +} +@(require_results, enable_target_feature="sse4.1") +_mm_round_ps :: #force_inline proc "c" (a: __m128, $ROUNDING: i32) -> __m128 { + return roundps(a, ROUNDING) +} +@(require_results, enable_target_feature="sse4.1") +_mm_round_sd :: #force_inline proc "c" (a, b: __m128d, $ROUNDING: i32) -> __m128d { + return roundsd(a, b, ROUNDING) +} +@(require_results, enable_target_feature="sse4.1") +_mm_round_ss :: #force_inline proc "c" (a, b: __m128, $ROUNDING: i32) -> __m128 { + return roundss(a, b, ROUNDING) +} +@(require_results, enable_target_feature="sse4.1") +_mm_minpos_epu16 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)phminposuw(transmute(u16x8)a) +} +@(require_results, enable_target_feature="sse4.1") +_mm_mul_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmuldq(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_mullo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)simd.mul(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="sse4.1") +_mm_mpsadbw_epu8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { + return transmute(__m128i)mpsadbw(transmute(u8x16)a, transmute(u8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.1") +_mm_testz_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { + return ptestz(transmute(i64x2)a, transmute(i64x2)mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_testc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { + return ptestc(transmute(i64x2)a, transmute(i64x2)mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_testnzc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { + return ptestnzc(transmute(i64x2)a, transmute(i64x2)mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_test_all_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { + return _mm_testz_si128(a, mask) +} +@(require_results, enable_target_feature="sse4.1") +_mm_test_all_ones :: #force_inline proc "c" (a: __m128i) -> i32 { + return _mm_testc_si128(a, _mm_cmpeq_epi32(a, a)) +} +@(require_results, enable_target_feature="sse4.1") +_mm_test_mix_ones_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 { + return _mm_testnzc_si128(a, mask) +} + + +when ODIN_ARCH == .amd64 { + @(require_results, enable_target_feature="sse4.1") + _mm_extract_epi64 :: #force_inline proc "c" (a: __m128i, $IMM1: u32) -> i64 { + return simd.extract(transmute(i64x2)a, IMM1) + } + + @(require_results, enable_target_feature="sse4.1") + _mm_insert_epi64 :: #force_inline proc "c" (a: __m128i, i: i64, $IMM1: u32) -> __m128i { + return transmute(__m128i)simd.replace(transmute(i64x2)a, IMM1, i) + } +} + + +@(private, default_calling_convention="c") +foreign _ { + @(link_name = "llvm.x86.sse41.pblendvb") + pblendvb :: proc(a, b: i8x16, mask: i8x16) -> i8x16 --- + @(link_name = "llvm.x86.sse41.blendvpd") + blendvpd :: proc(a, b, mask: __m128d) -> __m128d --- + @(link_name = "llvm.x86.sse41.blendvps") + blendvps :: proc(a, b, mask: __m128) -> __m128 --- + @(link_name = "llvm.x86.sse41.blendpd") + blendpd :: proc(a, b: __m128d, #const imm2: u8) -> __m128d --- + @(link_name = "llvm.x86.sse41.blendps") + blendps :: proc(a, b: __m128, #const imm4: u8) -> __m128 --- + @(link_name = "llvm.x86.sse41.pblendw") + pblendw :: proc(a: i16x8, b: i16x8, #const imm8: u8) -> i16x8 --- + @(link_name = "llvm.x86.sse41.insertps") + insertps :: proc(a, b: __m128, #const imm8: u8) -> __m128 --- + @(link_name = "llvm.x86.sse41.pmaxsb") + pmaxsb :: proc(a, b: i8x16) -> i8x16 --- + @(link_name = "llvm.x86.sse41.pmaxuw") + pmaxuw :: proc(a, b: u16x8) -> u16x8 --- + @(link_name = "llvm.x86.sse41.pmaxsd") + pmaxsd :: proc(a, b: i32x4) -> i32x4 --- + @(link_name = "llvm.x86.sse41.pmaxud") + pmaxud :: proc(a, b: u32x4) -> u32x4 --- + @(link_name = "llvm.x86.sse41.pminsb") + pminsb :: proc(a, b: i8x16) -> i8x16 --- + @(link_name = "llvm.x86.sse41.pminuw") + pminuw :: proc(a, b: u16x8) -> u16x8 --- + @(link_name = "llvm.x86.sse41.pminsd") + pminsd :: proc(a, b: i32x4) -> i32x4 --- + @(link_name = "llvm.x86.sse41.pminud") + pminud :: proc(a, b: u32x4) -> u32x4 --- + @(link_name = "llvm.x86.sse41.packusdw") + packusdw :: proc(a, b: i32x4) -> u16x8 --- + @(link_name = "llvm.x86.sse41.dppd") + dppd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d --- + @(link_name = "llvm.x86.sse41.dpps") + dpps :: proc(a, b: __m128, #const imm8: u8) -> __m128 --- + @(link_name = "llvm.x86.sse41.round.pd") + roundpd :: proc(a: __m128d, rounding: i32) -> __m128d --- + @(link_name = "llvm.x86.sse41.round.ps") + roundps :: proc(a: __m128, rounding: i32) -> __m128 --- + @(link_name = "llvm.x86.sse41.round.sd") + roundsd :: proc(a, b: __m128d, rounding: i32) -> __m128d --- + @(link_name = "llvm.x86.sse41.round.ss") + roundss :: proc(a, b: __m128, rounding: i32) -> __m128 --- + @(link_name = "llvm.x86.sse41.phminposuw") + phminposuw :: proc(a: u16x8) -> u16x8 --- + @(link_name = "llvm.x86.sse41.pmuldq") + pmuldq :: proc(a, b: i32x4) -> i64x2 --- + @(link_name = "llvm.x86.sse41.mpsadbw") + mpsadbw :: proc(a, b: u8x16, #const imm8: u8) -> u16x8 --- + @(link_name = "llvm.x86.sse41.ptestz") + ptestz :: proc(a, mask: i64x2) -> i32 --- + @(link_name = "llvm.x86.sse41.ptestc") + ptestc :: proc(a, mask: i64x2) -> i32 --- + @(link_name = "llvm.x86.sse41.ptestnzc") + ptestnzc :: proc(a, mask: i64x2) -> i32 --- +}
\ No newline at end of file diff --git a/core/simd/x86/sse42.odin b/core/simd/x86/sse42.odin new file mode 100644 index 000000000..62b4f0478 --- /dev/null +++ b/core/simd/x86/sse42.odin @@ -0,0 +1,149 @@ +//+build i386, amd64 +package simd_x86 + +import "core:simd" + +_SIDD_UBYTE_OPS :: 0b0000_0000 +_SIDD_UWORD_OPS :: 0b0000_0001 +_SIDD_SBYTE_OPS :: 0b0000_0010 +_SIDD_SWORD_OPS :: 0b0000_0011 + +_SIDD_CMP_EQUAL_ANY :: 0b0000_0000 +_SIDD_CMP_RANGES :: 0b0000_0100 +_SIDD_CMP_EQUAL_EACH :: 0b0000_1000 +_SIDD_CMP_EQUAL_ORDERED :: 0b0000_1100 + +_SIDD_POSITIVE_POLARITY :: 0b0000_0000 +_SIDD_NEGATIVE_POLARITY :: 0b0001_0000 +_SIDD_MASKED_POSITIVE_POLARITY :: 0b0010_0000 +_SIDD_MASKED_NEGATIVE_POLARITY :: 0b0011_0000 + +_SIDD_LEAST_SIGNIFICANT :: 0b0000_0000 +_SIDD_MOST_SIGNIFICANT :: 0b0100_0000 + +_SIDD_BIT_MASK :: 0b0000_0000 +_SIDD_UNIT_MASK :: 0b0100_0000 + +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistrm :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> __m128i { + return transmute(__m128i)pcmpistrm128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistri :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 { + return pcmpistri128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistrz :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 { + return pcmpistriz128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistrc :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 { + return pcmpistric128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistrs :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 { + return pcmpistris128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistro :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 { + return pcmpistrio128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpistra :: #force_inline proc "c" (a: __m128i, b: __m128i, $IMM8: i8) -> i32 { + return pcmpistria128(transmute(i8x16)a, transmute(i8x16)b, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestrm :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> __m128i { + return transmute(__m128i)pcmpestrm128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestri :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 { + return pcmpestri128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestrz :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 { + return pcmpestriz128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestrc :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 { + return pcmpestric128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestrs :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 { + return pcmpestris128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestro :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 { + return pcmpestrio128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpestra :: #force_inline proc "c" (a: __m128i, la: i32, b: __m128i, lb: i32, $IMM8: i8) -> i32 { + return pcmpestria128(transmute(i8x16)a, la, transmute(i8x16)b, lb, IMM8) +} +@(require_results, enable_target_feature="sse4.2") +_mm_crc32_u8 :: #force_inline proc "c" (crc: u32, v: u8) -> u32 { + return crc32_32_8(crc, v) +} +@(require_results, enable_target_feature="sse4.2") +_mm_crc32_u16 :: #force_inline proc "c" (crc: u32, v: u16) -> u32 { + return crc32_32_16(crc, v) +} +@(require_results, enable_target_feature="sse4.2") +_mm_crc32_u32 :: #force_inline proc "c" (crc: u32, v: u32) -> u32 { + return crc32_32_32(crc, v) +} +@(require_results, enable_target_feature="sse4.2") +_mm_cmpgt_epi64 :: #force_inline proc "c" (a: __m128i, b: __m128i) -> __m128i { + return transmute(__m128i)simd.lanes_gt(transmute(i64x2)a, transmute(i64x2)b) +} + +when ODIN_ARCH == .amd64 { + @(require_results, enable_target_feature="sse4.2") + _mm_crc32_u64 :: #force_inline proc "c" (crc: u64, v: u64) -> u64 { + return crc32_64_64(crc, v) + } +} + +@(private, default_calling_convention="c") +foreign _ { + // SSE 4.2 string and text comparison ops + @(link_name="llvm.x86.sse42.pcmpestrm128") + pcmpestrm128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> u8x16 --- + @(link_name="llvm.x86.sse42.pcmpestri128") + pcmpestri128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpestriz128") + pcmpestriz128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpestric128") + pcmpestric128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpestris128") + pcmpestris128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpestrio128") + pcmpestrio128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpestria128") + pcmpestria128 :: proc(a: i8x16, la: i32, b: i8x16, lb: i32, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpistrm128") + pcmpistrm128 :: proc(a, b: i8x16, #const imm8: i8) -> i8x16 --- + @(link_name="llvm.x86.sse42.pcmpistri128") + pcmpistri128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpistriz128") + pcmpistriz128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpistric128") + pcmpistric128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpistris128") + pcmpistris128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpistrio128") + pcmpistrio128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 --- + @(link_name="llvm.x86.sse42.pcmpistria128") + pcmpistria128 :: proc(a, b: i8x16, #const imm8: i8) -> i32 --- + // SSE 4.2 CRC instructions + @(link_name="llvm.x86.sse42.crc32.32.8") + crc32_32_8 :: proc(crc: u32, v: u8) -> u32 --- + @(link_name="llvm.x86.sse42.crc32.32.16") + crc32_32_16 :: proc(crc: u32, v: u16) -> u32 --- + @(link_name="llvm.x86.sse42.crc32.32.32") + crc32_32_32 :: proc(crc: u32, v: u32) -> u32 --- + + // AMD64 Only + @(link_name="llvm.x86.sse42.crc32.64.64") + crc32_64_64 :: proc(crc: u64, v: u64) -> u64 --- +} diff --git a/core/simd/x86/ssse3.odin b/core/simd/x86/ssse3.odin new file mode 100644 index 000000000..f11ef6774 --- /dev/null +++ b/core/simd/x86/ssse3.odin @@ -0,0 +1,140 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" +import "core:simd" +_ :: simd + +@(require_results, enable_target_feature="ssse3") +_mm_abs_epi8 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)pabsb128(transmute(i8x16)a) +} +@(require_results, enable_target_feature="ssse3") +_mm_abs_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)pabsw128(transmute(i16x8)a) +} +@(require_results, enable_target_feature="ssse3") +_mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)pabsd128(transmute(i32x4)a) +} +@(require_results, enable_target_feature="ssse3") +_mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i { + shift :: IMM8 + + // If palignr is shifting the pair of vectors more than the size of two + // lanes, emit zero. + if shift > 32 { + return _mm_set1_epi8(0) + } + a, b := a, b + if shift > 16 { + a, b = _mm_set1_epi8(0), a + } + + return transmute(__m128i)simd.shuffle( + transmute(i8x16)b, + transmute(i8x16)a, + 0 when shift > 32 else shift - 16 + 0 when shift > 16 else shift + 0, + 1 when shift > 32 else shift - 16 + 1 when shift > 16 else shift + 1, + 2 when shift > 32 else shift - 16 + 2 when shift > 16 else shift + 2, + 3 when shift > 32 else shift - 16 + 3 when shift > 16 else shift + 3, + 4 when shift > 32 else shift - 16 + 4 when shift > 16 else shift + 4, + 5 when shift > 32 else shift - 16 + 5 when shift > 16 else shift + 5, + 6 when shift > 32 else shift - 16 + 6 when shift > 16 else shift + 6, + 7 when shift > 32 else shift - 16 + 7 when shift > 16 else shift + 7, + 8 when shift > 32 else shift - 16 + 8 when shift > 16 else shift + 8, + 9 when shift > 32 else shift - 16 + 9 when shift > 16 else shift + 9, + 10 when shift > 32 else shift - 16 + 10 when shift > 16 else shift + 10, + 11 when shift > 32 else shift - 16 + 11 when shift > 16 else shift + 11, + 12 when shift > 32 else shift - 16 + 12 when shift > 16 else shift + 12, + 13 when shift > 32 else shift - 16 + 13 when shift > 16 else shift + 13, + 14 when shift > 32 else shift - 16 + 14 when shift > 16 else shift + 14, + 15 when shift > 32 else shift - 16 + 15 when shift > 16 else shift + 15, + ) +} + + +@(require_results, enable_target_feature="ssse3") +_mm_hadd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phaddw128(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_hadds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phaddsw128(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_hadd_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phaddd128(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_hsub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phsubw128(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_hsubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phsubsw128(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_hsub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phsubd128(transmute(i32x4)a, transmute(i32x4)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_maddubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaddubsw128(transmute(u8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_mulhrs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmulhrsw128(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_sign_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psignb128(transmute(i8x16)a, transmute(i8x16)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_sign_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psignw128(transmute(i16x8)a, transmute(i16x8)b) +} +@(require_results, enable_target_feature="ssse3") +_mm_sign_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psignd128(transmute(i32x4)a, transmute(i32x4)b) +} + + + +@(private, default_calling_convention="c") +foreign _ { + @(link_name = "llvm.x86.ssse3.pabs.b.128") + pabsb128 :: proc(a: i8x16) -> u8x16 --- + @(link_name = "llvm.x86.ssse3.pabs.w.128") + pabsw128 :: proc(a: i16x8) -> u16x8 --- + @(link_name = "llvm.x86.ssse3.pabs.d.128") + pabsd128 :: proc(a: i32x4) -> u32x4 --- + @(link_name = "llvm.x86.ssse3.pshuf.b.128") + pshufb128 :: proc(a, b: u8x16) -> u8x16 --- + @(link_name = "llvm.x86.ssse3.phadd.w.128") + phaddw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phadd.sw.128") + phaddsw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phadd.d.128") + phaddd128 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name = "llvm.x86.ssse3.phsub.w.128") + phsubw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phsub.sw.128") + phsubsw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phsub.d.128") + phsubd128 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name = "llvm.x86.ssse3.pmadd.ub.sw.128") + pmaddubsw128 :: proc(a: u8x16, b: i8x16) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.pmul.hr.sw.128") + pmulhrsw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.psign.b.128") + psignb128 :: proc(a, b: i8x16) -> i8x16 --- + @(link_name = "llvm.x86.ssse3.psign.w.128") + psignw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.psign.d.128") + psignd128 :: proc(a, b: i32x4) -> i32x4 --- +}
\ No newline at end of file diff --git a/core/simd/x86/types.odin b/core/simd/x86/types.odin new file mode 100644 index 000000000..06a2cd41e --- /dev/null +++ b/core/simd/x86/types.odin @@ -0,0 +1,57 @@ +//+build i386, amd64 +package simd_x86 + +import "core:simd" + +bf16 :: u16 + +__m128i :: #simd[2]i64 +__m128 :: #simd[4]f32 +__m128d :: #simd[2]f64 + +__m256i :: #simd[4]i64 +__m256 :: #simd[8]f32 +__m256d :: #simd[4]f64 + +__m512i :: #simd[8]i64 +__m512 :: #simd[16]f32 +__m512d :: #simd[8]f64 + +__m128bh :: #simd[8]bf16 +__m256bh :: #simd[16]bf16 +__m512bh :: #simd[32]bf16 + + +/// The `__mmask64` type used in AVX-512 intrinsics, a 64-bit integer +__mmask64 :: u64 + +/// The `__mmask32` type used in AVX-512 intrinsics, a 32-bit integer +__mmask32 :: u32 + +/// The `__mmask16` type used in AVX-512 intrinsics, a 16-bit integer +__mmask16 :: u16 + +/// The `__mmask8` type used in AVX-512 intrinsics, a 8-bit integer +__mmask8 :: u8 + +/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics. +_MM_CMPINT_ENUM :: i32 + +/// The `MM_MANTISSA_NORM_ENUM` type used to specify mantissa normalized operations in AVX-512 intrinsics. +_MM_MANTISSA_NORM_ENUM :: i32 + +/// The `MM_MANTISSA_SIGN_ENUM` type used to specify mantissa signed operations in AVX-512 intrinsics. +_MM_MANTISSA_SIGN_ENUM :: i32 + +_MM_PERM_ENUM :: i32 + +@(private) u8x16 :: simd.u8x16 +@(private) i8x16 :: simd.i8x16 +@(private) u16x8 :: simd.u16x8 +@(private) i16x8 :: simd.i16x8 +@(private) u32x4 :: simd.u32x4 +@(private) i32x4 :: simd.i32x4 +@(private) u64x2 :: simd.u64x2 +@(private) i64x2 :: simd.i64x2 +@(private) f32x4 :: simd.f32x4 +@(private) f64x2 :: simd.f64x2 diff --git a/core/sys/cpu/cpu.odin b/core/sys/cpu/cpu.odin deleted file mode 100644 index b99fe01d8..000000000 --- a/core/sys/cpu/cpu.odin +++ /dev/null @@ -1,33 +0,0 @@ -package sys_cpu - -Cache_Line_Pad :: struct {_: [_cache_line_size]byte}; - -initialized: bool; - -x86: struct { - _: Cache_Line_Pad, - has_aes: bool, // AES hardware implementation (AES NI) - has_adx: bool, // Multi-precision add-carry instruction extensions - has_avx: bool, // Advanced vector extension - has_avx2: bool, // Advanced vector extension 2 - has_bmi1: bool, // Bit manipulation instruction set 1 - has_bmi2: bool, // Bit manipulation instruction set 2 - has_erms: bool, // Enhanced REP for MOVSB and STOSB - has_fma: bool, // Fused-multiply-add instructions - has_os_xsave: bool, // OS supports XSAVE/XRESTOR for saving/restoring XMM registers. - has_pclmulqdq: bool, // PCLMULQDQ instruction - most often used for AES-GCM - has_popcnt: bool, // Hamming weight instruction POPCNT. - has_rdrand: bool, // RDRAND instruction (on-chip random number generator) - has_rdseed: bool, // RDSEED instruction (on-chip random number generator) - has_sse2: bool, // Streaming SIMD extension 2 (always available on amd64) - has_sse3: bool, // Streaming SIMD extension 3 - has_ssse3: bool, // Supplemental streaming SIMD extension 3 - has_sse41: bool, // Streaming SIMD extension 4 and 4.1 - has_sse42: bool, // Streaming SIMD extension 4 and 4.2 - _: Cache_Line_Pad, -}; - - -init :: proc() { - _init(); -} diff --git a/core/sys/cpu/cpu_x86.odin b/core/sys/cpu/cpu_x86.odin deleted file mode 100644 index 146822e61..000000000 --- a/core/sys/cpu/cpu_x86.odin +++ /dev/null @@ -1,67 +0,0 @@ -//+build i386, amd64 -package sys_cpu - -_cache_line_size :: 64; - -cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) { - return expand_to_tuple(asm(u32, u32) -> struct{eax, ebc, ecx, edx: u32} { - "cpuid", - "={ax},={bx},={cx},={dx},{ax},{cx}", - }(ax, cx)); -} - -xgetbv :: proc() -> (eax, edx: u32) { - return expand_to_tuple(asm(u32) -> struct{eax, edx: u32} { - "xgetbv", - "={ax},={dx},{cx}", - }(0)); -} - -_init :: proc() { - is_set :: proc(hwc: u32, value: u32) -> bool { - return hwc&value != 0; - } - - initialized = true; - - max_id, _, _, _ := cpuid(0, 0); - - if max_id < 1 { - return; - } - - _, _, ecx1, edx1 := cpuid(1, 0); - - x86.has_sse2 = is_set(26, edx1); - - x86.has_sse3 = is_set(0, ecx1); - x86.has_pclmulqdq = is_set(1, ecx1); - x86.has_ssse3 = is_set(9, ecx1); - x86.has_fma = is_set(12, ecx1); - x86.has_sse41 = is_set(19, ecx1); - x86.has_sse42 = is_set(20, ecx1); - x86.has_popcnt = is_set(23, ecx1); - x86.has_aes = is_set(25, ecx1); - x86.has_os_xsave = is_set(27, ecx1); - x86.has_rdrand = is_set(30, ecx1); - - os_supports_avx := false; - if x86.has_os_xsave { - eax, _ := xgetbv(); - os_supports_avx = is_set(1, eax) && is_set(2, eax); - } - - x86.has_avx = is_set(28, ecx1) && os_supports_avx; - - if max_id < 7 { - return; - } - - _, ebx7, _, _ := cpuid(7, 0); - x86.has_bmi1 = is_set(3, ebx7); - x86.has_avx2 = is_set(5, ebx7) && os_supports_avx; - x86.has_bmi2 = is_set(8, ebx7); - x86.has_erms = is_set(9, ebx7); - x86.has_rdseed = is_set(18, ebx7); - x86.has_adx = is_set(19, ebx7); -} diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index 36326b48e..1ab242305 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -96,6 +96,7 @@ import filepath "core:path/filepath" import reflect "core:reflect" import runtime "core:runtime" +import simd "core:simd" import slice "core:slice" import sort "core:sort" import strconv "core:strconv" @@ -192,6 +193,7 @@ _ :: slashpath _ :: filepath _ :: reflect _ :: runtime +_ :: simd _ :: slice _ :: sort _ :: strconv diff --git a/src/build_settings.cpp b/src/build_settings.cpp index e9f5f2099..a82cc80c9 100644 --- a/src/build_settings.cpp +++ b/src/build_settings.cpp @@ -256,7 +256,6 @@ struct BuildContext { String extra_linker_flags; String extra_assembler_flags; String microarch; - String target_features; BuildModeKind build_mode; bool generate_docs; i32 optimization_level; @@ -320,6 +319,10 @@ struct BuildContext { PtrMap<char const *, ExactValue> defined_values; + BlockingMutex target_features_mutex; + StringSet target_features_set; + String target_features_string; + }; gb_global BuildContext build_context = {0}; @@ -629,6 +632,15 @@ bool is_arch_wasm(void) { return false; } +bool is_arch_x86(void) { + switch (build_context.metrics.arch) { + case TargetArch_i386: + case TargetArch_amd64: + return true; + } + return false; +} + bool allow_check_foreign_filepath(void) { switch (build_context.metrics.arch) { case TargetArch_wasm32: @@ -1188,6 +1200,100 @@ void init_build_context(TargetMetrics *cross_target) { #include "microsoft_craziness.h" #endif + +Array<String> split_by_comma(String const &list) { + isize n = 1; + for (isize i = 0; i < list.len; i++) { + if (list.text[i] == ',') { + n++; + } + } + auto res = array_make<String>(heap_allocator(), n); + + String s = list; + for (isize i = 0; i < n; i++) { + isize m = string_index_byte(s, ','); + if (m < 0) { + res[i] = s; + break; + } + res[i] = substring(s, 0, m); + s = substring(s, m+1, s.len); + } + return res; +} + +bool check_target_feature_is_valid(TokenPos pos, String const &feature) { + // TODO(bill): check_target_feature_is_valid + return true; +} + +bool check_target_feature_is_enabled(TokenPos pos, String const &target_feature_list) { + BuildContext *bc = &build_context; + mutex_lock(&bc->target_features_mutex); + defer (mutex_unlock(&bc->target_features_mutex)); + + auto items = split_by_comma(target_feature_list); + array_free(&items); + for_array(i, items) { + String const &item = items.data[i]; + if (!check_target_feature_is_valid(pos, item)) { + error(pos, "Target feature '%.*s' is not valid", LIT(item)); + return false; + } + if (!string_set_exists(&bc->target_features_set, item)) { + error(pos, "Target feature '%.*s' is not enabled", LIT(item)); + return false; + } + } + + return true; +} + +void enable_target_feature(TokenPos pos, String const &target_feature_list) { + BuildContext *bc = &build_context; + mutex_lock(&bc->target_features_mutex); + defer (mutex_unlock(&bc->target_features_mutex)); + + auto items = split_by_comma(target_feature_list); + array_free(&items); + for_array(i, items) { + String const &item = items.data[i]; + if (!check_target_feature_is_valid(pos, item)) { + error(pos, "Target feature '%.*s' is not valid", LIT(item)); + } + } +} + + +char const *target_features_set_to_cstring(gbAllocator allocator, bool with_quotes) { + isize len = 0; + for_array(i, build_context.target_features_set.entries) { + if (i != 0) { + len += 1; + } + String feature = build_context.target_features_set.entries[i].value; + len += feature.len; + if (with_quotes) len += 2; + } + char *features = gb_alloc_array(allocator, char, len+1); + len = 0; + for_array(i, build_context.target_features_set.entries) { + if (i != 0) { + features[len++] = ','; + } + + if (with_quotes) features[len++] = '"'; + String feature = build_context.target_features_set.entries[i].value; + gb_memmove(features, feature.text, feature.len); + len += feature.len; + if (with_quotes) features[len++] = '"'; + } + features[len++] = 0; + + return features; +} + // NOTE(Jeroen): Set/create the output and other paths and report an error as appropriate. // We've previously called `parse_build_flags`, so `out_filepath` should be set. bool init_build_paths(String init_filename) { @@ -1197,6 +1303,9 @@ bool init_build_paths(String init_filename) { // NOTE(Jeroen): We're pre-allocating BuildPathCOUNT slots so that certain paths are always at the same enumerated index. array_init(&bc->build_paths, permanent_allocator(), BuildPathCOUNT); + string_set_init(&bc->target_features_set, heap_allocator(), 1024); + mutex_init(&bc->target_features_mutex); + // [BuildPathMainPackage] Turn given init path into a `Path`, which includes normalizing it into a full path. bc->build_paths[BuildPath_Main_Package] = path_from_string(ha, init_filename); @@ -1377,5 +1486,10 @@ bool init_build_paths(String init_filename) { return false; } + if (bc->target_features_string.len != 0) { + enable_target_feature({}, bc->target_features_string); + } + return true; -}
\ No newline at end of file +} + diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp index 55dd6b016..92e3987a0 100644 --- a/src/check_builtin.cpp +++ b/src/check_builtin.cpp @@ -246,7 +246,7 @@ bool is_constant_string(CheckerContext *c, String const &builtin_name, Ast *expr } bool check_builtin_objc_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) { - String builtin_name = builtin_procs[id].name; + String const &builtin_name = builtin_procs[id].name; if (build_context.metrics.os != TargetOs_darwin) { // allow on doc generation (e.g. Metal stuff) @@ -409,6 +409,667 @@ bool check_atomic_memory_order_argument(CheckerContext *c, Ast *expr, String con } + +bool check_builtin_simd_operation(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) { + ast_node(ce, CallExpr, call); + + String const &builtin_name = builtin_procs[id].name; + switch (id) { + // Any numeric + case BuiltinProc_simd_add: + case BuiltinProc_simd_sub: + case BuiltinProc_simd_mul: + case BuiltinProc_simd_div: + case BuiltinProc_simd_min: + case BuiltinProc_simd_max: + { + Operand x = {}; + Operand y = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(y.type)) { + error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!are_types_identical(x.type, y.type)) { + gbString xs = type_to_string(x.type); + gbString ys = type_to_string(y.type); + error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys); + gb_string_free(ys); + gb_string_free(xs); + return false; + } + Type *elem = base_array_type(x.type); + if (!is_type_integer(elem) && !is_type_float(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + // Integer only + case BuiltinProc_simd_add_sat: + case BuiltinProc_simd_sub_sat: + case BuiltinProc_simd_rem: + case BuiltinProc_simd_and: + case BuiltinProc_simd_or: + case BuiltinProc_simd_xor: + case BuiltinProc_simd_and_not: + { + Operand x = {}; + Operand y = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(y.type)) { + error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!are_types_identical(x.type, y.type)) { + gbString xs = type_to_string(x.type); + gbString ys = type_to_string(y.type); + error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys); + gb_string_free(ys); + gb_string_free(xs); + return false; + } + Type *elem = base_array_type(x.type); + + switch (id) { + case BuiltinProc_simd_add_sat: + case BuiltinProc_simd_sub_sat: + case BuiltinProc_simd_rem: + if (!is_type_integer(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + break; + default: + if (!is_type_integer(elem) && !is_type_boolean(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or boolean element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + break; + } + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + case BuiltinProc_simd_shl: // Odin-like + case BuiltinProc_simd_shr: // Odin-like + case BuiltinProc_simd_shl_masked: // C-like + case BuiltinProc_simd_shr_masked: // C-like + { + Operand x = {}; + Operand y = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(y.type)) { + error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + GB_ASSERT(x.type->kind == Type_SimdVector); + GB_ASSERT(y.type->kind == Type_SimdVector); + Type *xt = x.type; + Type *yt = y.type; + + if (xt->SimdVector.count != yt->SimdVector.count) { + error(x.expr, "'%.*s' mismatched simd vector lengths, got '%lld' vs '%lld'", + LIT(builtin_name), + cast(long long)xt->SimdVector.count, + cast(long long)yt->SimdVector.count); + return false; + } + if (!is_type_integer(base_array_type(x.type))) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + if (!is_type_unsigned(base_array_type(y.type))) { + gbString ys = type_to_string(y.type); + error(y.expr, "'%.*s' expected a #simd type with an unsigned integer element as the shifting operand, got '%s'", LIT(builtin_name), ys); + gb_string_free(ys); + return false; + } + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + // Unary + case BuiltinProc_simd_neg: + case BuiltinProc_simd_abs: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); + if (x.mode == Addressing_Invalid) { + return false; + } + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + if (!is_type_integer(elem) && !is_type_float(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + // Return integer masks + case BuiltinProc_simd_lanes_eq: + case BuiltinProc_simd_lanes_ne: + case BuiltinProc_simd_lanes_lt: + case BuiltinProc_simd_lanes_le: + case BuiltinProc_simd_lanes_gt: + case BuiltinProc_simd_lanes_ge: + { + // op(#simd[N]T, #simd[N]T) -> #simd[N]V + // where `V` is an integer, `size_of(T) == size_of(V)` + // `V` will all 0s if false and all 1s if true (e.g. 0x00 and 0xff for false and true, respectively) + + Operand x = {}; + Operand y = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + switch (id) { + case BuiltinProc_simd_lanes_eq: + case BuiltinProc_simd_lanes_ne: + if (!is_type_integer(elem) && !is_type_float(elem) && !is_type_boolean(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer, floating point, or boolean element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + break; + default: + if (!is_type_integer(elem) && !is_type_float(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + break; + } + + + Type *vt = base_type(x.type); + GB_ASSERT(vt->kind == Type_SimdVector); + i64 count = vt->SimdVector.count; + + i64 sz = type_size_of(elem); + Type *new_elem = nullptr; + + switch (sz) { + case 1: new_elem = t_u8; break; + case 2: new_elem = t_u16; break; + case 4: new_elem = t_u32; break; + case 8: new_elem = t_u64; break; + case 16: + error(x.expr, "'%.*s' not supported 128-bit integer backed simd vector types", LIT(builtin_name)); + return false; + } + + operand->mode = Addressing_Value; + operand->type = alloc_type_simd_vector(count, new_elem); + return true; + } + + case BuiltinProc_simd_extract: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + i64 max_count = x.type->SimdVector.count; + i64 value = -1; + if (!check_index_value(c, x.type, false, ce->args[1], max_count, &value)) { + return false; + } + if (max_count < 0) { + error(ce->args[1], "'%.*s' expected a constant integer index, got '%lld'", LIT(builtin_name), cast(long long)value); + return false; + } + + operand->mode = Addressing_Value; + operand->type = elem; + return true; + } + break; + case BuiltinProc_simd_replace: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + i64 max_count = x.type->SimdVector.count; + i64 value = -1; + if (!check_index_value(c, x.type, false, ce->args[1], max_count, &value)) { + return false; + } + if (max_count < 0) { + error(ce->args[1], "'%.*s' expected a constant integer index, got '%lld'", LIT(builtin_name), cast(long long)value); + return false; + } + + Operand y = {}; + check_expr_with_type_hint(c, &y, ce->args[2], elem); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, elem); if (y.mode == Addressing_Invalid) return false; + if (!are_types_identical(y.type, elem)) { + gbString et = type_to_string(elem); + gbString yt = type_to_string(y.type); + error(y.expr, "'%.*s' expected a type of '%s' to insert, got '%s'", LIT(builtin_name), et, yt); + gb_string_free(yt); + gb_string_free(et); + return false; + } + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + break; + + case BuiltinProc_simd_reduce_add_ordered: + case BuiltinProc_simd_reduce_mul_ordered: + case BuiltinProc_simd_reduce_min: + case BuiltinProc_simd_reduce_max: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + if (!is_type_integer(elem) && !is_type_float(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + + operand->mode = Addressing_Value; + operand->type = base_array_type(x.type); + return true; + } + + case BuiltinProc_simd_reduce_and: + case BuiltinProc_simd_reduce_or: + case BuiltinProc_simd_reduce_xor: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + if (!is_type_integer(elem) && !is_type_boolean(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or boolean element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + + operand->mode = Addressing_Value; + operand->type = base_array_type(x.type); + return true; + } + + + case BuiltinProc_simd_shuffle: + { + Operand x = {}; + Operand y = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(y.type)) { + error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!are_types_identical(x.type, y.type)) { + gbString xs = type_to_string(x.type); + gbString ys = type_to_string(y.type); + error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys); + gb_string_free(ys); + gb_string_free(xs); + return false; + } + Type *elem = base_array_type(x.type); + + i64 max_count = x.type->SimdVector.count + y.type->SimdVector.count; + + i64 arg_count = 0; + for_array(i, ce->args) { + if (i < 2) { + continue; + } + Ast *arg = ce->args[i]; + Operand op = {}; + check_expr(c, &op, arg); + if (op.mode == Addressing_Invalid) { + return false; + } + Type *arg_type = base_type(op.type); + if (!is_type_integer(arg_type) || op.mode != Addressing_Constant) { + error(op.expr, "Indices to '%.*s' must be constant integers", LIT(builtin_name)); + return false; + } + + if (big_int_is_neg(&op.value.value_integer)) { + error(op.expr, "Negative '%.*s' index", LIT(builtin_name)); + return false; + } + + BigInt mc = {}; + big_int_from_i64(&mc, max_count); + if (big_int_cmp(&mc, &op.value.value_integer) <= 0) { + error(op.expr, "'%.*s' index exceeds length", LIT(builtin_name)); + return false; + } + + arg_count++; + } + + if (arg_count > max_count) { + error(call, "Too many '%.*s' indices, %td > %td", LIT(builtin_name), arg_count, max_count); + return false; + } + + + if (!is_power_of_two(arg_count)) { + error(call, "'%.*s' must have a power of two index arguments, got %lld", LIT(builtin_name), cast(long long)arg_count); + return false; + } + + operand->mode = Addressing_Value; + operand->type = alloc_type_simd_vector(arg_count, elem); + return true; + } + + case BuiltinProc_simd_select: + { + Operand cond = {}; + check_expr(c, &cond, ce->args[0]); if (cond.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(cond.type)) { + error(cond.expr, "'%.*s' expected a simd vector boolean type", LIT(builtin_name)); + return false; + } + Type *cond_elem = base_array_type(cond.type); + if (!is_type_boolean(cond_elem) && !is_type_integer(cond_elem)) { + gbString cond_str = type_to_string(cond.type); + error(cond.expr, "'%.*s' expected a simd vector boolean or integer type, got '%s'", LIT(builtin_name), cond_str); + gb_string_free(cond_str); + return false; + } + + Operand x = {}; + Operand y = {}; + check_expr(c, &x, ce->args[1]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[2], x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(y.type)) { + error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!are_types_identical(x.type, y.type)) { + gbString xs = type_to_string(x.type); + gbString ys = type_to_string(y.type); + error(x.expr, "'%.*s' expected 2 results of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys); + gb_string_free(ys); + gb_string_free(xs); + return false; + } + + if (cond.type->SimdVector.count != x.type->SimdVector.count) { + error(x.expr, "'%.*s' expected condition vector to match the length of the result lengths, got '%lld' vs '%lld'", + LIT(builtin_name), + cast(long long)cond.type->SimdVector.count, + cast(long long)x.type->SimdVector.count); + return false; + } + + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + case BuiltinProc_simd_ceil: + case BuiltinProc_simd_floor: + case BuiltinProc_simd_trunc: + case BuiltinProc_simd_nearest: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector boolean type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + if (!is_type_float(elem)) { + gbString x_str = type_to_string(x.type); + error(x.expr, "'%.*s' expected a simd vector floating point type, got '%s'", LIT(builtin_name), x_str); + gb_string_free(x_str); + return false; + } + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + case BuiltinProc_simd_lanes_reverse: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + operand->type = x.type; + operand->mode = Addressing_Value; + return true; + } + + case BuiltinProc_simd_lanes_rotate_left: + case BuiltinProc_simd_lanes_rotate_right: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Operand offset = {}; + check_expr(c, &offset, ce->args[1]); if (offset.mode == Addressing_Invalid) return false; + convert_to_typed(c, &offset, t_i64); + if (!is_type_integer(offset.type) || offset.mode != Addressing_Constant) { + error(offset.expr, "'%.*s' expected a constant integer offset"); + return false; + } + check_assignment(c, &offset, t_i64, builtin_name); + + operand->type = x.type; + operand->mode = Addressing_Value; + return true; + } + + case BuiltinProc_simd_clamp: + { + Operand x = {}; + Operand y = {}; + Operand z = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &y, ce->args[1], x.type); if (y.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &z, ce->args[2], x.type); if (z.mode == Addressing_Invalid) return false; + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &z, x.type); + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(y.type)) { + error(y.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!is_type_simd_vector(z.type)) { + error(z.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + if (!are_types_identical(x.type, y.type)) { + gbString xs = type_to_string(x.type); + gbString ys = type_to_string(y.type); + error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, ys); + gb_string_free(ys); + gb_string_free(xs); + return false; + } + if (!are_types_identical(x.type, z.type)) { + gbString xs = type_to_string(x.type); + gbString zs = type_to_string(z.type); + error(x.expr, "'%.*s' expected 2 arguments of the same type, got '%s' vs '%s'", LIT(builtin_name), xs, zs); + gb_string_free(zs); + gb_string_free(xs); + return false; + } + Type *elem = base_array_type(x.type); + if (!is_type_integer(elem) && !is_type_float(elem)) { + gbString xs = type_to_string(x.type); + error(x.expr, "'%.*s' expected a #simd type with an integer or floating point element, got '%s'", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + + operand->mode = Addressing_Value; + operand->type = x.type; + return true; + } + + case BuiltinProc_simd_to_bits: + { + Operand x = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + + if (!is_type_simd_vector(x.type)) { + error(x.expr, "'%.*s' expected a simd vector type", LIT(builtin_name)); + return false; + } + Type *elem = base_array_type(x.type); + i64 count = get_array_type_count(x.type); + i64 sz = type_size_of(elem); + Type *bit_elem = nullptr; + switch (sz) { + case 1: bit_elem = t_u8; break; + case 2: bit_elem = t_u16; break; + case 4: bit_elem = t_u32; break; + case 8: bit_elem = t_u64; break; + } + GB_ASSERT(bit_elem != nullptr); + + operand->type = alloc_type_simd_vector(count, bit_elem); + operand->mode = Addressing_Value; + return true; + } + + case BuiltinProc_simd_x86__MM_SHUFFLE: + { + Operand x[4] = {}; + for (unsigned i = 0; i < 4; i++) { + check_expr(c, x+i, ce->args[i]); if (x[i].mode == Addressing_Invalid) return false; + } + + u32 offsets[4] = {6, 4, 2, 0}; + u32 result = 0; + for (unsigned i = 0; i < 4; i++) { + if (!is_type_integer(x[i].type) || x[i].mode != Addressing_Constant) { + gbString xs = type_to_string(x[i].type); + error(x[i].expr, "'%.*s' expected a constant integer", LIT(builtin_name), xs); + gb_string_free(xs); + return false; + } + i64 val = exact_value_to_i64(x[i].value); + if (val < 0 || val > 3) { + error(x[i].expr, "'%.*s' expected a constant integer in the range 0..<4, got %lld", LIT(builtin_name), cast(long long)val); + return false; + } + result |= cast(u32)(val) << offsets[i]; + } + + operand->type = t_untyped_integer; + operand->mode = Addressing_Constant; + operand->value = exact_value_i64(result); + return true; + } + default: + GB_PANIC("Unhandled simd intrinsic: %.*s", LIT(builtin_name)); + } + + return false; +} + + bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 id, Type *type_hint) { ast_node(ce, CallExpr, call); if (ce->inlining != ProcInlining_none) { @@ -479,7 +1140,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 break; } - String builtin_name = builtin_procs[id].name; + String const &builtin_name = builtin_procs[id].name; if (ce->args.count > 0) { @@ -491,6 +1152,17 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 } } + if (BuiltinProc__simd_begin < id && id < BuiltinProc__simd_end) { + bool ok = check_builtin_simd_operation(c, operand, call, id, type_hint); + if (!ok) { + operand->type = t_invalid; + } + operand->mode = Addressing_Value; + operand->value = {}; + operand->expr = call; + return ok; + } + switch (id) { default: GB_PANIC("Implement built-in procedure: %.*s", LIT(builtin_name)); @@ -1031,6 +1703,11 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 bt->Struct.soa_kind == StructSoa_Dynamic) { mode = Addressing_Value; } + } else if (is_type_simd_vector(op_type)) { + Type *bt = base_type(op_type); + mode = Addressing_Constant; + value = exact_value_i64(bt->SimdVector.count); + type = t_untyped_integer; } if (operand->mode == Addressing_Type && mode != Addressing_Constant) { mode = Addressing_Invalid; @@ -1445,6 +2122,11 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 operand->mode = Addressing_Value; } + if (is_type_simd_vector(type) && !is_power_of_two(arg_count)) { + error(call, "'swizzle' with a #simd vector must have a power of two arguments, got %lld", cast(long long)arg_count); + return false; + } + operand->type = determine_swizzle_array_type(original_type, type_hint, arg_count); break; } @@ -2279,7 +2961,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 if (i == j) continue; Operand *b = ops[j]; convert_to_typed(c, a, b->type); - if (a->mode == Addressing_Invalid) { return false; } + if (a->mode == Addressing_Invalid) return false; } } @@ -2685,46 +3367,6 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 break; } - case BuiltinProc_simd_vector: { - Operand x = {}; - Operand y = {}; - x = *operand; - if (!is_type_integer(x.type) || x.mode != Addressing_Constant) { - error(call, "Expected a constant integer for 'intrinsics.simd_vector'"); - operand->mode = Addressing_Type; - operand->type = t_invalid; - return false; - } - if (big_int_is_neg(&x.value.value_integer)) { - error(call, "Negative vector element length"); - operand->mode = Addressing_Type; - operand->type = t_invalid; - return false; - } - i64 count = big_int_to_i64(&x.value.value_integer); - - check_expr_or_type(c, &y, ce->args[1]); - if (y.mode != Addressing_Type) { - error(call, "Expected a type 'intrinsics.simd_vector'"); - operand->mode = Addressing_Type; - operand->type = t_invalid; - return false; - } - Type *elem = y.type; - if (!is_type_valid_vector_elem(elem)) { - gbString str = type_to_string(elem); - error(call, "Invalid element type for 'intrinsics.simd_vector', expected an integer or float with no specific endianness, got '%s'", str); - gb_string_free(str); - operand->mode = Addressing_Type; - operand->type = t_invalid; - return false; - } - - operand->mode = Addressing_Type; - operand->type = alloc_type_simd_vector(count, elem); - break; - } - case BuiltinProc_is_package_imported: { bool value = false; @@ -2944,7 +3586,14 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 return false; } - if (!is_type_integer_like(x.type)) { + if (is_type_simd_vector(x.type)) { + Type *elem = base_array_type(x.type); + if (!is_type_integer_like(elem)) { + gbString xts = type_to_string(x.type); + error(x.expr, "#simd values passed to '%.*s' must have an element of an integer-like type (integer, boolean, enum, bit_set), got %s", LIT(builtin_name), xts); + gb_string_free(xts); + } + } else if (!is_type_integer_like(x.type)) { gbString xts = type_to_string(x.type); error(x.expr, "Values passed to '%.*s' must be an integer-like type (integer, boolean, enum, bit_set), got %s", LIT(builtin_name), xts); gb_string_free(xts); @@ -3002,7 +3651,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 if (y.mode == Addressing_Invalid) { return false; } - convert_to_typed(c, &y, x.type); + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; convert_to_typed(c, &x, y.type); if (is_type_untyped(x.type)) { gbString xts = type_to_string(x.type); @@ -3039,14 +3688,23 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 if (x.mode == Addressing_Invalid) { return false; } - if (!is_type_float(x.type)) { + + Type *elem = core_array_type(x.type); + if (!is_type_float(x.type) && !(is_type_simd_vector(x.type) && is_type_float(elem))) { gbString xts = type_to_string(x.type); - error(x.expr, "Expected a floating point value for '%.*s', got %s", LIT(builtin_name), xts); + error(x.expr, "Expected a floating point or #simd vector value for '%.*s', got %s", LIT(builtin_name), xts); gb_string_free(xts); return false; + } else if (is_type_different_to_arch_endianness(elem)) { + GB_ASSERT(elem->kind == Type_Basic); + if (elem->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) { + gbString xts = type_to_string(x.type); + error(x.expr, "Expected a float which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts); + gb_string_free(xts); + return false; + } } - - if (x.mode == Addressing_Constant) { + if (is_type_float(x.type) && x.mode == Addressing_Constant) { f64 v = exact_value_to_f64(x.value); operand->mode = Addressing_Constant; @@ -3059,6 +3717,59 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 } break; + case BuiltinProc_fused_mul_add: + { + Operand x = {}; + Operand y = {}; + Operand z = {}; + check_expr(c, &x, ce->args[0]); if (x.mode == Addressing_Invalid) return false; + check_expr(c, &y, ce->args[1]); if (y.mode == Addressing_Invalid) return false; + check_expr(c, &z, ce->args[2]); if (z.mode == Addressing_Invalid) return false; + + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; + convert_to_typed(c, &x, y.type); if (x.mode == Addressing_Invalid) return false; + convert_to_typed(c, &z, x.type); if (z.mode == Addressing_Invalid) return false; + convert_to_typed(c, &x, z.type); if (x.mode == Addressing_Invalid) return false; + if (is_type_untyped(x.type)) { + gbString xts = type_to_string(x.type); + error(x.expr, "Expected a typed floating point value or #simd vector for '%.*s', got %s", LIT(builtin_name), xts); + gb_string_free(xts); + return false; + } + + Type *elem = core_array_type(x.type); + if (!is_type_float(x.type) && !(is_type_simd_vector(x.type) && is_type_float(elem))) { + gbString xts = type_to_string(x.type); + error(x.expr, "Expected a floating point or #simd vector value for '%.*s', got %s", LIT(builtin_name), xts); + gb_string_free(xts); + return false; + } + if (is_type_different_to_arch_endianness(elem)) { + GB_ASSERT(elem->kind == Type_Basic); + if (elem->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) { + gbString xts = type_to_string(x.type); + error(x.expr, "Expected a float which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts); + gb_string_free(xts); + return false; + } + } + + if (!are_types_identical(x.type, y.type) || !are_types_identical(y.type, z.type)) { + gbString xts = type_to_string(x.type); + gbString yts = type_to_string(y.type); + gbString zts = type_to_string(z.type); + error(x.expr, "Mismatched types for '%.*s', got %s vs %s vs %s", LIT(builtin_name), xts, yts, zts); + gb_string_free(zts); + gb_string_free(yts); + gb_string_free(xts); + return false; + } + + operand->mode = Addressing_Value; + operand->type = default_type(x.type); + } + break; + case BuiltinProc_mem_copy: case BuiltinProc_mem_copy_non_overlapping: { @@ -3309,9 +4020,8 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 break; case BuiltinProc_volatile_store: - /*fallthrough*/ case BuiltinProc_unaligned_store: - /*fallthrough*/ + case BuiltinProc_non_temporal_store: case BuiltinProc_atomic_store: { Type *elem = nullptr; @@ -3358,9 +4068,8 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 case BuiltinProc_volatile_load: - /*fallthrough*/ case BuiltinProc_unaligned_load: - /*fallthrough*/ + case BuiltinProc_non_temporal_load: case BuiltinProc_atomic_load: { Type *elem = nullptr; @@ -3618,7 +4327,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 if (x.mode == Addressing_Invalid) { return false; } - convert_to_typed(c, &y, x.type); + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; if (x.mode == Addressing_Invalid) { return false; } @@ -3675,7 +4384,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 if (y.mode == Addressing_Invalid) { return false; } - convert_to_typed(c, &y, x.type); + convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false; convert_to_typed(c, &x, y.type); if (!are_types_identical(x.type, y.type)) { gbString xts = type_to_string(x.type); @@ -4566,6 +5275,64 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32 } break; + case BuiltinProc_x86_cpuid: + { + if (!is_arch_x86()) { + error(call, "'%.*s' is only allowed on x86 targets (i386, amd64)", LIT(builtin_name)); + return false; + } + + Operand ax = {}; + Operand cx = {}; + + check_expr_with_type_hint(c, &ax, ce->args[0], t_u32); if (ax.mode == Addressing_Invalid) return false; + check_expr_with_type_hint(c, &cx, ce->args[1], t_u32); if (cx.mode == Addressing_Invalid) return false; + convert_to_typed(c, &ax, t_u32); if (ax.mode == Addressing_Invalid) return false; + convert_to_typed(c, &cx, t_u32); if (cx.mode == Addressing_Invalid) return false; + if (!are_types_identical(ax.type, t_u32)) { + gbString str = type_to_string(ax.type); + error(ax.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str); + gb_string_free(str); + return false; + } + if (!are_types_identical(cx.type, t_u32)) { + gbString str = type_to_string(cx.type); + error(cx.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str); + gb_string_free(str); + return false; + } + Type *types[4] = {t_u32, t_u32, t_u32, t_u32}; // eax ebc ecx edx + operand->type = alloc_type_tuple_from_field_types(types, gb_count_of(types), false, false); + operand->mode = Addressing_Value; + operand->value = {}; + return true; + } + break; + case BuiltinProc_x86_xgetbv: + { + if (!is_arch_x86()) { + error(call, "'%.*s' is only allowed on x86 targets (i386, amd64)", LIT(builtin_name)); + return false; + } + + Operand cx = {}; + check_expr_with_type_hint(c, &cx, ce->args[0], t_u32); if (cx.mode == Addressing_Invalid) return false; + convert_to_typed(c, &cx, t_u32); if (cx.mode == Addressing_Invalid) return false; + if (!are_types_identical(cx.type, t_u32)) { + gbString str = type_to_string(cx.type); + error(cx.expr, "'%.*s' expected a u32, got %s", LIT(builtin_name), str); + gb_string_free(str); + return false; + } + + Type *types[2] = {t_u32, t_u32}; + operand->type = alloc_type_tuple_from_field_types(types, gb_count_of(types), false, false); + operand->mode = Addressing_Value; + operand->value = {}; + return true; + } + break; + } return true; diff --git a/src/check_decl.cpp b/src/check_decl.cpp index 82ac6c677..86280b6cb 100644 --- a/src/check_decl.cpp +++ b/src/check_decl.cpp @@ -313,13 +313,19 @@ void check_type_decl(CheckerContext *ctx, Entity *e, Ast *init_expr, Type *def) } named->Named.base = base; - if (is_distinct && is_type_typeid(e->type)) { - error(init_expr, "'distinct' cannot be applied to 'typeid'"); - is_distinct = false; - } - if (is_distinct && is_type_any(e->type)) { - error(init_expr, "'distinct' cannot be applied to 'any'"); - is_distinct = false; + if (is_distinct) { + if (is_type_typeid(e->type)) { + error(init_expr, "'distinct' cannot be applied to 'typeid'"); + is_distinct = false; + } else if (is_type_any(e->type)) { + error(init_expr, "'distinct' cannot be applied to 'any'"); + is_distinct = false; + } else if (is_type_simd_vector(e->type)) { + gbString str = type_to_string(e->type); + error(init_expr, "'distinct' cannot be applied to '%s'", str); + gb_string_free(str); + is_distinct = false; + } } if (!is_distinct) { e->type = bt; @@ -893,6 +899,18 @@ void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) { } } + if (ac.require_target_feature.len != 0 && ac.enable_target_feature.len != 0) { + error(e->token, "Attributes @(require_target_feature=...) and @(enable_target_feature=...) cannot be used together"); + } else if (ac.require_target_feature.len != 0) { + if (check_target_feature_is_enabled(e->token.pos, ac.require_target_feature)) { + e->Procedure.target_feature = ac.require_target_feature; + } else { + e->Procedure.target_feature_disabled = true; + } + } else if (ac.enable_target_feature.len != 0) { + enable_target_feature(e->token.pos, ac.enable_target_feature); + e->Procedure.target_feature = ac.enable_target_feature; + } switch (e->Procedure.optimization_mode) { case ProcedureOptimizationMode_None: @@ -996,10 +1014,12 @@ void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) { } } - if (pt->result_count == 0 && ac.require_results) { - error(pl->type, "'require_results' is not needed on a procedure with no results"); - } else { - pt->require_results = ac.require_results; + if (ac.require_results) { + if (pt->result_count == 0) { + error(pl->type, "'require_results' is not needed on a procedure with no results"); + } else { + pt->require_results = true; + } } if (ac.link_name.len > 0) { @@ -1309,20 +1329,20 @@ void check_proc_group_decl(CheckerContext *ctx, Entity *&pg_entity, DeclInfo *d) if (!both_have_where_clauses) switch (kind) { case ProcOverload_Identical: - error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); + error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); is_invalid = true; break; // case ProcOverload_CallingConvention: - // error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); + // error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); // is_invalid = true; // break; case ProcOverload_ParamVariadic: - error(p->token, "Overloaded procedure '%.*s' as the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); + error(p->token, "Overloaded procedure '%.*s' has the same type as another procedure in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); is_invalid = true; break; case ProcOverload_ResultCount: case ProcOverload_ResultTypes: - error(p->token, "Overloaded procedure '%.*s' as the same parameters but different results in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); + error(p->token, "Overloaded procedure '%.*s' has the same parameters but different results in the procedure group '%.*s'", LIT(name), LIT(proc_group_name)); is_invalid = true; break; case ProcOverload_Polymorphic: diff --git a/src/check_expr.cpp b/src/check_expr.cpp index 7b269e048..f954f1583 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -442,6 +442,14 @@ bool find_or_generate_polymorphic_procedure(CheckerContext *old_c, Entity *base_ final_proc_type->Proc.is_poly_specialized = true; final_proc_type->Proc.is_polymorphic = true; + final_proc_type->Proc.variadic = src->Proc.variadic; + final_proc_type->Proc.require_results = src->Proc.require_results; + final_proc_type->Proc.c_vararg = src->Proc.c_vararg; + final_proc_type->Proc.has_named_results = src->Proc.has_named_results; + final_proc_type->Proc.diverging = src->Proc.diverging; + final_proc_type->Proc.return_by_pointer = src->Proc.return_by_pointer; + final_proc_type->Proc.optional_ok = src->Proc.optional_ok; + for (isize i = 0; i < operands.count; i++) { Operand o = operands[i]; @@ -777,6 +785,14 @@ i64 check_distance_between_types(CheckerContext *c, Operand *operand, Type *type return distance + 6; } } + + if (is_type_simd_vector(dst)) { + Type *dst_elem = base_array_type(dst); + i64 distance = check_distance_between_types(c, operand, dst_elem); + if (distance >= 0) { + return distance + 6; + } + } if (is_type_matrix(dst)) { Type *dst_elem = base_array_type(dst); @@ -786,6 +802,7 @@ i64 check_distance_between_types(CheckerContext *c, Operand *operand, Type *type } } + if (is_type_any(dst)) { if (!is_type_polymorphic(src)) { if (operand->mode == Addressing_Context && operand->type == t_context) { @@ -1328,6 +1345,19 @@ bool is_polymorphic_type_assignable(CheckerContext *c, Type *poly, Type *source, } } return false; + + case Type_SimdVector: + if (source->kind == Type_SimdVector) { + if (poly->SimdVector.generic_count != nullptr) { + if (!polymorphic_assign_index(&poly->SimdVector.generic_count, &poly->SimdVector.count, source->SimdVector.count)) { + return false; + } + } + if (poly->SimdVector.count == source->SimdVector.count) { + return is_polymorphic_type_assignable(c, poly->SimdVector.elem, source->SimdVector.elem, true, modify_type); + } + } + return false; } return false; } @@ -1567,9 +1597,11 @@ bool check_unary_op(CheckerContext *c, Operand *o, Token op) { bool check_binary_op(CheckerContext *c, Operand *o, Token op) { Type *main_type = o->type; + // TODO(bill): Handle errors correctly Type *type = base_type(core_array_type(main_type)); Type *ct = core_type(type); + switch (op.kind) { case Token_Sub: case Token_SubEq: @@ -1638,14 +1670,6 @@ bool check_binary_op(CheckerContext *c, Operand *o, Token op) { error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string)); return false; } - if (is_type_simd_vector(o->type)) { - switch (op.kind) { - case Token_ModMod: - case Token_ModModEq: - error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string)); - return false; - } - } break; case Token_AndNot: @@ -1654,14 +1678,6 @@ bool check_binary_op(CheckerContext *c, Operand *o, Token op) { error(op, "Operator '%.*s' is only allowed with integers and bit sets", LIT(op.string)); return false; } - if (is_type_simd_vector(o->type)) { - switch (op.kind) { - case Token_AndNot: - case Token_AndNotEq: - error(op, "Operator '%.*s' is only allowed with integers", LIT(op.string)); - return false; - } - } break; case Token_CmpAnd: @@ -2487,6 +2503,8 @@ void check_shift(CheckerContext *c, Operand *x, Operand *y, Ast *node, Type *typ gb_string_free(err_str); } + // TODO(bill): Should we support shifts for fixed arrays and #simd vectors? + if (!is_type_integer(x->type)) { gbString err_str = expr_to_string(y->expr); error(node, "Shift operand '%s' must be an integer", err_str); @@ -2697,6 +2715,26 @@ bool check_is_castable_to(CheckerContext *c, Operand *operand, Type *y) { return true; } + if (is_type_simd_vector(src) && is_type_simd_vector(dst)) { + if (src->SimdVector.count != dst->SimdVector.count) { + return false; + } + Type *elem_src = base_array_type(src); + Type *elem_dst = base_array_type(dst); + Operand x = {}; + x.type = elem_src; + x.mode = Addressing_Value; + return check_is_castable_to(c, &x, elem_dst); + } + + if (is_type_simd_vector(dst)) { + Type *elem = base_array_type(dst); + if (check_is_castable_to(c, operand, elem)) { + return true; + } + } + + return false; } @@ -4116,7 +4154,11 @@ ExactValue get_constant_field(CheckerContext *c, Operand const *operand, Selecti Type *determine_swizzle_array_type(Type *original_type, Type *type_hint, isize new_count) { Type *array_type = base_type(type_deref(original_type)); - GB_ASSERT(array_type->kind == Type_Array); + GB_ASSERT(array_type->kind == Type_Array || array_type->kind == Type_SimdVector); + if (array_type->kind == Type_SimdVector) { + Type *elem_type = array_type->SimdVector.elem; + return alloc_type_simd_vector(new_count, elem_type); + } Type *elem_type = array_type->Array.elem; Type *swizzle_array_type = nullptr; @@ -7738,111 +7780,106 @@ ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *node, Type * } if (cl->elems.count > 0 && cl->elems[0]->kind == Ast_FieldValue) { - if (is_type_simd_vector(t)) { - error(cl->elems[0], "'field = value' is not allowed for SIMD vector literals"); - } else { - RangeCache rc = range_cache_make(heap_allocator()); - defer (range_cache_destroy(&rc)); + RangeCache rc = range_cache_make(heap_allocator()); + defer (range_cache_destroy(&rc)); - for_array(i, cl->elems) { - Ast *elem = cl->elems[i]; - if (elem->kind != Ast_FieldValue) { - error(elem, "Mixture of 'field = value' and value elements in a literal is not allowed"); - continue; - } - ast_node(fv, FieldValue, elem); + for_array(i, cl->elems) { + Ast *elem = cl->elems[i]; + if (elem->kind != Ast_FieldValue) { + error(elem, "Mixture of 'field = value' and value elements in a literal is not allowed"); + continue; + } + ast_node(fv, FieldValue, elem); - if (is_ast_range(fv->field)) { - Token op = fv->field->BinaryExpr.op; + if (is_ast_range(fv->field)) { + Token op = fv->field->BinaryExpr.op; - Operand x = {}; - Operand y = {}; - bool ok = check_range(c, fv->field, &x, &y, nullptr); - if (!ok) { - continue; - } - if (x.mode != Addressing_Constant || !is_type_integer(core_type(x.type))) { - error(x.expr, "Expected a constant integer as an array field"); - continue; - } + Operand x = {}; + Operand y = {}; + bool ok = check_range(c, fv->field, &x, &y, nullptr); + if (!ok) { + continue; + } + if (x.mode != Addressing_Constant || !is_type_integer(core_type(x.type))) { + error(x.expr, "Expected a constant integer as an array field"); + continue; + } - if (y.mode != Addressing_Constant || !is_type_integer(core_type(y.type))) { - error(y.expr, "Expected a constant integer as an array field"); - continue; - } + if (y.mode != Addressing_Constant || !is_type_integer(core_type(y.type))) { + error(y.expr, "Expected a constant integer as an array field"); + continue; + } - i64 lo = exact_value_to_i64(x.value); - i64 hi = exact_value_to_i64(y.value); - i64 max_index = hi; - if (op.kind == Token_RangeHalf) { // ..< (exclusive) - hi -= 1; - } else { // .. (inclusive) - max_index += 1; - } + i64 lo = exact_value_to_i64(x.value); + i64 hi = exact_value_to_i64(y.value); + i64 max_index = hi; + if (op.kind == Token_RangeHalf) { // ..< (exclusive) + hi -= 1; + } else { // .. (inclusive) + max_index += 1; + } - bool new_range = range_cache_add_range(&rc, lo, hi); - if (!new_range) { - error(elem, "Overlapping field range index %lld %.*s %lld for %.*s", lo, LIT(op.string), hi, LIT(context_name)); - continue; - } + bool new_range = range_cache_add_range(&rc, lo, hi); + if (!new_range) { + error(elem, "Overlapping field range index %lld %.*s %lld for %.*s", lo, LIT(op.string), hi, LIT(context_name)); + continue; + } - if (max_type_count >= 0 && (lo < 0 || lo >= max_type_count)) { - error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", lo, max_type_count, LIT(context_name)); - continue; - } - if (max_type_count >= 0 && (hi < 0 || hi >= max_type_count)) { - error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", hi, max_type_count, LIT(context_name)); - continue; - } + if (max_type_count >= 0 && (lo < 0 || lo >= max_type_count)) { + error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", lo, max_type_count, LIT(context_name)); + continue; + } + if (max_type_count >= 0 && (hi < 0 || hi >= max_type_count)) { + error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", hi, max_type_count, LIT(context_name)); + continue; + } - if (max < hi) { - max = max_index; - } + if (max < hi) { + max = max_index; + } - Operand operand = {}; - check_expr_with_type_hint(c, &operand, fv->value, elem_type); - check_assignment(c, &operand, elem_type, context_name); + Operand operand = {}; + check_expr_with_type_hint(c, &operand, fv->value, elem_type); + check_assignment(c, &operand, elem_type, context_name); - is_constant = is_constant && operand.mode == Addressing_Constant; - } else { - Operand op_index = {}; - check_expr(c, &op_index, fv->field); + is_constant = is_constant && operand.mode == Addressing_Constant; + } else { + Operand op_index = {}; + check_expr(c, &op_index, fv->field); - if (op_index.mode != Addressing_Constant || !is_type_integer(core_type(op_index.type))) { - error(elem, "Expected a constant integer as an array field"); - continue; - } - // add_type_and_value(c->info, op_index.expr, op_index.mode, op_index.type, op_index.value); + if (op_index.mode != Addressing_Constant || !is_type_integer(core_type(op_index.type))) { + error(elem, "Expected a constant integer as an array field"); + continue; + } + // add_type_and_value(c->info, op_index.expr, op_index.mode, op_index.type, op_index.value); - i64 index = exact_value_to_i64(op_index.value); + i64 index = exact_value_to_i64(op_index.value); - if (max_type_count >= 0 && (index < 0 || index >= max_type_count)) { - error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", index, max_type_count, LIT(context_name)); - continue; - } + if (max_type_count >= 0 && (index < 0 || index >= max_type_count)) { + error(elem, "Index %lld is out of bounds (0..<%lld) for %.*s", index, max_type_count, LIT(context_name)); + continue; + } - bool new_index = range_cache_add_index(&rc, index); - if (!new_index) { - error(elem, "Duplicate field index %lld for %.*s", index, LIT(context_name)); - continue; - } + bool new_index = range_cache_add_index(&rc, index); + if (!new_index) { + error(elem, "Duplicate field index %lld for %.*s", index, LIT(context_name)); + continue; + } - if (max < index+1) { - max = index+1; - } + if (max < index+1) { + max = index+1; + } - Operand operand = {}; - check_expr_with_type_hint(c, &operand, fv->value, elem_type); - check_assignment(c, &operand, elem_type, context_name); + Operand operand = {}; + check_expr_with_type_hint(c, &operand, fv->value, elem_type); + check_assignment(c, &operand, elem_type, context_name); - is_constant = is_constant && operand.mode == Addressing_Constant; - } + is_constant = is_constant && operand.mode == Addressing_Constant; } - - cl->max_count = max; } + cl->max_count = max; } else { isize index = 0; for (; index < cl->elems.count; index++) { @@ -7887,7 +7924,7 @@ ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *node, Type * if (t->kind == Type_SimdVector) { if (!is_constant) { - error(node, "Expected all constant elements for a simd vector"); + // error(node, "Expected all constant elements for a simd vector"); } } diff --git a/src/check_stmt.cpp b/src/check_stmt.cpp index f2c830c1b..f061b4961 100644 --- a/src/check_stmt.cpp +++ b/src/check_stmt.cpp @@ -1381,6 +1381,18 @@ bool all_operands_valid(Array<Operand> const &operands) { return true; } +bool check_stmt_internal_builtin_proc_id(Ast *expr, BuiltinProcId *id_) { + BuiltinProcId id = BuiltinProc_Invalid; + Entity *e = entity_of_node(expr); + if (e != nullptr && e->kind == Entity_Builtin) { + if (e->Builtin.id && e->Builtin.id != BuiltinProc_DIRECTIVE) { + id = cast(BuiltinProcId)e->Builtin.id; + } + } + if (id_) *id_ = id; + return id != BuiltinProc_Invalid; +} + void check_stmt_internal(CheckerContext *ctx, Ast *node, u32 flags) { u32 mod_flags = flags & (~Stmt_FallthroughAllowed); switch (node->kind) { @@ -1405,29 +1417,43 @@ void check_stmt_internal(CheckerContext *ctx, Ast *node, u32 flags) { if (kind == Expr_Stmt) { return; } - Ast *expr = strip_or_return_expr(operand.expr); + Ast *expr = strip_or_return_expr(operand.expr); if (expr->kind == Ast_CallExpr) { + BuiltinProcId builtin_id = BuiltinProc_Invalid; + bool do_require = false; + AstCallExpr *ce = &expr->CallExpr; - Type *t = type_of_expr(ce->proc); - if (is_type_proc(t)) { - if (t->Proc.require_results) { - gbString expr_str = expr_to_string(ce->proc); - error(node, "'%s' requires that its results must be handled", expr_str); - gb_string_free(expr_str); - } + Type *t = base_type(type_of_expr(ce->proc)); + if (t->kind == Type_Proc) { + do_require = t->Proc.require_results; + } else if (check_stmt_internal_builtin_proc_id(ce->proc, &builtin_id)) { + auto const &bp = builtin_procs[builtin_id]; + do_require = bp.kind == Expr_Expr && !bp.ignore_results; + } + if (do_require) { + gbString expr_str = expr_to_string(ce->proc); + error(node, "'%s' requires that its results must be handled", expr_str); + gb_string_free(expr_str); } return; } else if (expr->kind == Ast_SelectorCallExpr) { + BuiltinProcId builtin_id = BuiltinProc_Invalid; + bool do_require = false; + AstSelectorCallExpr *se = &expr->SelectorCallExpr; ast_node(ce, CallExpr, se->call); - Type *t = type_of_expr(ce->proc); - if (is_type_proc(t)) { - if (t->Proc.require_results) { - gbString expr_str = expr_to_string(ce->proc); - error(node, "'%s' requires that its results must be handled", expr_str); - gb_string_free(expr_str); - } + Type *t = base_type(type_of_expr(ce->proc)); + if (t->kind == Type_Proc) { + do_require = t->Proc.require_results; + } else if (check_stmt_internal_builtin_proc_id(ce->proc, &builtin_id)) { + auto const &bp = builtin_procs[builtin_id]; + do_require = bp.kind == Expr_Expr && !bp.ignore_results; + } + if (do_require) { + gbString expr_str = expr_to_string(ce->proc); + error(node, "'%s' requires that its results must be handled", expr_str); + gb_string_free(expr_str); } return; } diff --git a/src/check_type.cpp b/src/check_type.cpp index 51f472961..fc5b7aed7 100644 --- a/src/check_type.cpp +++ b/src/check_type.cpp @@ -1234,7 +1234,7 @@ bool check_type_specialization_to(CheckerContext *ctx, Type *specialization, Typ } -Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *poly_type, Operand operand) { +Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *poly_type, Operand const &operand) { bool modify_type = !ctx->no_polymorphic_errors; bool show_error = modify_type && !ctx->hide_polymorphic_errors; if (!is_operand_value(operand)) { @@ -2795,15 +2795,27 @@ bool check_type_internal(CheckerContext *ctx, Ast *e, Type **type, Type *named_t if (name == "soa") { *type = make_soa_struct_fixed(ctx, e, at->elem, elem, count, generic_type); } else if (name == "simd") { - if (!is_type_valid_vector_elem(elem)) { + if (!is_type_valid_vector_elem(elem) && !is_type_polymorphic(elem)) { gbString str = type_to_string(elem); - error(at->elem, "Invalid element type for 'intrinsics.simd_vector', expected an integer or float with no specific endianness, got '%s'", str); + error(at->elem, "Invalid element type for #simd, expected an integer, float, or boolean with no specific endianness, got '%s'", str); gb_string_free(str); *type = alloc_type_array(elem, count, generic_type); goto array_end; } - *type = alloc_type_simd_vector(count, elem); + if (generic_type != nullptr) { + // Ignore + } else if (count < 1 || !is_power_of_two(count)) { + error(at->count, "Invalid length for #simd, expected a power of two length, got '%lld'", cast(long long)count); + *type = alloc_type_array(elem, count, generic_type); + goto array_end; + } + + *type = alloc_type_simd_vector(count, elem, generic_type); + + if (count > SIMD_ELEMENT_COUNT_MAX) { + error(at->count, "#simd support a maximum element count of %d, got %lld", SIMD_ELEMENT_COUNT_MAX, cast(long long)count); + } } else { error(at->tag, "Invalid tag applied to array, got #%.*s", LIT(name)); *type = alloc_type_array(elem, count, generic_type); diff --git a/src/checker.cpp b/src/checker.cpp index 8afc6eb14..874839ece 100644 --- a/src/checker.cpp +++ b/src/checker.cpp @@ -3207,6 +3207,22 @@ DECL_ATTRIBUTE_PROC(proc_decl_attribute) { } } return true; + } else if (name == "require_target_feature") { + ExactValue ev = check_decl_attribute_value(c, value); + if (ev.kind == ExactValue_String) { + ac->require_target_feature = ev.value_string; + } else { + error(elem, "Expected a string value for '%.*s'", LIT(name)); + } + return true; + } else if (name == "enable_target_feature") { + ExactValue ev = check_decl_attribute_value(c, value); + if (ev.kind == ExactValue_String) { + ac->enable_target_feature = ev.value_string; + } else { + error(elem, "Expected a string value for '%.*s'", LIT(name)); + } + return true; } return false; } diff --git a/src/checker.hpp b/src/checker.hpp index 1c9ffd8c7..f11a00532 100644 --- a/src/checker.hpp +++ b/src/checker.hpp @@ -60,6 +60,7 @@ struct BuiltinProc { ExprKind kind; BuiltinProcPkg pkg; bool diverging; + bool ignore_results; // ignores require results handling }; @@ -124,6 +125,9 @@ struct AttributeContext { String objc_name; bool objc_is_class_method; Type * objc_type; + + String require_target_feature; // required by the target micro-architecture + String enable_target_feature; // will be enabled for the procedure only }; AttributeContext make_attribute_context(String link_prefix) { diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp index d407ef7c1..05f256775 100644 --- a/src/checker_builtin_procs.hpp +++ b/src/checker_builtin_procs.hpp @@ -45,7 +45,6 @@ enum BuiltinProcId { // "Intrinsics" BuiltinProc_is_package_imported, - BuiltinProc_simd_vector, BuiltinProc_soa_struct, BuiltinProc_alloca, @@ -66,6 +65,7 @@ enum BuiltinProcId { BuiltinProc_overflow_mul, BuiltinProc_sqrt, + BuiltinProc_fused_mul_add, BuiltinProc_mem_copy, BuiltinProc_mem_copy_non_overlapping, @@ -80,6 +80,8 @@ enum BuiltinProcId { BuiltinProc_unaligned_store, BuiltinProc_unaligned_load, + BuiltinProc_non_temporal_store, + BuiltinProc_non_temporal_load, BuiltinProc_prefetch_read_instruction, BuiltinProc_prefetch_read_data, @@ -118,10 +120,76 @@ enum BuiltinProcId { BuiltinProc_fixed_point_div_sat, BuiltinProc_expect, + +BuiltinProc__simd_begin, + BuiltinProc_simd_add, + BuiltinProc_simd_sub, + BuiltinProc_simd_mul, + BuiltinProc_simd_div, + BuiltinProc_simd_rem, + BuiltinProc_simd_shl, // Odin logic + BuiltinProc_simd_shr, // Odin logic + BuiltinProc_simd_shl_masked, // C logic + BuiltinProc_simd_shr_masked, // C logic + + BuiltinProc_simd_add_sat, // saturation arithmetic + BuiltinProc_simd_sub_sat, // saturation arithmetic + + BuiltinProc_simd_and, + BuiltinProc_simd_or, + BuiltinProc_simd_xor, + BuiltinProc_simd_and_not, + + BuiltinProc_simd_neg, + BuiltinProc_simd_abs, + + BuiltinProc_simd_min, + BuiltinProc_simd_max, + BuiltinProc_simd_clamp, + + BuiltinProc_simd_lanes_eq, + BuiltinProc_simd_lanes_ne, + BuiltinProc_simd_lanes_lt, + BuiltinProc_simd_lanes_le, + BuiltinProc_simd_lanes_gt, + BuiltinProc_simd_lanes_ge, + + BuiltinProc_simd_extract, + BuiltinProc_simd_replace, + + BuiltinProc_simd_reduce_add_ordered, + BuiltinProc_simd_reduce_mul_ordered, + BuiltinProc_simd_reduce_min, + BuiltinProc_simd_reduce_max, + BuiltinProc_simd_reduce_and, + BuiltinProc_simd_reduce_or, + BuiltinProc_simd_reduce_xor, + + BuiltinProc_simd_shuffle, + BuiltinProc_simd_select, + + BuiltinProc_simd_ceil, + BuiltinProc_simd_floor, + BuiltinProc_simd_trunc, + BuiltinProc_simd_nearest, + + BuiltinProc_simd_to_bits, + + BuiltinProc_simd_lanes_reverse, + BuiltinProc_simd_lanes_rotate_left, + BuiltinProc_simd_lanes_rotate_right, + + + // Platform specific SIMD intrinsics + BuiltinProc_simd_x86__MM_SHUFFLE, +BuiltinProc__simd_end, // Platform specific intrinsics BuiltinProc_syscall, + BuiltinProc_x86_cpuid, + BuiltinProc_x86_xgetbv, + // Constant type tests BuiltinProc__type_begin, @@ -268,7 +336,6 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { // "Intrinsics" {STR_LIT("is_package_imported"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("simd_vector"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type {STR_LIT("soa_struct"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type {STR_LIT("alloca"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, @@ -290,6 +357,7 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("overflow_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("sqrt"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("fused_mul_add"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("mem_copy"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, {STR_LIT("mem_copy_non_overlapping"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, @@ -304,6 +372,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("unaligned_store"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, {STR_LIT("unaligned_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("non_temporal_store"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, + {STR_LIT("non_temporal_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("prefetch_read_instruction"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, {STR_LIT("prefetch_read_data"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, @@ -315,26 +385,26 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("atomic_signal_fence"), 1, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, {STR_LIT("atomic_store"), 2, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, {STR_LIT("atomic_store_explicit"), 3, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_load_explicit"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_add_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_sub_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_and_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_nand"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_nand_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_or"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_or_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_xor_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_exchange"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_exchange_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_compare_exchange_strong"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_compare_exchange_strong_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_compare_exchange_weak"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("atomic_compare_exchange_weak_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("atomic_load"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_load_explicit"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_add_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_sub_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_and_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_nand"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_nand_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_or"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_or_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_xor_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_exchange"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_exchange_explicit"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_compare_exchange_strong"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_compare_exchange_strong_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_compare_exchange_weak"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("atomic_compare_exchange_weak_explicit"), 5, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, {STR_LIT("fixed_point_mul"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("fixed_point_div"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, @@ -342,8 +412,74 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("fixed_point_div_sat"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("expect"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - - {STR_LIT("syscall"), 1, true, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_add"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_div"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_rem"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_shl"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_shr"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_shl_masked"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_shr_masked"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_add_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_sub_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_and"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_or"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_xor"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_and_not"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_neg"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_abs"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_min"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_max"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_clamp"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_lanes_eq"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_ne"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_lt"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_le"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_gt"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_ge"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_min"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_max"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_and"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_or"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_xor"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_shuffle"), 2, true, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_select"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_ceil") , 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_floor"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_trunc"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_nearest"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_to_bits"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_lanes_reverse"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_rotate_left"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_lanes_rotate_right"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT("simd_x86__MM_SHUFFLE"), 4, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + + {STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, + + + {STR_LIT("syscall"), 1, true, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("x86_cpuid"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("x86_xgetbv"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, @@ -429,12 +565,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("__entry_point"), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics}, - {STR_LIT("objc_send"), 3, true, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("objc_send"), 3, true, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, {STR_LIT("objc_find_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("objc_find_class"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("objc_register_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, - {STR_LIT("objc_register_class"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("objc_register_selector"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, + {STR_LIT("objc_register_class"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics, false, true}, {STR_LIT("constant_utf16_cstring"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, diff --git a/src/common.cpp b/src/common.cpp index 94248fb62..77caddfe8 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -47,6 +47,13 @@ void debugf(char const *fmt, ...); #include "range_cache.cpp" +bool is_power_of_two(i64 x) { + if (x <= 0) { + return false; + } + return !(x & (x-1)); +} + int isize_cmp(isize x, isize y) { if (x < y) { return -1; diff --git a/src/entity.cpp b/src/entity.cpp index 904a630fb..76e6912b9 100644 --- a/src/entity.cpp +++ b/src/entity.cpp @@ -233,10 +233,12 @@ struct Entity { String link_name; String link_prefix; DeferredProcedure deferred_procedure; - bool is_foreign; - bool is_export; - bool generated_from_polymorphic; ProcedureOptimizationMode optimization_mode; + bool is_foreign : 1; + bool is_export : 1; + bool generated_from_polymorphic : 1; + bool target_feature_disabled : 1; + String target_feature; } Procedure; struct { Array<Entity *> entities; diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index 7cf588853..cf7389ec1 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -1332,8 +1332,8 @@ void lb_generate_code(lbGenerator *gen) { } } - if (build_context.target_features.len != 0) { - llvm_features = alloc_cstring(permanent_allocator(), build_context.target_features); + if (build_context.target_features_set.entries.count != 0) { + llvm_features = target_features_set_to_cstring(permanent_allocator(), false); } // GB_ASSERT_MSG(LLVMTargetHasAsmBackend(target)); diff --git a/src/llvm_backend_const.cpp b/src/llvm_backend_const.cpp index 8f17a1cfb..bd76400de 100644 --- a/src/llvm_backend_const.cpp +++ b/src/llvm_backend_const.cpp @@ -495,9 +495,9 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc res.value = data; return res; } else if (is_type_array(type) && - value.kind != ExactValue_Invalid && - value.kind != ExactValue_String && - value.kind != ExactValue_Compound) { + value.kind != ExactValue_Invalid && + value.kind != ExactValue_String && + value.kind != ExactValue_Compound) { i64 count = type->Array.count; Type *elem = type->Array.elem; @@ -513,8 +513,8 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc res.value = llvm_const_array(lb_type(m, elem), elems, cast(unsigned)count); return res; } else if (is_type_matrix(type) && - value.kind != ExactValue_Invalid && - value.kind != ExactValue_Compound) { + value.kind != ExactValue_Invalid && + value.kind != ExactValue_Compound) { i64 row = type->Matrix.row_count; i64 column = type->Matrix.column_count; GB_ASSERT(row == column); @@ -537,6 +537,22 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc res.value = LLVMConstArray(lb_type(m, elem), elems, cast(unsigned)total_elem_count); return res; + } else if (is_type_simd_vector(type) && + value.kind != ExactValue_Invalid && + value.kind != ExactValue_Compound) { + i64 count = type->SimdVector.count; + Type *elem = type->SimdVector.elem; + + lbValue single_elem = lb_const_value(m, elem, value, allow_local); + single_elem.value = llvm_const_cast(single_elem.value, lb_type(m, elem)); + + LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, count); + for (i64 i = 0; i < count; i++) { + elems[i] = single_elem.value; + } + + res.value = LLVMConstVector(elems, cast(unsigned)count); + return res; } switch (value.kind) { @@ -819,26 +835,81 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc return lb_const_nil(m, original_type); } GB_ASSERT(elem_type_can_be_constant(elem_type)); - isize total_elem_count = cast(isize)type->SimdVector.count; LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, total_elem_count); - for (isize i = 0; i < elem_count; i++) { - TypeAndValue tav = cl->elems[i]->tav; - GB_ASSERT(tav.mode != Addressing_Invalid); - values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value; - } - LLVMTypeRef et = lb_type(m, elem_type); + if (cl->elems[0]->kind == Ast_FieldValue) { + // TODO(bill): This is O(N*M) and will be quite slow; it should probably be sorted before hand + isize value_index = 0; + for (i64 i = 0; i < total_elem_count; i++) { + bool found = false; - for (isize i = elem_count; i < type->SimdVector.count; i++) { - values[i] = LLVMConstNull(et); - } - for (isize i = 0; i < total_elem_count; i++) { - values[i] = llvm_const_cast(values[i], et); - } + for (isize j = 0; j < elem_count; j++) { + Ast *elem = cl->elems[j]; + ast_node(fv, FieldValue, elem); + if (is_ast_range(fv->field)) { + ast_node(ie, BinaryExpr, fv->field); + TypeAndValue lo_tav = ie->left->tav; + TypeAndValue hi_tav = ie->right->tav; + GB_ASSERT(lo_tav.mode == Addressing_Constant); + GB_ASSERT(hi_tav.mode == Addressing_Constant); - res.value = LLVMConstVector(values, cast(unsigned)total_elem_count); - return res; + TokenKind op = ie->op.kind; + i64 lo = exact_value_to_i64(lo_tav.value); + i64 hi = exact_value_to_i64(hi_tav.value); + if (op != Token_RangeHalf) { + hi += 1; + } + if (lo == i) { + TypeAndValue tav = fv->value->tav; + LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value; + for (i64 k = lo; k < hi; k++) { + values[value_index++] = val; + } + + found = true; + i += (hi-lo-1); + break; + } + } else { + TypeAndValue index_tav = fv->field->tav; + GB_ASSERT(index_tav.mode == Addressing_Constant); + i64 index = exact_value_to_i64(index_tav.value); + if (index == i) { + TypeAndValue tav = fv->value->tav; + LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value; + values[value_index++] = val; + found = true; + break; + } + } + } + + if (!found) { + values[value_index++] = LLVMConstNull(lb_type(m, elem_type)); + } + } + + res.value = LLVMConstVector(values, cast(unsigned)total_elem_count); + return res; + } else { + for (isize i = 0; i < elem_count; i++) { + TypeAndValue tav = cl->elems[i]->tav; + GB_ASSERT(tav.mode != Addressing_Invalid); + values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value; + } + LLVMTypeRef et = lb_type(m, elem_type); + + for (isize i = elem_count; i < total_elem_count; i++) { + values[i] = LLVMConstNull(et); + } + for (isize i = 0; i < total_elem_count; i++) { + values[i] = llvm_const_cast(values[i], et); + } + + res.value = LLVMConstVector(values, cast(unsigned)total_elem_count); + return res; + } } else if (is_type_struct(type)) { ast_node(cl, CompoundLit, value.value_compound); diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index 133df4d41..1894e85f6 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -258,7 +258,13 @@ lbValue lb_emit_unary_arith(lbProcedure *p, TokenKind op, lbValue x, Type *type) LLVMBuildStore(p->builder, v2, LLVMBuildStructGEP(p->builder, addr.addr.value, 2, "")); LLVMBuildStore(p->builder, v3, LLVMBuildStructGEP(p->builder, addr.addr.value, 3, "")); return lb_addr_load(p, addr); - + } else if (is_type_simd_vector(x.type)) { + Type *elem = base_array_type(x.type); + if (is_type_float(elem)) { + res.value = LLVMBuildFNeg(p->builder, x.value, ""); + } else { + res.value = LLVMBuildNeg(p->builder, x.value, ""); + } } else { GB_PANIC("Unhandled type %s", type_to_string(x.type)); } @@ -1820,6 +1826,59 @@ lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) { return res; } + if (is_type_simd_vector(dst)) { + Type *et = base_array_type(dst); + if (is_type_simd_vector(src)) { + Type *src_elem = core_array_type(src); + Type *dst_elem = core_array_type(dst); + + GB_ASSERT(src->SimdVector.count == dst->SimdVector.count); + + lbValue res = {}; + res.type = t; + if (are_types_identical(src_elem, dst_elem)) { + res.value = value.value; + } else if (is_type_float(src_elem) && is_type_integer(dst_elem)) { + if (is_type_unsigned(dst_elem)) { + res.value = LLVMBuildFPToUI(p->builder, value.value, lb_type(m, t), ""); + } else { + res.value = LLVMBuildFPToSI(p->builder, value.value, lb_type(m, t), ""); + } + } else if (is_type_integer(src_elem) && is_type_float(dst_elem)) { + if (is_type_unsigned(src_elem)) { + res.value = LLVMBuildUIToFP(p->builder, value.value, lb_type(m, t), ""); + } else { + res.value = LLVMBuildSIToFP(p->builder, value.value, lb_type(m, t), ""); + } + } else if ((is_type_integer(src_elem) || is_type_boolean(src_elem)) && is_type_integer(dst_elem)) { + res.value = LLVMBuildIntCast2(p->builder, value.value, lb_type(m, t), !is_type_unsigned(src_elem), ""); + } else if (is_type_float(src_elem) && is_type_float(dst_elem)) { + res.value = LLVMBuildFPCast(p->builder, value.value, lb_type(m, t), ""); + } else if (is_type_integer(src_elem) && is_type_boolean(dst_elem)) { + LLVMValueRef i1vector = LLVMBuildICmp(p->builder, LLVMIntNE, value.value, LLVMConstNull(LLVMTypeOf(value.value)), ""); + res.value = LLVMBuildIntCast2(p->builder, i1vector, lb_type(m, t), !is_type_unsigned(src_elem), ""); + } else { + GB_PANIC("Unhandled simd vector conversion: %s -> %s", type_to_string(src), type_to_string(dst)); + } + return res; + } else { + i64 count = get_array_type_count(dst); + LLVMTypeRef vt = lb_type(m, t); + LLVMTypeRef llvm_u32 = lb_type(m, t_u32); + LLVMValueRef elem = lb_emit_conv(p, value, et).value; + LLVMValueRef vector = LLVMConstNull(vt); + for (i64 i = 0; i < count; i++) { + LLVMValueRef idx = LLVMConstInt(llvm_u32, i, false); + vector = LLVMBuildInsertElement(p->builder, vector, elem, idx, ""); + } + lbValue res = {}; + res.type = t; + res.value = vector; + return res; + } + } + + // Pointer <-> uintptr if (is_type_pointer(src) && is_type_uintptr(dst)) { lbValue res = {}; @@ -2506,6 +2565,57 @@ lbValue lb_emit_comp(lbProcedure *p, TokenKind op_kind, lbValue left, lbValue ri case Token_NotEq: pred = LLVMIntNE; break; } res.value = LLVMBuildICmp(p->builder, pred, left.value, right.value, ""); + } else if (is_type_simd_vector(a)) { + LLVMValueRef mask = nullptr; + Type *elem = base_array_type(a); + if (is_type_float(elem)) { + LLVMRealPredicate pred = {}; + switch (op_kind) { + case Token_CmpEq: pred = LLVMRealOEQ; break; + case Token_NotEq: pred = LLVMRealONE; break; + } + mask = LLVMBuildFCmp(p->builder, pred, left.value, right.value, ""); + } else { + LLVMIntPredicate pred = {}; + switch (op_kind) { + case Token_CmpEq: pred = LLVMIntEQ; break; + case Token_NotEq: pred = LLVMIntNE; break; + } + mask = LLVMBuildICmp(p->builder, pred, left.value, right.value, ""); + } + GB_ASSERT_MSG(mask != nullptr, "Unhandled comparison kind %s (%s) %.*s %s (%s)", type_to_string(left.type), type_to_string(base_type(left.type)), LIT(token_strings[op_kind]), type_to_string(right.type), type_to_string(base_type(right.type))); + + /* NOTE(bill, 2022-05-28): + Thanks to Per Vognsen, sign extending <N x i1> to + a vector of the same width as the input vector, bit casting to an integer, + and then comparing against zero is the better option + See: https://lists.llvm.org/pipermail/llvm-dev/2012-September/053046.html + + // Example assuming 128-bit vector + + %1 = <4 x float> ... + %2 = <4 x float> ... + %3 = fcmp oeq <4 x float> %1, %2 + %4 = sext <4 x i1> %3 to <4 x i32> + %5 = bitcast <4 x i32> %4 to i128 + %6 = icmp ne i128 %5, 0 + br i1 %6, label %true1, label %false2 + + This will result in 1 cmpps + 1 ptest + 1 br + (even without SSE4.1, contrary to what the mail list states, because of pmovmskb) + + */ + + unsigned count = cast(unsigned)get_array_type_count(a); + unsigned elem_sz = cast(unsigned)(type_size_of(elem)*8); + LLVMTypeRef mask_type = LLVMVectorType(LLVMIntTypeInContext(p->module->ctx, elem_sz), count); + mask = LLVMBuildSExtOrBitCast(p->builder, mask, mask_type, ""); + + LLVMTypeRef mask_int_type = LLVMIntTypeInContext(p->module->ctx, cast(unsigned)(8*type_size_of(a))); + LLVMValueRef mask_int = LLVMBuildBitCast(p->builder, mask, mask_int_type, ""); + res.value = LLVMBuildICmp(p->builder, LLVMIntNE, mask_int, LLVMConstNull(LLVMTypeOf(mask_int)), ""); + return res; + } else { GB_PANIC("Unhandled comparison kind %s (%s) %.*s %s (%s)", type_to_string(left.type), type_to_string(base_type(left.type)), LIT(token_strings[op_kind]), type_to_string(right.type), type_to_string(base_type(right.type))); } @@ -4609,6 +4719,102 @@ lbAddr lb_build_addr(lbProcedure *p, Ast *expr) { break; } + case Type_SimdVector: { + if (cl->elems.count > 0) { + lbValue vector_value = lb_const_value(p->module, type, exact_value_compound(expr)); + defer (lb_addr_store(p, v, vector_value)); + + auto temp_data = array_make<lbCompoundLitElemTempData>(temporary_allocator(), 0, cl->elems.count); + + // NOTE(bill): Separate value, store into their own chunks + for_array(i, cl->elems) { + Ast *elem = cl->elems[i]; + if (elem->kind == Ast_FieldValue) { + ast_node(fv, FieldValue, elem); + if (lb_is_elem_const(fv->value, et)) { + continue; + } + if (is_ast_range(fv->field)) { + ast_node(ie, BinaryExpr, fv->field); + TypeAndValue lo_tav = ie->left->tav; + TypeAndValue hi_tav = ie->right->tav; + GB_ASSERT(lo_tav.mode == Addressing_Constant); + GB_ASSERT(hi_tav.mode == Addressing_Constant); + + TokenKind op = ie->op.kind; + i64 lo = exact_value_to_i64(lo_tav.value); + i64 hi = exact_value_to_i64(hi_tav.value); + if (op != Token_RangeHalf) { + hi += 1; + } + + lbValue value = lb_build_expr(p, fv->value); + + for (i64 k = lo; k < hi; k++) { + lbCompoundLitElemTempData data = {}; + data.value = value; + data.elem_index = cast(i32)k; + array_add(&temp_data, data); + } + + } else { + auto tav = fv->field->tav; + GB_ASSERT(tav.mode == Addressing_Constant); + i64 index = exact_value_to_i64(tav.value); + + lbValue value = lb_build_expr(p, fv->value); + lbCompoundLitElemTempData data = {}; + data.value = lb_emit_conv(p, value, et); + data.expr = fv->value; + data.elem_index = cast(i32)index; + array_add(&temp_data, data); + } + + } else { + if (lb_is_elem_const(elem, et)) { + continue; + } + lbCompoundLitElemTempData data = {}; + data.expr = elem; + data.elem_index = cast(i32)i; + array_add(&temp_data, data); + } + } + + + for_array(i, temp_data) { + lbValue field_expr = temp_data[i].value; + Ast *expr = temp_data[i].expr; + + auto prev_hint = lb_set_copy_elision_hint(p, lb_addr(temp_data[i].gep), expr); + + if (field_expr.value == nullptr) { + field_expr = lb_build_expr(p, expr); + } + Type *t = field_expr.type; + GB_ASSERT(t->kind != Type_Tuple); + lbValue ev = lb_emit_conv(p, field_expr, et); + + if (!p->copy_elision_hint.used) { + temp_data[i].value = ev; + } + + lb_reset_copy_elision_hint(p, prev_hint); + } + + + // TODO(bill): reduce the need for individual `insertelement` if a `shufflevector` + // might be a better option + + for_array(i, temp_data) { + if (temp_data[i].value.value != nullptr) { + LLVMValueRef index = lb_const_int(p->module, t_u32, temp_data[i].elem_index).value; + vector_value.value = LLVMBuildInsertElement(p->builder, vector_value.value, temp_data[i].value.value, index, ""); + } + } + } + break; + } } return v; diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp index 154be2f1f..75ca77641 100644 --- a/src/llvm_backend_proc.cpp +++ b/src/llvm_backend_proc.cpp @@ -169,6 +169,19 @@ lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool ignore_body) } } + if (!entity->Procedure.target_feature_disabled && + entity->Procedure.target_feature.len != 0) { + auto features = split_by_comma(entity->Procedure.target_feature); + for_array(i, features) { + String feature = features[i]; + LLVMAttributeRef ref = LLVMCreateStringAttribute( + m->ctx, + cast(char const *)feature.text, cast(unsigned)feature.len, + "", 0); + LLVMAddAttributeAtIndex(p->value, LLVMAttributeIndex_FunctionIndex, ref); + } + } + if (entity->flags & EntityFlag_Cold) { lb_add_attribute_to_proc(m, p->value, "cold"); } @@ -981,10 +994,466 @@ lbValue lb_emit_call(lbProcedure *p, lbValue value, Array<lbValue> const &args, return result; } +LLVMValueRef llvm_splat_float(i64 count, LLVMTypeRef type, f64 value) { + LLVMValueRef v = LLVMConstReal(type, value); + LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count); + for (i64 i = 0; i < count; i++) { + values[i] = v; + } + return LLVMConstVector(values, cast(unsigned)count); +} +LLVMValueRef llvm_splat_int(i64 count, LLVMTypeRef type, i64 value, bool is_signed=false) { + LLVMValueRef v = LLVMConstInt(type, value, is_signed); + LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count); + for (i64 i = 0; i < count; i++) { + values[i] = v; + } + return LLVMConstVector(values, cast(unsigned)count); +} + + +lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, BuiltinProcId builtin_id) { + ast_node(ce, CallExpr, expr); + + lbModule *m = p->module; + + lbValue res = {}; + res.type = tv.type; + + lbValue arg0 = {}; if (ce->args.count > 0) arg0 = lb_build_expr(p, ce->args[0]); + lbValue arg1 = {}; if (ce->args.count > 1) arg1 = lb_build_expr(p, ce->args[1]); + lbValue arg2 = {}; if (ce->args.count > 2) arg2 = lb_build_expr(p, ce->args[2]); + + Type *elem = base_array_type(arg0.type); + + bool is_float = is_type_float(elem); + bool is_signed = !is_type_unsigned(elem); + + LLVMOpcode op_code = cast(LLVMOpcode)0; + + switch (builtin_id) { + case BuiltinProc_simd_add: + case BuiltinProc_simd_sub: + case BuiltinProc_simd_mul: + case BuiltinProc_simd_div: + case BuiltinProc_simd_rem: + if (is_float) { + switch (builtin_id) { + case BuiltinProc_simd_add: op_code = LLVMFAdd; break; + case BuiltinProc_simd_sub: op_code = LLVMFSub; break; + case BuiltinProc_simd_mul: op_code = LLVMFMul; break; + case BuiltinProc_simd_div: op_code = LLVMFDiv; break; + } + } else { + switch (builtin_id) { + case BuiltinProc_simd_add: op_code = LLVMAdd; break; + case BuiltinProc_simd_sub: op_code = LLVMSub; break; + case BuiltinProc_simd_mul: op_code = LLVMMul; break; + case BuiltinProc_simd_div: + if (is_signed) { + op_code = LLVMSDiv; + } else { + op_code = LLVMUDiv; + } + break; + case BuiltinProc_simd_rem: + if (is_signed) { + op_code = LLVMSRem; + } else { + op_code = LLVMURem; + } + break; + } + } + if (op_code) { + res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, ""); + return res; + } + break; + case BuiltinProc_simd_shl: // Odin logic + case BuiltinProc_simd_shr: // Odin logic + case BuiltinProc_simd_shl_masked: // C logic + case BuiltinProc_simd_shr_masked: // C logic + { + i64 sz = type_size_of(elem); + GB_ASSERT(arg0.type->kind == Type_SimdVector); + + i64 count = arg0.type->SimdVector.count; + Type *elem1 = base_array_type(arg1.type); + + bool is_masked = false; + switch (builtin_id) { + case BuiltinProc_simd_shl: op_code = LLVMShl; is_masked = false; break; + case BuiltinProc_simd_shr: op_code = is_signed ? LLVMAShr : LLVMLShr; is_masked = false; break; + case BuiltinProc_simd_shl_masked: op_code = LLVMShl; is_masked = true; break; + case BuiltinProc_simd_shr_masked: op_code = is_signed ? LLVMAShr : LLVMLShr; is_masked = true; break; + } + if (op_code) { + LLVMValueRef bits = llvm_splat_int(count, lb_type(m, elem1), sz*8 - 1); + if (is_masked) { + // C logic + LLVMValueRef shift = LLVMBuildAnd(p->builder, arg1.value, bits, ""); + res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, shift, ""); + } else { + // Odin logic + LLVMValueRef zero = lb_const_nil(m, arg1.type).value; + LLVMValueRef mask = LLVMBuildICmp(p->builder, LLVMIntULE, arg1.value, bits, ""); + LLVMValueRef shift = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, ""); + res.value = LLVMBuildSelect(p->builder, mask, shift, zero, ""); + } + return res; + } + } + break; + case BuiltinProc_simd_and: + case BuiltinProc_simd_or: + case BuiltinProc_simd_xor: + case BuiltinProc_simd_and_not: + switch (builtin_id) { + case BuiltinProc_simd_and: op_code = LLVMAnd; break; + case BuiltinProc_simd_or: op_code = LLVMOr; break; + case BuiltinProc_simd_xor: op_code = LLVMXor; break; + case BuiltinProc_simd_and_not: + op_code = LLVMAnd; + arg1.value = LLVMBuildNot(p->builder, arg1.value, ""); + break; + } + if (op_code) { + res.value = LLVMBuildBinOp(p->builder, op_code, arg0.value, arg1.value, ""); + return res; + } + break; + case BuiltinProc_simd_neg: + if (is_float) { + res.value = LLVMBuildFNeg(p->builder, arg0.value, ""); + } else { + res.value = LLVMBuildNeg(p->builder, arg0.value, ""); + } + return res; + case BuiltinProc_simd_abs: + if (is_float) { + LLVMValueRef pos = arg0.value; + LLVMValueRef neg = LLVMBuildFNeg(p->builder, pos, ""); + LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOGT, pos, neg, ""); + res.value = LLVMBuildSelect(p->builder, cond, pos, neg, ""); + } else { + LLVMValueRef pos = arg0.value; + LLVMValueRef neg = LLVMBuildNeg(p->builder, pos, ""); + LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSGT : LLVMIntUGT, pos, neg, ""); + res.value = LLVMBuildSelect(p->builder, cond, pos, neg, ""); + } + return res; + case BuiltinProc_simd_min: + if (is_float) { + LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOLT, arg0.value, arg1.value, ""); + res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, ""); + } else { + LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSLT : LLVMIntULT, arg0.value, arg1.value, ""); + res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, ""); + } + return res; + case BuiltinProc_simd_max: + if (is_float) { + LLVMValueRef cond = LLVMBuildFCmp(p->builder, LLVMRealOGT, arg0.value, arg1.value, ""); + res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, ""); + } else { + LLVMValueRef cond = LLVMBuildICmp(p->builder, is_signed ? LLVMIntSGT : LLVMIntUGT, arg0.value, arg1.value, ""); + res.value = LLVMBuildSelect(p->builder, cond, arg0.value, arg1.value, ""); + } + return res; + case BuiltinProc_simd_lanes_eq: + case BuiltinProc_simd_lanes_ne: + case BuiltinProc_simd_lanes_lt: + case BuiltinProc_simd_lanes_le: + case BuiltinProc_simd_lanes_gt: + case BuiltinProc_simd_lanes_ge: + if (is_float) { + LLVMRealPredicate pred = cast(LLVMRealPredicate)0; + switch (builtin_id) { + case BuiltinProc_simd_lanes_eq: pred = LLVMRealOEQ; break; + case BuiltinProc_simd_lanes_ne: pred = LLVMRealONE; break; + case BuiltinProc_simd_lanes_lt: pred = LLVMRealOLT; break; + case BuiltinProc_simd_lanes_le: pred = LLVMRealOLE; break; + case BuiltinProc_simd_lanes_gt: pred = LLVMRealOGT; break; + case BuiltinProc_simd_lanes_ge: pred = LLVMRealOGE; break; + } + if (pred) { + res.value = LLVMBuildFCmp(p->builder, pred, arg0.value, arg1.value, ""); + res.value = LLVMBuildSExtOrBitCast(p->builder, res.value, lb_type(m, tv.type), ""); + return res; + } + } else { + LLVMIntPredicate pred = cast(LLVMIntPredicate)0; + switch (builtin_id) { + case BuiltinProc_simd_lanes_eq: pred = LLVMIntEQ; break; + case BuiltinProc_simd_lanes_ne: pred = LLVMIntNE; break; + case BuiltinProc_simd_lanes_lt: pred = is_signed ? LLVMIntSLT :LLVMIntULT; break; + case BuiltinProc_simd_lanes_le: pred = is_signed ? LLVMIntSLE :LLVMIntULE; break; + case BuiltinProc_simd_lanes_gt: pred = is_signed ? LLVMIntSGT :LLVMIntUGT; break; + case BuiltinProc_simd_lanes_ge: pred = is_signed ? LLVMIntSGE :LLVMIntUGE; break; + } + if (pred) { + res.value = LLVMBuildICmp(p->builder, pred, arg0.value, arg1.value, ""); + res.value = LLVMBuildSExtOrBitCast(p->builder, res.value, lb_type(m, tv.type), ""); + return res; + } + } + break; + + case BuiltinProc_simd_extract: + res.value = LLVMBuildExtractElement(p->builder, arg0.value, arg1.value, ""); + return res; + case BuiltinProc_simd_replace: + res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, ""); + return res; + + case BuiltinProc_simd_reduce_add_ordered: + case BuiltinProc_simd_reduce_mul_ordered: + { + LLVMTypeRef llvm_elem = lb_type(m, elem); + LLVMValueRef args[2] = {}; + isize args_count = 0; + + char const *name = nullptr; + switch (builtin_id) { + case BuiltinProc_simd_reduce_add_ordered: + if (is_float) { + name = "llvm.vector.reduce.fadd"; + args[args_count++] = LLVMConstReal(llvm_elem, 0.0); + } else { + name = "llvm.vector.reduce.add"; + } + break; + case BuiltinProc_simd_reduce_mul_ordered: + if (is_float) { + name = "llvm.vector.reduce.fmul"; + args[args_count++] = LLVMConstReal(llvm_elem, 1.0); + } else { + name = "llvm.vector.reduce.mul"; + } + break; + } + args[args_count++] = arg0.value; + + + LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)}; + unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name)); + GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0])); + LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types)); + + res.value = LLVMBuildCall(p->builder, ip, args, cast(unsigned)args_count, ""); + return res; + } + case BuiltinProc_simd_reduce_min: + case BuiltinProc_simd_reduce_max: + case BuiltinProc_simd_reduce_and: + case BuiltinProc_simd_reduce_or: + case BuiltinProc_simd_reduce_xor: + { + char const *name = nullptr; + switch (builtin_id) { + case BuiltinProc_simd_reduce_min: + if (is_float) { + name = "llvm.vector.reduce.fmin"; + } else if (is_signed) { + name = "llvm.vector.reduce.smin"; + } else { + name = "llvm.vector.reduce.umin"; + } + break; + case BuiltinProc_simd_reduce_max: + if (is_float) { + name = "llvm.vector.reduce.fmax"; + } else if (is_signed) { + name = "llvm.vector.reduce.smax"; + } else { + name = "llvm.vector.reduce.umax"; + } + break; + case BuiltinProc_simd_reduce_and: name = "llvm.vector.reduce.and"; break; + case BuiltinProc_simd_reduce_or: name = "llvm.vector.reduce.or"; break; + case BuiltinProc_simd_reduce_xor: name = "llvm.vector.reduce.xor"; break; + } + LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)}; + unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name)); + GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0])); + LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types)); + + LLVMValueRef args[1] = {}; + args[0] = arg0.value; + + res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), ""); + return res; + } + + case BuiltinProc_simd_shuffle: + { + Type *vt = arg0.type; + GB_ASSERT(vt->kind == Type_SimdVector); + + i64 indices_count = ce->args.count-2; + i64 max_count = vt->SimdVector.count*2; + GB_ASSERT(indices_count <= max_count); + + LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, indices_count); + for (isize i = 0; i < indices_count; i++) { + lbValue idx = lb_build_expr(p, ce->args[i+2]); + GB_ASSERT(LLVMIsConstant(idx.value)); + values[i] = idx.value; + } + LLVMValueRef indices = LLVMConstVector(values, cast(unsigned)indices_count); + + res.value = LLVMBuildShuffleVector(p->builder, arg0.value, arg1.value, indices, ""); + return res; + } + + case BuiltinProc_simd_select: + { + LLVMValueRef cond = arg0.value; + LLVMValueRef x = lb_build_expr(p, ce->args[1]).value; + LLVMValueRef y = lb_build_expr(p, ce->args[2]).value; + + cond = LLVMBuildICmp(p->builder, LLVMIntNE, cond, LLVMConstNull(LLVMTypeOf(cond)), ""); + res.value = LLVMBuildSelect(p->builder, cond, x, y, ""); + return res; + } + + case BuiltinProc_simd_ceil: + case BuiltinProc_simd_floor: + case BuiltinProc_simd_trunc: + case BuiltinProc_simd_nearest: + { + char const *name = nullptr; + switch (builtin_id) { + case BuiltinProc_simd_ceil: name = "llvm.ceil"; break; + case BuiltinProc_simd_floor: name = "llvm.floor"; break; + case BuiltinProc_simd_trunc: name = "llvm.trunc"; break; + case BuiltinProc_simd_nearest: name = "llvm.nearbyint"; break; + } + + LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)}; + unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name)); + GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0])); + LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types)); + + LLVMValueRef args[1] = {}; + args[0] = arg0.value; + + res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), ""); + return res; + } + + case BuiltinProc_simd_lanes_reverse: + { + i64 count = get_array_type_count(arg0.type); + LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count); + LLVMTypeRef llvm_u32 = lb_type(m, t_u32); + for (i64 i = 0; i < count; i++) { + values[i] = LLVMConstInt(llvm_u32, count-1-i, false); + } + LLVMValueRef mask = LLVMConstVector(values, cast(unsigned)count); + + LLVMValueRef v = arg0.value; + res.value = LLVMBuildShuffleVector(p->builder, v, v, mask, ""); + return res; + } + + case BuiltinProc_simd_lanes_rotate_left: + case BuiltinProc_simd_lanes_rotate_right: + { + + i64 count = get_array_type_count(arg0.type); + GB_ASSERT(is_power_of_two(count)); + BigInt bi_count = {}; + big_int_from_i64(&bi_count, count); + + TypeAndValue const &tv = ce->args[1]->tav; + ExactValue val = exact_value_to_integer(tv.value); + GB_ASSERT(val.kind == ExactValue_Integer); + BigInt *bi = &val.value_integer; + if (builtin_id == BuiltinProc_simd_lanes_rotate_right) { + big_int_neg(bi, bi); + } + big_int_rem(bi, bi, &bi_count); + big_int_dealloc(&bi_count); + + i64 left = big_int_to_i64(bi); + + LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, count); + LLVMTypeRef llvm_u32 = lb_type(m, t_u32); + for (i64 i = 0; i < count; i++) { + u64 idx = cast(u64)(i+left) & cast(u64)(count-1); + values[i] = LLVMConstInt(llvm_u32, idx, false); + } + LLVMValueRef mask = LLVMConstVector(values, cast(unsigned)count); + + LLVMValueRef v = arg0.value; + res.value = LLVMBuildShuffleVector(p->builder, v, v, mask, ""); + return res; + } + + + case BuiltinProc_simd_add_sat: + case BuiltinProc_simd_sub_sat: + { + char const *name = nullptr; + switch (builtin_id) { + case BuiltinProc_simd_add_sat: name = is_signed ? "llvm.sadd.sat" : "llvm.uadd.sat"; break; + case BuiltinProc_simd_sub_sat: name = is_signed ? "llvm.ssub.sat" : "llvm.usub.sat"; break; + } + + LLVMTypeRef types[1] = {lb_type(p->module, arg0.type)}; + unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name)); + GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0])); + LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types)); + + LLVMValueRef args[2] = {}; + args[0] = arg0.value; + args[1] = arg1.value; + + res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), ""); + return res; + } + + case BuiltinProc_simd_clamp: + { + LLVMValueRef v = arg0.value; + LLVMValueRef min = arg1.value; + LLVMValueRef max = arg2.value; + + if (is_float) { + v = LLVMBuildSelect(p->builder, LLVMBuildFCmp(p->builder, LLVMRealOLT, v, min, ""), min, v, ""); + res.value = LLVMBuildSelect(p->builder, LLVMBuildFCmp(p->builder, LLVMRealOGT, v, max, ""), max, v, ""); + } else if (is_signed) { + v = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntSLT, v, min, ""), min, v, ""); + res.value = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntSGT, v, max, ""), max, v, ""); + } else { + v = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntULT, v, min, ""), min, v, ""); + res.value = LLVMBuildSelect(p->builder, LLVMBuildICmp(p->builder, LLVMIntUGT, v, max, ""), max, v, ""); + } + return res; + } + + case BuiltinProc_simd_to_bits: + { + res.value = LLVMBuildBitCast(p->builder, arg0.value, lb_type(m, tv.type), ""); + return res; + } + + } + GB_PANIC("Unhandled simd intrinsic: '%.*s'", LIT(builtin_procs[builtin_id].name)); + + return {}; +} + lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, BuiltinProcId id) { ast_node(ce, CallExpr, expr); + if (BuiltinProc__simd_begin < id && id < BuiltinProc__simd_end) { + return lb_build_builtin_simd_proc(p, expr, tv, id); + } + switch (id) { case BuiltinProc_DIRECTIVE: { ast_node(bd, BasicDirective, ce->proc); @@ -1532,6 +2001,31 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, return res; } + case BuiltinProc_fused_mul_add: + { + Type *type = tv.type; + lbValue x = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), type); + lbValue y = lb_emit_conv(p, lb_build_expr(p, ce->args[1]), type); + lbValue z = lb_emit_conv(p, lb_build_expr(p, ce->args[2]), type); + + + char const *name = "llvm.fma"; + LLVMTypeRef types[1] = {lb_type(p->module, type)}; + unsigned id = LLVMLookupIntrinsicID(name, gb_strlen(name)); + GB_ASSERT_MSG(id != 0, "Unable to find %s.%s", name, LLVMPrintTypeToString(types[0])); + LLVMValueRef ip = LLVMGetIntrinsicDeclaration(p->module->mod, id, types, gb_count_of(types)); + + LLVMValueRef args[3] = {}; + args[0] = x.value; + args[1] = y.value; + args[2] = z.value; + + lbValue res = {}; + res.value = LLVMBuildCall(p->builder, ip, args, gb_count_of(args), ""); + res.type = type; + return res; + } + case BuiltinProc_mem_copy: { lbValue dst = lb_build_expr(p, ce->args[0]); @@ -1614,6 +2108,7 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, return {}; case BuiltinProc_volatile_store: + case BuiltinProc_non_temporal_store: case BuiltinProc_atomic_store: case BuiltinProc_atomic_store_explicit: { lbValue dst = lb_build_expr(p, ce->args[0]); @@ -1622,6 +2117,13 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, LLVMValueRef instr = LLVMBuildStore(p->builder, val.value, dst.value); switch (id) { + case BuiltinProc_non_temporal_store: + { + unsigned kind_id = LLVMGetMDKindIDInContext(p->module->ctx, "nontemporal", 11); + LLVMMetadataRef node = LLVMValueAsMetadata(LLVMConstInt(lb_type(p->module, t_u32), 1, false)); + LLVMSetMetadata(instr, kind_id, LLVMMetadataAsValue(p->module->ctx, node)); + } + break; case BuiltinProc_volatile_store: LLVMSetVolatile(instr, true); break; case BuiltinProc_atomic_store: LLVMSetOrdering(instr, LLVMAtomicOrderingSequentiallyConsistent); break; case BuiltinProc_atomic_store_explicit: LLVMSetOrdering(instr, llvm_atomic_ordering_from_odin(ce->args[2])); break; @@ -1633,12 +2135,21 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, } case BuiltinProc_volatile_load: + case BuiltinProc_non_temporal_load: case BuiltinProc_atomic_load: case BuiltinProc_atomic_load_explicit: { lbValue dst = lb_build_expr(p, ce->args[0]); LLVMValueRef instr = LLVMBuildLoad(p->builder, dst.value, ""); switch (id) { + case BuiltinProc_non_temporal_load: + { + unsigned kind_id = LLVMGetMDKindIDInContext(p->module->ctx, "nontemporal", 11); + LLVMMetadataRef node = LLVMValueAsMetadata(LLVMConstInt(lb_type(p->module, t_u32), 1, false)); + LLVMSetMetadata(instr, kind_id, LLVMMetadataAsValue(p->module->ctx, node)); + } + break; + break; case BuiltinProc_volatile_load: LLVMSetVolatile(instr, true); break; case BuiltinProc_atomic_load: LLVMSetOrdering(instr, LLVMAtomicOrderingSequentiallyConsistent); break; case BuiltinProc_atomic_load_explicit: LLVMSetOrdering(instr, llvm_atomic_ordering_from_odin(ce->args[1])); break; @@ -2232,6 +2743,47 @@ lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValue const &tv, return res; } + + case BuiltinProc_x86_cpuid: + { + Type *param_types[2] = {t_u32, t_u32}; + Type *type = alloc_type_proc_from_types(param_types, gb_count_of(param_types), tv.type, false, ProcCC_None); + LLVMTypeRef func_type = LLVMGetElementType(lb_type(p->module, type)); + LLVMValueRef the_asm = llvm_get_inline_asm( + func_type, + str_lit("cpuid"), + str_lit("={ax},={bx},={cx},={dx},{ax},{cx}"), + true + ); + GB_ASSERT(the_asm != nullptr); + + LLVMValueRef args[2] = {}; + args[0] = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), t_u32).value; + args[1] = lb_emit_conv(p, lb_build_expr(p, ce->args[1]), t_u32).value; + lbValue res = {}; + res.type = tv.type; + res.value = LLVMBuildCall2(p->builder, func_type, the_asm, args, gb_count_of(args), ""); + return res; + } + case BuiltinProc_x86_xgetbv: + { + Type *type = alloc_type_proc_from_types(&t_u32, 1, tv.type, false, ProcCC_None); + LLVMTypeRef func_type = LLVMGetElementType(lb_type(p->module, type)); + LLVMValueRef the_asm = llvm_get_inline_asm( + func_type, + str_lit("xgetbv"), + str_lit("={ax},={dx},{cx}"), + true + ); + GB_ASSERT(the_asm != nullptr); + + LLVMValueRef args[1] = {}; + args[0] = lb_emit_conv(p, lb_build_expr(p, ce->args[0]), t_u32).value; + lbValue res = {}; + res.type = tv.type; + res.value = LLVMBuildCall2(p->builder, func_type, the_asm, args, gb_count_of(args), ""); + return res; + } } GB_PANIC("Unhandled built-in procedure %.*s", LIT(builtin_procs[id].name)); diff --git a/src/llvm_backend_utility.cpp b/src/llvm_backend_utility.cpp index 037171637..52d3a17cf 100644 --- a/src/llvm_backend_utility.cpp +++ b/src/llvm_backend_utility.cpp @@ -201,6 +201,11 @@ lbValue lb_emit_transmute(lbProcedure *p, lbValue value, Type *t) { return res; } + if (is_type_simd_vector(src) && is_type_simd_vector(dst)) { + res.value = LLVMBuildBitCast(p->builder, value.value, lb_type(p->module, t), ""); + return res; + } + if (lb_is_type_aggregate(src) || lb_is_type_aggregate(dst)) { lbValue s = lb_address_from_load_or_generate_local(p, value); lbValue d = lb_emit_transmute(p, s, alloc_type_pointer(t)); @@ -480,8 +485,10 @@ lbValue lb_emit_count_ones(lbProcedure *p, lbValue x, Type *type) { } lbValue lb_emit_count_zeros(lbProcedure *p, lbValue x, Type *type) { - i64 sz = 8*type_size_of(type); - lbValue size = lb_const_int(p->module, type, cast(u64)sz); + Type *elem = base_array_type(type); + i64 sz = 8*type_size_of(elem); + lbValue size = lb_const_int(p->module, elem, cast(u64)sz); + size = lb_emit_conv(p, size, type); lbValue count = lb_emit_count_ones(p, x, type); return lb_emit_arith(p, Token_Sub, size, count, type); } diff --git a/src/main.cpp b/src/main.cpp index 13c8bd74d..ee71b91df 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1376,8 +1376,8 @@ bool parse_build_flags(Array<String> args) { } case BuildFlag_TargetFeatures: { GB_ASSERT(value.kind == ExactValue_String); - build_context.target_features = value.value_string; - string_to_lower(&build_context.target_features); + build_context.target_features_string = value.value_string; + string_to_lower(&build_context.target_features_string); break; } case BuildFlag_RelocMode: { diff --git a/src/parser.cpp b/src/parser.cpp index d19e249e5..5280fd4b0 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -360,6 +360,7 @@ Ast *clone_ast(Ast *node) { case Ast_ArrayType: n->ArrayType.count = clone_ast(n->ArrayType.count); n->ArrayType.elem = clone_ast(n->ArrayType.elem); + n->ArrayType.tag = clone_ast(n->ArrayType.tag); break; case Ast_DynamicArrayType: n->DynamicArrayType.elem = clone_ast(n->DynamicArrayType.elem); @@ -2127,7 +2128,18 @@ Ast *parse_operand(AstFile *f, bool lhs) { Token name = expect_token(f, Token_Ident); if (name.string == "type") { return ast_helper_type(f, token, parse_type(f)); - } else if (name.string == "soa" || name.string == "simd") { + } else if ( name.string == "simd") { + Ast *tag = ast_basic_directive(f, token, name); + Ast *original_type = parse_type(f); + Ast *type = unparen_expr(original_type); + switch (type->kind) { + case Ast_ArrayType: type->ArrayType.tag = tag; break; + default: + syntax_error(type, "Expected a fixed array type after #%.*s, got %.*s", LIT(name.string), LIT(ast_strings[type->kind])); + break; + } + return original_type; + } else if (name.string == "soa") { Ast *tag = ast_basic_directive(f, token, name); Ast *original_type = parse_type(f); Ast *type = unparen_expr(original_type); diff --git a/src/parser.hpp b/src/parser.hpp index dc294b6ce..a648828fb 100644 --- a/src/parser.hpp +++ b/src/parser.hpp @@ -411,7 +411,6 @@ AST_KIND(_ExprBegin, "", bool) \ Token ellipsis; \ ProcInlining inlining; \ bool optional_ok_one; \ - i32 builtin_id; \ void *sce_temp_data; \ }) \ AST_KIND(FieldValue, "field value", struct { Token eq; Ast *field, *value; }) \ diff --git a/src/string.cpp b/src/string.cpp index 616761265..44eccd2d2 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -157,6 +157,15 @@ int string_compare(String const &x, String const &y) { return 0; } +isize string_index_byte(String const &s, u8 x) { + for (isize i = 0; i < s.len; i++) { + if (s.text[i] == x) { + return i; + } + } + return -1; +} + GB_COMPARE_PROC(string_cmp_proc) { String x = *(String *)a; String y = *(String *)b; diff --git a/src/types.cpp b/src/types.cpp index c79b8e652..ad83e0568 100644 --- a/src/types.cpp +++ b/src/types.cpp @@ -261,6 +261,7 @@ struct TypeProc { TYPE_KIND(SimdVector, struct { \ i64 count; \ Type *elem; \ + Type *generic_count; \ }) \ TYPE_KIND(RelativePointer, struct { \ Type *pointer_type; \ @@ -362,6 +363,9 @@ enum : int { MATRIX_ELEMENT_COUNT_MIN = 1, MATRIX_ELEMENT_COUNT_MAX = 16, MATRIX_ELEMENT_MAX_SIZE = MATRIX_ELEMENT_COUNT_MAX * (2 * 8), // complex128 + + SIMD_ELEMENT_COUNT_MIN = 1, + SIMD_ELEMENT_COUNT_MAX = 64, }; @@ -1085,10 +1089,11 @@ Type *alloc_type_bit_set() { -Type *alloc_type_simd_vector(i64 count, Type *elem) { +Type *alloc_type_simd_vector(i64 count, Type *elem, Type *generic_count=nullptr) { Type *t = alloc_type(Type_SimdVector); t->SimdVector.count = count; t->SimdVector.elem = elem; + t->SimdVector.generic_count = generic_count; return t; } @@ -1593,6 +1598,8 @@ i64 get_array_type_count(Type *t) { return bt->Array.count; } else if (bt->kind == Type_EnumeratedArray) { return bt->EnumeratedArray.count; + } else if (bt->kind == Type_SimdVector) { + return bt->SimdVector.count; } GB_ASSERT(is_type_array_like(t)); return -1; @@ -1932,11 +1939,14 @@ bool is_type_valid_vector_elem(Type *t) { return false; } if (is_type_integer(t)) { - return true; + return !is_type_integer_128bit(t); } if (is_type_float(t)) { return true; } + if (is_type_boolean(t)) { + return true; + } } return false; } @@ -2078,6 +2088,11 @@ bool is_type_polymorphic(Type *t, bool or_specialized=false) { return true; } return is_type_polymorphic(t->Array.elem, or_specialized); + case Type_SimdVector: + if (t->SimdVector.generic_count != nullptr) { + return true; + } + return is_type_polymorphic(t->SimdVector.elem, or_specialized); case Type_DynamicArray: return is_type_polymorphic(t->DynamicArray.elem, or_specialized); case Type_Slice: @@ -2291,6 +2306,9 @@ bool is_type_comparable(Type *t) { } } return true; + + case Type_SimdVector: + return true; } return false; } @@ -3446,7 +3464,7 @@ i64 type_align_of_internal(Type *t, TypePath *path) { case Type_SimdVector: { // IMPORTANT TODO(bill): Figure out the alignment of vector types - return gb_clamp(next_pow2(type_size_of_internal(t, path)), 1, build_context.max_align); + return gb_clamp(next_pow2(type_size_of_internal(t, path)), 1, build_context.max_align*2); } case Type_Matrix: |