diff options
| author | gingerBill <bill@gingerbill.org> | 2022-05-27 23:07:33 +0100 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2022-05-27 23:07:33 +0100 |
| commit | 4db533ff71e9f11ff21e845509790a355020a944 (patch) | |
| tree | 1a908324de8642ad04a8de202322e0ed501db7ee | |
| parent | 026540040df94309280d6991eba6b2dcc303bd76 (diff) | |
Add ssse3 support
| -rw-r--r-- | core/simd/x86/ssse3.odin | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/core/simd/x86/ssse3.odin b/core/simd/x86/ssse3.odin new file mode 100644 index 000000000..920dddd85 --- /dev/null +++ b/core/simd/x86/ssse3.odin @@ -0,0 +1,124 @@ +//+build i386, amd64 +package simd_x86 + +import "core:intrinsics" +import "core:simd" +_ :: simd + +_mm_abs_epi8 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)pabsb128(transmute(i8x16)a) +} +_mm_abs_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)pabsw128(transmute(i16x8)a) +} +_mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { + return transmute(__m128i)pabsd128(transmute(i32x4)a) +} +_mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b) +} +_mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i { + shift :: IMM8 + + // If palignr is shifting the pair of vectors more than the size of two + // lanes, emit zero. + if shift > 32 { + return _mm_set1_epi8(0) + } + a, b := a, b + if shift > 16 { + a, b = _mm_set1_epi8(0), a + } + + return transmute(__m128i)simd.shuffle( + transmute(i8x16)b, + transmute(i8x16)a, + 0 when shift > 32 else shift - 16 + 0 when shift > 16 else shift + 0, + 1 when shift > 32 else shift - 16 + 1 when shift > 16 else shift + 1, + 2 when shift > 32 else shift - 16 + 2 when shift > 16 else shift + 2, + 3 when shift > 32 else shift - 16 + 3 when shift > 16 else shift + 3, + 4 when shift > 32 else shift - 16 + 4 when shift > 16 else shift + 4, + 5 when shift > 32 else shift - 16 + 5 when shift > 16 else shift + 5, + 6 when shift > 32 else shift - 16 + 6 when shift > 16 else shift + 6, + 7 when shift > 32 else shift - 16 + 7 when shift > 16 else shift + 7, + 8 when shift > 32 else shift - 16 + 8 when shift > 16 else shift + 8, + 9 when shift > 32 else shift - 16 + 9 when shift > 16 else shift + 9, + 10 when shift > 32 else shift - 16 + 10 when shift > 16 else shift + 10, + 11 when shift > 32 else shift - 16 + 11 when shift > 16 else shift + 11, + 12 when shift > 32 else shift - 16 + 12 when shift > 16 else shift + 12, + 13 when shift > 32 else shift - 16 + 13 when shift > 16 else shift + 13, + 14 when shift > 32 else shift - 16 + 14 when shift > 16 else shift + 14, + 15 when shift > 32 else shift - 16 + 15 when shift > 16 else shift + 15, + ) +} + + +_mm_hadd_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phaddw128(transmute(i16x8)a, transmute(i16x8)b) +} +_mm_hadds_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phaddsw128(transmute(i16x8)a, transmute(i16x8)b) +} +_mm_hadd_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phaddd128(transmute(i32x4)a, transmute(i32x4)b) +} +_mm_hsub_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phsubw128(transmute(i16x8)a, transmute(i16x8)b) +} +_mm_hsubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phsubsw128(transmute(i16x8)a, transmute(i16x8)b) +} +_mm_hsub_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)phsubd128(transmute(i32x4)a, transmute(i32x4)b) +} +_mm_maddubs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmaddubsw128(transmute(u8x16)a, transmute(i8x16)b) +} +_mm_mulhrs_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)pmulhrsw128(transmute(i16x8)a, transmute(i16x8)b) +} +_mm_sign_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psignb128(transmute(i8x16)a, transmute(i8x16)b) +} +_mm_sign_epi16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psignw128(transmute(i16x8)a, transmute(i16x8)b) +} +_mm_sign_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { + return transmute(__m128i)psignd128(transmute(i32x4)a, transmute(i32x4)b) +} + + + +@(default_calling_convention="c") +@(private) +foreign _ { + @(link_name = "llvm.x86.ssse3.pabs.b.128") + pabsb128 :: proc(a: i8x16) -> u8x16 --- + @(link_name = "llvm.x86.ssse3.pabs.w.128") + pabsw128 :: proc(a: i16x8) -> u16x8 --- + @(link_name = "llvm.x86.ssse3.pabs.d.128") + pabsd128 :: proc(a: i32x4) -> u32x4 --- + @(link_name = "llvm.x86.ssse3.pshuf.b.128") + pshufb128 :: proc(a, b: u8x16) -> u8x16 --- + @(link_name = "llvm.x86.ssse3.phadd.w.128") + phaddw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phadd.sw.128") + phaddsw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phadd.d.128") + phaddd128 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name = "llvm.x86.ssse3.phsub.w.128") + phsubw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phsub.sw.128") + phsubsw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.phsub.d.128") + phsubd128 :: proc(a, b: i32x4) -> i32x4 --- + @(link_name = "llvm.x86.ssse3.pmadd.ub.sw.128") + pmaddubsw128 :: proc(a: u8x16, b: i8x16) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.pmul.hr.sw.128") + pmulhrsw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.psign.b.128") + psignb128 :: proc(a, b: i8x16) -> i8x16 --- + @(link_name = "llvm.x86.ssse3.psign.w.128") + psignw128 :: proc(a, b: i16x8) -> i16x8 --- + @(link_name = "llvm.x86.ssse3.psign.d.128") + psignd128 :: proc(a, b: i32x4) -> i32x4 ---}
\ No newline at end of file |