From 12dd0cb72a586a99129280c78697089caab0500a Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:39:19 -0400 Subject: Simplify and make `simd_util` cross-platform This new algorithm uses a Scalar->Vector->Scalar iteration loop which requires no masking off of any incomplete data chunks. Also, the width was reduced to 32 bytes instead of 64, as I found this to be about as fast as the previous 64-byte x86 version. --- core/bytes/bytes.odin | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'core/bytes') diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index dcd4931e2..136c98f6b 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(s, c) } else { return _index_byte(s, c) } @@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(s, c) } else { return _last_index_byte(s, c) } -- cgit v1.2.3