From 12dd0cb72a586a99129280c78697089caab0500a Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Fri, 9 Aug 2024 17:39:19 -0400
Subject: Simplify and make `simd_util` cross-platform

This new algorithm uses a Scalar->Vector->Scalar iteration loop which
requires no masking off of any incomplete data chunks.

Also, the width was reduced to 32 bytes instead of 64, as I found this
to be about as fast as the previous 64-byte x86 version.
---
 core/bytes/bytes.odin | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'core/bytes')

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index dcd4931e2..136c98f6b 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
 	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
 	// significant speedup when compiling in either Size or Speed mode.
 	// The SIMD version is usually 2-3x slower without optimizations on.
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		// SIMD's benefits are noticeable only past a certain threshold of data.
-		// For small data, use the plain old algorithm.
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.index_byte(s, c)
-		} else {
-			return _index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.index_byte(s, c)
 	} else {
 		return _index_byte(s, c)
 	}
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
 		return -1
 	}
 
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.last_index_byte(s, c)
-		} else {
-			return _last_index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.last_index_byte(s, c)
 	} else {
 		return _last_index_byte(s, c)
 	}
-- 
cgit v1.2.3