aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2025-05-31 20:24:21 +0200
committerJeroen van Rijn <Kelimion@users.noreply.github.com>2025-05-31 20:24:21 +0200
commit890e923051a90362b6c4a5e7aaaeafc00d533424 (patch)
tree90b32ba2e69372594edd111946933ee8d342d994
parentd52aa3f2c22362f417e440e988e186f683449261 (diff)
Vectorize `strings.prefix_length`.
Also add `strings.common_prefix`.
-rw-r--r--base/runtime/internal.odin68
-rw-r--r--core/strings/strings.odin64
-rw-r--r--tests/benchmark/all.odin1
-rw-r--r--tests/benchmark/strings/benchmark_strings.odin132
-rw-r--r--tests/core/strings/test_core_strings.odin51
5 files changed, 298 insertions, 18 deletions
diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
index a35dbff8a..3faf3e178 100644
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -405,6 +405,74 @@ memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_
return 0
}
+memory_prefix_length :: proc "contextless" (x, y: rawptr, n: int) -> (idx: int) #no_bounds_check {
+ switch {
+ case x == y: return n
+ case x == nil: return 0
+ case y == nil: return 0
+ }
+ a, b := cast([^]byte)x, cast([^]byte)y
+
+ n := uint(n)
+ i := uint(0)
+ m := uint(0)
+
+ when HAS_HARDWARE_SIMD {
+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+ m = n / 32 * 32
+ for /**/; i < m; i += 32 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[32]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[32]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return i + index_reduce
+ }
+ }
+ }
+ }
+
+ m = (n-i) / 16 * 16
+ for /**/; i < m; i += 16 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[16]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[16]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return int(i + index_reduce)
+ }
+ }
+
+ // 64-bit SIMD is faster than using a `uintptr` to detect a difference then
+ // re-iterating with the byte-by-byte loop, at least on AMD64.
+ m = (n-i) / 8 * 8
+ for /**/; i < m; i += 8 {
+ load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
+ load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
+ comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+ if intrinsics.simd_reduce_or(comparison) != 0 {
+ sentinel: #simd[8]u8 = u8(0xFF)
+ indices := intrinsics.simd_indices(#simd[8]u8)
+ index_select := intrinsics.simd_select(comparison, indices, sentinel)
+ index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+ return int(i + index_reduce)
+ }
+ }
+
+ for /**/; i < n; i += 1 {
+ if a[i] ~ b[i] != 0 {
+ return int(i)
+ }
+ }
+ return int(n)
+}
+
string_eq :: proc "contextless" (lhs, rhs: string) -> bool {
x := transmute(Raw_String)lhs
y := transmute(Raw_String)rhs
diff --git a/core/strings/strings.odin b/core/strings/strings.odin
index e15754dff..59e70ed15 100644
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -2,6 +2,7 @@
package strings
import "base:intrinsics"
+import "base:runtime"
import "core:bytes"
import "core:io"
import "core:mem"
@@ -458,6 +459,7 @@ equal_fold :: proc(u, v: string) -> (res: bool) {
return s == t
}
+
/*
Returns the prefix length common between strings `a` and `b`
@@ -488,30 +490,58 @@ Output:
0
*/
-prefix_length :: proc(a, b: string) -> (n: int) {
- _len := min(len(a), len(b))
-
- // Scan for matches including partial codepoints.
- #no_bounds_check for n < _len && a[n] == b[n] {
- n += 1
- }
+prefix_length :: proc "contextless" (a, b: string) -> (n: int) {
+ RUNE_ERROR :: '\ufffd'
+ RUNE_SELF :: 0x80
+ UTF_MAX :: 4
- // Now scan to ignore partial codepoints.
- if n > 0 {
- s := a[:n]
- n = 0
- for {
- r0, w := utf8.decode_rune(s[n:])
- if r0 != utf8.RUNE_ERROR {
- n += w
- } else {
- break
+ n = runtime.memory_prefix_length(raw_data(a), raw_data(b), min(len(a), len(b)))
+ lim := max(n - UTF_MAX + 1, 0)
+ for l := n; l > lim; l -= 1 {
+ r, _ := runtime.string_decode_rune(a[l - 1:])
+ if r != RUNE_ERROR {
+ if l > 0 && (a[l - 1] & 0xc0 == 0xc0) {
+ return l - 1
}
+ return l
}
}
return
}
/*
+Returns the common prefix between strings `a` and `b`
+
+Inputs:
+- a: The first input string
+- b: The second input string
+
+Returns:
+- n: The string prefix common between strings `a` and `b`
+
+Example:
+
+ import "core:fmt"
+ import "core:strings"
+
+ common_prefix_example :: proc() {
+ fmt.println(strings.common_prefix("testing", "test"))
+ fmt.println(strings.common_prefix("testing", "te"))
+ fmt.println(strings.common_prefix("telephone", "te"))
+ fmt.println(strings.common_prefix("testing", "est"))
+ }
+
+Output:
+
+ "test"
+ "te"
+ "te"
+ ""
+
+*/
+common_prefix :: proc(a, b: string) -> string {
+ return a[:prefix_length(a, b)]
+}
+/*
Determines if a string `s` starts with a given `prefix`
Inputs:
diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin
index a48872cc6..30640ac87 100644
--- a/tests/benchmark/all.odin
+++ b/tests/benchmark/all.odin
@@ -4,3 +4,4 @@ package benchmarks
@(require) import "crypto"
@(require) import "hash"
@(require) import "text/regex"
+@(require) import "strings" \ No newline at end of file
diff --git a/tests/benchmark/strings/benchmark_strings.odin b/tests/benchmark/strings/benchmark_strings.odin
new file mode 100644
index 000000000..9f5a76f0f
--- /dev/null
+++ b/tests/benchmark/strings/benchmark_strings.odin
@@ -0,0 +1,132 @@
+package benchmark_strings
+
+import "base:intrinsics"
+import "base:runtime"
+import "core:fmt"
+import "core:log"
+import "core:testing"
+import "core:strings"
+import "core:text/table"
+import "core:time"
+import "core:unicode/utf8"
+
+RUNS_PER_SIZE :: 2500
+
+sizes := [?]int {
+ 7, 8, 9,
+ 15, 16, 17,
+ 31, 32, 33,
+ 63, 64, 65,
+ 95, 96, 97,
+ 128,
+ 256,
+ 512,
+ 1024,
+ 4096,
+}
+
+// These are the normal, unoptimized algorithms.
+
+plain_prefix_length :: proc "contextless" (a, b: string) -> (n: int) {
+ _len := min(len(a), len(b))
+
+ // Scan for matches including partial codepoints.
+ #no_bounds_check for n < _len && a[n] == b[n] {
+ n += 1
+ }
+
+ // Now scan to ignore partial codepoints.
+ if n > 0 {
+ s := a[:n]
+ n = 0
+ for {
+ r0, w := utf8.decode_rune(s[n:])
+ if r0 != utf8.RUNE_ERROR {
+ n += w
+ } else {
+ break
+ }
+ }
+ }
+ return
+}
+
+run_trial_size_prefix :: proc(p: proc "contextless" (string, string) -> $R, suffix: string, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
+ left := make([]u8, size)
+ right := make([]u8, size)
+ defer {
+ delete(left)
+ delete(right)
+ }
+
+ if len(suffix) > 0 {
+ copy(left [idx:], suffix)
+ copy(right[idx:], suffix)
+
+ } else {
+ right[idx] = 'A'
+ }
+
+ accumulator: int
+
+ watch: time.Stopwatch
+
+ time.stopwatch_start(&watch)
+ for _ in 0..<runs {
+ result := p(string(left[:size]), string(right[:size]))
+ accumulator += result
+ }
+ time.stopwatch_stop(&watch)
+ timing = time.stopwatch_duration(watch)
+
+ log.debug(accumulator)
+ return
+}
+
+run_trial_size :: proc {
+ run_trial_size_prefix,
+}
+
+bench_table_size :: proc(algo_name: string, plain, simd: $P, suffix := "") {
+ string_buffer := strings.builder_make()
+ defer strings.builder_destroy(&string_buffer)
+
+ tbl: table.Table
+ table.init(&tbl)
+ defer table.destroy(&tbl)
+
+ table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Iterations", "Scalar", "SIMD", "SIMD Relative (%)", "SIMD Relative (x)")
+
+ for size in sizes {
+ // Place the non-zero byte somewhere in the middle.
+ needle_index := size / 2
+
+ plain_timing := run_trial_size(plain, suffix, size, needle_index, RUNS_PER_SIZE)
+ simd_timing := run_trial_size(simd, suffix, size, needle_index, RUNS_PER_SIZE)
+
+ _plain := fmt.tprintf("%8M", plain_timing)
+ _simd := fmt.tprintf("%8M", simd_timing)
+ _relp := fmt.tprintf("%.3f %%", f64(simd_timing) / f64(plain_timing) * 100.0)
+ _relx := fmt.tprintf("%.3f x", 1 / (f64(simd_timing) / f64(plain_timing)))
+
+ table.aligned_row_of_values(
+ &tbl,
+ .Right,
+ algo_name,
+ size, RUNS_PER_SIZE, _plain, _simd, _relp, _relx)
+ }
+
+ builder_writer := strings.to_writer(&string_buffer)
+
+ fmt.sbprintln(&string_buffer)
+ table.write_plain_table(builder_writer, &tbl)
+
+ my_table_string := strings.to_string(string_buffer)
+ log.info(my_table_string)
+}
+
+@test
+benchmark_memory_procs :: proc(t: ^testing.T) {
+ bench_table_size("prefix_length ascii", plain_prefix_length, strings.prefix_length)
+ bench_table_size("prefix_length unicode", plain_prefix_length, strings.prefix_length, "🦉")
+} \ No newline at end of file
diff --git a/tests/core/strings/test_core_strings.odin b/tests/core/strings/test_core_strings.odin
index 7b5f7fbb4..140689468 100644
--- a/tests/core/strings/test_core_strings.odin
+++ b/tests/core/strings/test_core_strings.odin
@@ -1,9 +1,10 @@
package test_core_strings
+import "base:runtime"
import "core:mem"
import "core:strings"
import "core:testing"
-import "base:runtime"
+import "core:unicode/utf8"
@test
test_index_any_small_string_not_found :: proc(t: ^testing.T) {
@@ -218,3 +219,51 @@ test_builder_to_cstring :: proc(t: ^testing.T) {
testing.expect(t, err == .Out_Of_Memory)
}
}
+
+@test
+test_prefix_length :: proc(t: ^testing.T) {
+ prefix_length :: proc "contextless" (a, b: string) -> (n: int) {
+ _len := min(len(a), len(b))
+
+ // Scan for matches including partial codepoints.
+ #no_bounds_check for n < _len && a[n] == b[n] {
+ n += 1
+ }
+
+ // Now scan to ignore partial codepoints.
+ if n > 0 {
+ s := a[:n]
+ n = 0
+ for {
+ r0, w := utf8.decode_rune(s[n:])
+ if r0 != utf8.RUNE_ERROR {
+ n += w
+ } else {
+ break
+ }
+ }
+ }
+ return
+ }
+
+ cases := [][2]string{
+ {"Hellope, there!", "Hellope, world!"},
+ {"Hellope, there!", "Foozle"},
+ {"Hellope, there!", "Hell"},
+ {"Hellope! 🦉", "Hellope! 🦉"},
+ }
+
+ for v in cases {
+ p_scalar := prefix_length(v[0], v[1])
+ p_simd := strings.prefix_length(v[0], v[1])
+ testing.expect_value(t, p_simd, p_scalar)
+
+ s := v[0]
+ for len(s) > 0 {
+ p_scalar = prefix_length(v[0], s)
+ p_simd = strings.prefix_length(v[0], s)
+ testing.expect_value(t, p_simd, p_scalar)
+ s = s[:len(s) - 1]
+ }
+ }
+} \ No newline at end of file