From dd5b7852ce569027e87d77f46601210aa4180947 Mon Sep 17 00:00:00 2001 From: Barinzaya Date: Mon, 5 May 2025 15:13:10 -0400 Subject: Added alternate reduce-add/reduce-mul intrinsics. The new reduce_add/reduce_mul procs perform the corresponding arithmetic reduction in different orders than sequential order. These alternative orders can often offer better SIMD hardware utilization. Two different orders are added: pair-wise (operating on pairs of adjacent elements) or bisection-wise (operating element-wise on the first and last N/2 elements of the vector). --- src/llvm_backend_proc.cpp | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'src/llvm_backend_proc.cpp') diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp index 7bd8dea59..14157455e 100644 --- a/src/llvm_backend_proc.cpp +++ b/src/llvm_backend_proc.cpp @@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, ""); return res; + case BuiltinProc_simd_reduce_add_bisect: + case BuiltinProc_simd_reduce_mul_bisect: + { + GB_ASSERT(arg0.type->kind == Type_SimdVector); + i64 num_elems = arg0.type->SimdVector.count; + + LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems); + for (i64 i = 0; i < num_elems; i++) { + indices[i] = lb_const_int(m, t_uint, cast(u64)i).value; + } + + switch (builtin_id) { + case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break; + case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break; + } + + LLVMValueRef remaining = arg0.value; + i64 num_remaining = num_elems; + + while (num_remaining > 1) { + num_remaining /= 2; + LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining); + LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, ""); + LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining); + LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, ""); + remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, ""); + } + + res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], ""); + return res; + } + case BuiltinProc_simd_reduce_add_ordered: case BuiltinProc_simd_reduce_mul_ordered: { @@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types)); return res; } + + case BuiltinProc_simd_reduce_add_pairs: + case BuiltinProc_simd_reduce_mul_pairs: + { + GB_ASSERT(arg0.type->kind == Type_SimdVector); + i64 num_elems = arg0.type->SimdVector.count; + + LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems); + for (i64 i = 0; i < num_elems/2; i++) { + indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value; + indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value; + } + + switch (builtin_id) { + case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break; + case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break; + } + + LLVMValueRef remaining = arg0.value; + i64 num_remaining = num_elems; + + while (num_remaining > 1) { + num_remaining /= 2; + LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining); + LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, ""); + LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining); + LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, ""); + remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, ""); + } + + res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], ""); + return res; + } + case BuiltinProc_simd_reduce_min: case BuiltinProc_simd_reduce_max: case BuiltinProc_simd_reduce_and: -- cgit v1.2.3