From af63eff8d738b0c1d6869510b14e5abde84a2c48 Mon Sep 17 00:00:00 2001 From: Andrea Piseri Date: Sun, 16 Apr 2023 15:01:30 +0200 Subject: improve code generation for `intrinsics.unaligned_load/store` on `#simd` types the default implementation calls memcpy on an `alloca` constant, which seems to heavily confuse the optimizer and produces overall suboptimal code. Introducing this specialization simplifies the intermediate representation produced, resulting in more efficient code. --- src/llvm_backend_proc.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'src/llvm_backend_proc.cpp') diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp index 02748663b..5e709b0bf 100644 --- a/src/llvm_backend_proc.cpp +++ b/src/llvm_backend_proc.cpp @@ -2363,9 +2363,15 @@ gb_internal lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValu { lbValue dst = lb_build_expr(p, ce->args[0]); lbValue src = lb_build_expr(p, ce->args[1]); - src = lb_address_from_load_or_generate_local(p, src); Type *t = type_deref(dst.type); - lb_mem_copy_non_overlapping(p, dst, src, lb_const_int(p->module, t_int, type_size_of(t)), false); + + if (is_type_simd_vector(t)) { + LLVMValueRef store = LLVMBuildStore(p->builder, src.value, dst.value); + LLVMSetAlignment(store, 1); + } else { + src = lb_address_from_load_or_generate_local(p, src); + lb_mem_copy_non_overlapping(p, dst, src, lb_const_int(p->module, t_int, type_size_of(t)), false); + } return {}; } @@ -2373,9 +2379,17 @@ gb_internal lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValu { lbValue src = lb_build_expr(p, ce->args[0]); Type *t = type_deref(src.type); - lbAddr dst = lb_add_local_generated(p, t, false); - lb_mem_copy_non_overlapping(p, dst.addr, src, lb_const_int(p->module, t_int, type_size_of(t)), false); - return lb_addr_load(p, dst); + if (is_type_simd_vector(t)) { + lbValue res = {}; + res.type = t; + res.value = LLVMBuildLoad2(p->builder, lb_type(p->module, t), src.value, ""); + LLVMSetAlignment(res.value, 1); + return res; + } else { + lbAddr dst = lb_add_local_generated(p, t, false); + lb_mem_copy_non_overlapping(p, dst.addr, src, lb_const_int(p->module, t_int, type_size_of(t)), false); + return lb_addr_load(p, dst); + } } case BuiltinProc_atomic_add: -- cgit v1.2.3