diff options
| author | gingerBill <gingerBill@users.noreply.github.com> | 2025-09-27 09:58:28 +0100 |
|---|---|---|
| committer | gingerBill <gingerBill@users.noreply.github.com> | 2025-09-27 09:58:28 +0100 |
| commit | ac01d1b5bf0050e8929756e046db425c92b8b1dd (patch) | |
| tree | 71c421e22e79b2d77194a5f97dd054e1de86187d | |
| parent | 0eaf3ee7cd1b02fd694697fe581c143b0a87c3a3 (diff) | |
Add `runtime.conditional_mem_zero` to improve `heap_allocator` performance on non-Windows systems
| -rw-r--r-- | base/runtime/heap_allocator.odin | 10 | ||||
| -rw-r--r-- | base/runtime/internal.odin | 50 |
2 files changed, 56 insertions, 4 deletions
diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin index f2c887759..e2667a78c 100644 --- a/base/runtime/heap_allocator.odin +++ b/base/runtime/heap_allocator.odin @@ -71,10 +71,12 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode, new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return - // NOTE: heap_resize does not zero the new memory, so we do it - if zero_memory && new_size > old_size { - new_region := raw_data(new_memory[old_size:]) - intrinsics.mem_zero(new_region, new_size - old_size) + when ODIN_OS != .Windows { + // NOTE: heap_resize does not zero the new memory, so we do it + if zero_memory && new_size > old_size { + new_region := raw_data(new_memory[old_size:]) + conditional_mem_zero(new_region, new_size - old_size) + } } return } diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index 8af083d07..50be890f7 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -230,6 +230,56 @@ non_zero_mem_resize :: proc(ptr: rawptr, old_size, new_size: int, alignment: int return _mem_resize(ptr, old_size, new_size, alignment, allocator, false, loc) } +conditional_mem_zero :: proc "contextless" (data: rawptr, n_: int) #no_bounds_check { + // When acquiring memory from the OS for the first time it's likely that the + // OS already gives the zero page mapped multiple times for the request. The + // actual allocation does not have physical pages allocated to it until those + // pages are written to which causes a page-fault. This is often called COW + // (Copy on Write) + // + // You do not want to actually zero out memory in this case because it would + // cause a bunch of page faults decreasing the speed of allocations and + // increase the amount of actual resident physical memory used. + // + // Instead a better technique is to check if memory is zerored before zeroing + // it. This turns out to be an important optimization in practice, saving + // nearly half (or more) the amount of physical memory used by an application. + // This is why every implementation of calloc in libc does this optimization. + // + // It may seem counter-intuitive but most allocations in an application are + // wasted and never used. When you consider something like a [dynamic]T which + // always doubles in capacity on resize but you rarely ever actually use the + // full capacity of a dynamic array it means you have a lot of resident waste + // if you actually zeroed the remainder of the memory. + // + // Keep in mind the OS is already guaranteed to give you zeroed memory by + // mapping in this zero page multiple times so in the best case there is no + // need to actually zero anything. As for testing all this memory for a zero + // value, it costs nothing because the the same zero page is used for the + // whole allocation and will exist in L1 cache for the entire zero checking + // process. + + if n_ <= 0 { + return + } + n := uint(n_) + + n_words := n / size_of(uintptr) + n_bytes := n % size_of(uintptr) + p_words := ([^]uintptr)(data)[:n_words] + p_bytes := ([^]byte)(data)[size_of(uintptr) * n_words:n] + for &p_word in p_words { + if p_word != 0 { + p_word = 0 + } + } + for &p_byte in p_bytes { + if p_byte != 0 { + p_byte = 0 + } + } +} + memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { switch { case n == 0: return true |