aboutsummaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authorgingerBill <gingerBill@users.noreply.github.com>2025-09-27 09:58:28 +0100
committergingerBill <gingerBill@users.noreply.github.com>2025-09-27 09:58:28 +0100
commitac01d1b5bf0050e8929756e046db425c92b8b1dd (patch)
tree71c421e22e79b2d77194a5f97dd054e1de86187d /base
parent0eaf3ee7cd1b02fd694697fe581c143b0a87c3a3 (diff)
Add `runtime.conditional_mem_zero` to improve `heap_allocator` performance on non-Windows systems
Diffstat (limited to 'base')
-rw-r--r--base/runtime/heap_allocator.odin10
-rw-r--r--base/runtime/internal.odin50
2 files changed, 56 insertions, 4 deletions
diff --git a/base/runtime/heap_allocator.odin b/base/runtime/heap_allocator.odin
index f2c887759..e2667a78c 100644
--- a/base/runtime/heap_allocator.odin
+++ b/base/runtime/heap_allocator.odin
@@ -71,10 +71,12 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return
- // NOTE: heap_resize does not zero the new memory, so we do it
- if zero_memory && new_size > old_size {
- new_region := raw_data(new_memory[old_size:])
- intrinsics.mem_zero(new_region, new_size - old_size)
+ when ODIN_OS != .Windows {
+ // NOTE: heap_resize does not zero the new memory, so we do it
+ if zero_memory && new_size > old_size {
+ new_region := raw_data(new_memory[old_size:])
+ conditional_mem_zero(new_region, new_size - old_size)
+ }
}
return
}
diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
index 8af083d07..50be890f7 100644
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -230,6 +230,56 @@ non_zero_mem_resize :: proc(ptr: rawptr, old_size, new_size: int, alignment: int
return _mem_resize(ptr, old_size, new_size, alignment, allocator, false, loc)
}
+conditional_mem_zero :: proc "contextless" (data: rawptr, n_: int) #no_bounds_check {
+ // When acquiring memory from the OS for the first time it's likely that the
+ // OS already gives the zero page mapped multiple times for the request. The
+ // actual allocation does not have physical pages allocated to it until those
+ // pages are written to which causes a page-fault. This is often called COW
+ // (Copy on Write)
+ //
+ // You do not want to actually zero out memory in this case because it would
+ // cause a bunch of page faults decreasing the speed of allocations and
+ // increase the amount of actual resident physical memory used.
+ //
+ // Instead a better technique is to check if memory is zerored before zeroing
+ // it. This turns out to be an important optimization in practice, saving
+ // nearly half (or more) the amount of physical memory used by an application.
+ // This is why every implementation of calloc in libc does this optimization.
+ //
+ // It may seem counter-intuitive but most allocations in an application are
+ // wasted and never used. When you consider something like a [dynamic]T which
+ // always doubles in capacity on resize but you rarely ever actually use the
+ // full capacity of a dynamic array it means you have a lot of resident waste
+ // if you actually zeroed the remainder of the memory.
+ //
+ // Keep in mind the OS is already guaranteed to give you zeroed memory by
+ // mapping in this zero page multiple times so in the best case there is no
+ // need to actually zero anything. As for testing all this memory for a zero
+ // value, it costs nothing because the the same zero page is used for the
+ // whole allocation and will exist in L1 cache for the entire zero checking
+ // process.
+
+ if n_ <= 0 {
+ return
+ }
+ n := uint(n_)
+
+ n_words := n / size_of(uintptr)
+ n_bytes := n % size_of(uintptr)
+ p_words := ([^]uintptr)(data)[:n_words]
+ p_bytes := ([^]byte)(data)[size_of(uintptr) * n_words:n]
+ for &p_word in p_words {
+ if p_word != 0 {
+ p_word = 0
+ }
+ }
+ for &p_byte in p_bytes {
+ if p_byte != 0 {
+ p_byte = 0
+ }
+ }
+}
+
memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
switch {
case n == 0: return true