uring: move package from nbio/uring to sys/linux/uring

author: Laytan Laats <laytanlaats@hotmail.com> 2026-01-14 19:03:15 +0100
committer: Laytan Laats <laytanlaats@hotmail.com> 2026-01-14 19:03:15 +0100
commit: 5fcf210c916135a66283b65cd7785cbbe091ebee (patch)
tree: c16ad89b2ca8fe370d1bf60f97135ffff813ed8a /core/sys/linux
parent: 792e6c75ee08ba62ca5578ec3ec273e90a94668b (diff)
3 files changed, 1229 insertions, 0 deletions
diff --git a/core/sys/linux/uring/doc.odin b/core/sys/linux/uring/doc.odin
new file mode 100644
index 000000000..8b9ae5ee8
--- /dev/null
+++ b/core/sys/linux/uring/doc.odin
@@ -0,0 +1,88 @@
+/*
+Wrapper/convenience package over the raw io_uring syscalls, providing help with setup, creation, and operating the ring.
+
+The following example shows a simple `cat` program implementation using the package.
+
+Example:
+	package main
+
+	import "base:runtime"
+
+	import "core:fmt"
+	import "core:os"
+	import "core:sys/linux"
+	import "core:sys/linux/uring"
+
+	Request :: struct {
+		path:       cstring,
+		buffer:     []byte,
+		completion: linux.IO_Uring_CQE,
+	}
+
+	main :: proc() {
+		if len(os.args) < 2 {
+			fmt.eprintfln("Usage: %s [file name] <[file name] ...>", os.args[0])
+			os.exit(1)
+		}
+
+		requests := make_soa(#soa []Request, len(os.args)-1)
+		defer delete(requests)
+
+		ring: uring.Ring
+		params := uring.DEFAULT_PARAMS
+		err := uring.init(&ring, &params)
+		fmt.assertf(err == nil, "uring.init: %v", err)
+		defer uring.destroy(&ring)
+
+		for &request, i in requests {
+			request.path = runtime.args__[i+1]
+			// sets up a read requests and adds it to the ring buffer.
+			submit_read_request(request.path, &request.buffer, &ring)
+		}
+
+		ulen := u32(len(requests))
+
+		// submit the requests and wait for them to complete right away.
+		n, serr := uring.submit(&ring, ulen)
+		fmt.assertf(serr == nil, "uring.submit: %v", serr)
+		assert(n == ulen)
+
+		// copy the completed requests out of the ring buffer.
+		cn := uring.copy_cqes_ready(&ring, requests.completion[:ulen])
+		assert(cn == ulen)
+
+		for request in requests {
+			// check result of the requests.
+			fmt.assertf(request.completion.res >= 0, "read %q failed: %v", request.path, linux.Errno(-request.completion.res))
+			// print out.
+			fmt.print(string(request.buffer))
+
+			delete(request.buffer)
+		}
+	}
+
+	submit_read_request :: proc(path: cstring, buffer: ^[]byte, ring: ^uring.Ring) {
+		fd, err := linux.open(path, {})
+		fmt.assertf(err == nil, "open(%q): %v", path, err)
+
+		file_sz := get_file_size(fd)
+
+		buffer^ = make([]byte, file_sz)
+
+		_, ok := uring.read(ring, 0, fd, buffer^, 0)
+		assert(ok, "could not get read sqe")
+	}
+
+	get_file_size :: proc(fd: linux.Fd) -> uint {
+		st: linux.Stat
+		err := linux.fstat(fd, &st)
+		fmt.assertf(err == nil, "fstat: %v", err)
+
+		if linux.S_ISREG(st.mode) {
+			return uint(st.size)
+		}
+
+		panic("not a regular file")
+	}
+*/
+package uring
diff --git a/core/sys/linux/uring/ops.odin b/core/sys/linux/uring/ops.odin
new file mode 100644
index 000000000..7b3392e98
--- /dev/null
+++ b/core/sys/linux/uring/ops.odin
@@ -0,0 +1,847 @@
+package uring
+
+import "core:sys/linux"
+
+// Do not perform any I/O. This is useful for testing the performance of the uring implementation itself.
+nop :: proc(ring: ^Ring, user_data: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .NOP
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+// Vectored read operation, see also readv(2).
+readv :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, iovs: []linux.IO_Vec, off: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .READV
+	sqe.fd = fd
+	sqe.addr = cast(u64)uintptr(raw_data(iovs))
+	sqe.len = u32(len(iovs))
+	sqe.off = off
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+// Vectored write operation, see also writev(2).
+writev :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, iovs: []linux.IO_Vec, off: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .WRITEV
+	sqe.fd = fd
+	sqe.addr = cast(u64)uintptr(raw_data(iovs))
+	sqe.len = u32(len(iovs))
+	sqe.off = off
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+read_fixed :: proc() {
+	unimplemented()
+}
+
+write_fixed :: proc() {
+	unimplemented()
+}
+
+/*
+File sync. See also fsync(2).
+
+Optionally off and len can be used to specify a range within the file to be synced rather than syncing the entire file, which is the default behavior.
+
+Note that, while I/O is initiated in the order in which it appears in the submission queue, completions are unordered.
+For example, an application which places a write I/O followed by an fsync in the submission queue cannot expect the fsync to apply to the write.
+The two operations execute in parallel, so the fsync may complete before the write is issued to the storage.
+The same is also true for previously issued writes that have not completed prior to the fsync.
+To enforce ordering one may utilize linked SQEs,
+IOSQE_IO_DRAIN or wait for the arrival of CQEs of requests which have to be ordered before a given request before submitting its SQE.
+*/
+fsync :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, flags: linux.IO_Uring_Fsync_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .FSYNC
+	sqe.fsync_flags = flags
+	sqe.fd = fd
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Poll the fd specified in the submission queue entry for the events specified in the poll_events field.
+
+Unlike poll or epoll without EPOLLONESHOT, by default this interface always works in one shot mode.
+That is, once the poll operation is completed, it will have to be resubmitted.
+
+If IORING_POLL_ADD_MULTI is set in the SQE len field, then the poll will work in multi shot mode instead.
+That means it'll repatedly trigger when the requested event becomes true, and hence multiple CQEs can be generated from this single SQE.
+The CQE flags field will have IORING_CQE_F_MORE set on completion if the application should expect further CQE entries from the original request.
+If this flag isn't set on completion, then the poll request has been terminated and no further events will be generated.
+This mode is available since 5.13.
+
+This command works like an async poll(2) and the completion event result is the returned mask of events.
+
+Without IORING_POLL_ADD_MULTI and the initial poll operation with IORING_POLL_ADD_MULTI the operation is level triggered,
+i.e. if there is data ready or events pending etc.
+at the time of submission a corresponding CQE will be posted.
+Potential further completions beyond the first caused by a IORING_POLL_ADD_MULTI are edge triggered.
+*/
+poll_add :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, events: linux.Fd_Poll_Events, flags: linux.IO_Uring_Poll_Add_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .POLL_ADD
+	sqe.fd = fd
+	sqe.poll_events = events
+	sqe.poll_flags = flags
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Remove an existing poll request.
+
+If found, the res field of the struct io_uring_cqe will contain 0.
+If not found, res will contain -ENOENT, or -EALREADY if the poll request was in the process of completing already.
+*/
+poll_remove :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, events: linux.Fd_Poll_Events) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .POLL_REMOVE
+	sqe.fd = fd
+	sqe.poll_events = events
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Update the events of an existing poll request.
+
+The request will update an existing poll request with the mask of events passed in with this request.
+The lookup is based on the user_data field of the original SQE submitted.
+
+Updating an existing poll is available since 5.13.
+*/
+poll_update_events :: proc(ring: ^Ring, user_data: u64, orig_user_data: u64, fd: linux.Fd, events: linux.Fd_Poll_Events) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .POLL_REMOVE
+	sqe.fd = fd
+	sqe.addr = orig_user_data
+	sqe.poll_events = events
+	sqe.user_data = user_data
+	sqe.poll_flags = {.UPDATE_EVENTS}
+
+	ok = true
+	return
+}
+
+/*
+Update the user data of an existing poll request.
+
+The request will update the user_data of an existing poll request based on the value passed.
+
+Updating an existing poll is available since 5.13.
+*/
+poll_update_user_data :: proc(ring: ^Ring, user_data: u64, orig_user_data: u64, new_user_data: u64, fd: linux.Fd) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .POLL_REMOVE
+	sqe.fd = fd
+	sqe.off = orig_user_data
+	sqe.addr = new_user_data
+	sqe.user_data = user_data
+	sqe.poll_flags = {.UPDATE_USER_DATA}
+
+	ok = true
+	return
+}
+
+/*
+Add, remove or modify entries in the interest list of epoll(7).
+
+See epoll_ctl(2) for details of the system call.
+
+Available since 5.6.
+*/
+epoll_ctl :: proc(ring: ^Ring, user_data: u64, epfd: linux.Fd, op: linux.EPoll_Ctl_Opcode, fd: linux.Fd, event: ^linux.EPoll_Event) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .EPOLL_CTL
+	sqe.fd = epfd
+	sqe.off = u64(fd)
+	sqe.epoll_ctl_op = op
+	sqe.addr = cast(u64)uintptr(event)
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+sync_file_range :: proc() {
+	unimplemented()
+}
+
+/*
+Issue the equivalent of a sendmsg(2) system call.
+
+See also sendmsg(2) for the general description of the related system call.
+
+poll_first: if set, uring will assume the socket is currently full and attempting to send data will be unsuccessful.
+For this case, uring will arm internal poll and trigger a send of the data when there is enough space available.
+This initial send attempt can be wasteful for the case where the socket is expected to be full, setting this flag will
+bypass the initial send attempt and go straight to arming poll.
+If poll does indicate that data can be sent, the operation will proceed.
+
+Available since 5.3.
+*/
+sendmsg :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, msghdr: ^linux.Msg_Hdr, flags: linux.Socket_Msg, poll_first := false) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .SENDMSG
+	sqe.fd = fd
+	sqe.addr = cast(u64)uintptr(msghdr)
+	sqe.msg_flags = flags
+	sqe.user_data = user_data
+	sqe.sq_send_recv_flags = {.RECVSEND_POLL_FIRST} if poll_first else {}
+
+	ok = true
+	return
+}
+
+/*
+Works just like sendmsg, but receives instead of sends.
+
+poll_first: If set, uring will assume the socket is currently empty and attempting to receive data will be unsuccessful.
+For this case, uring will arm internal poll and trigger a receive of the data when the socket has data to be read.
+This initial receive attempt can be wasteful for the case where the socket is expected to be empty, setting this flag will bypass the initial receive attempt and go straight to arming poll.
+If poll does indicate that data is ready to be received, the operation will proceed.
+
+Available since 5.3.
+*/
+recvmsg :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, msghdr: ^linux.Msg_Hdr, flags: linux.Socket_Msg, poll_first := false) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .RECVMSG
+	sqe.fd = fd
+	sqe.addr = cast(u64)uintptr(msghdr)
+	sqe.msg_flags = flags
+	sqe.user_data = user_data
+	sqe.sq_send_recv_flags = {.RECVSEND_POLL_FIRST} if poll_first else {}
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a send(2) system call.
+
+See also send(2) for the general description of the related system call.
+
+poll_first: If set, uring will assume the socket is currently full and attempting to send data will be unsuccessful.
+For this case, uring will arm internal poll and trigger a send of the data when there is enough space available.
+This initial send attempt can be wasteful for the case where the socket is expected to be full, setting this flag will bypass the initial send attempt and go straight to arming poll.
+If poll does indicate that data can be sent, the operation will proceed.
+
+Available since 5.6.
+*/
+send :: proc(ring: ^Ring, user_data: u64, sockfd: linux.Fd, buf: []byte, flags: linux.Socket_Msg, poll_first := false) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .SEND
+	sqe.fd = sockfd
+	sqe.addr = cast(u64)uintptr(raw_data(buf))
+	sqe.len = u32(len(buf))
+	sqe.msg_flags = flags
+	sqe.user_data = user_data
+	sqe.sq_send_recv_flags = {.RECVSEND_POLL_FIRST} if poll_first else {}
+
+	ok = true
+	return
+}
+
+sendto :: proc(ring: ^Ring, user_data: u64, sockfd: linux.Fd, buf: []byte, flags: linux.Socket_Msg, dest: ^$T, poll_first := false) -> (sqe: ^linux.IO_Uring_SQE, ok: bool)
+	where T == linux.Sock_Addr_In || T == linux.Sock_Addr_In6 || T == linux.Sock_Addr_Un || T == linux.Sock_Addr_Any {
+
+	sqe = send(ring, user_data, sockfd, buf, flags, poll_first) or_return
+	sqe.addr2 = u64(uintptr(dest))
+	sqe.addr_len = u16(size_of(T))
+
+	ok = true
+	return
+}
+
+/*
+Works just like send, but receives instead of sends.
+
+poll_first: If set, uring will assume the socket is currently empty and attempting to receive data will be unsuccessful.
+For this case, uring will arm internal poll and trigger a receive of the data when the socket has data to be read.
+This initial receive attempt can be wasteful for the case where the socket is expected to be empty, setting this flag will bypass the initial receive attempt and go straight to arming poll.
+If poll does indicate that data is ready to be received, the operation will proceed.
+
+Available since 5.6.
+*/
+recv :: proc(ring: ^Ring, user_data: u64, sockfd: linux.Fd, buf: []byte, flags: linux.Socket_Msg, poll_first := false) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .RECV
+	sqe.fd = sockfd
+	sqe.addr = cast(u64)uintptr(raw_data(buf))
+	sqe.len = cast(u32)uintptr(len(buf))
+	sqe.msg_flags = flags
+	sqe.user_data = user_data
+	sqe.sq_send_recv_flags = {.RECVSEND_POLL_FIRST} if poll_first else {}
+
+	ok = true
+	return
+}
+
+/*
+Register a timeout operation.
+
+The timeout will complete when either the timeout expires, or after the specified number of
+events complete (if `count` is greater than `0`).
+
+`flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout.
+
+The completion event result will be `-ETIME` if the timeout completed through expiration,
+`0` if the timeout completed after the specified number of events, or `-ECANCELED` if the
+timeout was removed before it expired.
+
+uring timeouts use the `CLOCK.MONOTONIC` clock source.
+*/
+timeout :: proc(ring: ^Ring, user_data: u64, ts: ^linux.Time_Spec, count: u32, flags: linux.IO_Uring_Timeout_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .TIMEOUT
+	sqe.fd = -1
+	sqe.addr = cast(u64)uintptr(ts)
+	sqe.len = 1
+	sqe.off = u64(count)
+	sqe.timeout_flags = flags
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Rmove an existing timeout operation.
+
+The timeout is identified by it's `user_data`.
+
+The completion event result will be `0` if the timeout was found and cancelled successfully,
+`-EBUSY` if the timeout was found but expiration was already in progress, or
+`-ENOENT` if the timeout was not found.
+*/
+timeout_remove :: proc(ring: ^Ring, user_data: u64, timeout_user_data: u64, flags: linux.IO_Uring_Timeout_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .TIMEOUT_REMOVE
+	sqe.fd = -1
+	sqe.addr = timeout_user_data
+	sqe.timeout_flags = flags
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of an accept4(2) system call.
+
+See also accept4(2) for the general description of the related system call.
+
+If the file_index field is set to a positive number, the file won't be installed into the normal file table as usual
+but will be placed into the fixed file table at index file_index - 1.
+In this case, instead of returning a file descriptor, the result will contain either 0 on success or an error.
+If the index points to a valid empty slot, the installation is guaranteed to not fail.
+If there is already a file in the slot, it will be replaced, similar to IORING_OP_FILES_UPDATE.
+Please note that only uring has access to such files and no other syscall can use them. See IOSQE_FIXED_FILE and IORING_REGISTER_FILES.
+
+Available since 5.5.
+*/
+accept :: proc(ring: ^Ring, user_data: u64, sockfd: linux.Fd, addr: ^$T, addr_len: ^i32, flags: linux.Socket_FD_Flags, file_index: u32 = 0) -> (sqe: ^linux.IO_Uring_SQE, ok: bool)
+where T == linux.Sock_Addr_In || T == linux.Sock_Addr_In6 || T == linux.Sock_Addr_Un || T == linux.Sock_Addr_Any {
+
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .ACCEPT
+	sqe.fd = sockfd
+	sqe.addr = cast(u64)uintptr(addr)
+	sqe.off = cast(u64)uintptr(addr_len)
+	sqe.accept_flags = flags
+	sqe.user_data = user_data
+	sqe.file_index = file_index
+
+	ok = true
+	return
+}
+
+/*
+Attempt to cancel an already issued request.
+
+The request is identified by it's user data.
+
+The cancelation request will complete with one of the following results codes.
+
+If found, the res field of the cqe will contain 0.
+If not found, res will contain -ENOENT.
+
+If found and attempted canceled, the res field will contain -EALREADY.
+In this case, the request may or may not terminate.
+In general, requests that are interruptible (like socket IO) will get canceled, while disk IO requests cannot be canceled if already started.
+
+Available since 5.5.
+*/
+async_cancel :: proc(ring: ^Ring, orig_user_data: u64, user_data: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .ASYNC_CANCEL
+	sqe.addr = orig_user_data
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Adds a link timeout operation.
+
+You need to set linux.IOSQE_IO_LINK to flags of the target operation
+and then call this method right after the target operation.
+See https://lwn.net/Articles/803932/ for detail.
+
+If the dependent request finishes before the linked timeout, the timeout
+is canceled. If the timeout finishes before the dependent request, the
+dependent request will be canceled.
+
+The completion event result of the link_timeout will be
+`-ETIME` if the timeout finishes before the dependent request
+(in this case, the completion event result of the dependent request will
+be `-ECANCELED`), or
+`-EALREADY` if the dependent request finishes before the linked timeout.
+
+Available since 5.5.
+*/
+link_timeout :: proc(ring: ^Ring, user_data: u64, ts: ^linux.Time_Spec, flags: linux.IO_Uring_Timeout_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring, 0) or_return
+	sqe.opcode = .LINK_TIMEOUT
+	sqe.fd = -1
+	sqe.addr = cast(u64)uintptr(ts)
+	sqe.len = 1
+	sqe.timeout_flags = flags
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a connect(2) system call.
+
+See also connect(2) for the general description of the related system call.
+
+Available since 5.5.
+*/
+connect :: proc(ring: ^Ring, user_data: u64, sockfd: linux.Fd, addr: ^$T) -> (sqe: ^linux.IO_Uring_SQE, ok: bool)
+where T == linux.Sock_Addr_In || T == linux.Sock_Addr_In6 || T == linux.Sock_Addr_Un || T == linux.Sock_Addr_Any {
+
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .CONNECT
+	sqe.fd = sockfd
+	sqe.addr = cast(u64)uintptr(addr)
+	sqe.off = size_of(T)
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+fallocate :: proc() {
+	unimplemented()
+}
+
+fadvise :: proc() {
+	unimplemented()
+}
+
+/*
+Issue the equivalent of a madvise(2) system call.
+
+See also madvise(2) for the general description of the related system call.
+
+Available since 5.6.
+*/
+madvise :: proc(ring: ^Ring, user_data: u64, addr: rawptr, size: u32, advise: linux.MAdvice) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .MADVISE
+	sqe.addr = u64(uintptr(addr))
+	sqe.len = size
+	sqe.fadvise_advice = cast(u32)transmute(int)advise
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a openat(2) system call.
+
+See also openat(2) for the general description of the related system call.
+
+Available since 5.6.
+
+If the file_index is set to a positive number,
+the file won't be installed into the normal file table as usual but will be placed into the fixed file table at index file_index - 1.
+In this case, instead of returning a file descriptor, the result will contain either 0 on success or an error.
+If the index points to a valid empty slot, the installation is guaranteed to not fail.
+If there is already a file in the slot, it will be replaced, similar to IORING_OP_FILES_UPDATE.
+Please note that only uring has access to such files and no other syscall can use them.
+See IOSQE_FIXED_FILE and IORING_REGISTER_FILES.
+
+Available since 5.15.
+*/
+openat :: proc(ring: ^Ring, user_data: u64, dirfd: linux.Fd, path: cstring, mode: linux.Mode, flags: linux.Open_Flags, file_index: u32 = 0) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .OPENAT
+	sqe.fd = dirfd
+	sqe.addr = cast(u64)transmute(uintptr)path
+	sqe.len = transmute(u32)mode
+	sqe.open_flags = flags
+	sqe.user_data = user_data
+	sqe.file_index = file_index
+
+	ok = true
+	return
+}
+
+openat2 :: proc() {
+	unimplemented()
+}
+
+/*
+Issue the equivalent of a close(2) system call.
+
+See also close(2) for the general description of the related system call.
+
+Available since 5.6.
+
+If the file_index field is set to a positive number, this command can be used to close files that were
+direct opened through IORING_OP_OPENAT, IORING_OP_OPENAT2, or IORING_OP_ACCEPT using the uring specific direct descriptors.
+Note that only one of the descriptor fields may be set.
+The direct close feature is available since the 5.15 kernel, where direct descriptors were introduced.
+*/
+close :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, file_index: u32 = 0) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .CLOSE
+	sqe.fd = fd
+	sqe.user_data = user_data
+	sqe.file_index = file_index
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a statx(2) system call.
+
+See also statx(2) for the general description of the related system call.
+
+Available since 5.6.
+*/
+statx :: proc(ring: ^Ring, user_data: u64, dirfd: linux.Fd, pathname: cstring, flags: linux.FD_Flags, mask: linux.Statx_Mask, buf: ^linux.Statx) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .STATX
+	sqe.fd = dirfd
+	sqe.addr = cast(u64)transmute(uintptr)pathname
+	sqe.statx_flags = flags
+	sqe.statx_mask = mask
+	sqe.statx = buf
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a pread(2) system call.
+
+If offset is set to -1 , the offset will use (and advance) the file position, like the read(2) system calls.
+These are non-vectored versions of the IORING_OP_READV and IORING_OP_WRITEV opcodes.
+See also read(2) for the general description of the related system call.
+
+Available since 5.6.
+*/
+read :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, buf: []u8, offset: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .READ
+	sqe.fd = fd
+	sqe.addr = cast(u64)uintptr(raw_data(buf))
+	sqe.len = u32(len(buf))
+	sqe.off = offset
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a pwrite(2) system call.
+
+If offset is set to -1 , the offset will use (and advance) the file position, like the read(2) system calls.
+These are non-vectored versions of the IORING_OP_READV and IORING_OP_WRITEV opcodes.
+See also write(2) for the general description of the related system call.
+
+Available since 5.6.
+*/
+write :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, buf: []u8, offset: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .WRITE
+	sqe.fd = fd
+	sqe.addr = cast(u64)uintptr(raw_data(buf))
+	sqe.len = u32(len(buf))
+	sqe.off = offset
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a splice(2) system call.
+
+A sentinel value of -1 is used to pass the equivalent of a NULL for the offsets to splice(2).
+
+Please note that one of the file descriptors must refer to a pipe.
+See also splice(2) for the general description of the related system call.
+
+Available since 5.7.
+
+*/
+splice :: proc(ring: ^Ring, user_data: u64, fd_in: linux.Fd, off_in: i64, fd_out: linux.Fd, off_out: i64, len: u32, flags: linux.IO_Uring_Splice_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .SPLICE
+	sqe.splice_fd_in = fd_in
+	sqe.splice_off_in = cast(u64)off_in
+	sqe.fd = fd_out
+	sqe.off = cast(u64)off_out
+	sqe.len = len
+	sqe.splice_flags = flags
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+Issue the equivalent of a tee(2) system call.
+
+Please note that both of the file descriptors must refer to a pipe.
+See also tee(2) for the general description of the related system call.
+
+Available since 5.8.
+*/
+tee :: proc(ring: ^Ring, user_data: u64, fd_in: linux.Fd, fd_out: linux.Fd, len: u32, flags: linux.IO_Uring_Splice_Flags) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .TEE
+	sqe.splice_fd_in = fd_in
+	sqe.fd = fd_out
+	sqe.len = len
+	sqe.splice_flags = flags
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+/*
+This command is an alternative to using IORING_REGISTER_FILES_UPDATE which then works in an async fashion, like the rest of the uring commands.
+
+Note that the array of file descriptors pointed to in addr must remain valid until this operation has completed.
+
+Available since 5.6.
+*/
+files_update :: proc(ring: ^Ring, user_data: u64, fds: []linux.Fd, off: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .FILES_UPDATE
+	sqe.addr = cast(u64)uintptr(raw_data(fds))
+	sqe.len = cast(u32)len(fds)
+	sqe.off = off
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+provide_buffers :: proc() {
+	unimplemented()
+}
+
+remove_buffers :: proc() {
+	unimplemented()
+}
+
+/*
+Issue the equivalent of a shutdown(2) system call.
+
+Available since 5.11.
+*/
+shutdown :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, how: linux.Shutdown_How) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .SHUTDOWN
+	sqe.fd = fd
+	sqe.shutdown_how = how
+	sqe.user_data = user_data
+
+	ok = true
+	return
+}
+
+renameat :: proc() {
+	unimplemented()
+}
+
+unlinkat :: proc() {
+	unimplemented()
+}
+
+mkdirat :: proc() {
+	unimplemented()
+}
+
+symlinkat :: proc() {
+	unimplemented()
+}
+
+linkat :: proc() {
+	unimplemented()
+}
+
+msg_ring :: proc() {
+	unimplemented()
+}
+
+/*
+Issue the equivalent of a socket(2) system call.
+
+See also socket(2) for the general description of the related system call.
+
+Available since 5.19.
+
+If the file_index field is set to a positive number, the file won't be installed into the normal file
+table as usual but will be placed into the fixed file table at index file_index - 1.
+In this case, instead of returning a file descriptor, the result will contain either 0 on success or an error.
+If the index points to a valid empty slot, the installation is guaranteed to not fail.
+If there is already a file in the slot, it will be replaced, similar to IORING_OP_FILES_UPDATE.
+Please note that only uring has access to such files and no other syscall can use them.
+See IOSQE_FIXED_FILE and IORING_REGISTER_FILES.
+*/
+socket :: proc(ring: ^Ring, user_data: u64, domain: linux.Address_Family, socktype: linux.Socket_Type, protocol: linux.Protocol, file_index: u32 = 0) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .SOCKET
+	sqe.user_data = user_data
+	sqe.fd = cast(linux.Fd)domain
+	sqe.off = cast(u64)socktype
+	sqe.len = cast(u32)protocol
+	sqe.rw_flags = {}
+	sqe.file_index = file_index
+
+	ok = true
+	return
+}
+
+uring_cmd :: proc() {
+	unimplemented()
+}
+
+send_zc :: proc() {
+	unimplemented()
+}
+
+sendmsg_zc :: proc() {
+	unimplemented()
+}
+
+waitid :: proc() {
+	unimplemented()
+}
+
+setxattr :: proc() {
+	unimplemented()
+}
+
+getxattr :: proc() {
+	unimplemented()
+}
+
+fsetxattr :: proc() {
+	unimplemented()
+}
+
+fgetxattr :: proc() {
+	unimplemented()
+}
+
+/*
+Issues the equivalent of the bind(2) system call.
+
+Available since 6.11.
+*/
+bind :: proc(ring: ^Ring, user_data: u64, sock: linux.Fd, addr: ^$T) -> (sqe: linux.IO_Uring_SQE, ok: bool)
+	where
+	T == linux.Sock_Addr_In ||
+	T == linux.Sock_Addr_In6 ||
+	T == linux.Sock_Addr_Un ||
+	T == linux.Sock_Addr_Any
+{
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .BIND
+	sqe.user_data = user_data
+	sqe.fd = sock
+	sqe.addr = cast(u64)uintptr(addr)
+	sqe.addr2 = size_of(T)
+
+	ok = true
+	return
+}
+
+/*
+Issues the equivalent of the listen(2) system call.
+
+fd must contain the file descriptor of the socket and addr must contain the backlog parameter, i.e. the maximum amount of pending queued connections.
+
+Available since 6.11.
+*/
+listen :: proc(ring: ^Ring, user_data: u64, fd: linux.Fd, backlog: u64) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sqe = get_sqe(ring) or_return
+	sqe.opcode = .LISTEN
+	sqe.user_data = user_data
+	sqe.fd = fd
+	sqe.addr = backlog
+
+	ok = true
+	return
+}
+
+ftruncate :: proc() {
+	unimplemented()
+}
+
+read_multishot :: proc() {
+	unimplemented()
+}
+
+futex_wait :: proc() {
+	unimplemented()
+}
+
+futex_wake :: proc() {
+	unimplemented()
+}
+
+futex_waitv :: proc() {
+	unimplemented()
+}
+
+fixed_fd_install :: proc() {
+	unimplemented()
+}
+
+fixed_file :: proc() {
+	unimplemented()
+}
diff --git a/core/sys/linux/uring/uring.odin b/core/sys/linux/uring/uring.odin
new file mode 100644
index 000000000..8e4315e6a
--- /dev/null
+++ b/core/sys/linux/uring/uring.odin
@@ -0,0 +1,294 @@
+package uring
+
+import "core:math"
+import "core:sync"
+import "core:sys/linux"
+
+DEFAULT_THREAD_IDLE_MS :: 1000
+DEFAULT_ENTRIES        :: 32
+MAX_ENTRIES            :: 4096
+
+Ring :: struct {
+	fd:       linux.Fd,
+	sq:       Submission_Queue,
+	cq:       Completion_Queue,
+	flags:    linux.IO_Uring_Setup_Flags,
+	features: linux.IO_Uring_Features,
+}
+
+DEFAULT_PARAMS :: linux.IO_Uring_Params {
+	sq_thread_idle = DEFAULT_THREAD_IDLE_MS,
+}
+
+// Initialize and setup an uring, `entries` must be a power of 2 between 1 and 4096.
+init :: proc(ring: ^Ring, params: ^linux.IO_Uring_Params, entries: u32 = DEFAULT_ENTRIES) -> (err: linux.Errno) {
+	assert(entries <= MAX_ENTRIES,             "too many entries")
+	assert(entries != 0,                       "entries must be positive")
+	assert(math.is_power_of_two(int(entries)), "entries must be a power of two")
+
+	fd := linux.io_uring_setup(entries, params) or_return
+	defer if err != nil { linux.close(fd) }
+
+	if .SINGLE_MMAP not_in params.features {
+		// NOTE: Could support this, but currently isn't.
+		err = .ENOSYS
+		return
+	}
+
+	assert(.CQE32       not_in params.flags,    "unsupported flag") // NOTE: Could support this by making IO_Uring generic.
+	assert(.SQE128      not_in params.flags,    "unsupported flag") // NOTE: Could support this by making IO_Uring generic.
+
+	sq := submission_queue_make(fd, params) or_return
+
+	ring.fd = fd
+	ring.sq = sq
+	ring.cq = completion_queue_make(fd, params, &sq)
+	ring.flags = params.flags
+	ring.features = params.features
+
+	return
+}
+
+destroy :: proc(ring: ^Ring) {
+	assert(ring.fd >= 0)
+	submission_queue_destroy(&ring.sq)
+	linux.close(ring.fd)
+	ring.fd = -1
+}
+
+// Returns a pointer to a vacant submission queue entry, or nil if the submission queue is full.
+// NOTE: extra is so you can make sure there is space for related entries, defaults to 1 so
+// a link timeout op can always be added after another.
+get_sqe :: proc(ring: ^Ring, extra: int = 1) -> (sqe: ^linux.IO_Uring_SQE, ok: bool) {
+	sq := &ring.sq
+	head: u32 = sync.atomic_load_explicit(sq.head, .Acquire)
+	next := sq.sqe_tail + 1
+
+	if int(next - head) > len(sq.sqes)-extra {
+		sqe = nil
+		ok = false
+		return
+	}
+
+	sqe = &sq.sqes[sq.sqe_tail & sq.mask]
+	sqe^ = {}
+
+	sq.sqe_tail = next
+	ok = true
+	return
+}
+
+free_space :: proc(ring: ^Ring) -> int {
+	sq   := &ring.sq
+	head := sync.atomic_load_explicit(sq.head, .Acquire)
+	next := sq.sqe_tail + 1
+	free := len(sq.sqes) - int(next - head)
+	assert(free >= 0)
+	return free
+}
+
+// Sync internal state with kernel ring state on the submission queue side.
+// Returns the number of all pending events in the submission queue.
+// Rationale is to determine that an enter call is needed.
+flush_sq :: proc(ring: ^Ring) -> (n_pending: u32) {
+	sq := &ring.sq
+	to_submit := sq.sqe_tail - sq.sqe_head
+	if to_submit != 0 {
+		tail := sq.tail^
+		i: u32 = 0
+		for ; i < to_submit; i += 1 {
+			sq.array[tail & sq.mask] = sq.sqe_head & sq.mask
+			tail += 1
+			sq.sqe_head += 1
+		}
+		sync.atomic_store_explicit(sq.tail, tail, .Release)
+	}
+	n_pending = sq_ready(ring)
+	return
+}
+
+// Returns true if we are not using an SQ thread (thus nobody submits but us),
+// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened.
+// For the latter case, we set the SQ thread wakeup flag.
+// Matches the implementation of sq_ring_needs_enter() in liburing.
+sq_ring_needs_enter :: proc(ring: ^Ring, flags: ^linux.IO_Uring_Enter_Flags) -> bool {
+	assert(flags^ == {})
+	if .SQPOLL not_in ring.flags { return true }
+	if .NEED_WAKEUP in sync.atomic_load_explicit(ring.sq.flags, .Relaxed) {
+		flags^ += {.SQ_WAKEUP}
+		return true
+	}
+	return false
+}
+
+
+// Submits the submission queue entries acquired via get_sqe().
+// Returns the number of entries submitted.
+// Optionally wait for a number of events by setting `wait_nr`, and/or set a maximum wait time by setting `timeout`.
+submit :: proc(ring: ^Ring, wait_nr: u32 = 0, timeout: ^linux.Time_Spec = nil) -> (n_submitted: u32, err: linux.Errno) {
+	n_submitted = flush_sq(ring)
+	flags: linux.IO_Uring_Enter_Flags
+	if sq_ring_needs_enter(ring, &flags) || wait_nr > 0 {
+		if wait_nr > 0 || .IOPOLL in ring.flags {
+			flags += {.GETEVENTS}
+		}
+
+		flags += {.EXT_ARG}
+		ext: linux.IO_Uring_Getevents_Arg
+		ext.ts = timeout
+
+		n_submitted_: int
+		n_submitted_, err = linux.io_uring_enter2(ring.fd, n_submitted, wait_nr, flags, &ext)
+		assert(n_submitted_ >= 0)
+		n_submitted = u32(n_submitted_)
+	}
+	return
+}
+
+// Returns the number of submission queue entries in the submission queue.
+sq_ready :: proc(ring: ^Ring) -> u32 {
+	// Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync,
+	// see https://github.com/axboe/liburing/issues/92.
+	return ring.sq.sqe_tail - sync.atomic_load_explicit(ring.sq.head, .Acquire)
+}
+
+// Returns the number of completion queue entries in the completion queue (yet to consume).
+cq_ready :: proc(ring: ^Ring) -> (n_ready: u32) {
+	return sync.atomic_load_explicit(ring.cq.tail, .Acquire) - ring.cq.head^
+}
+
+// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice.
+// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs.
+// Returns the number of CQEs copied, advancing the CQ ring.
+// Provides all the wait/peek methods found in liburing, but with batching and a single method.
+// TODO: allow for timeout.
+copy_cqes :: proc(ring: ^Ring, cqes: []linux.IO_Uring_CQE, wait_nr: u32) -> (n_copied: u32, err: linux.Errno) {
+	n_copied = copy_cqes_ready(ring, cqes)
+	if n_copied > 0 { return }
+	if wait_nr > 0 || cq_ring_needs_flush(ring) {
+		_ = linux.io_uring_enter(ring.fd, 0, wait_nr, {.GETEVENTS}, nil) or_return
+		n_copied = copy_cqes_ready(ring, cqes)
+	}
+	return
+}
+
+copy_cqes_ready :: proc(ring: ^Ring, cqes: []linux.IO_Uring_CQE) -> (n_copied: u32) {
+	n_ready := cq_ready(ring)
+	n_copied = min(u32(len(cqes)), n_ready)
+	head := ring.cq.head^
+	tail := head + n_copied
+	shift := u32(.CQE32 in ring.flags)
+
+	i := 0
+	for head != tail {
+		cqes[i] = ring.cq.cqes[(head & ring.cq.mask) << shift]
+		head += 1
+		i += 1
+	}
+	cq_advance(ring, n_copied)
+	return
+}
+
+cq_ring_needs_flush :: proc(ring: ^Ring) -> bool {
+	return .CQ_OVERFLOW in sync.atomic_load_explicit(ring.sq.flags, .Relaxed)
+}
+
+// For advanced use cases only that implement custom completion queue methods.
+// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance().
+// Must be called exactly once after a zero-copy CQE has been processed by your application.
+// Not idempotent, calling more than once will result in other CQEs being lost.
+// Matches the implementation of cqe_seen() in liburing.
+cqe_seen :: proc(ring: ^Ring) {
+	cq_advance(ring, 1)
+}
+
+// For advanced use cases only that implement custom completion queue methods.
+// Matches the implementation of cq_advance() in liburing.
+cq_advance :: proc(ring: ^Ring, count: u32) {
+	if count == 0 { return }
+	sync.atomic_store_explicit(ring.cq.head, ring.cq.head^ + count, .Release)
+}
+
+Submission_Queue :: struct {
+	head:      ^u32,
+	tail:      ^u32,
+	mask:      u32,
+	flags:     ^linux.IO_Uring_Submission_Queue_Flags,
+	dropped:   ^u32,
+	array:     []u32,
+	sqes:      []linux.IO_Uring_SQE,
+	mmap:      []u8,
+	mmap_sqes: []u8,
+
+	// We use `sqe_head` and `sqe_tail` in the same way as liburing:
+	// We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`.
+	// We then set `tail` to `sqe_tail` once, only when these events are actually submitted.
+	// This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs.
+	sqe_head:  u32,
+	sqe_tail:  u32,
+}
+
+submission_queue_make :: proc(fd: linux.Fd, params: ^linux.IO_Uring_Params) -> (sq: Submission_Queue, err: linux.Errno) {
+	assert(fd >= 0, "uninitialized queue fd")
+	assert(.SINGLE_MMAP in params.features, "unsupported feature") // NOTE: Could support this, but currently isn't.
+
+	sq_size := params.sq_off.array + params.sq_entries * size_of(u32)
+	cq_size := params.cq_off.cqes + params.cq_entries * size_of(linux.IO_Uring_CQE)
+	size := max(sq_size, cq_size)
+
+	// PERF: .POPULATE commits all pages right away, is that desired?
+
+	cqe_map := cast([^]byte)(linux.mmap(0, uint(size), {.READ, .WRITE}, {.SHARED, .POPULATE}, fd, linux.IORING_OFF_SQ_RING) or_return)
+	defer if err != nil { linux.munmap(cqe_map, uint(size)) }
+
+	size_sqes := params.sq_entries * size_of(linux.IO_Uring_SQE)
+	sqe_map   := cast([^]byte)(linux.mmap(0, uint(size_sqes), {.READ, .WRITE}, {.SHARED, .POPULATE}, fd, linux.IORING_OFF_SQES) or_return)
+
+	array := cast([^]u32)cqe_map[params.sq_off.array:]
+	sqes  := cast([^]linux.IO_Uring_SQE)sqe_map
+
+	sq.head      = cast(^u32)&cqe_map[params.sq_off.head]
+	sq.tail      = cast(^u32)&cqe_map[params.sq_off.tail]
+	sq.mask      = (cast(^u32)&cqe_map[params.sq_off.ring_mask])^
+	sq.flags     = cast(^linux.IO_Uring_Submission_Queue_Flags)&cqe_map[params.sq_off.flags]
+	sq.dropped   = cast(^u32)&cqe_map[params.sq_off.dropped]
+	sq.array     = array[:params.sq_entries]
+	sq.sqes      = sqes[:params.sq_entries]
+	sq.mmap      = cqe_map[:size]
+	sq.mmap_sqes = sqe_map[:size_sqes]
+
+	return
+}
+
+submission_queue_destroy :: proc(sq: ^Submission_Queue) -> (err: linux.Errno) {
+	err   = linux.munmap(raw_data(sq.mmap), uint(len(sq.mmap)))
+	err2 := linux.munmap(raw_data(sq.mmap_sqes), uint(len(sq.mmap_sqes)))
+	if err == nil { err = err2 }
+	return
+}
+
+Completion_Queue :: struct {
+	head:     ^u32,
+	tail:     ^u32,
+	mask:     u32,
+	overflow: ^u32,
+	cqes:     []linux.IO_Uring_CQE,
+}
+
+completion_queue_make :: proc(fd: linux.Fd, params: ^linux.IO_Uring_Params, sq: ^Submission_Queue) -> Completion_Queue {
+	assert(fd >= 0, "uninitialized queue fd")
+	assert(.SINGLE_MMAP in params.features, "required feature SINGLE_MMAP not supported")
+
+	mmap := sq.mmap
+	cqes := cast([^]linux.IO_Uring_CQE)&mmap[params.cq_off.cqes]
+
+	return(
+		{
+			head     = cast(^u32)&mmap[params.cq_off.head],
+			tail     = cast(^u32)&mmap[params.cq_off.tail],
+			mask     = (cast(^u32)&mmap[params.cq_off.ring_mask])^,
+			overflow = cast(^u32)&mmap[params.cq_off.overflow],
+			cqes     = cqes[:params.cq_entries],
+		} \
+	)
+}
author	Laytan Laats <laytanlaats@hotmail.com>	2026-01-14 19:03:15 +0100
committer	Laytan Laats <laytanlaats@hotmail.com>	2026-01-14 19:03:15 +0100
commit	5fcf210c916135a66283b65cd7785cbbe091ebee (patch)
tree	c16ad89b2ca8fe370d1bf60f97135ffff813ed8a /core/sys/linux
parent	792e6c75ee08ba62ca5578ec3ec273e90a94668b (diff)