aboutsummaryrefslogtreecommitdiff
path: root/core/sys/linux
diff options
context:
space:
mode:
authorLaytan <laytanlaats@hotmail.com>2025-11-10 20:03:57 +0100
committerLaytan <laytanlaats@hotmail.com>2025-11-10 20:03:57 +0100
commit3db76bc367ebe7f5cb67447d90af64ebdd603abb (patch)
treed3b7eb842f77df84a016c4debe8adf56d7acb9b3 /core/sys/linux
parentad9dc4d31b2589cbf680e2b8209012e7e9bddd73 (diff)
sys/linux: add io uring API
Diffstat (limited to 'core/sys/linux')
-rw-r--r--core/sys/linux/bits.odin262
-rw-r--r--core/sys/linux/constants.odin10
-rw-r--r--core/sys/linux/sys.odin45
-rw-r--r--core/sys/linux/types.odin205
4 files changed, 516 insertions, 6 deletions
diff --git a/core/sys/linux/bits.odin b/core/sys/linux/bits.odin
index 64cdd2208..12ae949ef 100644
--- a/core/sys/linux/bits.odin
+++ b/core/sys/linux/bits.odin
@@ -1964,3 +1964,265 @@ RISCV_HWProbe_Misaligned_Scalar_Perf :: enum {
UNSUPPORTED,
}
+IO_Uring_Enter_Flags_Bits :: enum {
+ GETEVENTS,
+ SQ_WAKEUP,
+ SQ_WAIT,
+ EXT_ARG, // Available since Linux 5.11
+ REGISTERED_RING,
+}
+
+IO_Uring_Register_Opcode :: enum uint {
+ REGISTER_BUFFERS = 0,
+ UNREGISTER_BUFFERS = 1,
+ REGISTER_FILES = 2,
+ UNREGISTER_FILES = 3,
+ REGISTER_EVENTFD = 4,
+ UNREGISTER_EVENTFD = 5,
+ REGISTER_FILES_UPDATE = 6,
+ REGISTER_EVENTFD_ASYNC = 7,
+ REGISTER_PROBE = 8,
+ REGISTER_PERSONALITY = 9,
+ UNREGISTER_PERSONALITY = 10,
+ REGISTER_RESTRICTIONS = 11,
+ REGISTER_ENABLE_RINGS = 12,
+ /* extended with tagging */
+ REGISTER_FILES2 = 13,
+ REGISTER_FILES_UPDATE2 = 14,
+ REGISTER_BUFFERS2 = 15,
+ REGISTER_BUFFERS_UPDATE = 16,
+ /* set/clear io-wq thread affinities */
+ REGISTER_IOWQ_AFF = 17,
+ UNREGISTER_IOWQ_AFF = 18,
+ /* set/get max number of io-wq workers */
+ REGISTER_IOWQ_MAX_WORKERS = 19,
+ /* register/unregister io_uring fd with the ring */
+ REGISTER_RING_FDS = 20,
+ UNREGISTER_RING_FDS = 21,
+ /* register ring based provide buffer group */
+ REGISTER_PBUF_RING = 22,
+ UNREGISTER_PBUF_RING = 23,
+ /* sync cancelation API */
+ REGISTER_SYNC_CANCEL = 24,
+ /* register a range of fixed file slots for automatic slot allocation */
+ REGISTER_FILE_ALLOC_RANGE = 25,
+ /* this goes last */
+ REGISTER_LAST,
+ /* flag added to the opcode to use a registered ring fd */
+ REGISTER_USE_REGISTERED_RING = 1 << 31,
+}
+
+IO_Uring_Setup_Flags_Bits :: enum {
+ // io_context is polled.
+ IOPOLL,
+ // SQ poll thread.
+ SQPOLL,
+ // sq_thread_cpu is valid.
+ SQ_AFF,
+ // app defines CQ size.
+ CQSIZE,
+ // clamp SQ/CQ ring sizes.
+ CLAMP,
+ // attach to existing wq.
+ ATTACH_WQ,
+ // start with ring disabled.
+ R_DISABLED,
+ // continue submit on error.
+ SUBMIT_ALL,
+ // Cooperative task running. When requests complete, they often require
+ // forcing the submitter to transition to the kernel to complete. If this
+ // flag is set, work will be done when the task transitions anyway, rather
+ // than force an inter-processor interrupt reschedule. This avoids interrupting
+ // a task running in userspace, and saves an IPI.
+ COOP_TASKRUN,
+ // If COOP_TASKRUN is set, get notified if task work is available for
+ // running and a kernel transition would be needed to run it. This sets
+ // IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ TASKRUN_FLAG,
+ // SQEs are 128 bytes.
+ SQE128,
+ // CQEs are 32 bytes.
+ CQE32,
+ // Only one task is allowed to submit requests
+ SINGLE_ISSUER,
+ // Defer running task work to get events.
+ // Rather than running bits of task work whenever the task transitions
+ // try to do it just before it is needed.
+ DEFER_TASKRUN,
+}
+
+IO_Uring_Features_Bits :: enum {
+ SINGLE_MMAP,
+ NODROP,
+ SUBMIT_STABLE,
+ RW_CUR_POS,
+ CUR_PERSONALITY,
+ FAST_POLL,
+ POLL_32BITS,
+ SQPOLL_NONFIXED,
+ EXT_ARG,
+ NATIVE_WORKERS,
+ RSRC_TAGS,
+}
+
+IO_Uring_CQE_Flags_Bits :: enum {
+ // If set, the upper 16 bits are the buffer ID.
+ BUFFER,
+ // If set, parent SQE will generate more CQE entries.
+ MORE,
+ // If set, more data to read after socket recv.
+ SOCK_NONEMPTY,
+ // Set for notification CQEs. Can be used to distinct them from sends.
+ NOTIF,
+}
+
+IO_Uring_OP :: enum u8 {
+ NOP,
+ READV,
+ WRITEV,
+ FSYNC,
+ READ_FIXED,
+ WRITE_FIXED,
+ POLL_ADD,
+ POLL_REMOVE,
+ SYNC_FILE_RANGE,
+ SENDMSG,
+ RECVMSG,
+ TIMEOUT,
+ TIMEOUT_REMOVE,
+ ACCEPT,
+ ASYNC_CANCEL,
+ LINK_TIMEOUT,
+ CONNECT,
+ FALLOCATE,
+ OPENAT,
+ CLOSE,
+ FILES_UPDATE,
+ STATX,
+ READ,
+ WRITE,
+ FADVISE,
+ MADVISE,
+ SEND,
+ RECV,
+ OPENAT2,
+ EPOLL_CTL,
+ SPLICE,
+ PROVIDE_BUFFERS,
+ REMOVE_BUFFERS,
+ TEE,
+ SHUTDOWN,
+ RENAMEAT,
+ UNLINKAT,
+ MKDIRAT,
+ SYMLINKAT,
+ LINKAT,
+ MSG_RING,
+ FSETXATTR,
+ SETXATTR,
+ FGETXATTR,
+ GETXATTR,
+ SOCKET,
+ URING_CMD,
+ SEND_ZC,
+ SENDMSG_ZC,
+ READ_MULTISHOT,
+ WAITID,
+ FUTEX_WAIT,
+ FUTEX_WAKE,
+ FUTEX_WAITV,
+ FIXED_FD_INSTALL,
+ FTRUNCATE,
+ BIND,
+ LISTEN,
+}
+
+IO_Uring_SQE_Flags_Bits :: enum {
+ // Use fixed fileset.
+ FIXED_FILE,
+ // Issue after inflight IO.
+ IO_DRAIN,
+ // Links next sqe.
+ IO_LINK,
+ // Like LINK, but stronger.
+ IO_HARDLINK,
+ // Always go async.
+ ASYNC,
+ // Select buffer from sq.buf_group.
+ BUFFER_SELECT,
+ // Don't post CQE if request succeeded.
+ CQE_SKIP_SUCCESS,
+}
+
+IO_Uring_Poll_Add_Flags_Bits :: enum {
+ ADD_MULTI,
+ UPDATE_EVENTS,
+ UPDATE_USER_DATA,
+ ADD_LEVEL,
+}
+
+IO_Uring_Fsync_Flags_Bits :: enum {
+ DATASYNC,
+}
+
+IO_Uring_Timeout_Flags_Bits :: enum {
+ ABS,
+ UPDATE,
+ BOOTTIME,
+ REALTIME,
+ LINK_TIMEOUT_UPDATE,
+ ETIME_SUCCESS,
+}
+
+IO_Uring_Cmd_Flags_Bits :: enum {
+ // use registered buffer; pass this flag along with setting sqe.buf_index.
+ FIXED,
+}
+
+IO_Uring_Splice_Flags_Bits :: enum {
+ MOVE,
+ NONBLOCK,
+ MORE,
+ GIFT,
+ F_FD_IN_FIXED = 31,
+}
+
+IO_Uring_Accept_Flags_Bits :: enum {
+ MULTISHOT,
+}
+
+IO_Uring_Send_Recv_Flags_Bits :: enum {
+ /*
+ If set, instead of first attempting to send
+ or receive and arm poll if that yields an
+ -EAGAIN result, arm poll upfront and skip
+ the initial transfer attempt.
+ */
+ RECVSEND_POLL_FIRST,
+ /*
+ Multishot recv. Sets IORING_CQE_F_MORE if
+ the handler will continue to report
+ CQEs on behalf of the same SQE.
+ */
+ RECV_MULTISHOT,
+ /*
+ Use registered buffers, the index is stored in
+ the buf_index field.
+ */
+ RECVSEND_FIXED_BUF,
+ /*
+ If set, SEND[MSG]_ZC should report
+ the zerocopy usage in cqe.res
+ for the IORING_CQE_F_NOTIF cqe.
+ 0 is reported if zerocopy was actually possible.
+ IORING_NOTIF_USAGE_ZC_COPIED if data was copied
+ (at least partially).
+ */
+ SEND_ZC_REPORT_USAGE,
+}
+
+IO_Uring_Submission_Queue_Flags_Bits :: enum {
+ NEED_WAKEUP,
+ CQ_OVERFLOW,
+ TASKRUN,
+}
diff --git a/core/sys/linux/constants.odin b/core/sys/linux/constants.odin
index ceab17f6d..c80777025 100644
--- a/core/sys/linux/constants.odin
+++ b/core/sys/linux/constants.odin
@@ -395,3 +395,13 @@ MAP_HUGE_16GB :: transmute(Map_Flags)(u32(34) << MAP_HUGE_SHIFT)
/* Get window size */
TIOCGWINSZ :: 0x5413
+
+IORING_TIMEOUT_CLOCK_MASK :: IO_Uring_Timeout_Flags{.BOOTTIME, .REALTIME}
+IORING_TIMEOUT_UPDATE_MASK :: IO_Uring_Timeout_Flags{.UPDATE, .LINK_TIMEOUT_UPDATE}
+
+IORING_OFF_SQ_RING :: 0
+IORING_OFF_CQ_RING :: 0x8000000
+IORING_OFF_SQES :: 0x10000000
+IORING_OFF_PBUF_RING :: 0x80000000
+IORING_OFF_PBUF_SHIFT :: 16
+IORING_OFF_MMAP_MASK :: 0xf8000000
diff --git a/core/sys/linux/sys.odin b/core/sys/linux/sys.odin
index deb22726f..cfd586a66 100644
--- a/core/sys/linux/sys.odin
+++ b/core/sys/linux/sys.odin
@@ -510,7 +510,7 @@ sendfile :: proc "contextless" (out_fd: Fd, in_fd: Fd, offset: ^i64, count: uint
Available since Linux 2.0.
*/
socket :: proc "contextless" (domain: Address_Family, socktype: Socket_Type, sockflags: Socket_FD_Flags, protocol: Protocol) -> (Fd, Errno) {
- sock_type_flags: int = cast(int) socktype | transmute(int) sockflags
+ sock_type_flags: int = cast(int) socktype | cast(int) transmute(i32) sockflags
ret := syscall(SYS_socket, domain, sock_type_flags, protocol)
return errno_unwrap(ret, Fd)
}
@@ -543,7 +543,7 @@ where
T == Sock_Addr_Any
{
addr_len: i32 = size_of(T)
- ret := syscall(SYS_accept4, sock, addr, &addr_len, transmute(int) sockflags)
+ ret := syscall(SYS_accept4, sock, addr, &addr_len, transmute(i32) sockflags)
return errno_unwrap(ret, Fd)
}
@@ -2927,11 +2927,46 @@ statx :: proc "contextless" (dir: Fd, pathname: cstring, flags: FD_Flags, mask:
// TODO(flysand): pidfd_send_signal
-// TODO(flysand): io_uring_setup
+/*
+ Setup a context for performing asynchronous I/O.
+
+ Available since Linux 5.1
+*/
+io_uring_setup :: proc "contextless" (entries: u32, params: ^IO_Uring_Params) -> (Fd, Errno) {
+ ret := syscall(SYS_io_uring_setup, entries, params)
+ return errno_unwrap(ret, Fd)
+}
-// TODO(flysand): io_uring_enter
+/*
+ Initiate and/or complete I/O using the shared submission and completion queues.
-// TODO(flysand): io_uring_register
+ Available since Linux 5.1
+*/
+io_uring_enter :: proc "contextless" (fd: Fd, to_submit: u32, min_complete: u32, flags: IO_Uring_Enter_Flags, sig: ^Sig_Set) -> (int, Errno) {
+ ret := syscall(SYS_io_uring_enter, fd, to_submit, min_complete, transmute(u32)flags, sig, size_of(Sig_Set) if sig != nil else 0)
+ return errno_unwrap(ret, int)
+}
+
+/*
+ Initiate and.or complete I/O using the shared submission and completion queues.
+
+ Available since Linux 5.11
+*/
+io_uring_enter2 :: proc "contextless" (fd: Fd, to_submit: u32, min_complete: u32, flags: IO_Uring_Enter_Flags, arg: ^IO_Uring_Getevents_Arg) -> (int, Errno) {
+ assert_contextless(.EXT_ARG in flags)
+ ret := syscall(SYS_io_uring_enter, fd, to_submit, min_complete, transmute(u32)flags, arg, size_of(IO_Uring_Getevents_Arg))
+ return errno_unwrap(ret, int)
+}
+
+/*
+ Register files or user buffers for asynchronous I/O.
+
+ Available since Linux 5.1
+*/
+io_uring_register :: proc "contextless" (fd: Fd, opcode: IO_Uring_Register_Opcode, arg: rawptr, nr_args: u32) -> Errno {
+ ret := syscall(SYS_io_uring_register, fd, opcode, arg, nr_args)
+ return Errno(-ret)
+}
// TODO(flysand): open_tree
diff --git a/core/sys/linux/types.odin b/core/sys/linux/types.odin
index 38b413cdd..a2819803c 100644
--- a/core/sys/linux/types.odin
+++ b/core/sys/linux/types.odin
@@ -763,7 +763,7 @@ Sig_Action :: struct($T: typeid) {
Note, on linux these are technically passed by OR'ing together
with Socket_Type, our wrapper does this under the hood.
*/
-Socket_FD_Flags :: bit_set[Socket_FD_Flags_Bits; int]
+Socket_FD_Flags :: bit_set[Socket_FD_Flags_Bits; i32]
/*
Address family for the socket.
@@ -1488,3 +1488,206 @@ RISCV_HWProbe :: struct {
raw: u64,
},
}
+
+IO_Uring_Params :: struct {
+ sq_entries: u32,
+ cq_entries: u32,
+ flags: IO_Uring_Setup_Flags,
+ sq_thread_cpu: u32,
+ sq_thread_idle: u32,
+ features: IO_Uring_Features,
+ wq_fd: u32,
+ resv: [3]u32,
+ sq_off: IO_SQ_Ring_Offsets,
+ cq_off: IO_CQ_Ring_Offsets,
+}
+
+IO_Uring_Setup_Flags :: bit_set[IO_Uring_Setup_Flags_Bits; u32]
+
+IO_Uring_Features :: bit_set[IO_Uring_Features_Bits; u32]
+
+IO_SQ_Ring_Offsets :: struct {
+ head: u32,
+ tail: u32,
+ ring_mask: u32,
+ ring_entries: u32,
+ flags: u32,
+ dropped: u32,
+ array: u32,
+ resv1: u32,
+ user_addr: u64,
+}
+
+IO_CQ_Ring_Offsets :: struct {
+ head: u32,
+ tail: u32,
+ ring_mask: u32,
+ ring_entries: u32,
+ overflow: u32,
+ cqes: u32,
+ flags: u32,
+ resv1: u32,
+ user_addr: u64,
+}
+
+IO_Uring_Enter_Flags :: bit_set[IO_Uring_Enter_Flags_Bits; u32]
+
+IO_Uring_Getevents_Arg :: struct #min_field_align(8) {
+ sigmask: ^Sig_Set,
+ sigmask_sz: u32,
+ // pad: u32,
+ ts: ^Time_Spec,
+}
+#assert(align_of(IO_Uring_Getevents_Arg) == 8)
+
+IO_Uring_Rsrc_Register :: struct($T: typeid) {
+ nr: u32,
+ resv: u32,
+ resv2: u64,
+ using _: struct #min_field_align(8) {
+ data: [^]T,
+ tags: [^]u64,
+ },
+}
+
+IO_Uring_Rsrc_Update2 :: struct($T: typeid) {
+ offset: u32,
+ resv: u32,
+ using _: struct #min_field_align(8) {
+ data: [^]T,
+ tags: [^]u64,
+ },
+ nr: u32,
+ resv2: u32,
+}
+
+// The completion queue entry when the .CQE32 flag is not set on setup.
+IO_Uring_CQE :: struct {
+ // sq.data submission passed back.
+ user_data: u64,
+ // result code for this event.
+ res: i32,
+ flags: IO_Uring_CQE_Flags,
+}
+#assert(size_of(IO_Uring_CQE) == 16)
+
+// The completion queue entry when the .CQE32 flag is set on setup.
+IO_Uring_CQE32 :: struct {
+ using _: IO_Uring_CQE,
+ pad: u64,
+ pad2: u64,
+}
+#assert(size_of(IO_Uring_CQE32) == 32)
+
+IO_Uring_CQE_Flags :: bit_set[IO_Uring_CQE_Flags_Bits; u32]
+IO_Uring_SQE_Flags :: bit_set[IO_Uring_SQE_Flags_Bits; u8]
+
+// The submission queue entry when the .SQE128 flag is not set on setup.
+IO_Uring_SQE :: struct {
+ opcode: IO_Uring_OP,
+ flags: IO_Uring_SQE_Flags,
+ using __ioprio: struct #raw_union {
+ ioprio: u16,
+ sq_accept_flags: IO_Uring_Accept_Flags,
+ sq_send_recv_flags: IO_Uring_Send_Recv_Flags,
+ },
+ fd: Fd,
+ using __offset: struct #raw_union {
+ // Offset into file.
+ off: u64,
+ addr2: u64,
+ using _: struct {
+ cmd_op: u32,
+ __pad1: u32,
+ },
+ statx: ^Statx,
+ },
+ using __iovecs: struct #raw_union {
+ // Pointer to buffer or iovecs.
+ addr: u64,
+ splice_off_in: u64,
+ using _: struct {
+ level: u32,
+ optname: u32,
+ },
+ },
+ using __len: struct #raw_union {
+ // Buffer size or number of iovecs.
+ len: u32,
+ poll_flags: IO_Uring_Poll_Add_Flags,
+ statx_mask: Statx_Mask,
+ epoll_ctl_op: EPoll_Ctl_Opcode,
+ shutdown_how: Shutdown_How,
+ },
+ using __contents: struct #raw_union {
+ rw_flags: i32,
+ fsync_flags: IO_Uring_Fsync_Flags,
+ // compatibility.
+ poll_events: Fd_Poll_Events,
+ // word-reversed for BE.
+ poll32_events: u32,
+ sync_range_flags: u32,
+ msg_flags: Socket_Msg,
+ timeout_flags: IO_Uring_Timeout_Flags,
+ accept_flags: Socket_FD_Flags,
+ cancel_flags: u32,
+ open_flags: Open_Flags,
+ statx_flags: FD_Flags,
+ fadvise_advice: u32,
+ splice_flags: IO_Uring_Splice_Flags,
+ rename_flags: u32,
+ unlink_flags: u32,
+ hardlink_flags: u32,
+ xattr_flags: u32,
+ msg_ring_flags: u32,
+ uring_cmd_flags: IO_Uring_Cmd_Flags,
+ },
+ // Data to be passed back at completion time.
+ user_data: u64,
+ using __buffer: struct #raw_union {
+ // Index into fixed buffers, if used.
+ buf_index: u16,
+ // For grouped buffer selection.
+ buf_group: u16,
+ },
+ // Personality to use, if used.
+ personality: u16,
+ using _: struct #raw_union {
+ splice_fd_in: Fd,
+ file_index: u32,
+ using _: struct {
+ addr_len: u16,
+ __pad3: [1]u16,
+ },
+ },
+ using __: struct #raw_union {
+ using _: struct {
+ addr3: u64,
+ __pad2: [1]u64,
+ },
+ },
+}
+#assert(size_of(IO_Uring_SQE) == 64)
+
+// The submission queue entry when the .SQE128 flag is set on setup.
+IO_Uring_SQE128 :: struct {
+ using _: IO_Uring_SQE,
+ cmd: [64]byte,
+}
+#assert(size_of(IO_Uring_SQE128) == 128)
+
+IO_Uring_Poll_Add_Flags :: bit_set[IO_Uring_Poll_Add_Flags_Bits; u32]
+
+IO_Uring_Fsync_Flags :: bit_set[IO_Uring_Fsync_Flags_Bits; u32]
+
+IO_Uring_Timeout_Flags :: bit_set[IO_Uring_Timeout_Flags_Bits; u32]
+
+IO_Uring_Cmd_Flags :: bit_set[IO_Uring_Cmd_Flags_Bits; u32]
+
+IO_Uring_Splice_Flags :: bit_set[IO_Uring_Splice_Flags_Bits; u32]
+
+IO_Uring_Accept_Flags :: bit_set[IO_Uring_Accept_Flags_Bits; u16]
+
+IO_Uring_Send_Recv_Flags :: bit_set[IO_Uring_Send_Recv_Flags_Bits; u16]
+
+IO_Uring_Submission_Queue_Flags :: bit_set[IO_Uring_Submission_Queue_Flags_Bits; u32]