lph-11/odin-http/nbio/_io_uring/sys.odin
2025-03-13 18:14:21 +13:00

478 lines
13 KiB
Odin

#+build linux
package io_uring
import "base:intrinsics"
//odinfmt:disable
SYS_io_uring_setup: uintptr : 425
SYS_io_uring_enter: uintptr : 426
SYS_io_uring_register: uintptr : 427
//odinfmt:enable
NSIG :: 65
sigset_t :: [1024 / 32]u32
io_uring_params :: struct {
sq_entries: u32,
cq_entries: u32,
flags: u32,
sq_thread_cpu: u32,
sq_thread_idle: u32,
features: u32,
wq_fd: u32,
resv: [3]u32,
sq_off: io_sqring_offsets,
cq_off: io_cqring_offsets,
}
#assert(size_of(io_uring_params) == 120)
io_sqring_offsets :: struct {
head: u32,
tail: u32,
ring_mask: u32,
ring_entries: u32,
flags: u32,
dropped: u32,
array: u32,
resv1: u32,
user_addr: u64,
}
io_cqring_offsets :: struct {
head: u32,
tail: u32,
ring_mask: u32,
ring_entries: u32,
overflow: u32,
cqes: u32,
flags: u32,
resv1: u32,
user_addr: u64,
}
// Submission queue entry.
io_uring_sqe :: struct {
opcode: IORING_OP, // u8
flags: u8, /* IOSQE_ flags */
ioprio: u16, /* ioprio for the request */
fd: i32, /* file descriptor to do IO on */
using __offset: struct #raw_union {
off: u64, /* offset into file */
addr2: u64,
using _: struct {
cmd_op: u32,
__pad1: u32,
},
},
using __iovecs: struct #raw_union {
addr: u64, /* pointer to buffer or iovecs */
splice_off_in: u64,
},
len: u32, /* buffer size or number of iovecs */
using __contents: struct #raw_union {
rw_flags: i32,
fsync_flags: u32,
poll_events: u16, /* compatibility */
poll32_events: u32, /* word-reversed for BE */
sync_range_flags: u32,
msg_flags: u32,
timeout_flags: u32,
accept_flags: u32,
cancel_flags: u32,
open_flags: u32,
statx_flags: u32,
fadvise_advice: u32,
splice_flags: u32,
rename_flags: u32,
unlink_flags: u32,
hardlink_flags: u32,
xattr_flags: u32,
msg_ring_flags: u32,
uring_cmd_flags: u32,
},
user_data: u64, /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
using __buffer: struct #raw_union {
/* index into fixed buffers, if used */
buf_index: u16,
/* for grouped buffer selection */
buf_group: u16,
},
/* personality to use, if used */
personality: u16,
using _: struct #raw_union {
splice_fd_in: i32,
file_index: u32,
using _: struct {
addr_len: u16,
__pad3: [1]u16,
},
},
using __: struct #raw_union {
using _: struct {
addr3: u64,
__pad2: [1]u64,
},
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
* this field is used for 80 bytes of arbitrary command data
* NOTE: This is currently not supported.
*/
// cmd: [^]u8,
},
}
#assert(size_of(io_uring_sqe) == 64)
// Completion queue entry.
io_uring_cqe :: struct {
user_data: u64, /* sq.data submission passed back */
res: i32, /* result code for this event */
flags: u32,
/*
* If the ring is initialized with IORING_SETUP_CQE32, then this field
* contains 16-bytes of padding, doubling the size of the CQE.
* NOTE: This is currently not supported.
*/
// big_cqe: [^]u64,
}
#assert(size_of(io_uring_cqe) == 16)
/*
* sqe.flags
*/
/* use fixed fileset */
IOSQE_FIXED_FILE: u32 : (1 << 0)
/* issue after inflight IO */
IOSQE_IO_DRAIN: u32 : (1 << 1)
/* links next sqe */
IOSQE_IO_LINK: u32 : (1 << 2)
/* like LINK, but stronger */
IOSQE_IO_HARDLINK: u32 : (1 << 3)
/* always go async */
IOSQE_ASYNC: u32 : (1 << 4)
/* select buffer from sq.buf_group */
IOSQE_BUFFER_SELECT: u32 : (1 << 5)
/* don't post CQE if request succeeded */
IOSQE_CQE_SKIP_SUCCESS: u32 : (1 << 6)
/*
* io_uring_setup() flags
*/
IORING_SETUP_IOPOLL: u32 : (1 << 0) /* io_context is polled */
IORING_SETUP_SQPOLL: u32 : (1 << 1) /* SQ poll thread */
IORING_SETUP_SQ_AFF: u32 : (1 << 2) /* sq_thread_cpu is valid */
IORING_SETUP_CQSIZE: u32 : (1 << 3) /* app defines CQ size */
IORING_SETUP_CLAMP: u32 : (1 << 4) /* clamp SQ/CQ ring sizes */
IORING_SETUP_ATTACH_WQ: u32 : (1 << 5) /* attach to existing wq */
IORING_SETUP_R_DISABLED: u32 : (1 << 6) /* start with ring disabled */
IORING_SETUP_SUBMIT_ALL: u32 : (1 << 7) /* continue submit on error */
// Cooperative task running. When requests complete, they often require
// forcing the submitter to transition to the kernel to complete. If this
// flag is set, work will be done when the task transitions anyway, rather
// than force an inter-processor interrupt reschedule. This avoids interrupting
// a task running in userspace, and saves an IPI.
IORING_SETUP_COOP_TASKRUN: u32 : (1 << 8)
// If COOP_TASKRUN is set, get notified if task work is available for
// running and a kernel transition would be needed to run it. This sets
// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
IORING_SETUP_TASKRUN_FLAG: u32 : (1 << 9)
IORING_SETUP_SQE128: u32 : (1 << 10) /* SQEs are 128 byte */
IORING_SETUP_CQE32: u32 : (1 << 11) /* CQEs are 32 byte */
// Only one task is allowed to submit requests
IORING_SETUP_SINGLE_ISSUER: u32 : (1 << 12)
// Defer running task work to get events.
// Rather than running bits of task work whenever the task transitions
// try to do it just before it is needed.
IORING_SETUP_DEFER_TASKRUN: u32 : (1 << 13)
/*
* sqe.uring_cmd_flags
* IORING_URING_CMD_FIXED use registered buffer; pass this flag
* along with setting sqe.buf_index.
*/
IORING_URING_CMD_FIXED: u32 : (1 << 0)
/*
* sqe.fsync_flags
*/
IORING_FSYNC_DATASYNC: u32 : (1 << 0)
/*
* sqe.timeout_flags
*/
IORING_TIMEOUT_ABS: u32 : (1 << 0)
IORING_TIMEOUT_UPDATE: u32 : (1 << 1)
IORING_TIMEOUT_BOOTTIME: u32 : (1 << 2)
IORING_TIMEOUT_REALTIME: u32 : (1 << 3)
IORING_LINK_TIMEOUT_UPDATE: u32 : (1 << 4)
IORING_TIMEOUT_ETIME_SUCCESS: u32 : (1 << 5)
IORING_TIMEOUT_CLOCK_MASK: u32 : (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
IORING_TIMEOUT_UPDATE_MASK: u32 : (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
* sq_ring.flags
*/
IORING_SQ_NEED_WAKEUP: u32 : (1 << 0) /* needs io_uring_enter wakeup */
IORING_SQ_CQ_OVERFLOW: u32 : (1 << 1) /* CQ ring is overflown */
IORING_SQ_TASKRUN: u32 : (1 << 2) /* task should enter the kernel */
/*
* sqe.splice_flags
* extends splice(2) flags
*/
SPLICE_F_FD_IN_FIXED: u32 : (1 << 31) /* the last bit of __u32 */
/*
* POLL_ADD flags. Note that since sqe.poll_events is the flag space, the command flags for POLL_ADD are stored in sqe.len.
*
* IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue to report CQEs on behalf of the same SQE.
* IORING_POLL_UPDATE Update existing poll request, matching sqe.addr as the old user_data field.
*
* IORING_POLL_LEVEL Level triggered poll.
*/
IORING_POLL_ADD_MULTI: u32 : (1 << 0)
IORING_POLL_UPDATE_EVENTS: u32 : (1 << 1)
IORING_POLL_UPDATE_USER_DATA: u32 : (1 << 2)
IORING_POLL_ADD_LEVEL: u32 : (1 << 3)
IORing_Poll_Bits :: enum {
ADD_MULTI,
UPDATE_EVENTS,
UPDATE_USER_DATA,
ADD_LEVEL,
}
IORing_Poll_Flags :: bit_set[IORing_Poll_Bits; u32]
/*
* send/sendmsg and recv/recvmsg flags (sq.ioprio)
*
* IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send
* or receive and arm poll if that yields an
* -EAGAIN result, arm poll upfront and skip
* the initial transfer attempt.
*
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
* the handler will continue to report
* CQEs on behalf of the same SQE.
*
* IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
* the buf_index field.
*
* IORING_SEND_ZC_REPORT_USAGE
* If set, SEND[MSG]_ZC should report
* the zerocopy usage in cqe.res
* for the IORING_CQE_F_NOTIF cqe.
* 0 is reported if zerocopy was actually possible.
* IORING_NOTIF_USAGE_ZC_COPIED if data was copied
* (at least partially).
*/
IORING_RECVSEND_POLL_FIRST: u32 : (1 << 0)
IORING_RECV_MULTISHOT: u32 : (1 << 1)
IORING_RECVSEND_FIXED_BUF: u32 : (1 << 2)
IORING_SEND_ZC_REPORT_USAGE: u32 : (1 << 3)
/*
* cqe.res for IORING_CQE_F_NOTIF if
* IORING_SEND_ZC_REPORT_USAGE was requested
*
* It should be treated as a flag, all other
* bits of cqe.res should be treated as reserved!
*/
IORING_NOTIF_USAGE_ZC_COPIED: u32 : (1 << 31)
/*
* accept flags stored in sq.ioprio
*/
IORING_ACCEPT_MULTISHOT: u32 : (1 << 0)
/*
* IORING_OP_MSG_RING command types, stored in sq.addr
*/
IORING_MSG :: enum {
DATA, /* pass sq.len as 'res' and off as user_data */
SEND_FD, /* send a registered fd to another ring */
}
/*
* IORING_OP_MSG_RING flags (sq.msg_ring_flags)
*
* IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
* applicable for IORING_MSG_DATA, obviously.
*/
IORING_MSG_RING_CQE_SKIP: u32 : (1 << 0)
/* Pass through the flags from sq.file_index to cqe.flags */
IORING_MSG_RING_FLAGS_PASS: u32 : (1 << 1)
IORING_OP :: enum u8 {
NOP,
READV,
WRITEV,
FSYNC,
READ_FIXED,
WRITE_FIXED,
POLL_ADD,
POLL_REMOVE,
SYNC_FILE_RANGE,
SENDMSG,
RECVMSG,
TIMEOUT,
TIMEOUT_REMOVE,
ACCEPT,
ASYNC_CANCEL,
LINK_TIMEOUT,
CONNECT,
FALLOCATE,
OPENAT,
CLOSE,
FILES_UPDATE,
STATX,
READ,
WRITE,
FADVISE,
MADVISE,
SEND,
RECV,
OPENAT2,
EPOLL_CTL,
SPLICE,
PROVIDE_BUFFERS,
REMOVE_BUFFERS,
TEE,
SHUTDOWN,
RENAMEAT,
UNLINKAT,
MKDIRAT,
SYMLINKAT,
LINKAT,
/* this goes last, obviously */
LAST,
}
/*
* sys_io_uring_register() opcodes and arguments.
*/
IORING_REGISTER :: enum u32 {
REGISTER_BUFFERS = 0,
UNREGISTER_BUFFERS = 1,
REGISTER_FILES = 2,
UNREGISTER_FILES = 3,
REGISTER_EVENTFD = 4,
UNREGISTER_EVENTFD = 5,
REGISTER_FILES_UPDATE = 6,
REGISTER_EVENTFD_ASYNC = 7,
REGISTER_PROBE = 8,
REGISTER_PERSONALITY = 9,
UNREGISTER_PERSONALITY = 10,
REGISTER_RESTRICTIONS = 11,
REGISTER_ENABLE_RINGS = 12,
/* extended with tagging */
REGISTER_FILES2 = 13,
REGISTER_FILES_UPDATE2 = 14,
REGISTER_BUFFERS2 = 15,
REGISTER_BUFFERS_UPDATE = 16,
/* set/clear io-wq thread affinities */
REGISTER_IOWQ_AFF = 17,
UNREGISTER_IOWQ_AFF = 18,
/* set/get max number of io-wq workers */
REGISTER_IOWQ_MAX_WORKERS = 19,
/* register/unregister io_uring fd with the ring */
REGISTER_RING_FDS = 20,
UNREGISTER_RING_FDS = 21,
/* register ring based provide buffer group */
REGISTER_PBUF_RING = 22,
UNREGISTER_PBUF_RING = 23,
/* sync cancelation API */
REGISTER_SYNC_CANCEL = 24,
/* register a range of fixed file slots for automatic slot allocation */
REGISTER_FILE_ALLOC_RANGE = 25,
/* this goes last */
REGISTER_LAST,
/* flag added to the opcode to use a registered ring fd */
REGISTER_USE_REGISTERED_RING = 1 << 31,
}
IORING_FEAT_SINGLE_MMAP: u32 : (1 << 0)
IORING_FEAT_NODROP: u32 : (1 << 1)
IORING_FEAT_SUBMIT_STABLE: u32 : (1 << 2)
IORING_FEAT_RW_CUR_POS: u32 : (1 << 3)
IORING_FEAT_CUR_PERSONALITY: u32 : (1 << 4)
IORING_FEAT_FAST_POLL: u32 : (1 << 5)
IORING_FEAT_POLL_32BITS: u32 : (1 << 6)
IORING_FEAT_SQPOLL_NONFIXED: u32 : (1 << 7)
IORING_FEAT_EXT_ARG: u32 : (1 << 8)
IORING_FEAT_NATIVE_WORKERS: u32 : (1 << 9)
IORING_FEAT_RSRC_TAGS: u32 : (1 << 10)
/*
* cqe.flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends.
*/
IORING_CQE_F_BUFFER: u32 : (1 << 0)
IORING_CQE_F_MORE: u32 : (1 << 1)
IORING_CQE_F_SOCK_NONEMPTY: u32 : (1 << 2)
IORING_CQE_F_NOTIF: u32 : (1 << 3)
IORING_CQE :: enum {
BUFFER_SHIFT = 16,
}
/*
* cq_ring->flags
*/
// disable eventfd notifications
IORING_CQ_EVENTFD_DISABLED: u32 : (1 << 0)
/*
* io_uring_enter(2) flags
*/
IORING_ENTER_GETEVENTS: u32 : (1 << 0)
IORING_ENTER_SQ_WAKEUP: u32 : (1 << 1)
IORING_ENTER_SQ_WAIT: u32 : (1 << 2)
IORING_ENTER_EXT_ARG: u32 : (1 << 3)
IORING_ENTER_REGISTERED_RING: u32 : (1 << 4)
/*
* Magic offsets for the application to mmap the data it needs
*/
IORING_OFF_SQ_RING: uintptr : 0
IORING_OFF_CQ_RING: u64 : 0x8000000
IORING_OFF_SQES: uintptr : 0x10000000
IORING_OFF_PBUF_RING: u64 : 0x80000000
IORING_OFF_PBUF_SHIFT :: 16
IORING_OFF_MMAP_MASK: u64 : 0xf8000000
sys_io_uring_setup :: proc "contextless" (entries: u32, params: ^io_uring_params) -> int {
return int(intrinsics.syscall(SYS_io_uring_setup, uintptr(entries), uintptr(params)))
}
sys_io_uring_enter :: proc "contextless" (
fd: u32,
to_submit: u32,
min_complete: u32,
flags: u32,
sig: ^sigset_t,
) -> int {
return int(
intrinsics.syscall(
SYS_io_uring_enter,
uintptr(fd),
uintptr(to_submit),
uintptr(min_complete),
uintptr(flags),
uintptr(sig),
NSIG / 8 if sig != nil else 0,
),
)
}
sys_io_uring_register :: proc "contextless" (fd: u32, opcode: IORING_REGISTER, arg: rawptr, nr_args: u32) -> int {
return int(intrinsics.syscall(SYS_io_uring_register, uintptr(fd), uintptr(opcode), uintptr(arg), uintptr(nr_args)))
}