478 lines
13 KiB
Odin
478 lines
13 KiB
Odin
#+build linux
|
|
package io_uring
|
|
|
|
import "base:intrinsics"
|
|
|
|
//odinfmt:disable
|
|
SYS_io_uring_setup: uintptr : 425
|
|
SYS_io_uring_enter: uintptr : 426
|
|
SYS_io_uring_register: uintptr : 427
|
|
//odinfmt:enable
|
|
|
|
NSIG :: 65
|
|
|
|
sigset_t :: [1024 / 32]u32
|
|
|
|
io_uring_params :: struct {
|
|
sq_entries: u32,
|
|
cq_entries: u32,
|
|
flags: u32,
|
|
sq_thread_cpu: u32,
|
|
sq_thread_idle: u32,
|
|
features: u32,
|
|
wq_fd: u32,
|
|
resv: [3]u32,
|
|
sq_off: io_sqring_offsets,
|
|
cq_off: io_cqring_offsets,
|
|
}
|
|
#assert(size_of(io_uring_params) == 120)
|
|
|
|
io_sqring_offsets :: struct {
|
|
head: u32,
|
|
tail: u32,
|
|
ring_mask: u32,
|
|
ring_entries: u32,
|
|
flags: u32,
|
|
dropped: u32,
|
|
array: u32,
|
|
resv1: u32,
|
|
user_addr: u64,
|
|
}
|
|
|
|
io_cqring_offsets :: struct {
|
|
head: u32,
|
|
tail: u32,
|
|
ring_mask: u32,
|
|
ring_entries: u32,
|
|
overflow: u32,
|
|
cqes: u32,
|
|
flags: u32,
|
|
resv1: u32,
|
|
user_addr: u64,
|
|
}
|
|
|
|
// Submission queue entry.
|
|
io_uring_sqe :: struct {
|
|
opcode: IORING_OP, // u8
|
|
flags: u8, /* IOSQE_ flags */
|
|
ioprio: u16, /* ioprio for the request */
|
|
fd: i32, /* file descriptor to do IO on */
|
|
using __offset: struct #raw_union {
|
|
off: u64, /* offset into file */
|
|
addr2: u64,
|
|
using _: struct {
|
|
cmd_op: u32,
|
|
__pad1: u32,
|
|
},
|
|
},
|
|
using __iovecs: struct #raw_union {
|
|
addr: u64, /* pointer to buffer or iovecs */
|
|
splice_off_in: u64,
|
|
},
|
|
len: u32, /* buffer size or number of iovecs */
|
|
using __contents: struct #raw_union {
|
|
rw_flags: i32,
|
|
fsync_flags: u32,
|
|
poll_events: u16, /* compatibility */
|
|
poll32_events: u32, /* word-reversed for BE */
|
|
sync_range_flags: u32,
|
|
msg_flags: u32,
|
|
timeout_flags: u32,
|
|
accept_flags: u32,
|
|
cancel_flags: u32,
|
|
open_flags: u32,
|
|
statx_flags: u32,
|
|
fadvise_advice: u32,
|
|
splice_flags: u32,
|
|
rename_flags: u32,
|
|
unlink_flags: u32,
|
|
hardlink_flags: u32,
|
|
xattr_flags: u32,
|
|
msg_ring_flags: u32,
|
|
uring_cmd_flags: u32,
|
|
},
|
|
user_data: u64, /* data to be passed back at completion time */
|
|
/* pack this to avoid bogus arm OABI complaints */
|
|
using __buffer: struct #raw_union {
|
|
/* index into fixed buffers, if used */
|
|
buf_index: u16,
|
|
/* for grouped buffer selection */
|
|
buf_group: u16,
|
|
},
|
|
/* personality to use, if used */
|
|
personality: u16,
|
|
using _: struct #raw_union {
|
|
splice_fd_in: i32,
|
|
file_index: u32,
|
|
using _: struct {
|
|
addr_len: u16,
|
|
__pad3: [1]u16,
|
|
},
|
|
},
|
|
using __: struct #raw_union {
|
|
using _: struct {
|
|
addr3: u64,
|
|
__pad2: [1]u64,
|
|
},
|
|
/*
|
|
* If the ring is initialized with IORING_SETUP_SQE128, then
|
|
* this field is used for 80 bytes of arbitrary command data
|
|
* NOTE: This is currently not supported.
|
|
*/
|
|
// cmd: [^]u8,
|
|
},
|
|
}
|
|
#assert(size_of(io_uring_sqe) == 64)
|
|
|
|
// Completion queue entry.
|
|
io_uring_cqe :: struct {
|
|
user_data: u64, /* sq.data submission passed back */
|
|
res: i32, /* result code for this event */
|
|
flags: u32,
|
|
/*
|
|
* If the ring is initialized with IORING_SETUP_CQE32, then this field
|
|
* contains 16-bytes of padding, doubling the size of the CQE.
|
|
* NOTE: This is currently not supported.
|
|
*/
|
|
// big_cqe: [^]u64,
|
|
}
|
|
#assert(size_of(io_uring_cqe) == 16)
|
|
|
|
/*
|
|
* sqe.flags
|
|
*/
|
|
/* use fixed fileset */
|
|
IOSQE_FIXED_FILE: u32 : (1 << 0)
|
|
/* issue after inflight IO */
|
|
IOSQE_IO_DRAIN: u32 : (1 << 1)
|
|
/* links next sqe */
|
|
IOSQE_IO_LINK: u32 : (1 << 2)
|
|
/* like LINK, but stronger */
|
|
IOSQE_IO_HARDLINK: u32 : (1 << 3)
|
|
/* always go async */
|
|
IOSQE_ASYNC: u32 : (1 << 4)
|
|
/* select buffer from sq.buf_group */
|
|
IOSQE_BUFFER_SELECT: u32 : (1 << 5)
|
|
/* don't post CQE if request succeeded */
|
|
IOSQE_CQE_SKIP_SUCCESS: u32 : (1 << 6)
|
|
|
|
/*
|
|
* io_uring_setup() flags
|
|
*/
|
|
IORING_SETUP_IOPOLL: u32 : (1 << 0) /* io_context is polled */
|
|
IORING_SETUP_SQPOLL: u32 : (1 << 1) /* SQ poll thread */
|
|
IORING_SETUP_SQ_AFF: u32 : (1 << 2) /* sq_thread_cpu is valid */
|
|
IORING_SETUP_CQSIZE: u32 : (1 << 3) /* app defines CQ size */
|
|
IORING_SETUP_CLAMP: u32 : (1 << 4) /* clamp SQ/CQ ring sizes */
|
|
IORING_SETUP_ATTACH_WQ: u32 : (1 << 5) /* attach to existing wq */
|
|
IORING_SETUP_R_DISABLED: u32 : (1 << 6) /* start with ring disabled */
|
|
IORING_SETUP_SUBMIT_ALL: u32 : (1 << 7) /* continue submit on error */
|
|
// Cooperative task running. When requests complete, they often require
|
|
// forcing the submitter to transition to the kernel to complete. If this
|
|
// flag is set, work will be done when the task transitions anyway, rather
|
|
// than force an inter-processor interrupt reschedule. This avoids interrupting
|
|
// a task running in userspace, and saves an IPI.
|
|
IORING_SETUP_COOP_TASKRUN: u32 : (1 << 8)
|
|
// If COOP_TASKRUN is set, get notified if task work is available for
|
|
// running and a kernel transition would be needed to run it. This sets
|
|
// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
|
|
IORING_SETUP_TASKRUN_FLAG: u32 : (1 << 9)
|
|
IORING_SETUP_SQE128: u32 : (1 << 10) /* SQEs are 128 byte */
|
|
IORING_SETUP_CQE32: u32 : (1 << 11) /* CQEs are 32 byte */
|
|
// Only one task is allowed to submit requests
|
|
IORING_SETUP_SINGLE_ISSUER: u32 : (1 << 12)
|
|
// Defer running task work to get events.
|
|
// Rather than running bits of task work whenever the task transitions
|
|
// try to do it just before it is needed.
|
|
IORING_SETUP_DEFER_TASKRUN: u32 : (1 << 13)
|
|
|
|
/*
|
|
* sqe.uring_cmd_flags
|
|
* IORING_URING_CMD_FIXED use registered buffer; pass this flag
|
|
* along with setting sqe.buf_index.
|
|
*/
|
|
IORING_URING_CMD_FIXED: u32 : (1 << 0)
|
|
|
|
/*
|
|
* sqe.fsync_flags
|
|
*/
|
|
IORING_FSYNC_DATASYNC: u32 : (1 << 0)
|
|
|
|
/*
|
|
* sqe.timeout_flags
|
|
*/
|
|
IORING_TIMEOUT_ABS: u32 : (1 << 0)
|
|
IORING_TIMEOUT_UPDATE: u32 : (1 << 1)
|
|
IORING_TIMEOUT_BOOTTIME: u32 : (1 << 2)
|
|
IORING_TIMEOUT_REALTIME: u32 : (1 << 3)
|
|
IORING_LINK_TIMEOUT_UPDATE: u32 : (1 << 4)
|
|
IORING_TIMEOUT_ETIME_SUCCESS: u32 : (1 << 5)
|
|
IORING_TIMEOUT_CLOCK_MASK: u32 : (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
|
|
IORING_TIMEOUT_UPDATE_MASK: u32 : (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
|
|
|
|
/*
|
|
* sq_ring.flags
|
|
*/
|
|
IORING_SQ_NEED_WAKEUP: u32 : (1 << 0) /* needs io_uring_enter wakeup */
|
|
IORING_SQ_CQ_OVERFLOW: u32 : (1 << 1) /* CQ ring is overflown */
|
|
IORING_SQ_TASKRUN: u32 : (1 << 2) /* task should enter the kernel */
|
|
|
|
/*
|
|
* sqe.splice_flags
|
|
* extends splice(2) flags
|
|
*/
|
|
SPLICE_F_FD_IN_FIXED: u32 : (1 << 31) /* the last bit of __u32 */
|
|
|
|
/*
|
|
* POLL_ADD flags. Note that since sqe.poll_events is the flag space, the command flags for POLL_ADD are stored in sqe.len.
|
|
*
|
|
* IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue to report CQEs on behalf of the same SQE.
|
|
|
|
* IORING_POLL_UPDATE Update existing poll request, matching sqe.addr as the old user_data field.
|
|
*
|
|
* IORING_POLL_LEVEL Level triggered poll.
|
|
*/
|
|
IORING_POLL_ADD_MULTI: u32 : (1 << 0)
|
|
IORING_POLL_UPDATE_EVENTS: u32 : (1 << 1)
|
|
IORING_POLL_UPDATE_USER_DATA: u32 : (1 << 2)
|
|
IORING_POLL_ADD_LEVEL: u32 : (1 << 3)
|
|
|
|
IORing_Poll_Bits :: enum {
|
|
ADD_MULTI,
|
|
UPDATE_EVENTS,
|
|
UPDATE_USER_DATA,
|
|
ADD_LEVEL,
|
|
}
|
|
|
|
IORing_Poll_Flags :: bit_set[IORing_Poll_Bits; u32]
|
|
|
|
/*
|
|
* send/sendmsg and recv/recvmsg flags (sq.ioprio)
|
|
*
|
|
* IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send
|
|
* or receive and arm poll if that yields an
|
|
* -EAGAIN result, arm poll upfront and skip
|
|
* the initial transfer attempt.
|
|
*
|
|
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
|
|
* the handler will continue to report
|
|
* CQEs on behalf of the same SQE.
|
|
*
|
|
* IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
|
|
* the buf_index field.
|
|
*
|
|
* IORING_SEND_ZC_REPORT_USAGE
|
|
* If set, SEND[MSG]_ZC should report
|
|
* the zerocopy usage in cqe.res
|
|
* for the IORING_CQE_F_NOTIF cqe.
|
|
* 0 is reported if zerocopy was actually possible.
|
|
* IORING_NOTIF_USAGE_ZC_COPIED if data was copied
|
|
* (at least partially).
|
|
*/
|
|
IORING_RECVSEND_POLL_FIRST: u32 : (1 << 0)
|
|
IORING_RECV_MULTISHOT: u32 : (1 << 1)
|
|
IORING_RECVSEND_FIXED_BUF: u32 : (1 << 2)
|
|
IORING_SEND_ZC_REPORT_USAGE: u32 : (1 << 3)
|
|
|
|
/*
|
|
* cqe.res for IORING_CQE_F_NOTIF if
|
|
* IORING_SEND_ZC_REPORT_USAGE was requested
|
|
*
|
|
* It should be treated as a flag, all other
|
|
* bits of cqe.res should be treated as reserved!
|
|
*/
|
|
IORING_NOTIF_USAGE_ZC_COPIED: u32 : (1 << 31)
|
|
|
|
/*
|
|
* accept flags stored in sq.ioprio
|
|
*/
|
|
IORING_ACCEPT_MULTISHOT: u32 : (1 << 0)
|
|
|
|
/*
|
|
* IORING_OP_MSG_RING command types, stored in sq.addr
|
|
*/
|
|
IORING_MSG :: enum {
|
|
DATA, /* pass sq.len as 'res' and off as user_data */
|
|
SEND_FD, /* send a registered fd to another ring */
|
|
}
|
|
|
|
/*
|
|
* IORING_OP_MSG_RING flags (sq.msg_ring_flags)
|
|
*
|
|
* IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
|
|
* applicable for IORING_MSG_DATA, obviously.
|
|
*/
|
|
IORING_MSG_RING_CQE_SKIP: u32 : (1 << 0)
|
|
/* Pass through the flags from sq.file_index to cqe.flags */
|
|
IORING_MSG_RING_FLAGS_PASS: u32 : (1 << 1)
|
|
|
|
IORING_OP :: enum u8 {
|
|
NOP,
|
|
READV,
|
|
WRITEV,
|
|
FSYNC,
|
|
READ_FIXED,
|
|
WRITE_FIXED,
|
|
POLL_ADD,
|
|
POLL_REMOVE,
|
|
SYNC_FILE_RANGE,
|
|
SENDMSG,
|
|
RECVMSG,
|
|
TIMEOUT,
|
|
TIMEOUT_REMOVE,
|
|
ACCEPT,
|
|
ASYNC_CANCEL,
|
|
LINK_TIMEOUT,
|
|
CONNECT,
|
|
FALLOCATE,
|
|
OPENAT,
|
|
CLOSE,
|
|
FILES_UPDATE,
|
|
STATX,
|
|
READ,
|
|
WRITE,
|
|
FADVISE,
|
|
MADVISE,
|
|
SEND,
|
|
RECV,
|
|
OPENAT2,
|
|
EPOLL_CTL,
|
|
SPLICE,
|
|
PROVIDE_BUFFERS,
|
|
REMOVE_BUFFERS,
|
|
TEE,
|
|
SHUTDOWN,
|
|
RENAMEAT,
|
|
UNLINKAT,
|
|
MKDIRAT,
|
|
SYMLINKAT,
|
|
LINKAT,
|
|
/* this goes last, obviously */
|
|
LAST,
|
|
}
|
|
|
|
/*
|
|
* sys_io_uring_register() opcodes and arguments.
|
|
*/
|
|
IORING_REGISTER :: enum u32 {
|
|
REGISTER_BUFFERS = 0,
|
|
UNREGISTER_BUFFERS = 1,
|
|
REGISTER_FILES = 2,
|
|
UNREGISTER_FILES = 3,
|
|
REGISTER_EVENTFD = 4,
|
|
UNREGISTER_EVENTFD = 5,
|
|
REGISTER_FILES_UPDATE = 6,
|
|
REGISTER_EVENTFD_ASYNC = 7,
|
|
REGISTER_PROBE = 8,
|
|
REGISTER_PERSONALITY = 9,
|
|
UNREGISTER_PERSONALITY = 10,
|
|
REGISTER_RESTRICTIONS = 11,
|
|
REGISTER_ENABLE_RINGS = 12,
|
|
/* extended with tagging */
|
|
REGISTER_FILES2 = 13,
|
|
REGISTER_FILES_UPDATE2 = 14,
|
|
REGISTER_BUFFERS2 = 15,
|
|
REGISTER_BUFFERS_UPDATE = 16,
|
|
/* set/clear io-wq thread affinities */
|
|
REGISTER_IOWQ_AFF = 17,
|
|
UNREGISTER_IOWQ_AFF = 18,
|
|
/* set/get max number of io-wq workers */
|
|
REGISTER_IOWQ_MAX_WORKERS = 19,
|
|
/* register/unregister io_uring fd with the ring */
|
|
REGISTER_RING_FDS = 20,
|
|
UNREGISTER_RING_FDS = 21,
|
|
/* register ring based provide buffer group */
|
|
REGISTER_PBUF_RING = 22,
|
|
UNREGISTER_PBUF_RING = 23,
|
|
/* sync cancelation API */
|
|
REGISTER_SYNC_CANCEL = 24,
|
|
/* register a range of fixed file slots for automatic slot allocation */
|
|
REGISTER_FILE_ALLOC_RANGE = 25,
|
|
/* this goes last */
|
|
REGISTER_LAST,
|
|
/* flag added to the opcode to use a registered ring fd */
|
|
REGISTER_USE_REGISTERED_RING = 1 << 31,
|
|
}
|
|
|
|
IORING_FEAT_SINGLE_MMAP: u32 : (1 << 0)
|
|
IORING_FEAT_NODROP: u32 : (1 << 1)
|
|
IORING_FEAT_SUBMIT_STABLE: u32 : (1 << 2)
|
|
IORING_FEAT_RW_CUR_POS: u32 : (1 << 3)
|
|
IORING_FEAT_CUR_PERSONALITY: u32 : (1 << 4)
|
|
IORING_FEAT_FAST_POLL: u32 : (1 << 5)
|
|
IORING_FEAT_POLL_32BITS: u32 : (1 << 6)
|
|
IORING_FEAT_SQPOLL_NONFIXED: u32 : (1 << 7)
|
|
IORING_FEAT_EXT_ARG: u32 : (1 << 8)
|
|
IORING_FEAT_NATIVE_WORKERS: u32 : (1 << 9)
|
|
IORING_FEAT_RSRC_TAGS: u32 : (1 << 10)
|
|
|
|
/*
|
|
* cqe.flags
|
|
*
|
|
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
|
|
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
|
|
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
|
|
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
|
|
* them from sends.
|
|
*/
|
|
IORING_CQE_F_BUFFER: u32 : (1 << 0)
|
|
IORING_CQE_F_MORE: u32 : (1 << 1)
|
|
IORING_CQE_F_SOCK_NONEMPTY: u32 : (1 << 2)
|
|
IORING_CQE_F_NOTIF: u32 : (1 << 3)
|
|
|
|
IORING_CQE :: enum {
|
|
BUFFER_SHIFT = 16,
|
|
}
|
|
|
|
/*
|
|
* cq_ring->flags
|
|
*/
|
|
// disable eventfd notifications
|
|
IORING_CQ_EVENTFD_DISABLED: u32 : (1 << 0)
|
|
|
|
/*
|
|
* io_uring_enter(2) flags
|
|
*/
|
|
IORING_ENTER_GETEVENTS: u32 : (1 << 0)
|
|
IORING_ENTER_SQ_WAKEUP: u32 : (1 << 1)
|
|
IORING_ENTER_SQ_WAIT: u32 : (1 << 2)
|
|
IORING_ENTER_EXT_ARG: u32 : (1 << 3)
|
|
IORING_ENTER_REGISTERED_RING: u32 : (1 << 4)
|
|
|
|
/*
|
|
* Magic offsets for the application to mmap the data it needs
|
|
*/
|
|
IORING_OFF_SQ_RING: uintptr : 0
|
|
IORING_OFF_CQ_RING: u64 : 0x8000000
|
|
IORING_OFF_SQES: uintptr : 0x10000000
|
|
IORING_OFF_PBUF_RING: u64 : 0x80000000
|
|
IORING_OFF_PBUF_SHIFT :: 16
|
|
IORING_OFF_MMAP_MASK: u64 : 0xf8000000
|
|
|
|
sys_io_uring_setup :: proc "contextless" (entries: u32, params: ^io_uring_params) -> int {
|
|
return int(intrinsics.syscall(SYS_io_uring_setup, uintptr(entries), uintptr(params)))
|
|
}
|
|
|
|
sys_io_uring_enter :: proc "contextless" (
|
|
fd: u32,
|
|
to_submit: u32,
|
|
min_complete: u32,
|
|
flags: u32,
|
|
sig: ^sigset_t,
|
|
) -> int {
|
|
return int(
|
|
intrinsics.syscall(
|
|
SYS_io_uring_enter,
|
|
uintptr(fd),
|
|
uintptr(to_submit),
|
|
uintptr(min_complete),
|
|
uintptr(flags),
|
|
uintptr(sig),
|
|
NSIG / 8 if sig != nil else 0,
|
|
),
|
|
)
|
|
}
|
|
|
|
sys_io_uring_register :: proc "contextless" (fd: u32, opcode: IORING_REGISTER, arg: rawptr, nr_args: u32) -> int {
|
|
return int(intrinsics.syscall(SYS_io_uring_register, uintptr(fd), uintptr(opcode), uintptr(arg), uintptr(nr_args)))
|
|
}
|