315 lines
8.1 KiB
Groff
315 lines
8.1 KiB
Groff
.\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
|
|
.\" Copyright (C) 2019 Jon Corbet <corbet@lwn.net>
|
|
.\" Copyright (C) 2019 Red Hat, Inc.
|
|
.\"
|
|
.\" %%%LICENSE_START(LGPL_V2.1)
|
|
.\" This file is distributed according to the GNU Lesser General Public License.
|
|
.\" %%%LICENSE_END
|
|
.\"
|
|
.TH IO_URING_SETUP 2 2019-01-29 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
io_uring_setup \- setup a context for performing asynchronous I/O
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.BR "#include <linux/io_uring.h>"
|
|
.PP
|
|
.BI "int io_uring_setup(u32 " entries ", struct io_uring_params *" p);
|
|
.fi
|
|
.PP
|
|
.SH DESCRIPTION
|
|
.PP
|
|
The io_uring_setup() system call sets up a submission queue (SQ) and
|
|
completion queue (CQ) with at least
|
|
.I entries
|
|
entries, and returns a file descriptor which can be used to perform
|
|
subsequent operations on the io_uring instance. The submission and
|
|
completion queues are shared between userspace and the kernel, which
|
|
eliminates the need to copy data when initiating and completing I/O.
|
|
|
|
.I params
|
|
is used by the application to pass options to the kernel, and by the
|
|
kernel to convey information about the ring buffers.
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_uring_params {
|
|
__u32 sq_entries;
|
|
__u32 cq_entries;
|
|
__u32 flags;
|
|
__u32 sq_thread_cpu;
|
|
__u32 sq_thread_idle;
|
|
__u32 resv[5];
|
|
struct io_sqring_offsets sq_off;
|
|
struct io_cqring_offsets cq_off;
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The
|
|
.I flags, sq_thread_cpu
|
|
and
|
|
.I sq_thread_idle
|
|
fields are used to configure the io_uring instance.
|
|
.I flags
|
|
is a bit mask of 0 or more of the following values ORed
|
|
together:
|
|
.TP
|
|
.BR IORING_SETUP_IOPOLL
|
|
Perform busy-waiting for an I/O completion, as opposed to getting
|
|
notifications via an asynchronous IRQ (Interrupt Request). The file
|
|
system (if any) and block device must support polling in order for
|
|
this to work. Busy-waiting provides lower latency, but may consume
|
|
more CPU resources than interrupt driven I/O. Currently, this feature
|
|
is usable only on a file descriptor opened using the
|
|
.B O_DIRECT
|
|
flag. When a read or write is submitted to a polled context, the
|
|
application must poll for completions on the CQ ring by calling
|
|
.BR io_uring_enter(2).
|
|
It is illegal to mix and match polled and non-polled I/O on an io_uring
|
|
instance.
|
|
|
|
.TP
|
|
.BR IORING_SETUP_SQPOLL
|
|
When this flag is specified, a kernel thread is created to perform
|
|
submission queue polling. An io_uring instance configured in this way
|
|
enables an application to issue I/O without ever context switching
|
|
into the kernel. By using the submission queue to fill in new
|
|
submission queue entries and watching for completions on the
|
|
completion queue, the application can submit and reap I/Os without
|
|
doing a single system call.
|
|
|
|
If the kernel thread is idle for more than
|
|
.I sq_thread_idle
|
|
microseconds, it will set the
|
|
.B IORING_SQ_NEED_WAKEUP
|
|
bit in the
|
|
.I flags
|
|
field of the
|
|
.I struct io_sq_ring.
|
|
When this happens, the application must call
|
|
.BR io_uring_enter(2)
|
|
to wake the kernel thread. If I/O is kept busy, the kernel thread
|
|
will never sleep. An application making use of this feature will need
|
|
to guard the
|
|
.BR io_uring_enter(2)
|
|
call with the following code sequence:
|
|
|
|
.in +4n
|
|
.EX
|
|
read_barrier();
|
|
if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
|
|
io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
|
|
.EE
|
|
.in
|
|
|
|
where
|
|
.I sq_ring
|
|
is a submission queue ring setup using the
|
|
.I struct io_sqring_offsets
|
|
described below.
|
|
.TP
|
|
.BR
|
|
To successfully use this feature, the application must register a set of files
|
|
to be used for IO through
|
|
.B io_uring_register(2)
|
|
using the
|
|
.B IORING_REGISTER_FILES
|
|
opcode. Failure to do so will result in submitted IO being errored with
|
|
.B EBADF.
|
|
.TP
|
|
.BR IORING_SETUP_SQ_AFF
|
|
If this flag is specified, then the poll thread will be bound to the
|
|
cpu set in the
|
|
.I sq_thread_cpu
|
|
field of the
|
|
.I struct io_uring_params.
|
|
This flag is only meaningful when
|
|
.B IORING_SETUP_SQPOLL
|
|
is specified.
|
|
.PP
|
|
If no flags are specified, the io_uring instance is setup for
|
|
interrupt driven I/O. I/O may be submitted using
|
|
.BR io_uring_enter(2)
|
|
and can be reaped by polling the completion queue.
|
|
|
|
The
|
|
.I resv
|
|
array must be initialized to zero.
|
|
|
|
The rest of the fields in the
|
|
.I struct io_uring_params
|
|
are filled in by the kernel, and provide the information necessary to
|
|
memory map the submission queue, completion queue, and the array of
|
|
submission queue entries.
|
|
.I sq_entries
|
|
specifies the number of submission queue entries allocated.
|
|
.I sq_off
|
|
describes the offsets of various ring buffer fields:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_sqring_offsets {
|
|
__u32 head;
|
|
__u32 tail;
|
|
__u32 ring_mask;
|
|
__u32 ring_entries;
|
|
__u32 flags;
|
|
__u32 dropped;
|
|
__u32 array;
|
|
__u32 resv[3];
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
Taken together,
|
|
.I sq_entries
|
|
and
|
|
.I sq_off
|
|
provide all of the information necessary for accessing the submission
|
|
queue ring buffer and the submission queue entry array. The
|
|
submission queue can be mapped with a call like:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ptr = mmap(0, sq_off.array + sq_entries * sizeof(__u32),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
|
|
ring_fd, IORING_OFF_SQ_RING);
|
|
.EE
|
|
.in
|
|
.PP
|
|
Where sq_off is the
|
|
.I io_sqring_offsets
|
|
structure, and
|
|
.I ring_fd
|
|
is the file descriptor returned from
|
|
.BR io_uring_setup(2).
|
|
The addition of
|
|
.I sq_off.array
|
|
to the length of the region accounts for the fact that the ring
|
|
located at the end of the data structure. As an example, the ring
|
|
buffer head pointer can be accessed by adding
|
|
.I sq_off.head
|
|
to the address returned from
|
|
.BR mmap(2):
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
head = ptr + sq_off.head;
|
|
.EE
|
|
.in
|
|
|
|
The
|
|
.I flags
|
|
field is used by the kernel to communicate state information to the
|
|
application. Currently, it is used to inform the application when a
|
|
call to
|
|
.BR io_uring_enter(2)
|
|
is necessary. See the documentation for the
|
|
.B IORING_SETUP_SQPOLL
|
|
flag above.
|
|
The
|
|
.I dropped
|
|
member is incremented for each invalid submission queue entry
|
|
encountered in the ring buffer.
|
|
|
|
The head and tail track the ring buffer state. The tail is
|
|
incremented by the application when submitting new I/O, and the head
|
|
is incremented by the kernel when the I/O has been successfully
|
|
submitted. Determining the index of the head or tail into the ring is
|
|
accomplished by applying a mask:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
index = tail & ring_mask;
|
|
.EE
|
|
.in
|
|
.PP
|
|
The array of submission queue entries is mapped with:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
sqentries = mmap(0, sq_entries * sizeof(struct io_uring_sqe),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
|
|
ring_fd, IORING_OFF_SQES);
|
|
.EE
|
|
.in
|
|
.PP
|
|
The completion queue is described by
|
|
.I cq_entries
|
|
and
|
|
.I cq_off,
|
|
shown here:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_cqring_offsets {
|
|
__u32 head;
|
|
__u32 tail;
|
|
__u32 ring_mask;
|
|
__u32 ring_entries;
|
|
__u32 overflow;
|
|
__u32 cqes;
|
|
__u32 resv[4];
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The completion queue is simpler, since the entries are not separated
|
|
from the queue itself, and can be mapped with:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ptr = mmap(0, cq_off.cqes + cq_entries * sizeof(struct io_uring_cqe),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd,
|
|
IORING_OFF_CQ_RING);
|
|
.EE
|
|
.in
|
|
.PP
|
|
Closing the fd returned by io_uring_setup(2) will free all resources
|
|
associated with the io_uring context.
|
|
.PP
|
|
.SH RETURN VALUE
|
|
io_uring_setup() returns a new file descriptor on success. The
|
|
application may then provide the file descriptor in a subsequent mmap
|
|
call to map the submission and completion queues, or to the
|
|
io_uring_register or io_uring_enter system calls.
|
|
|
|
On error, -1 is returned and errno is set appropriately.
|
|
.PP
|
|
.SH ERRORS
|
|
.TP
|
|
.BR EFAULT
|
|
params is outside your accessible address space.
|
|
.TP
|
|
.BR EINVAL
|
|
The resv array contains non-zero data, p.flags contains an unsupported
|
|
flag,
|
|
.I entries
|
|
is out of bounds, or
|
|
.B IORING_SETUP_SQ_AFF
|
|
was specified, but
|
|
.B IORING_SETUP_SQPOLL
|
|
was not.
|
|
.TP
|
|
.BR EMFILE
|
|
The per-process limit on the number of open file descriptors has been
|
|
reached (see the description of
|
|
.B RLIMIT_NOFILE
|
|
in
|
|
.BR getrlimit(2)).
|
|
.TP
|
|
.BR ENFILE
|
|
The system-wide limit on the total number of open files has been
|
|
reached.
|
|
.TP
|
|
.BR ENOMEM
|
|
Insufficient kernel resources are available.
|
|
.TP
|
|
.BR EPERM
|
|
.B IORING_SETUP_SQPOLL
|
|
was specified, but the effective user ID of the caller did not have sufficient
|
|
privileges.
|
|
.SH SEE ALSO
|
|
.BR io_uring_register(2),
|
|
.BR io_uring_enter(2)
|