Deep understanding of libaio interface

Keywords: Linux iOS

Maybe everyone will use the libaio interface, but how does it interact with the kernel? What is the mechanism of the kernel? Let's follow the main process together.

Entering: system call

Dependent header file

#include <errno.h>
#include <sys/syscall.h>
#include <unistd.h>

Main functions:

/* Actual syscalls */
int io_setup(int maxevents, io_context_t *ctxp) {
    return syscall(__NR_io_setup, maxevents, ctxp);

int io_destroy(io_context_t ctx) {
    return syscall(__NR_io_destroy, ctx);

int io_submit(io_context_t ctx, long nr, struct iocb *ios[]) {
    return syscall(__NR_io_submit, ctx, nr, ios);

int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt) {
    return syscall(__NR_io_cancel, ctx, iocb, evt);

(question to be tracked: how does IO get events() work?)

Implementation of system call

So what code does the above system call number correspond to? Let's take a look at the background of system call:<;

The x86 architecture has quite a few different ways to jump into
kernel code. Most of these entry points are registered in
arch/x86/kernel/traps.c and implemented in arch/x86/entry/entry_64.S
for 64-bit, arch/x86/entry/entry_32.S for 32-bit and finally
arch/x86/entry/entry_64_compat.S which implements the 32-bit compatibility
syscall entry points and thus provides for 32-bit processes the
ability to execute syscalls when running on 64-bit kernels.

The IDT vector assignments are listed in arch/x86/include/asm/irq_vectors.h.

Some of these entries are:

  • system_call: syscall instruction from 64-bit code.

  • entry_INT80_compat: int 0x80 from 32-bit or 64-bit code; compat syscall
    either way.

  • entry_INT80_compat, ia32_sysenter: syscall and sysenter from 32-bit

  • interrupt: An array of entries. Every IDT vector that doesn't
    explicitly point somewhere else gets set to the corresponding
    value in interrupts. These point to a whole array of
    magically-generated functions that make their way to do_IRQ with
    the interrupt number as a parameter.

  • APIC interrupts: Various special-purpose interrupts for things
    like TLB shootdown.

  • Architecturally-defined exceptions like divide_error.

    Next, take a look at some system call numbers involved in asynchronous IO system calls. Refer to arch/ia64/include/uapi/asm/unistd.h file:

    define __NR_io_setup                   1238
    #define __NR_io_destroy                 1239
    #define __NR_io_getevents               1240
    #define __NR_io_submit                  1241
    #define __NR_io_cancel                  1242
    #define __NR_epoll_create               1243
    #define __NR_epoll_ctl                  1244
    #define __NR_epoll_wait                 1245

So how is the system call number managed with function code?

Universal entry

         * Interrupts are off on entry.
         * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
         * it is too small to ever cause noticeable irq latency.

         * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
         * is not required to switch CR3.
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS                      /* pt_regs->ss */
        pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
        pushq   %r11                            /* pt_regs->flags */
        pushq   $__USER_CS                      /* pt_regs->cs */
        pushq   %rcx                            /* pt_regs->ip */
        pushq   %rax                            /* pt_regs->orig_ax */



        /* IRQs are off. */
        movq    %rax, %rdi
        movq    %rsp, %rsi
        call    do_syscall_64           /* returns with IRQs disabled */

        TRACE_IRQS_IRETQ                /* we're about to change IF */

Find processing function based on system call table

#ifdef CONFIG_X86_64
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
        struct thread_info *ti;

        ti = current_thread_info();
        if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
                nr = syscall_trace_enter(regs);

         * NB: Native and x32 syscalls are dispatched from the same
         * table.  The only functional difference is the x32 bit in
         * regs->orig_ax, which changes the behavior of some syscalls.
        nr &= __SYSCALL_MASK;
        if (likely(nr < NR_syscalls)) {
                nr = array_index_nospec(nr, NR_syscalls);
                regs->ax = sys_call_table[nr](regs);


Where is sys call table? In include / UAPI / ASM generic / unistd. H

#define __NR_io_setup 0
__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
#define __NR_io_destroy 1
__SYSCALL(__NR_io_destroy, sys_io_destroy)
#define __NR_io_submit 2
__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
#define __NR_io_cancel 3
__SYSCALL(__NR_io_cancel, sys_io_cancel)
#define __NR_io_getevents 4
__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)

Call the function that actually works: FS / AIO. C: sys ﹣ IO ﹣ submit

/* sys_io_submit:
 *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
 *      the number of iocbs queued.  May return -EINVAL if the aio_context
 *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
 *      *iocbpp[0] is not properly initialized, if the operation specified
 *      is invalid for the file descriptor in the iocb.  May fail with
 *      -EFAULT if any of the data structures point to invalid data.  May
 *      fail with -EBADF if the file descriptor specified in the first
 *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
 *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
 *      fail with -ENOSYS if not implemented.
SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                struct iocb __user * __user *, iocbpp)
        struct kioctx *ctx;
        long ret = 0;
        int i = 0;
        struct blk_plug plug;

        if (unlikely(nr < 0))
                return -EINVAL;

        ctx = lookup_ioctx(ctx_id);
        if (unlikely(!ctx)) {
                pr_debug("EINVAL: invalid context id\n");
                return -EINVAL;

        if (nr > ctx->nr_events)
                nr = ctx->nr_events;

        for (i = 0; i < nr; i++) {
                struct iocb __user *user_iocb;

                if (unlikely(get_user(user_iocb, iocbpp + i))) {
                        ret = -EFAULT;

                ret = io_submit_one(ctx, user_iocb, false);
                if (ret)

Reference documents:


Posted by slamMan on Tue, 07 Jan 2020 06:26:34 -0800