【io_uring】内核源码分析(更新中)
文章目录
当前内容基于 Linux Kernel v5.4.121
io_uring
之前介绍过 io_uring 只增加了三个 Linux 系统调用分别是 io_uring_setup
,io_uring_enter
和 io_uring_register
他们的入口都在 Linux 内核源码的 fs/io_uring.c
文件中,下面将逐个分析
系统调用 io_uring_setup
io_uring_setup
的作用在用户库源码分析中有过介绍,主要是初始化初始化 io_uring
结构体
io_uring_setup
/* * Sets up an aio uring context, and returns the fd. Applications asks for a * ring size, we return the actual sq/cq ring sizes (among other things) in the * params structure passed in. */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; long ret; int i; // 用户态拷贝到内核态 if (copy_from_user(&p, params, sizeof(p))) return -EFAULT; // 确认保留区域没有被赋值 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { if (p.resv[i]) return -EINVAL; } // 检查 flags 参数 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF)) return -EINVAL; // 分配内存空间,创建 workqueue,创建 fd 等 ret = io_uring_create(entries, &p); if (ret < 0) return ret; // 内核态拷贝回用户态 if (copy_to_user(params, &p, sizeof(p))) return -EFAULT; return ret; } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { return io_uring_setup(entries, params); }
可以看到 io_uring_setup
的核心函数是 io_uring_create
io_uring_create
static int io_uring_create(unsigned entries, struct io_uring_params *p) { struct user_struct *user = NULL; struct io_ring_ctx *ctx; bool account_mem; int ret; if (!entries || entries > IORING_MAX_ENTRIES) return -EINVAL; /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, * since the sqes are only used at submission time. This allows for * some flexibility in overcommitting a bit. */ p->sq_entries = roundup_pow_of_two(entries); p->cq_entries = 2 * p->sq_entries; user = get_uid(current_user()); // 允许对共享内存段进行锁定 account_mem = !capable(CAP_IPC_LOCK); if (account_mem) { // 不能对共享内存段进行锁定,就需要增加当前可以锁定的内存 ret = io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); return ret; } } ctx = io_ring_ctx_alloc(p); if (!ctx) { if (account_mem) io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; ctx->creds = get_current_cred(); if (!ctx->creds) { ret = -ENOMEM; goto err; } // 申请 io_rings SQEs ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; // 初始化 workqueue,[初始化内核线程用于进行 IO poll] ret = io_sq_offload_start(ctx, p); if (ret) goto err; memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; memset(&p->cq_off, 0, sizeof(p->cq_off)); p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ // 创建 fd 便于用户态访问 ctx ret = io_uring_get_fd(ctx); if (ret < 0) goto err; p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; }
-
io_ring_ctx_alloc
主要用来申请空间,初始化列表头、互斥锁、自旋锁等结构 -
io_allocate_scq_urings
来初始化整个struct io_rings *rings
,包括SQ
、CQ
头尾指针的初始化,以及SQE
、CQE
的初始化- 不同的是
SQ
、CQ
头尾指针以及CQE
都在struct io_rings *rings
结构体中 - 而
SQE
则是在struct io_ring_ctx *ctx
结构体中
- 不同的是
-
io_sq_offload_start
会根据用户通过io_uring_setup
传递的flags
来配置io_uring
的运行方式,后续详细展开 -
io_uring_get_fd
将struct io_ring_ctx *ctx
暴露给用户态访问
io_allocate_scq_urings
来初始化整个struct io_rings *rings
,包括SQ
、CQ
头尾指针的初始化,以及SQE
、CQE
的初始化
io_sq_offload_start
static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { int ret; mmgrab(current->mm); ctx->sqo_mm = current->mm; if (ctx->flags & IORING_SETUP_SQPOLL) { // IORING_SETUP_SQPOLL 将会创建一个内核线程来 poll SQ ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; ret = -EINVAL; if (cpu >= nr_cpu_ids) goto err; if (!cpu_online(cpu)) goto err; ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, ctx, cpu, "io_uring-sq"); } else { ctx->sqo_thread = kthread_create(io_sq_thread, ctx, "io_uring-sq"); } if (IS_ERR(ctx->sqo_thread)) { ret = PTR_ERR(ctx->sqo_thread); ctx->sqo_thread = NULL; goto err; } wake_up_process(ctx->sqo_thread); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; goto err; } /* Do QD, or 2 * CPUS, whatever is smallest */ ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); if (!ctx->sqo_wq[0]) { ret = -ENOMEM; goto err; } /* * This is for buffered writes, where we want to limit the parallelism * due to file locking in file systems. As "normal" buffered writes * should parellelize on writeout quite nicely, limit us to having 2 * pending. This avoids massive contention on the inode when doing * buffered async writes. */ // 对 buffer 写的 workqueue 深度进行限制,减少锁争用开销? ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", WQ_UNBOUND | WQ_FREEZABLE, 2); if (!ctx->sqo_wq[1]) { ret = -ENOMEM; goto err; } return 0; err: io_finish_async(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; }
当 flags
中配置了 IORING_SETUP_SQPOLL
时,将启动一个单独的内核线程 io_sq_thread
,而当 IORING_SETUP_SQ_AFF
字段也配置时,将根据 sq_thread_cpu
字段,在指定的 CPU 上启用内核线程 io_sq_thread
同时该函数还会创建两个工作队列 ctx->sqo_wq[2]
分别名为 io_ring-wq
和 io_ring-write-wq
io_ring-wq
主要处理读 IO,以及 direct 写 IOio_ring-write-wq
主要是处理 buffer 写 IO
系统调用 io_uring_enter
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) { struct io_ring_ctx *ctx; long ret = -EBADF; int submitted = 0; struct fd f; if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; f = fdget(fd); if (!f.file) return -EBADF; ret = -EOPNOTSUPP; if (f.file->f_op != &io_uring_fops) goto out_fput; ret = -ENXIO; ctx = f.file->private_data; if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) goto out; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); if (ctx->flags & IORING_SETUP_IOPOLL) { ret = io_iopoll_check(ctx, &nr_events, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } } out: percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; }
TODO
系统调用 io_uring_register
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { struct io_ring_ctx *ctx; long ret = -EBADF; struct fd f; f = fdget(fd); if (!f.file) return -EBADF; ret = -EOPNOTSUPP; if (f.file->f_op != &io_uring_fops) goto out_fput; ctx = f.file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); out_fput: fdput(f); return ret; }
TODO
内核线程 io_sq_thread
TODO
IOPOLL
模式
启用
当 io_uring_setup
初始化时 flags
配置了 IORING_SETUP_IOPOLL
字段后将开启 IOPOLL
模式
限制
开启此选项必须保证后续只用 O_DIRECT
打开文件并且文件系统的 file_operations
中注册了 iopoll
函数,否则 IO 将下发失败
调用栈
开启后内核将调用注册的 iopoll
函数来主动轮询设备驱动确认 IO 是否完成
对 f_op->iopoll
函数调用关系进行了分析
主要有三条调用路线(所有调用逻辑都会判断是否在初始化时配置了 IORING_SETUP_IOPOLL
):
io_uring
销毁时需要调用- 系统调用
io_uring_enter
将会触发,用于轮询 IO 完成情况,直到到达指定的wait_nr
数量 IO 完成后才会退出轮询 - 当初始化时同时配置了
IORING_SETUP_SQPOLL
时,io_sq_thread
内核线程触发,当存在未完成的 IO 时调用,用于更新 IO 完成情况(io_do_iopoll
的参数min = 0
,即每次调用无论是否有新完成的 IO 都会退出轮询,不会阻塞线程)
本文作者: ywang_wnlo
本文链接: https://ywang-wnlo.github.io/posts/4f0d345c.html
版权声明: 本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律