linux nvme的sendfile流程
在nvme的硬盘上使用sendfile系统调用,到底需要经过哪些流程?
do_sendfile--->do_splice_direct-->splice_direct_to_actor--->do_splice_to 对于xfs,其实就是xfs_file_splice_read
xfs_file_splice_read--->generic_file_splice_read--->__generic_file_splice_read--->mapping->a_ops->readpage--->xfs_vm_readpage-->mpage_readpage--->submit_bio
在splice_direct_to_actor函数中,有一个while循环,执行一段direct_splice_actor,返回之后,就执行do_splice_from-->generic_splice_sendpage-->splice_from_pipe-->__splice_from_pipe-->
splice_from_pipe_feed-->pipe_to_sendpage-->sock_sendpage-->kernel_sendpage-->inet_sendpage-->udp_sendpage(我用的是udp)
堆栈如下:
0xffffffff816093ed : inet_sendpage+0x6d/0xe0 [kernel]
0xffffffff8156b0bb : kernel_sendpage+0x1b/0x30 [kernel]
0xffffffff8156b0f7 : sock_sendpage+0x27/0x30 [kernel]
0xffffffff812329c3 : pipe_to_sendpage+0x63/0xa0 [kernel]
0xffffffff812328be : splice_from_pipe_feed+0x7e/0x120 [kernel]
0xffffffff81232e8e : __splice_from_pipe+0x6e/0x90 [kernel]
0xffffffff8123483e : splice_from_pipe+0x5e/0x90 [kernel]
0xffffffff81234905 : generic_splice_sendpage+0x15/0x20 [kernel]
0xffffffff8123368d : do_splice_from+0xad/0xf0 [kernel]
0xffffffff812336f0 : direct_splice_actor+0x20/0x30 [kernel]
0xffffffff81233424 : splice_direct_to_actor+0xd4/0x200 [kernel]
0xffffffff812335b2 : do_splice_direct+0x62/0x90 [kernel]
0xffffffff81203518 : do_sendfile+0x1d8/0x3c0 [kernel]
0xffffffff81204b6e : SyS_sendfile64+0x5e/0xb0 [kernel]
0xffffffff816b78c9 : system_call_fastpath+0x16/0x1b [kernel]
流程真长啊。
在2.6的内核中,generic_make_request会先调用__generic_make_request,然后__generic_make_request再调用q->make_request_fn 这个回调函数,
在3.10的内核中,generic_make_request 会直接回调 q->make_request_fn,针对nvme,多队列的这种情况,使用的是 blk_mq_requeue_work.
submit_bio-->generic_make_request--->q->make_request_fn--->blk_mq_requeue_work
任务的执行:blk_mq_make_request--->blk_mq_run_hw_queue,blk_mq_map_request等。
static struct request *blk_mq_map_request(struct request_queue *q,
struct bio *bio,
struct blk_map_ctx *data)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct request *rq;
int rw = bio_data_dir(bio);
struct blk_mq_alloc_data alloc_data;
blk_queue_enter_live(q);
ctx = blk_mq_get_ctx(q);
/*
* This assumes per-cpu software queueing queues. They could be per-node
* as well, for instance. For now this is hardcoded as-is. Note that we don't
* care about preemption, since we know the ctx's are persistent. This does
* mean that we can't rely on ctx always matching the currently running CPU.
*/
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
return __blk_mq_get_ctx(q, get_cpu());
}
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
return per_cpu_ptr(q->queue_ctx, cpu);
}
在nvme中,如何把bio插入的queue,映射为在各个cpu上运行的sq呢?利用的是blk_mq_map_queue函数,
static struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,------------------指定blk-mq向驱动提交request的函数
.complete = nvme_complete_rq,---------------完成队列处理
.map_queue = blk_mq_map_queue,--------------映射函数,将software queue和hardware queue对应
.init_hctx = nvme_admin_init_hctx,----------hardware Queue创建时调用,将NVMe Queue与Hardware Queue绑定
.exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_admin_init_request,----在分配Request时调用
.timeout = nvme_timeout,--------------------发生timeout时的调用
};
static struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_complete_rq,
.map_queue = blk_mq_map_queue,----------映射函数
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.timeout = nvme_timeout,
};
queue_rq指定blk-mq向驱动提交request的函数,map_queue定义如何将software queue和hardware queue对应,init_hctx是hardware Queue创建时调用(可以在这里将NVMe Queue与Hardware Queue绑定),init_request是在分配Request时调用,timeout是发生timeout时的调用。