linux驱动移植-linux块设备驱动blk-mq
在上一节我们介绍了块设备驱动的I/O读写流程,以及块设备的注册/卸载、通用磁盘的申请/删除/添加相关的API。
此外还有一部分相关重要的内容没有介绍,那就是通用块层request_queue、bio等相关的内容。
一、通用块层核心数据结构
1.1 request_queue
请求队列是由struct request_queue表示的,每一个gendisk对象都有一个request_queue对象,保存对该gendisk对象的所有请求。定义在include/linux/blkdev.h:
struct request_queue { /* * Together with queue_head for cacheline sharing */ struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; struct blk_queue_stats *stats; struct rq_qos *rq_qos; make_request_fn *make_request_fn; dma_drain_needed_fn *dma_drain_needed; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; unsigned int queue_depth; /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; struct backing_dev_info *backing_dev_info; /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. */ void *queuedata; /* * various queue flags, see QUEUE_* below */ unsigned long queue_flags; /* * Number of contexts that have called blk_set_pm_only(). If this * counter is above zero then only RQF_PM and RQF_PREEMPT requests are * processed. */ atomic_t pm_only; /* * ida allocated id for this queue. Used to index queues from * ioctx. */ int id; /* * queue needs bounce pages for pages above this limit */ gfp_t bounce_gfp; spinlock_t queue_lock; /* * queue kobject */ struct kobject kobj; /* * mq queue kobject */ struct kobject *mq_kobj; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity integrity; #endif /* CONFIG_BLK_DEV_INTEGRITY */ #ifdef CONFIG_PM struct device *dev; int rpm_status; unsigned int nr_pending; #endif /* * queue settings */ unsigned long nr_requests; /* Max # of requests */ unsigned int dma_drain_size; void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; unsigned int rq_timeout; int poll_nsec; struct blk_stat_callback *poll_cb; struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; struct timer_list timeout; struct work_struct timeout_work; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); struct blkcg_gq *root_blkg; struct list_head blkg_list; #endif struct queue_limits limits; #ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. * nr_zones is the total number of zones of the device. This is always * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones * bits which indicates if a zone is conventional (bit clear) or * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones * bits which indicates if a zone is write locked, that is, if a write * request targeting the zone was dispatched. All three fields are * initialized by the low level device driver (e.g. scsi/sd.c). * Stacking drivers (device mappers) may or may not initialize * these fields. * * Reads of this information must be protected with blk_queue_enter() / * blk_queue_exit(). Modifying this information is only allowed while * no requests are being processed. See also blk_mq_freeze_queue() and * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; unsigned long *seq_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ /* * sg stuff */ unsigned int sg_timeout; unsigned int sg_reserved_size; int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; struct mutex blk_trace_mutex; #endif /* * for flush operations */ struct blk_flush_queue *fq; struct list_head requeue_list; spinlock_t requeue_lock; struct delayed_work requeue_work; struct mutex sysfs_lock; /* * for reusing dead hctx instance in case of updating * nr_hw_queues */ struct list_head unused_hctx_list; spinlock_t unused_hctx_lock; int mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) struct bsg_class_device bsg_dev; #endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; /* * Protect concurrent access to q_usage_counter by * percpu_ref_kill() and percpu_ref_reinit(). */ struct mutex mq_freeze_lock; struct percpu_ref q_usage_counter; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; #endif bool mq_sysfs_init_done; size_t cmd_size; struct work_struct release_work; #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; };
其部分参数含义如下:
- queue_head:通过list_head可以用来构建一个request_queue类型的双向链表;
- make_request_fn:设置bio提交时的回调函数,一般设置为blk_mq_make_request;
- last_merge:指向队列中首次可能合并的请求描述符;
- elevator:指向elevator对象的指针(电梯算法),决定了I/O调度层使用的I/O调度算法;
- mq_ops:块设备驱动mq的操作集合,用于抽象块设备驱动的行为;
- requeue_list:请求队列中保存的request双向链表头节点;
- tag_set:标签集,这个后面介绍;
- tag_set_list:保存blk_mq_tag_set双向链表的头节点;
- queue_ctx:软件队列,这是一个per cpu变量,软件队列数量等于CPU数量;
- nr_queues:软件队列的数量,等于CPU的数量;
- queue_hw_ctx:硬件队列;
1.2 request
request表示经过I/O调度之后的针对一个gendisk的一个请求,是request_queue的一个节点,多个request构成了一个request_queue。request定义在include/linux/blkdev.h:
/* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; struct blk_mq_hw_ctx *mq_hctx; unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; int internal_tag; /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ int tag; sector_t __sector; /* sector cursor */ struct bio *bio; struct bio *biotail; struct list_head queuelist; /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used * to queue the request for softirq completion, which is long * after the request has been unhashed (and even removed from * the dispatch list). */ union { struct hlist_node hash; /* merge hash */ struct list_head ipi_list; }; /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the * completion_data share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; int error_count; /* for legacy drivers, don't use */ }; /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. Flush requests are * never put on the IO scheduler. So let the flush fields share * space with the elevator data. */ union { struct { struct io_cq *icq; void *priv[2]; } elv; struct { unsigned int seq; struct list_head list; rq_end_io_fn *saved_end_io; } flush; }; struct gendisk *rq_disk; struct hd_struct *part; /* Time that I/O was submitted to the kernel. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif #ifdef CONFIG_BLK_DEV_THROTTLING_LOW unsigned short throtl_size; #endif /* * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif unsigned short write_hint; unsigned short ioprio; unsigned int extra_len; /* length of alignment and padding */ enum mq_rq_state state; refcount_t ref; unsigned int timeout; unsigned long deadline; union { struct __call_single_data csd; u64 fifo_time; }; /* * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; };
其部分参数含义如下:
- q:这个request所属的请求队列;
- tag:为这个reques分配的tag,本质上就是一个索引号,如果没有为-1;
- __sectot:u64类型,当前request读取或写入到块设备起始扇区(每个扇区512 字节);
- __data_len:当前request读取或写入到块设备的字节大小;
- mq_ctx:指定这个请求将会发送到的软件队列;
- bio:组成这个request的bio链表的头指针;
- biotail:组成这个request的bio链表的尾指针;
- hash:内核hash表头指针;
- queuelist:通过list_head可以用来构建一个request类型的双向链表;
1个request中包含了一个或多个bio,为什么要有request这个结构呢?它存在的目的就是为了进行io的调度,通过request这个辅助结构,我们来给bio进行某种调度方法的排序,从而最大化地提高磁盘访问速度。
1.3 bio
bio用来描述单一的I/O请求,它记录了一次I/O操作所必需的相关信息,如用于I/O操作的数据缓存位置,I/O操作的块设备起始扇区,是读操作还是写操作等等。
bio结构包含了一个磁盘存储区标识符(存储磁盘起始扇区和扇区数目)和一个或多个描述与I/O操作相关的内存区段(bio_vec数组)。
bio定义在include/linux/blk_types.h:
/* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) */ struct bio { struct bio *bi_next; /* request queue link */ struct gendisk *bi_disk; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. */ unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; u8 bi_partno; /* Number of segments in this BIO after * physical address coalescing is performed. */ unsigned int bi_phys_segments; struct bvec_iter bi_iter; atomic_t __bi_remaining; bio_end_io_t *bi_end_io; void *bi_private; #ifdef CONFIG_BLK_CGROUP /* * Represents the association of the css and request_queue for the bio. * If a bio goes direct to device, it will not have a blkg as it will * not have a request_queue associated with it. The reference is put * on release of the bio. */ struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif }; unsigned short bi_vcnt; /* how many bio_vec's */ /* * Everything starting with bi_max_vecs will be preserved by bio_reset() */ unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ atomic_t __bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; /* * We can inline a number of vecs at the end of the bio, to avoid * double allocations for a small number of bio_vecs. This member * MUST obviously be kept at the very end of the bio. */ struct bio_vec bi_inline_vecs[0]; };
其部分参数含义如下:
- bi_next:指向链表中下一个bio;
- bi_disk:正想当前bio发往的磁盘gendisk;
- bi_opf:低24位为请求标志位,高8位为请求操作位;
- bi_flags:bio状态等信息;
- bi_iter:磁盘存储区标识符,描述了当前bio_vec被处理的情况;
- bi_phys_segments:把病之后bio中物理段的数目;
- bi_end_io:bio的I/O操作结束时调用的函数;
- bi_private:通用块层和块设备驱动程序的I/O完成方法使用的指针;
- bi_vcnt:bio对象包含bio_vec对象的数目;
- bi_max_vecs:这个bio能承载的最大的bio_vec的数目;
- bi_io_vec:存放段的数组,bio中每个段是由一个bio_vec的数据结构描述的,关于什么是“段”下面会有介绍;
- bi_pool:备用的bio内存池;
- bi_inline_vecs:一般一个bio就一个段,bi_inline_vecs就 可满足,省去了再为bi_io_vec分配空间;
一个bio可能有多个bio_vec,多个bio经过I/O调度和合并之后可以形成一个request。
请求操作以及请求标志定义在include/linux/blk_types.h:
/* * Operations and flags common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. * * The least significant bit of the operation number indicates the data * transfer direction: * * - if the least significant bit is set transfers are TO the device * - if the least significant bit is not set transfers are FROM the device * * If a operation does not transfer data the least significant bit has no * meaning. */ #define REQ_OP_BITS 8 #define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) #define REQ_FLAG_BITS 24 enum req_opf { /* read sectors from the device */ REQ_OP_READ = 0, /* write sectors to the device */ REQ_OP_WRITE = 1, /* flush the volatile write cache */ REQ_OP_FLUSH = 2, /* discard sectors */ REQ_OP_DISCARD = 3, /* securely erase sectors */ REQ_OP_SECURE_ERASE = 5, /* reset a zone write pointer */ REQ_OP_ZONE_RESET = 6, /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, REQ_OP_SCSI_OUT = 33, /* Driver private requests */ REQ_OP_DRV_IN = 34, REQ_OP_DRV_OUT = 35, REQ_OP_LAST, }; enum req_flag_bits { __REQ_FAILFAST_DEV = /* no driver retries of device errors */ REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ __REQ_NOMERGE, /* don't touch this for merging */ __REQ_IDLE, /* anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ __REQ_HIPRI, /* for driver use */ __REQ_DRV, __REQ_SWAP, /* swapping request. */ __REQ_NR_BITS, /* stops here */ }; #define REQ_FAILFAST_DEV (1ULL << __REQ_FAILFAST_DEV) #define REQ_FAILFAST_TRANSPORT (1ULL << __REQ_FAILFAST_TRANSPORT) #define REQ_FAILFAST_DRIVER (1ULL << __REQ_FAILFAST_DRIVER) #define REQ_SYNC (1ULL << __REQ_SYNC) #define REQ_META (1ULL << __REQ_META) #define REQ_PRIO (1ULL << __REQ_PRIO) #define REQ_NOMERGE (1ULL << __REQ_NOMERGE) #define REQ_IDLE (1ULL << __REQ_IDLE) #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) #define REQ_FUA (1ULL << __REQ_FUA) #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_HIPRI (1ULL << __REQ_HIPRI) #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
1.4 bio_vec
bio_vec描述指定page中的一块连续的区域,在bio中描述的就是一个page中的一个"段"(segment)。
bio_vec定义在include/linux/bvec.h:
/* * was unsigned short, but we might as well be ready for > 64kB I/O pages */ struct bio_vec { struct page *bv_page; unsigned int bv_len; unsigned int bv_offset; };
其部分参数含义如下:
- bv_page:指向段在页框描述符的指针;
- bv_len:段的字节长度;
- bv_offset:页框中数据的偏移量;
从上面几个参数,我们不难猜出bio段就是描述所有读或写的数据在内存中的位置。
1.5 bvec_iter
用于记录当前bio_vec被处理的情况,用于遍历bio,定义在include/linux/bvec.h:
struct bvec_iter { sector_t bi_sector; /* device address in 512 byte sectors */ unsigned int bi_size; /* residual I/O count */ unsigned int bi_idx; /* current index into bvl_vec */ unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ };
其部分参数含义如下:
- bi_sector:I/O 请求的块设备起始扇区(每个扇区512 字节);
- bi_size:待传输的字节大小;
- bi_idx:bio_vec中当前索引,遍历bio_vec使用;
- bi_bvec_done:当前 bio_vec中已经处理完成的字节数,比那里bio_vec使用;
1.6 request_queue、request、bio关系结构图
下面用一副关系图来描述 request_queue、request、bio之间的一个关系:
在上图中我们可以看到:
- 每一个块设备都对应一个通用磁盘gendick结构;
- 每一个gendisk都有一个工作队列request_queue,保存着若干个准备就绪的request;
- 每一个request代表着块设备可以处理的一次任务单元,一个request由一个bio或者多个扇区相连的bio组成;
- bio代表了一次I/O请求,代表一个块设备的一个扇区或者多个连续扇区的数据请求,扇区是块设备的最小访问单元,bio是文件系统发给Block Layer(块设备层)的,至于系统调用(sys_read、sys_write)到bio的生成过程不在我们这一节的讨论范围内,有兴趣的可以参考IO子系统全流程介绍;
- 每个进程有新的bio到来时:
- 会先检查能不能合并到当前进程plug list里面的某个request中;
- 如果定义了I/O调度器,然后检查能不能合并到I/O调度器队列的某个request中;
- 如果没有定义I/O调度器,然后检查能不能合并到请求队列的某个request中;
- 如果可以就不必申请新的request了,只需要在已经存在的request.bio链表上新增成员就可以了,具体是放在链表头,还是链表尾取决于磁盘的相对位置;如果不行再变成request,然后放到plug list中;然后在等待特定时机释放plug list中的request到I/O调度器,这样可以提高bio处理的效率(如果没有plug,那么每个进程在将request放到请求队列时就会竞争request_queue的锁,有个plug list之后,就可以等到当前进程堆积到一定量的request时再推送到请求队列,就可以一定程度上减少锁的竞争次数);
- I/O调度器会在内部创建自定义的各种队列来存放plug list释放出来的request,然后I/O调度器对这个所有的request进行一个调度,按照特定的规则再分发给request_queue,让块设备处理;同时有了I/O调度器,我们就可以实现所有request的重新排序甚至合并,还可以对不同进程的request进行不同的优先级控制,目前linux支持的scheduler有:CFQ,deadline、Noop等;
注:两个bio描述了一段连续的磁盘空间,如果两个bio在磁盘物理地址正好是相邻的,组合起来也刚好是一段连续的磁盘空间,对于这种情况实际上也只需要给磁盘发送一次请求就够了,不需要将两个bio分别单独发给磁盘,因此可以将这两个bio合并为一个request,相邻的bio通过bio.bi_next构建成一个链表,request.bio记录链表头,request.biotail记录链表尾;
我们对物理存储设备的操作不外乎就是将RAM中的数据写入到物理存储设备,或者将物理存储设备中的数据读取到RAM中去处理。
- 数据传输三个要求:数据源、数据长度以及数据目的地,就是你要从物理存储设备的哪个地址开始读取、读取到 RAM 中的哪个地址处、读取的数据长度是多少。
- bi_iter这个结构体成员变量就是用于描述物理存储设备的地址信息,比如要操作的扇区地址。
- bi_io_vec指向 bio_vec数组首地址,bio_vec数组就是内存信息,比如页地址、页偏移以及长度。
一个bio可能有多个bio_vec(bio段),这些bio段可能在内存上不连续(位于不同的页),但它们在磁盘上对应的位置时连续的;因此我们可以推断出,具体包含多少个bio段取决于物理存储设备这一段连续的磁盘空间映射了多少个不连续的内存空间,一般上层构建bio的时候都是只有一个bio段。
在块I/O操作期间bio的内容一直保持更新,例如,块设备驱动在一次分散聚集DMA操作中不能一次完成全部数据的传送,那么bio的bi_iter.bi_idx就会更新来指向待传送的第一个bio段。
二、Multi-Queue Block IO Queueing Mechanism (blk-mq)
2.1 blk-sq框架
Linux上传统的块设备层(Block Layer)和IO调度器(如完全公平算法)主要是针对HDD(hard disk drivers)设计的。我们知道,HDD设备的随机IO性能很差,吞吐量大约是几百IOPS(IOs per second),延迟在毫秒级,所以当时IO性能的瓶颈在硬件,而不是内核。
Linux上传统块设备使用单队列blk-sq(block simple queue)架构,如图所示:
简单来说,块设备层负责管理从用户进程到存储设备的I/O请求,一方面为上层提供访问不同存储设备的统一接口,隐藏存储设备的复杂性和多样性;另一方面,为存储设备驱动程序提供通用服务,让这些驱动程序以最适合的方式接收来自上层的I/O请求。Linux Block Layer主要提供以下几个方面的功能:
- bio的提交和完成处理,上层通过bio来描述单一的块设备I/O请求;
- bio会被合并或者直接转换为request请求;
- request请求合并、排序等;
- I/O调度:如预期算法、最后期限算法、完全公平算法等;
- I/O记账:I/O记账,如统计提交到块设备的I/O总量,I/O延迟等信息;
由于采用单队列(每个块设备1个请求队列)的设计,传统的Block Layer对多核体系的可扩展性(scalability)不佳。
随着高速SSD(Solid State Disk)和没有机械部件的非易失性存储器的发展,支持硬件多队列的存储器件越来越常见(比如NVMe SSD),可以高并发随机随机访问, 百万级甚至千万级IOPS的数据访问已成为一大趋势,传统的块设备层已无法满足这么高的IOPS需求,逐渐成为系统IO性能的瓶颈。多核体系中blk-sq的软件开销主要来自三个方面:
- 请求队列锁竞争:blk-sq使用spinlock(q->queue_lock)来同步对请求队列的访问,每次调度器队列中插入或删除ruquest请求,必须先获取此锁;request排序和调度操作时,也必须先获取此锁。这一系列操作继续之前,必须先获得请求队列锁,在高IOPS场景(多个线程同时提交I/O请求)下,势必引起剧烈的锁竞争,带来不可忽视的软件开销;
- 硬件中断:高的IOPS意味着高的中断数量。在多数情况下,完成一次I/O需要两次中断,一次是存储器件触发的硬件中断,另一次是IPI核间中断用于触发其他CPU上的软中断。
- 远端内存访问:如果提交I/O请求的CPU不是接收硬件中断的CPU且这两个CPU没有共享缓存,那么获取请求队列锁的过程中还存在远端内存访问问题;
为了适配现代存设备(高速SSD等)高IOPS、低延迟的I/O特征,新的块设备层框架Block multi-queue(blk-mq)应运而生。
2.2 blk-mq框架
多队列blk-mq队列框架如下:
blk-mq中使用了两层队列,将单个请求队列锁的竞争分散到多个队列中,极大的提高了Block Layer并发处理IO的能力。两层队列的设计分工明确:
- 软件队列(对应内核struct blk_mq_ctx数据结构):blk-mq中为每个CPU分配一个软件队列(soft context dispatch queue),由于每个CPU有单独的队列,所以每个CPU上的这些I/O操作可以同时进行(需要注意的是:同一个CPU中的进程间依然存在锁竞争的问题),而不存在锁竞争问题;
- 硬件队列(对应内核struct blk_mq_hw_ctx数据结构,更准确的说是硬件派发队列):blk-mq为存储器件的每个硬件队列(目前多数存储器件只有1个)分配一个硬件派发队列(hard context dispatch queue),负责存放软件队列往这个硬件队列派发的I/O请求。在存储设备驱动初始化时,blk-mq会通过固定的映射关系将一个或多个软件队列映射(map)到一个硬件派发队列(同时保证映射到每个硬件队列的软件队列数量基本一致),之后这些软件队列上的I/O请求会往存储器件对应的硬件队列上派发。
blk-mq架构解决了blk-sq架构中请求队列锁竞争和远端内存访问问题,极大的提高了Block Layer的IOPS吞吐量。
2.3 I/O队列流程图
当进程对块设备进行读写操作时,系统调用经过文件系统会生成bio,所有的bio都由submit_bio函数提交到Block Layer,bio的处理大致经过以下队列(下图只是一个示意图,reqeust的分发流程还要结合具体情境分析,比如:request也有可能直接派发到块设备驱动层):
注:值得注意的是在2013年之后的版本中,plug机制已经不能满足硬件需求了,kernel又提供了新的机制来替代它,所以在linux 5.2.8版本中plug并不是必须的,例如sys_read中使用了plug机制,但是sys_write已经不再使用plug机制。具体是否使用取决于代码作者是否在调用submit_bio函数前后调用了blk_start_plug和blk_finish_plug两个函数对struct blk_plug进行初始化;
2.3.1 进程私有的plug list
队列中存放的是request,引入这个缓冲队列的目的是为了性能。进程提交一个bio后,短时间类很可能还会有新的bio,这些bio变成request后被暂存在plug list中,因为这个队列只有本进程能操作,所以不用加锁就可以进行bio merge操作。
2.3.2 调度器队列
multi-queue的调度器有mq-deadline、bfq、kyber。每个调度器有都实现了专门的数据结构管理(链表、红黑树等),这里统以elevator queus称呼。
系统中的调度器队列可能有很多,调度器需要决定先处理哪个队列以及队列中的哪个request。
2.3.3 blk_mq_ctx软件队列
对于multi-queue,linux内核将blk_mq_ctx定义为per_cpu变量,每个request仅可以加到本cpu的blk_mq_ctx链表上。
在另一方面,I/O请求分为读和写,在nvme设备中读和写请求共用一个queue时,写请求会将读请求阻塞,因此linux内核总结出三种模式供request进行选择 default模式、只读模式、poll轮询模式,ctx和hctx都遵循这个标准。
2.3.4 blk_mq_hw_ctx硬件队列
存储器件空闲时,其设备驱动程序主动从调度器中拉取一个request存在软件队列中,blk-mq会通过固定的映射关系将一个或多个软件队列(ctx)映射(map)到一个硬件队列(hctx),之后这些软件队列上的request会往对应的硬件队列上派发,最后硬件派发队列中的request按照先进先出顺序被封装成cmd下发给器件。
三、 blk-mq数据结构
blk-mq代码在Linux-3.13(2014)内核中合入主线,在Linux-3.16中成为内核的一个完整特性,在Linux-5.0内核中,blk-sq代码(包括基于blk-sq的I/O调度器)已被完全移除,MQ成为Linux Block layer的默认选项。下面基于Linux-5.2.8内核介绍blk-mq代码和关键数据结构。
3.1 blk_mq_tag_set
blk_mq_tag_set,包含了一个新的块设备(物理设备)向Block Layer注册时需要的所有重要信息,抽象了存储器件的I/O特征,定义在include/linux/blk-mq.h:
struct blk_mq_tag_set { /* * map[] holds ctx -> hctx mappings, one map exists for each type * that the driver wishes to support. There are no restrictions * on maps being of the same size, and it's perfectly legal to * share maps between types. */ struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; /* nr entries in map[] */ const struct blk_mq_ops *ops; unsigned int nr_hw_queues; /* nr hw queues across maps */ unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; unsigned int cmd_size; /* per-request extra data */ int numa_node; unsigned int timeout; unsigned int flags; /* BLK_MQ_F_* */ void *driver_data; struct blk_mq_tags **tags; struct mutex tag_list_lock; struct list_head tag_list; };
其中部分参数如下:
- map: 每个数组成员代表一种类型的硬件队列,每个元素内部又维护着一个数组mq_map,用于保存软件队列(ctx)到硬件队列(hctx)的映射表,mq_map数组的下标为cpu编号,数组元素为cpu编号所对应的硬件队列号;
- nr_maps:map中元素的数量,他的范围在【1,HCTX_MAX_TYPES】之间;
- ops:块设备驱动mq的操作集合,用于抽象块设备驱动的行为;
- nr_hw_queues:块设备的硬件队列数量,目前多数块设备时1,nvme可能超过1;
- queue_depth:每个硬件队列的深度(包含预留的个数reserved_tags);
- reserved_tags:每个硬件队列预留的元素个数;
- cmd_size:块设备驱动为每个request分配的额外的空间大小,一般用于存放设备驱动payload数据;
- numa_node:块设备连接的NUMA(Non Uniform Memory Access Architecture)节点,分配request内存时使用,避免远程内存访问问题;
- timeout:请求处理的超时时间,单位是jiffies,例如ufs默认是30s;
- flags:0个或者多个BLK_MQ_F*标志;
- driver_data:块设备驱动私有数据;
- tags:tag sets,每个硬件队列都有一个blk_mq_tags结构体,一共具有nr_hw_queues个元素;
- tag_list_lock:互斥锁,用于同步访问tag_list;
- tag_list:通过list_head可以用来构建一个blk_mq_tag_set类型的双向链表;
blk_mq_queue_map用于描述软硬队列之间的映射关系:
struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; unsigned int queue_offset; };
在blk_mq_tag_set中定义了一个blk_mq_queue_map数组,每个数组元素代表一种硬件队列类型,主要的硬件列类型包括三种:
- HCTX_TYPE_DEFAULT(默认模式)
- HCTX_TYPE_READ(只读模式)
- HCTX_TYPE_POLL(poll轮询模式)
3.2 blk_mq_tags
blk_mq_tags主要是管理struct request的分配, blk_mq_tags与硬件队列blk_mq_hw_ctx一一对应,定义在block/blk-mq-tag.h:
/* * Tag address space map. */ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; atomic_t active_queues; struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; struct request **rqs; struct request **static_rqs; struct list_head page_list; };
其中部分参数如下:
- nr_tags:每个硬件队列的深度(包含预留的个数reserved_tags);
- nr_reserved_tags:每个硬件队列预留的tag个数;
- active_queue:活跃队列数量,blk-mq中一个tag set可以是多个request queue共享的,记录当前活跃队列数量的目的是为了均匀分配tag到每个request queue;
- bitmap_tags:tag的位图;每个bit代表一个tag标记,用于标示硬件队列中的request;1位已分配,0为为分配;bitmap_tags管理static_rqs[nr_reserved_tags ~ nr_tags]这nr_tags- nr_reserved_tags个request;
- breserved_tags:保留tag的位图,每个bit代表一个tag标记,用于标示硬件队列中的request;1位已分配,0为为分配;breserved_tags管理static_rqs[0~ (nr_reserved_tags-1]]这nr_reserved_tags个request;
- rqs:struct request *类型数组,数组长度为nr_tags;
- static_rqs:struct request *类型数组,数组长度为nr_tags;数组元素在blk_mq_alloc_rqs()中根据硬队列深度真实分配了队列的request;
- page_list:用于链接分配出的page;
什么是tag呢?tag是用来为request打标签的,只有一个request被分配了一个tag,这个request才能进行真正的I/O传输。
一个硬件队列的深度为nr_tags,也就是该硬件队列最多包含nr_tags个request,这些request都是事前已经分配好的,并且保存在static_rqs数组中。
每当一个bio被提交,如果被转换成request的话,需要进行如下步骤:
- 首先从bitmap_tags或或者breserved_tags分配一个tag;
- 然后根据tag索引,获取static_rqs[tag]作为当前的请求,并初始化该请求成员;
- 设置rqs[tag]=static_rqs[tag];
3.3 blk_mq_ctx
blk_mq_ctx用来表示软件队列,更准备的说是软件队列上下文,与CPU数量相同,blk_mq_ctx定义在block/blk-mq.h:
/** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; unsigned long rq_merged; /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp;
其中部分参数如下:
- rq_list:双向链表头节点数组,长度为HCTX_MAX_TYPES,每一个元素都是双向链表头节点,数组依次存放HCTX_TYPE_DEFAULT、HCTX_TYPE_READ、HCTX_TYPE_POLL类型的软件队列的头节点(每种类型的软件队列本质上是由request组成的双向链表);
- cpu:当前cpu索引号;
- hctxs:指针数组类型,数组长度为硬件队列类型数量,每个元素都是一个struct blk_mq_hw_ctx指针;依次指向HCTX_TYPE_DEFAULT、HCTX_TYPE_READ、HCTX_TYPE_POLL类型的硬件队列(每种类型的硬件队列本质上是由request组成的双向链表);
- queue:struct request_queue类型,这个变量会被初始化为blk_mq_init_queue()函数分配的request_queue;
3.4 blk_mq_hw_ctx
blk_mq_hw_ctx用来表示硬件队列,更准备的说是硬件队列上下文,每个blk_mq_hw_ctx是和blk_mq_tags一一对应,blk_mq_hw_ctx定义在include/linux/blk-mq.h:
/** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device */ struct blk_mq_hw_ctx { struct { spinlock_t lock; struct list_head dispatch; unsigned long state; /* BLK_MQ_S_* flags */ } ____cacheline_aligned_in_smp; struct delayed_work run_work; cpumask_var_t cpumask; int next_cpu; int next_cpu_batch; unsigned long flags; /* BLK_MQ_F_* flags */ void *sched_data; struct request_queue *queue; struct blk_flush_queue *fq; void *driver_data; struct sbitmap ctx_map; struct blk_mq_ctx *dispatch_from; unsigned int dispatch_busy; unsigned short type; unsigned short nr_ctx; struct blk_mq_ctx **ctxs; spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; struct blk_mq_tags *sched_tags; unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int numa_node; unsigned int queue_num; atomic_t nr_active; struct hlist_node cpuhp_dead; struct kobject kobj; unsigned long poll_considered; unsigned long poll_invoked; unsigned long poll_success; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; #endif struct list_head hctx_list; /* Must be the last member - see also blk_mq_hw_ctx_size(). */ struct srcu_struct srcu[0]; };
其中部分参数如下:
- dispatch:双向链表头节点,用于保存派发到硬件队列的所有request;
- dispatch_busy:用来描述当前块设备驱动器(也就是磁盘硬件状态)的状态是否繁忙,0表示不繁忙;
- tags:用来保存硬件队列对应的blk_mq_tags(针对无调度算法);
- sched_tags:用来保存硬队列对应的blk_mq_tags(针对有调度算法);
- queue:struct request_queue类型,这个变量会被初始化为blk_mq_init_queue()函数分配的request_queue;
- ctxs:struct blk_mq_ctx **类型,指向struct blk_mq_ctx *类型的数组,数组长度为CPU个数;
- queue_num:硬件队列索引号;
- numa_node:存储适配器已连接到的NUMA节点;
由于每一个硬件队列对应一个blk_mq_tags,在blk_mq_tags:
- static_rqs这个指针数组保存了nr_tags个request指针;
- breserved_tags管理static_rqs[0~ (nr_reserved_tags-1]]这nr_reserved_tags个request;
- bitmap_tags管理static_rqs[nr_reserved_tags ~ nr_tags]这nr_tags- nr_reserved_tags个request;
3.5 blk_mq_ops
blk_mq_ops定义块设备驱动mq的操作集合,用于抽象块设备驱动的行为,定义在include/linux/blk-mq.h:
struct blk_mq_ops { /* * Queue request */ queue_rq_fn *queue_rq; /* * If a driver uses bd->last to judge when to submit requests to * hardware, it must define this function. In case of errors that * make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ commit_rqs_fn *commit_rqs; /* * Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ get_budget_fn *get_budget; put_budget_fn *put_budget; /* * Called on request timeout */ timeout_fn *timeout; /* * Called to poll for completion of a specific tag. */ poll_fn *poll; complete_fn *complete; /* * Called when the block layer side of a hardware queue has been * set up, allowing the driver to allocate/init matching structures. * Ditto for exit/teardown. */ init_hctx_fn *init_hctx; exit_hctx_fn *exit_hctx; /* * Called for every command allocated by the block layer to allow * the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. * * Ditto for exit/teardown. */ init_request_fn *init_request; exit_request_fn *exit_request; /* Called from inside blk_get_request() */ void (*initialize_rq_fn)(struct request *rq); /* * If set, returns whether or not this queue currently is busy */ busy_fn *busy; map_queues_fn *map_queues; #ifdef CONFIG_BLK_DEBUG_FS /* * Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif };
其中部分参数定义:
- queue_rq:Queue a new request from block IO. 块设备驱动注册的queue_rq函数,将request请求发往块设备驱动;
- commit_rqs:If a driver uses bd->last to judge when to submit requests to hardware, it must define this function. In case of errors that make us stop issuing further requests, this hook serves the purpose of kicking the hardware (which the last request otherwise would have done).
- queue_rqs:Queue a list of new requests. Driver is guaranteed that each request belongs to the same queue. If the driver doesn’t empty the rqlist completely, then the rest will be queued individually by the block layer upon return.
- get_budget:Reserve budget before queue request, once .queue_rq is run, it is driver’s responsibility to release the reserved budget. Also we have to handle failure case of .get_budget for avoiding I/O deadlock.
- put_budget:Release the reserved budget.
- set_rq_budget_token:store rq’s budget token
- get_rq_budget_token:retrieve rq’s budget token
- timeout:Called on request timeout.
- poll:Called to poll for completion of a specific tag.
- complete:Mark the request as complete.
- init_hctx:Called when the block layer side of a hardware queue has been set up, allowing the driver to allocate/init matching structures.
- exit_hctx:Ditto for exit/teardown.
- init_request:Called for every command allocated by the block layer to allow the driver to set up driver specific data.Tag greater than or equal to queue_depth is for setting up flush request.
- exit_request:Ditto for exit/teardown.
- cleanup_rq:Called before freeing one request which isn’t completed yet, and usually for freeing the driver private data.
- busy:If set, returns whether or not this queue currently is busy.
- map_queues:This allows drivers specify their own queue mapping by overriding the setup-time function that builds the mq_map. 自定义ctx到hctx的函数函数。
- show_rq:Used by the debugfs implementation to show driver-specific information about a request.
3.6 数据结构关系
由于blk-mq相关数据结构比较多,全部了解也是很难的,这里我大概绘制了各个数据结构之间的关系:
四、blk-mq API
4.1 blk_mq_init_queue
基于blk-mq的块设备驱动初始化时,通过调用blk_mq_init_queue初始化请求队列,其定义在block/blk-mq.c,例如,scsi-mq驱动中,每次添加scsi设备(scsi_device)时都会调用blk_mq_init_queue接口来初始化scsi设备的请求队列。
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct request_queue *uninit_q, *q; uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); q = blk_mq_init_allocated_queue(set, uninit_q); if (IS_ERR(q)) blk_cleanup_queue(uninit_q); return q; }
函数的主要流程如下:
- 调用blk_alloc_queue_node分配请求队列的内存,分配的内存节点与设备连接的NUMA节点一致,避免远端内存访问问题;
- 调用blk_mq_init_allocated_queue来分配请求队列request_queue,期间会分配软件队列和硬件队列并初始化,并进一步建立软件队列和硬件队列的映射关系;
4.2 blk_mq_init_sq_queue
blk_mq_init_sq_queue函数用于初始化一个请求队列,其定义在block/blk-mq.c:
/* * Helper for setting up a queue with mq ops, given queue depth, and * the passed in mq ops flags. */ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags) { struct request_queue *q; int ret; memset(set, 0, sizeof(*set)); // 清空set set->ops = ops; // 设置块设备驱动行为 set->nr_hw_queues = 1; // 设置硬件队列数量为1 set->nr_maps = 1; // map元素的数量只有一个,也就是只使用default类型的队列 set->queue_depth = queue_depth; // 设置硬件队列的深度 set->numa_node = NUMA_NO_NODE; // -1 set->flags = set_flags; // BLK_MQ_F*标志 ret = blk_mq_alloc_tag_set(set); // 这个函数比较复杂,下面介绍 if (ret) return ERR_PTR(ret); q = blk_mq_init_queue(set); // 动态分配请求队列,并初始化 if (IS_ERR(q)) { blk_mq_free_tag_set(set); return q; } return q; }
blk_mq_init_sq_queue函数有四个参数:
- set:可以在请求队列之间共享的tag set,描述了一个新的块设备(物理设备)向Block Layer注册时需要的所有重要信息;
- ops:实现块驱动程序行为的回调函数;
- queue_depth:硬件队列深度;
- set_flags:设置标志;
4.3 blk_mq_alloc_tag_set
blk_mq_alloc_tag_set分配的不是blk_mq_tag_set ,而是为全体硬队列分配blk_mq_tags指针数组,每个硬队列对应一个blk_mq_tags指针,函数定义在block/blk-mq.c:
/* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) // 如果不存在硬件队列 return -EINVAL; if (!set->queue_depth) // 如果硬件队列深度为0 return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) // queue_depth < reserved_tags + 1 return -EINVAL; if (!set->ops->queue_rq) // 未指定块设备操作行为函数queue_rq return -EINVAL; if (!set->ops->get_budget ^ !set->ops->put_budget) // 同时指定 return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { // > 10240 pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); set->queue_depth = BLK_MQ_MAX_DEPTH; } if (!set->nr_maps) // ctx->hctx映射表为空 set->nr_maps = 1; else if (set->nr_maps > HCTX_MAX_TYPES) return -EINVAL; /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 1 queue and * 64 tags to prevent using too much memory. */ if (is_kdump_kernel()) { // crashdump激活时 set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = min(64U, set->queue_depth); } /* * There is no use for more h/w queues than cpus if we just have * a single map */ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) // 硬件队列大于CPU数量(等于软件队列数量) set->nr_hw_queues = nr_cpu_ids; set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), // 分配硬件队列数量个struct blk_mq_tags * GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; ret = -ENOMEM; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, // 初始化ctx->hctx映射表,mq_map长度为CPU个数 sizeof(set->map[i].mq_map[0]), GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; // 设置硬件队列数量 } ret = blk_mq_update_queue_map(set); // 初始化ctx -> hctx映射表 if (ret) goto out_free_mq_map; ret = blk_mq_alloc_rq_maps(set); // 为每个硬件队列分配blk_mq_tags并初始化rqs if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); set->map[i].mq_map = NULL; } kfree(set->tags); set->tags = NULL; return ret; }
主要流程如下:
- 设置硬件队列数量(nr_hw_queues)和映射表数量(nr_maps);
- 调用kcalloc_node根据硬件队列数量扩展tags数组,数组长度为硬件队列个数,数组元素为struct blk_mq_tags *类型;
- 调用blk_mq_update_queue_map更新映射表(mq_map数组),数组下标为cpu编号,数组元素为cpu编号所对应的硬队列号(map: cpu id->hw queue id);
- 调用blk_mq_alloc_rq_maps为每个硬件队列分配blk_mq_tags并初始化tags->rqs、tags->static_rqs;
4.3.1 blk_mq_update_queue_map
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { if (set->ops->map_queues && !is_kdump_kernel()) { // 设置了ctx->hctx的映射函数 int i; /* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); // 清空 return set->ops->map_queues(set); // 映射 } else { BUG_ON(set->nr_maps > 1); return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); // 初始化映射表,即一个或者多个软件队列如何映射到硬件队列 } }
如果没有重写map_queues,实际上这里只建立了HCTX_TYPE_DEFAULT类型软件队列到硬件队列的映射。
4.3.2 blk_mq_alloc_rq_maps
static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) { int ret = 0; set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, // 为硬件队列hctx_idx动态申请blk_mq_tags,并扩展rqs、static_rqs数组,rqs、static_iqs均 // 指向一个struct *request数组 set->queue_depth, set->reserved_tags); if (!set->tags[hctx_idx]) return false; ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, // 动态申请request,并赋值给set->tags[hctx_idx]->static_iqs[i] set->queue_depth); if (!ret) return true; blk_mq_free_rq_map(set->tags[hctx_idx]); set->tags[hctx_idx] = NULL; return false; } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; for (i = 0; i < set->nr_hw_queues; i++) // 遍历次数 = 硬件队列数量 if (!__blk_mq_alloc_rq_map(set, i)) // 动态申请blk_mq_tags,并赋值给set->tags[i] goto out_unwind; return 0; out_unwind: while (--i >= 0) blk_mq_free_rq_map(set->tags[i]); return -ENOMEM; } /* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { unsigned int depth; int err; depth = set->queue_depth; do { err = __blk_mq_alloc_rq_maps(set); if (!err) // 成功 break; set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { err = -ENOMEM; break; } } while (set->queue_depth); // 队列深度不为0 if (!set->queue_depth || err) { pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM; } if (depth != set->queue_depth) pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth); return 0; }
4.3.3 blk_mq_alloc_rq_map
blk_mq_alloc_rq_map:分配blk_mq_tags结构体并初始化,根据队列深度set->queue_depth来分配bitmap,每个bit代表一个tag标记,用于标示硬件队列中的request。根据tags->nr_tags分配struct request *指针数组;
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, // 硬件队列索引号 unsigned int nr_tags, // 硬件队列的深度 unsigned int reserved_tags) { struct blk_mq_tags *tags; int node; node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), // 分配nr_tags个struct request * GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; } tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), // 分配nr_tags个struct request * GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->static_rqs) { kfree(tags->rqs); blk_mq_free_tags(tags); return NULL; } return tags; }
4.3.4 blk_mq_alloc_rqs
blk_mq_alloc_rqs:根据队列深度depth分配request, 分配的request指针最终保存到tags->static_rqs[i]。注意此处分配request时,同时也分配了driver payload的空间用于存放cmd;
上面blk_mq_alloc_rq_map只是分配了struct request *指针数组,此处blk_mq_alloc_rqs根据硬队列深度真实分配了队列的request:
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; int node; node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); left = rq_size * depth; for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p; while (this_order && left < order_to_size(this_order - 1)) this_order--; do { page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break; } while (1); if (!page) goto fail; page->private = this_order; list_add_tail(&page->lru, &tags->page_list); p = page_address(page); /* * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { struct request *rq = p; tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { tags->static_rqs[i] = NULL; goto fail; } p += rq_size; i++; } } return 0; fail: blk_mq_free_rqs(set, tags, hctx_idx); return -ENOMEM; }
4.4 blk_mq_init_allocated_queue
blk_mq_init_queue调用blk_mq_init_allocated_queue来分配请求队列request_queue,期间会分配软件队列和硬件队列并初始化,并进一步建立软件队列和硬件队列的映射关系;
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, blk_mq_poll_stats_bkt, BLK_MQ_POLL_STATS_BKTS, q); if (!q->poll_cb) goto err_exit; if (blk_mq_alloc_ctxs(q)) goto err_poll; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); q->nr_queues = nr_hw_queues(set); q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); if (!q->queue_hw_ctx) goto err_sys_init; INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; if (set->nr_maps > HCTX_TYPE_POLL && set->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, q); q->sg_reserved_size = INT_MAX; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); /* * Do this after blk_queue_make_request() overrides it... */ q->nr_requests = set->queue_depth; /* * Default to classic polling */ q->poll_nsec = BLK_MQ_POLL_CLASSIC; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); if (!(set->flags & BLK_MQ_F_NO_SCHED)) { int ret; ret = elevator_init_mq(q); if (ret) return ERR_PTR(ret); } return q; err_hctxs: kfree(q->queue_hw_ctx); err_sys_init: blk_mq_sysfs_deinit(q); err_poll: blk_stat_free_callback(q->poll_cb); q->poll_cb = NULL; err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); }
其流程图大致如下:
这个初始化过程主要包含下面几步:
- 设置队列的mq_ops(q->mq_ops)为set->ops ;
- blk_mq_realloc_hw_ctxs:创建set->nr_hw_queues个blk_mq_hw_ctx,取地址赋值给q->queue_hw_ctx(struct blk_mq_hw_ctx*数组类型)每个成员,同时初始化每一个blk_mq_hw_ctx:
- hctx->nr_active=0;
- hctx->run_work,=blk_mq_run_work_fn;
- hctx->ctxs=kmalloc_array_node(nr_cpu_ids, sizeof(void *),gfp, node);为每个CPU分配软件队列blk_mq_ctx指针;
- hctx->nr_ctx=0;
- hctx->queue = q;
- 初始化自旋锁hctx->dispatch_wait_lock;
- 初始化自旋锁hctx->lock;
- 初始化双向链表头hctx->hctx_list;
- 初始化双向链表头&hctx->dispatch;
- hctx->queue_num=hctx_idx;
- hctx->tags=set->tags[hctx_idx]; 每个硬件队列对应一个blk_mq_tags,
- ...
- 设置request超时时间,初始化timeout_work(处理函数是blk_mq_timeout_work);
- 设置队列的make_request回调为blk_mq_make_request (bio的提交时会用到);
- blk_mq_init_cpu_queues:创建cpu数量个blk_mq_ctx,取地址赋值给q->queue_ctx(struct blk_mq_ctx数组)每个成员,同时初始化每一个blk_mq_ctx:
- ctx->cpu=i;
- 初始化双向链表头数组ctx->rq_lists;
- ctx->queue = q;
- ...
- 关联request_queue和块设备的tag set,初始化q->tag_set=set;
- blk_mq_map_swqueue:更新软件队列(ctx)到硬件派发队列(hctx)的映射关系(map: ctx->hctx);与blk_mq_map_queues不同,blk_mq_map_queues是通过map数组的mq_map数组通过索引和数组元素记录软硬队列的映射关系,其中数组索引为CPU编号,数组元素为硬队列编号,且映射关系保存在set->map[i]->mq_map中;blk_mq_map_swqueue是基于blk_mq_map_queues创建的映射关系,进一步将软队列描述符指针保存在硬队列描述符,将软队列映射到硬队列的映射号index_hw保存在软队列描述符,它也与软队列索引号相同;
4.4.1 blk_mq_realloc_hw_ctxs
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { // 循环次数,硬件队列数 int node; struct blk_mq_hw_ctx *hctx; node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); /* * If the hw queue has been mapped to another numa node, * we need to realloc the hctx. If allocation fails, fallback * to use the previous one. */ if (hctxs[i] && (hctxs[i]->numa_node == node)) continue; hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); // 初始化硬件队列 if (hctx) { if (hctxs[i]) blk_mq_exit_hctx(q, set, hctxs[i], i); hctxs[i] = hctx; } else { if (hctxs[i]) pr_warn("Allocate new hctx on node %d fails,\ fallback to previous one on node %d\n", node, hctxs[i]->numa_node); else break; } } /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues. */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; end = i; } else { j = i; end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } for (; j < end; j++) { struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { if (hctx->tags) blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } } mutex_unlock(&q->sysfs_lock); }
4.4.2 blk_mq_init_cpu_queue
static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; unsigned int i, j; for_each_possible_cpu(i) { // per cpu struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; int k; __ctx->cpu = i; // 当前索引号 spin_lock_init(&__ctx->lock); // 初始化自旋锁 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) INIT_LIST_HEAD(&__ctx->rq_lists[k]); // 初始化双向链表头 __ctx->queue = q; // 指向请求队列 /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = local_memory_node(cpu_to_node(i)); } } }
4.5 submit_bio
我们之前说过当进程对块设备进行读写操作时,系统调用经过文件系统会生成bio,所有的bio都由submit_bio函数提交到Block Layer,submit_bio定义在block/blk-core.c:
/** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is very similar in purpose to generic_make_request(), and * uses that function to do most of the work. Both are fairly rough * interfaces; @bio must be presetup and ready for I/O. * */ blk_qc_t submit_bio(struct bio *bio) { /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) // 低概率发生 count = queue_logical_block_size(bio->bi_disk->queue) >> 9; else count = bio_sectors(bio); if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { // 低概率发生 char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), op_is_write(bio_op(bio)) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, bio_devname(bio, b), count); } } return generic_make_request(bio); // 重点 }
整个处理流程如下图所示:
4.5.1 generic_make_request
generic_make_request函数被用于块设备的I/O请求中,其参数bio指针,描述了I/O需要做的事情,该函数定义在block/blk-core.c:
/** * generic_make_request - hand a buffer to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * * generic_make_request() is used to make I/O requests of block * devices. It is passed a &struct bio, which describes the I/O that needs * to be done. * * generic_make_request() does not return any status. The * success/failure status of the request, along with notification of * completion, is delivered asynchronously through the bio->bi_end_io * function described (one day) else where. * * The caller of generic_make_request must make sure that bi_io_vec * are set to describe the memory buffer, and that bi_dev and bi_sector are * set to describe the device address, and the * bi_end_io and optionally bi_private are set to describe how * completion notification should be signaled. * * generic_make_request and the drivers it calls may use bi_next if this * bio happens to be merged with someone else, and may resubmit the bio to * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ blk_qc_t generic_make_request(struct bio *bio) { /* * bio_list_on_stack[0] contains bios submitted by the current * make_request_fn. * bio_list_on_stack[1] contains bios that were submitted before * the current make_request_fn, but that haven't been processed * yet. */ struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) // 判断当前bio是否有效 goto out; /* * We only want one ->make_request_fn to be active at a time, else * stack usage with stacked devices could be a problem. So use * current->bio_list to keep a list of requests submited by a * make_request_fn function. current->bio_list is also used as a * flag to say if generic_make_request is currently active in this * task or not. If it is NULL, then no make_request is active. If * it is non-NULL, then a make_request is active, and new requests * should be added at the tail */ if (current->bio_list) { // 如果使用了bio_list,将不会立即处理 bio_list_add(¤t->bio_list[0], bio); // 追加到bio_list尾部 goto out; } /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be * added. ->make_request() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); // 初始化0号链表头结点、和尾结点为NULL current->bio_list = bio_list_on_stack; // 初始化当前进程bio_list,就是一个空链表 do { // 实际上这里只会执行一次 struct request_queue *q = bio->bi_disk->queue; // 获取通用磁盘gendisk请求队列 blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? BLK_MQ_REQ_NOWAIT : 0; if (likely(blk_queue_enter(q, flags) == 0)) { // 大概率发生 struct bio_list lower, same; // 局部变量 /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; // 设置1号链表值,由于0号链表实际上没有元素,所以是一个空链表 bio_list_init(&bio_list_on_stack[0]); // 清空0号链表,也是一个空链表 ret = q->make_request_fn(q, bio); // 回调请求队列request_queue的make_request_fn函数 blk_queue_exit(q); /* sort new bios into those for a lower level * and those for the same level 下面代码实际上是先对0号链表元素排序 依次为lower、same,然后再将1号链表链接到0号链表尾部 */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) // 链表头节点出列,不为NULL if (q == bio->bi_disk->queue) // 相同的请求队列 bio_list_add(&same, bio); // 追加到same链表尾部 else bio_list_add(&lower, bio); /* now assemble so we handle the lowest level first */ bio_list_merge(&bio_list_on_stack[0], &lower); // 合并lower链表到bio_list_on_statck[0]链表 bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { if (unlikely(!blk_queue_dying(q) && // 大概率不会发生 (bio->bi_opf & REQ_NOWAIT))) bio_wouldblock_error(bio); else bio_io_error(bio); } bio = bio_list_pop(&bio_list_on_stack[0]); // 链表头节点出列,NULL } while (bio); current->bio_list = NULL; /* deactivate */ out: return ret; }
该函数执行完毕不返回任何状态,请求的成功/失败状态,以及通知完成通知,是通过bio的bio->bi_end_to函数描述的,也就是bio的I/O操作结束时会调用的该函数。
我们来看一下bio_list结构:
/* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * * A bio_list anchors a singly-linked list of bios chained through the bi_next * member of the bio. The bio_list also caches the last list member to allow * fast access to the tail. */ struct bio_list { struct bio *head; struct bio *tail; };
上面这段代码虽然do{}while()中有一大堆代码,实际上都没起到任何作用,最终只会执行一次q->make_request_fn(q, bio)回调函数。
对于blk-mq,这个函数是在blk_mq_init_allocated_queue函数中设定的,被设置为为blk_mq_make_request 函数。由于这个函数过于复杂,我们单启一个小节来说。
4.6 blk_cleanup_queue
blk_cleanup_queue用于清除申请的请求队列,定义在block/blk-core.c文件中:
/** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. */ void blk_cleanup_queue(struct request_queue *q) { /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_queue_flag_set(QUEUE_FLAG_DYING, q); mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ blk_freeze_queue(q); rq_qos_exit(q); blk_queue_flag_set(QUEUE_FLAG_DEAD, q); /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); if (queue_is_mq(q)) blk_mq_exit_queue(q); /* * In theory, request pool of sched_tags belongs to request queue. * However, the current implementation requires tag_set for freeing * requests, so free the pool now. * * Queue has become frozen, there can't be any in-queue requests, so * it is safe to free requests now. */ mutex_lock(&q->sysfs_lock); if (q->elevator) blk_mq_sched_free_requests(q); mutex_unlock(&q->sysfs_lock); percpu_ref_exit(&q->q_usage_counter); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); }
五、blk_mq_make_request
我们经过之前的分析,已经了解到通过submit_io函数提交bio之后,会被blk_mq_make_request处理,blk_mq_make_request定义在block/blk-mq.c文件中:
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); // 判断是不是同步I/O操作 const int is_flush_fua = op_is_flush(bio->bi_opf); // 包含REQ_FUA和REQ_PREFLUSH标志位 struct blk_mq_alloc_data data = { .flags = 0}; struct request *rq; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; blk_queue_bounce(q, &bio); //DMA时相关的地址限制 blk_queue_split(q, &bio); // 判断当前的bio是否超过了预设最大处理大小,若是,则进行拆分,拆分后会进行gennric_make_request函数调用 if (!bio_integrity_prep(bio)) // bio完整性判断 return BLK_QC_T_NONE; if (!is_flush_fua && !blk_queue_nomerges(q) && // 非flush fua,并且支持合并 blk_attempt_plug_merge(q, bio, &same_queue_rq)) // 尝试将bio合并到进程plug list的request,如果成功直接返回 return BLK_QC_T_NONE; if (blk_mq_sched_bio_merge(q, bio)) // 尝试将bio合并到I/O调度器队列/软件队列里的request,如果成功,直接返回 return BLK_QC_T_NONE; rq_qos_throttle(q, bio); // 执行限流策略 data.cmd_flags = bio->bi_opf; rq = blk_mq_get_request(q, bio, &data); // 从硬件队列tags或者sched_tags获取一个request if (unlikely(!rq)) { // 大概率不会执行 rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; } trace_block_getrq(q, bio, bio->bi_opf); rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); plug = current->plug; // 获取当前进程plug list
// 针对不同情景,request派发略有不同 if (unlikely(is_flush_fua)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); // 将bio转换成request /* bypass scheduler for flush rq */ blk_insert_flush(rq); // 如果是flush fua,则将其加入到flush队列中,该队列直接发送至driver blk_mq_run_hw_queue(data.hctx, true); // 将I/O调度算法队列、软件队列、硬件队列上的request异步派发到块设备驱动 } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { // plug存在并且硬件队列数量为1,或者设置了mq回调commit_reqs /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. */ unsigned int request_count = plug->rq_count; struct request *last = NULL; blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); // 将bio转换为request if (!request_count) trace_block_plug(q); else last = list_entry_rq(plug->mq_list.prev); if (request_count >= BLK_MAX_REQUEST_COUNT || (last && // 如果plug list中存放了大量request,超出阈值 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); // 进行plug list中request向下一层派发 trace_block_plug(q); } blk_add_rq_to_plug(plug, rq); // 将request添加到plug list上 } else if (plug && !blk_queue_nomerges(q)) { //plug存在并且支持合并 blk_mq_bio_to_request(rq, bio); // 将bio直接转为request /* * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most * The plug list might get flushed before this. If that happens, * the plug list is empty, and same_queue_rq is invalid. */ if (list_empty(&plug->mq_list)) same_queue_rq = NULL; if (same_queue_rq) { list_del_init(&same_queue_rq->queuelist); plug->rq_count--; } blk_add_rq_to_plug(plug, rq); // request追加到plug list trace_block_plug(q); blk_mq_put_ctx(data.ctx); if (same_queue_rq) { // NULL 所以这里面不会执行 data.hctx = same_queue_rq->mq_hctx; trace_block_unplug(q, 1, true); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && // 硬件队列数量>1并且同步I/O,或者不使用I/O调度器且硬件队列不繁忙 !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true); } return cookie; }
该函数执行流程:
- 首先判断I/O请求是否可以跟其它request合并,如果无法合并再将I/O请求转换为request进一步处理:
- 调用blk_attempt_plug_merge尝试将bio合并到进程plug list的request,如果合并成功直接返回;
- 调用blk_mq_sched_bio_merge函数,如果定义了I/O调度器,尝试将bio合并到I/O调度器队列里的request,否则尝试将bio合并到当前CPU对应的软件队列ctx->rq_list双向链表里的request,如果合并成功直接返回;
- bio无法合并后,调用blk_mq_get_request从从硬件队列的blk_mq_tags结构体的tags->bitmap_tags或者tags->breserved_tags分配一个空闲tag(如果获取失败,则启动request异步派发,之后再次尝试分配tag),然后获取tags->static_rqs[tag_offset + tag]并初始化;
- 针对各种不同的情景,开始进行request派发处理:
- 如果是flush fua请求:调用blk_insert_flush函数将request直接插入到flush队列,然后调用blk_mq_run_hw_queue将I/O调度算法队列、软件队列、硬件队列上的request异步派发到块设备驱动
- 如果使用了进程plug list,并且硬件队列数为1:调用blk_add_rq_to_plug先将request添加到plug->mq_list双向链表上;
- 如果使用了进程plug list,并且支持合并:调用blk_add_rq_to_plug先将request添加到plug->mq_list双向链表上;
- 如果硬件队列数量>1并且同步I/O,或者不使用I/O调度器且硬件队列不繁忙:调用blk_mq_try_issue_directly将request直接派发到块设备驱动,如果块设备驱动层繁忙,也会执行blk_mq_run_hw_queue将I/O调度算法队列、软件队列、硬件队列上的request同步派发到块设备驱动;
- 其他情况:调用blk_mq_sched_insert_request函数执行I/O调度算法将request插入到I/O调度器队列;
5.1 操作位判定函数
利用op_is_sync,判断当前操作是不是同步:
/* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the * REQ_SYNC flag. */ static inline bool op_is_sync(unsigned int op) { return (op & REQ_OP_MASK) == REQ_OP_READ || (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); }
同步操作包括:
- 所有读操作;
- 请求带有REQ_SYNC、REQ_FUA、REQ_PREFLUSH;
O_SYNC 为同步I/O标记,保证数据安全写到非易失存储设备。
REQ_PREFLUSH:Explicit cache flushes The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from the filesystem and will make sure the volatile cache of the storage device has been flushed before the actual I/O operation is started. This explicitly guarantees that previously completed write requests are on non-volatile storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be set on an otherwise empty bio structure, which causes only an explicit cache flush without any dependent I/O. It is recommend to use the blkdev_issue_flush() helper for a pure cache flush;
REQ_FUA :Forced Unit Access The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the filesystem and will make sure that I/O completion for this request is only signaled after the data has been committed to non-volatile storage。
op_is_flush函数:
/* * Check if the bio or request is one that needs special treatment in the * flush state machine. */ static inline bool op_is_flush(unsigned int op) { return op & (REQ_FUA | REQ_PREFLUSH); }
5.2 blk_attempt_plug_merge
blk_attempt_plug_merge尝试将bio合并到plug list某个reeuest中:
/** * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued * @same_queue_rq: pointer to &struct request that gets filled in when * another request associated with @q is found on the plug list * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * * Plugging coalesces IOs from the same issuer for the same purpose without * going through @q->queue_lock. As such it's more of an issuing mechanism * than scheduling, and the request, while may have elvpriv data, is not * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. * * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct request **same_queue_rq) // 参数可选的,这里实参为NULL { struct blk_plug *plug; struct request *rq; struct list_head *plug_list; plug = current->plug; if (!plug) return false; plug_list = &plug->mq_list; list_for_each_entry_reverse(rq, plug_list, queuelist) { // 反向遍历双向链表 plug_list bool merged = false; if (rq->q == q && same_queue_rq) { /* * Only blk-mq multiple hardware queues case checks the * rq in the same queue, there should be only one such * rq in a queue **/ *same_queue_rq = rq; } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; switch (blk_try_merge(rq, bio)) { // 尝试将bio合并到rq case ELEVATOR_BACK_MERGE: merged = bio_attempt_back_merge(q, rq, bio); break; case ELEVATOR_FRONT_MERGE: merged = bio_attempt_front_merge(q, rq, bio); break; case ELEVATOR_DISCARD_MERGE: merged = bio_attempt_discard_merge(q, rq, bio); break; default: break; } if (merged) return true; } return false; }
5.2.1 bio_attempt_back_merge
bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_back_merge_fn(q, req, bio)) return false; trace_block_bio_backmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); req->biotail->bi_next = bio; // 合并到尾部 req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; // 扇区个数 blk_account_io_start(req, false); return true; }
5.2.2 bio_attempt_front_merge
bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_front_merge_fn(q, req, bio)) return false; trace_block_bio_frontmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); bio->bi_next = req->bio; // 合并到头部 req->bio = bio; req->__sector = bio->bi_iter.bi_sector; // 块设备的起始扇区 req->__data_len += bio->bi_iter.bi_size; // 当前请求需要传输的字节大小 blk_account_io_start(req, false); return true; }
5.2.3 bio_attempt_discard_merge
bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio) { unsigned short segments = blk_rq_nr_discard_segments(req); if (segments >= queue_max_discard_segments(q)) goto no_merge; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) goto no_merge; req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->nr_phys_segments = segments + 1; blk_account_io_start(req, false); return true; no_merge: req_set_nomerge(q, req); return false; }
5.3 blk_mq_sched_bio_merge
如果定义了I/O调度器,blk_mq_sched_bio_merge函数尝试将bio合并到I/O调度器队列,否则将bio和并到ctx->rq_lists(没有定义IO调度器)软件队列中。
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { struct elevator_queue *e = q->elevator; // I/O调度器队列 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); // 根据cpu索引号获取软件队列 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); // 获取某种类型的硬件队列指针 bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) { // 如果使用了I/O调度器 blk_mq_put_ctx(ctx); return e->type->ops.bio_merge(hctx, bio); // 尝试将bio合并到I/O调度器队列 } type = hctx->type; if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && !list_empty_careful(&ctx->rq_lists[type])) { // 获取type类型的软件队列 /* default per sw-queue merge */ spin_lock(&ctx->lock); ret = blk_mq_attempt_merge(q, hctx, ctx, bio); // 尝试将bio合并到type类型的软件队列中 spin_unlock(&ctx->lock); } blk_mq_put_ctx(ctx); return ret; }
5.3.1 blk_mq_get_ctx
blk_mq_get_ctx函数根据当前cpu索引号获取对应的软件队列:
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); // q->queue_ctx[cpu] 获取当前cpu对应的软件队列 } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, get_cpu()); // 第二个参数为当前cpu索引号,blk-mq中每个cpu分配了一个软件队列 }
5.3.2 blk_mq_map_queue
/* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags * @cpu: cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, struct blk_mq_ctx *ctx) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_HIPRI, poll must be enabled. */ if (flags & REQ_HIPRI) type = HCTX_TYPE_POLL; else if ((flags & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return ctx->hctxs[type]; // 获取某种类型的硬件队列指针 }
5.3.3 blk_mq_attempt_merge
/* * Reverse check our software queue for entries that we could potentially * merge with. Currently includes a hand-wavy stop count of 8, to not spend * too much time checking for merges. */ static bool blk_mq_attempt_merge(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct bio *bio) { enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { // 尝试将bio合并到type类型的软件队列的某个request上 ctx->rq_merged++; return true; } return false; }
5.3.4 blk_mq_bio_list_merge
/* * Iterate list of requests and see if we can merge this bio with any * of them. */ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio) { struct request *rq; int checked = 8; list_for_each_entry_reverse(rq, list, queuelist) { // 遍历list双向链表,列表元素为request类型 bool merged = false; if (!checked--) break; if (!blk_rq_merge_ok(rq, bio)) // 无法合并,继续下一个 continue; switch (blk_try_merge(rq, bio)) { // 尝试合并 case ELEVATOR_BACK_MERGE: if (blk_mq_sched_allow_merge(q, rq, bio)) merged = bio_attempt_back_merge(q, rq, bio); break; case ELEVATOR_FRONT_MERGE: if (blk_mq_sched_allow_merge(q, rq, bio)) merged = bio_attempt_front_merge(q, rq, bio); break; case ELEVATOR_DISCARD_MERGE: merged = bio_attempt_discard_merge(q, rq, bio); break; default: continue; } return merged; } return false; }
5.4 blk_mq_get_request
request是事先在硬件队列的tags或者sched_tags中分配好的,通过blk_mq_get_request获取:
static struct request *blk_mq_get_request(struct request_queue *q, struct bio *bio, struct blk_mq_alloc_data *data) { struct elevator_queue *e = q->elevator; // 获取I/O调度器队列 struct request *rq; unsigned int tag; bool put_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); // 根据当前cpu索引号获取软件队列,cpu、软件队列、硬件队列实一一队列关系 put_ctx_on_error = true; } if (likely(!data->hctx)) // 如果没有硬件队列,查找ctx->hctxs查找硬件队列 data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); if (data->cmd_flags & REQ_NOWAIT) // NOWAIT标志处理 data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { // 开启了I/O调度器 data->flags |= BLK_MQ_REQ_INTERNAL; /* * Flush requests are special and go directly to the * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ if (!op_is_flush(data->cmd_flags) && e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.limit_depth(data->cmd_flags, data); } else { blk_mq_tag_busy(data->hctx); } tag = blk_mq_get_tag(data); // 从硬件队列的blk_mq_tags结构体的tags->bitmap_tags或者tags->breserved_tags分配一个空闲的tag,一个request必须分配一个空闲的tag才能I/O传输 if (tag == BLK_MQ_TAG_FAIL) { // 分配失败,直接返回NULL if (put_ctx_on_error) { blk_mq_put_ctx(data->ctx); data->ctx = NULL; } blk_queue_exit(q); return NULL; } rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); // 对新分配的request进行初始化,赋值软件队列、起始时间等 if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; if (e && e->type->ops.prepare_request) { if (e->type->icq_cache) blk_mq_sched_assign_ioc(rq); e->type->ops.prepare_request(rq, bio); rq->rq_flags |= RQF_ELVPRIV; } } data->hctx->queued++; return rq; }
5.4.1 blk_mq_get_tag
blk_mq_get_tag定义在block/blk-mq-tag.c:
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); // 使用调度器返回硬件队列的data->hctx->sched_tags,否者返回data->hctx->tags struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; bool drop_ctx; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { // 使用预留tag if (unlikely(!tags->nr_reserved_tags)) { // 预留tag个数为0,异常 WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } bt = &tags->breserved_tags; // 获取预留的tag位图 tag_offset = 0; } else { bt = &tags->bitmap_tags; // tag位图 tag_offset = tags->nr_reserved_tags; // 预留的tag个数 } tag = __blk_mq_get_tag(data, bt); // tags->bitmap_tags或者tags->breserved_tags分配一个空闲tag。tag表明了req在 static_rqs[]的数组下标 if (tag != -1) // 如果是-1,分配失败 goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) // 如果使用了NOWAIT标志,直接返回 分配失败 return BLK_MQ_TAG_FAIL; ws = bt_wait_ptr(bt, data->hctx); drop_ctx = data->ctx == NULL; do { struct sbitmap_queue *bt_prev; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for * some to complete. */ blk_mq_run_hw_queue(data->hctx, false); // 将I/O调度算法队列、软件队列、硬件队列上的request同步派发到块设备驱动,以便腾出空闲的request /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ tag = __blk_mq_get_tag(data, bt); // 再次尝试从blk_mq_tags中分配空闲tag if (tag != -1) // 分配失败,如果分配成功,直接跳出 break; sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); // 再次尝试从blk_mq_tags中分配空闲tag if (tag != -1) break; if (data->ctx) blk_mq_put_ctx(data->ctx); bt_prev = bt; io_schedule(); // 休眠调度 sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); // 根据当前cpu索引号获取软件队列,因为唤醒之后进程所在cpu可能变了 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, // 获取软件队列对应的硬件队列 data->ctx); tags = blk_mq_tags_from_data(data); // 使用调度器时,返回hctx->sched_tags,无调度器时返回hctx->tags if (data->flags & BLK_MQ_REQ_RESERVED) // 重复前面的流程 bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so * other allocations on previous queue won't be starved. */ if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev); ws = bt_wait_ptr(bt, data->hctx); } while (1); if (drop_ctx && data->ctx) blk_mq_put_ctx(data->ctx); sbitmap_finish_wait(bt, ws, &wait); found_tag: return tag + tag_offset; // tag + tag_offset才是本次分配的空闲request在static_reqs[]数组的真正下标 }
该函数是分配tag的核心实现:从硬件队列的blk_mq_tags结构体的tags->bitmap_tags或者tags->breserved_tags分配一个空闲tag,一个request必须分配一个tag才能IO传输。
分配失败则启动硬件I/O数据派发,休眠后再尝试从blk_mq_tags结构体的tags->bitmap_tags或者tags->breserved_tags分配一个空闲tag。
需要说明的一点是,tag并不能一次就能分配成功,当时可能I/O传输进程很多,把tag分配完了。那只能先休眠等待,同时执行blk_mq_run_hw_queue()把I/O调度算法队列、软件队列、硬件队列上的request派发到块设备驱动,等它传输完成就会释放tag。然后当前进程唤醒,就能分配到tag,但是需要重新获取硬件队列和软件队列,因为休眠后唤醒,进程所属CPU可能变了。
5.4.2 blk_mq_rq_ctx_init
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, unsigned int tag, unsigned int op) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); // 获取当前硬件队列对应的blk_mq_tags struct request *rq = tags->static_rqs[tag]; // 获取tag索引对应的request req_flags_t rq_flags = 0; if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; } else { if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } rq->tag = tag; // 设置tag rq->internal_tag = -1; data->hctx->tags->rqs[rq->tag] = rq; // 保存当前请求 } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; // 请求队列 rq->mq_ctx = data->ctx; // 软件队列 rq->mq_hctx = data->hctx; // 硬件队列 rq->rq_flags = rq_flags; // 请求标志 rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; INIT_LIST_HEAD(&rq->queuelist); INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif /* tag was already set */ rq->extra_len = 0; WRITE_ONCE(rq->deadline, 0); rq->timeout = 0; rq->end_io = NULL; rq->end_io_data = NULL; data->ctx->rq_dispatched[op_is_sync(op)]++; refcount_set(&rq->ref, 1); return rq; }
5.5 blk_mq_try_issue_directly
blk_mq_try_issue_directly函数定义在block/blk-mq.c文件中,该函数将request直接派发到块设备驱动,如果块设备驱动层繁忙;这回将该request添加到硬件队列hctx->dispatch双向链表,然后再启动硬件队列上的request同步派发到块设备驱动
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { blk_status_t ret; int srcu_idx; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); // 如果设置堵塞标志位BLOCKING hctx_lock(hctx, &srcu_idx); // 上锁 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); // 直接派发request到块设备驱动 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) // 块设备驱动层繁忙 blk_mq_request_bypass_insert(rq, true); // 将request添加到硬件队列hctx->dispatch双向链表,然后再启动硬件队列上的request同步派发到块设备驱动 else if (ret != BLK_STS_OK) // 数据已经传输完成 blk_mq_end_request(rq, ret); // 进行I/O数据统计,唤醒进程 hctx_unlock(hctx, srcu_idx); // 解锁 }
5.5.1 __blk_mq_try_issue_directly
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, bool bypass_insert, bool last) { struct request_queue *q = rq->q; // 获取请求队列request_queue bool run_queue = true; /* * RCU or SRCU read lock is needed before checking quiesced flag. * * When queue is stopped or quiesced, ignore 'bypass_insert' from * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, * and avoid driver to try to dispatch again. */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; bypass_insert = false; goto insert; } if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_dispatch_budget(hctx)) goto insert; if (!blk_mq_get_driver_tag(rq)) { // 在将request派发到块设备驱动层之前,会再次分配tag blk_mq_put_dispatch_budget(hctx); goto insert; } return __blk_mq_issue_directly(hctx, rq, cookie, last); // 执行块设备驱动注册的queue_rq函数,将request请求发往块设备驱动 insert: if (bypass_insert) return BLK_STS_RESOURCE; blk_mq_request_bypass_insert(rq, run_queue); return BLK_STS_OK; }
5.5.2 blk_mq_get_driver_tag
blk_mq_get_driver_tag函数定义在block/blk-mq.c文件中,我们在前面已经调用了blk_mq_get_tag从硬件队列的blk_mq_tags结构体的tags->bitmap_tags或者tags->breserved_tags分配一个空闲的tag,这里为啥又来了一遍呢?
bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, .hctx = rq->mq_hctx, .flags = BLK_MQ_REQ_NOWAIT, .cmd_flags = rq->cmd_flags, }; bool shared; if (rq->tag != -1) // 该request已经分配了tag,直接返回 goto done; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); // 重新分配tag if (rq->tag >= 0) { if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } data.hctx->tags->rqs[rq->tag] = rq; // 保存当前请求 } done: return rq->tag != -1; }
这里首先判定该request是否已经分配了tag,如果分配了将直接返回。否则会重复之前的步骤。
这主要是因为request在派发到块设备驱动时,如果块设备驱动繁忙,派发失败,则会把requset加入硬件队列hctx->dispatch链表,然后把requset的tag释放掉,则req->tag=-1,等空闲时派发该request。一个requset必须分配一个tag才能I/O传输。
5.5.3 __blk_mq_issue_directly
__blk_mq_issue_directly是将request直接派发给块设备驱动的核心函数:
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, bool last) { struct request_queue *q = rq->q; // 获取请求队列 struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; blk_qc_t new_cookie; blk_status_t ret; new_cookie = request_to_qc_t(hctx, rq); /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done. */ ret = q->mq_ops->queue_rq(hctx, &bd); // queue_rq函数根据request设置块设备驱动cmd,把request添加到请求队列q->requeue_list,并启动定时器q->timeout switch (ret) { case BLK_STS_OK: // 成功 blk_mq_update_dispatch_busy(hctx, false); // 更新硬件队列的状态为不忙 hctx->dispatch_busy *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: // 块设备驱动繁忙 blk_mq_update_dispatch_busy(hctx, true); // 设置硬件队列的状态为繁忙 __blk_mq_requeue_request(rq); // 硬件队列繁忙,则从tags->bitmap_tags或者tags->breserved_tags中按照rq->tag这个tag编号释放tag; break; default: blk_mq_update_dispatch_busy(hctx, false); // 标记硬件队列的状态不忙 *cookie = BLK_QC_T_NONE; break; } return ret; }
基本是调用块设备驱动层的函数,将request有关的磁盘传输信息发送给块设备驱动,然后会进行磁盘数据传输。如果遇到块设备驱动硬件忙,则设置硬件队列忙,还释放request的tag。
5.5.4 blk_mq_request_bypass_insert
/* * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); // 考虑同步问题 list_add_tail(&rq->queuelist, &hctx->dispatch); // 将当前请求添加到硬件队列hctx->dispact双向链表上 spin_unlock(&hctx->lock); if (run_queue) // 执行,同步request派发操作 blk_mq_run_hw_queue(hctx, false); }
5.6 blk_mq_run_hw_queue
blk_mq_try_issue_directly()方法的派发是针对单个requset的,blk_mq_run_hw_queue()是派发软件队列ctx->rq_list、硬件队列hctx->dispatch、I/O调度算法队列上的request的,这是二者最大的区别。
bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { int srcu_idx; bool need_run; /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even __blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ hctx_lock(hctx, &srcu_idx); // 加锁 need_run = !blk_queue_quiesced(hctx->queue) && // 有request需要传输 blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx); // 解锁 if (need_run) { __blk_mq_delay_run_hw_queue(hctx, async, 0); return true; } return false; }
5.6.1 __blk_mq_delay_run_hw_queue
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return;
// 同步传输 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); // 获取当前cpu索引号 if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); put_cpu(); return; } put_cpu(); } // 将工作任务任务hctx->run_work,即blk_mq_run_work_fn添加到工作队列中,然后由内核线程worker执行 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); }
关于workqueue相关基础知识我们在linux驱动移植-软中断、tasklet、workqueue 介绍过,这里节不重复介绍了,在之前介绍的linux的usb子系统就利用了工作队列的原理实现usb了设备的热插拔。
blk_mq_run_work_fn函数内部调用了__blk_mq_run_hw_queue函数:
static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); /* * If we are stopped, don't run the queue. */ if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) return; __blk_mq_run_hw_queue(hctx); }
5.6.2 __blk_mq_run_hw_queue
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; /* * We should be running this queue from one of the CPUs that * are mapped to it. * * There are at least two related races now between setting * hctx->next_cpu from blk_mq_hctx_next_cpu() and running * __blk_mq_run_hw_queue(): * * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), * but later it becomes online, then this warning is harmless * at all * * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), * but later it becomes offline, then the warning can't be * triggered, and we depend on blk-mq timeout handler to * handle dispatched requests to this hctx */ if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && cpu_online(hctx->next_cpu)) { printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", raw_smp_processor_id(), cpumask_empty(hctx->cpumask) ? "inactive": "active"); dump_stack(); } /* * We can't run the queue inline with ints disabled. Ensure that * we catch bad users of this early. */ WARN_ON_ONCE(in_interrupt()); might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); hctx_lock(hctx, &srcu_idx); // 加锁 blk_mq_sched_dispatch_requests(hctx); // 重点 hctx_unlock(hctx, srcu_idx); // 解锁 }
5.6.3 blk_mq_sched_dispatch_requests
blk_mq_sched_dispatch_requests函数定义在block/blk-mq-sched.c,用于派发各种队列的request
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.dispatch_request; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; hctx->run++; /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. */ if (!list_empty_careful(&hctx->dispatch)) { // 不为空 spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) list_splice_init(&hctx->dispatch, &rq_list); // 把hctx->dispatch双向链表上的request转移到rq_list中 spin_unlock(&hctx->lock); } /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where * we only ever dispatch a fraction of the requests available because * of low device queue depth. Once we pull requests out of the IO * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. * * We want to dispatch from the scheduler if there was nothing * on the dispatch list or we were able to dispatch from the * dispatch list. */ if (!list_empty(&rq_list)) { // 如果hctx->dispatch上有request要派发,hctx->dispatch链表上的request已经转移到rq_list blk_mq_sched_mark_restart_hctx(hctx); //这里设置了hctx->state的BLK_MQ_S_SCHED_RESTART标志位
/* rq_list上的request来自hctx->dispatch硬件派发队列,遍历rq_list上的request,先给request在硬件队列hctx的blk_mq_tags里分配一个空闲tag,就是建立request与硬件队列的联系吧,
然后把request派发给块设备驱动。任意一个request要启动硬件传输,都要从blk_mq_tags结构里得到一个空闲的tag。如果块设备驱动繁忙,还要把rq_list剩余的request转移到 hctx->dispatch,
启动异步传输。*/
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) // 使用I/O调度器 blk_mq_do_dispatch_sched(hctx); // 派发I/O调度器队列上的request else blk_mq_do_dispatch_ctx(hctx); // 派发硬件队列绑定的所有软件队列上的request } } else if (has_sched_dispatch) { // 有I/O调度器 blk_mq_do_dispatch_sched(hctx); // 派发I/O调度器队列上的request } else if (hctx->dispatch_busy) { // 块设备驱动器繁忙 /* dequeue request one by one from sw queue if queue is busy */ blk_mq_do_dispatch_ctx(hctx); // 派发硬件队列绑定的所有软件队列上的request } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); // 把硬件队列hctx关联的软件队列上的ctx->rq_list链表上req转移到传入的rq_list链表 blk_mq_dispatch_rq_list(q, &rq_list, false); // 遍历rq_list上的request,先给request在硬件队列hctx的blk_mq_tags里分配一个空闲tag,然后把request派发给块设备驱动。如果遇到块设备驱动层繁忙,则把request加入hctx->dispatch异步派发 } }
通过该函数进行各种各样场景的request派发:
- hctx->dispatch硬件队列dispatch链表上的req派发;
- I/O调度器队列上的request派发(blk_mq_do_dispatch_sched函数);
- 无I/O调度算法时,硬件队列关联的所有软件队列ctx->rq_list上的request的派发(blk_mq_do_dispatch_ctx函数)。
派发最终都是调用blk_mq_dispatch_rq_list(),该函数最终还是执行块设备驱动注册的queue_rq函数,将request请求发往块设备驱动层,块设备驱动不繁忙直接启动request传输,繁忙的话则把剩余的request转移到hctx->dispatch队列,然后启动异步传输。
5.6.4 kblockd_mod_delayed_work_on
kblockd_mod_delayed_work_on定义在block/blk-core.c,该函数将延时工作dwork添加到工作队列kblockd_workqueue,工作队列的作用就是把工作推后,交由一个内核线程worker去执行。
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); }
kblockd_workqueue是一个全局变量,实在blk_deb_init函数中初始化的,工作队列创建成功后工作就有了栖身之所,以后只要往工作队列里添加工作就可以异步执行了。
/* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue;
int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); #ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); #endif return 0; }
mod_delayed_work_on定义在kernel/workqueue.c:
/** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. * See try_to_grab_pending() for details. */ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { unsigned long flags; int ret; do { ret = try_to_grab_pending(&dwork->work, true, &flags); } while (unlikely(ret == -EAGAIN)); if (likely(ret >= 0)) { __queue_delayed_work(cpu, wq, dwork, delay); local_irq_restore(flags); } /* -ENOENT from try_to_grab_pending() becomes %true */ return ret; }
5.7 blk_flush_plug_list
blk_flush_plug_list函数用于派发plug->my_list链表上的request,就是当前进程集聚了很多request在plug->mq_list,然后一下次全部派发给块设备驱动,它的发起函数是blk_flush_plug_list。
基本原理是:先取出plug->mq_list链表上的request;
- 如果设置了I/O调度器,则把request插入到I/O算法队列;
- 否则,则先执行blk_mq_try_issue_list_directly()将这些request直接派发给块设备驱动。如果块设备驱动繁忙,就先把request添加到软件队列ctx->rq_list链表,等稍后执行blk_mq_run_hw_queue函数,再次尝试同步或者异步派发这些request。
blk_flush_plug_list函数定义在block/blk-core.c:
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); }
5.7.1 blk_mq_flush_plug_list
blk_mq_flush_plug_list定义在block/blk-mq.c文件中:
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct blk_mq_hw_ctx *this_hctx; struct blk_mq_ctx *this_ctx; struct request_queue *this_q; struct request *rq; LIST_HEAD(list); LIST_HEAD(rq_list); unsigned int depth; list_splice_init(&plug->mq_list, &list); if (plug->rq_count > 2 && plug->multiple_queues) list_sort(NULL, &list, plug_rq_cmp); plug->rq_count = 0; this_q = NULL; this_hctx = NULL; this_ctx = NULL; depth = 0; while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); BUG_ON(!rq->q); if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); } this_q = rq->q; this_ctx = rq->mq_ctx; this_hctx = rq->mq_hctx; depth = 0; } depth++; list_add_tail(&rq->queuelist, &rq_list); } /* * If 'this_hctx' is set, we know we have entries to complete * on 'rq_list'. Do those. */ if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); } }
5.7.2 blk_mq_sched_insert_requests
blk_mq_sched_insert_requests函数定义在block/blk-mq-sched.c:
void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct elevator_queue *e; struct request_queue *q = hctx->queue; /* * blk_mq_sched_insert_requests() is called from flush plug * context only, and hold one usage counter to prevent queue * from being released. */ percpu_ref_get(&q->q_usage_counter); e = hctx->queue->elevator; if (e && e->type->ops.insert_requests) e->type->ops.insert_requests(hctx, list, false); else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ if (!hctx->dispatch_busy && !e && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list); if (list_empty(list)) goto out; } blk_mq_insert_requests(hctx, ctx, list); } blk_mq_run_hw_queue(hctx, run_queue_async); out: percpu_ref_put(&q->q_usage_counter); }
参考文章
[4]深度剖析Linux块设备IO子系统(一)_驱动模型(秒懂)
[7]Multi-Queue Block IO Queueing Mechanism (blk-mq)
[9]Block multi-queue 架构解析(二)流程与机制
[10]Linux内核中块层上的多队列
[11]Multi-queue 架构分析
[12]Linux Block IO: Introducing Multi-queue SSD Access on Multi-core Systems
[13]Linux 块设备之Block Layer层架构演变(推荐)
[14]IO子系统全流程介绍
[15]block多队列分析 - 2. block多队列的初始化
[16]linux内核block层Multi queue多队列核心点分析(更多细节推荐)