Linux3.10.0块IO子系统流程(2)-- 构造、排序、合并请求
Linux块设备可以分为三类。分别针对顺序访问物理设备、随机访问物理设备和逻辑设备(即“栈式设备”)
类型 | make_request_fn | request_fn | 备注 |
SCSI 设备等 | 从bio构造request(经过合并和排序),返回0 | 逐个处理request | 调用blk_init_queue,使用默认的__make_request,提供策略例程 |
SSD等 | 直接处理bio,返回0 | 无 | 调用blk_alloc_queue,提供make_request_fn |
RAID或Device Mapper设备 | 重定向bio,返回非零值 | 无 | 调用blk_alloc_queue,提供make_request_fn |
blk_init_queue原型:
1 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 2 { 3 return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); 4 } 5 EXPORT_SYMBOL(blk_init_queue); 6 7 struct request_queue * 8 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 9 { 10 struct request_queue *uninit_q, *q; 11 uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); 12 if (!uninit_q) 13 return NULL; 14 q = blk_init_allocated_queue(uninit_q, rfn, lock); 15 if (!q) 16 blk_cleanup_queue(uninit_q); 17 return q; 18 } 19 EXPORT_SYMBOL(blk_init_queue_node); 20 21 struct request_queue * 22 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, 23 spinlock_t *lock) 24 { 25 if (!q) 26 return NULL; 27 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) 28 return NULL; 29 q->request_fn = rfn; 30 q->prep_rq_fn = NULL; 31 q->unprep_rq_fn = NULL; 32 q->queue_flags |= QUEUE_FLAG_DEFAULT; 33 /* Override internal queue lock with supplied lock pointer */ 34 if (lock) 35 q->queue_lock = lock; 36 /* 37 * This also sets hw/phys segments, boundary and size 38 */ 39 blk_queue_make_request(q, blk_queue_bio); //使用blk_init_queue会默认绑定blk_queue_bio来处理IO 40 q->sg_reserved_size = INT_MAX; 41 /* init elevator */ 42 if (elevator_init(q, NULL)) // 初始化IO调度 43 return NULL; 44 return q; 45 } 46 EXPORT_SYMBOL(blk_init_allocated_queue);
下面来跟踪blk_queue_bio函数:
1 void blk_queue_bio(struct request_queue *q, struct bio *bio) 2 { 3 const bool sync = !!(bio->bi_rw & REQ_SYNC); 4 struct blk_plug *plug; 5 int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; 6 struct request *req; 7 unsigned int request_count = 0; 8 /* 9 * low level driver can indicate that it wants pages above a 10 * certain limit bounced to low memory (ie for highmem, or even 11 * ISA dma in theory) 12 */ 13 blk_queue_bounce(q, &bio); // 如果需要,创建反弹缓冲区 14 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 15 bio_endio(bio, -EIO); 16 return; 17 } 18 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 19 spin_lock_irq(q->queue_lock); 20 where = ELEVATOR_INSERT_FLUSH; 21 goto get_rq; 22 } 23 /* 24 * Check if we can merge with the plugged list before grabbing any locks 25 * 首先尝试请求合并 26 */ 27 if (attempt_plug_merge(q, bio, &request_count)) 28 return; 29 spin_lock_irq(q->queue_lock); 30 el_ret = elv_merge(q, &req, bio); // 判断是否bio是否可以合并 31 // 如果可以合并的话,分为向前和向后合并 32 if (el_ret == ELEVATOR_BACK_MERGE) { 33 if (bio_attempt_back_merge(q, req, bio)) { 34 elv_bio_merged(q, req, bio); // 请求如果在硬件上允许,则进行合并 35 if (!attempt_back_merge(q, req)) // 合并之后可能两个request可以合并 36 elv_merged_request(q, req, el_ret); 37 goto out_unlock; 38 } 39 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 40 if (bio_attempt_front_merge(q, req, bio)) { 41 elv_bio_merged(q, req, bio); 42 if (!attempt_front_merge(q, req)) 43 elv_merged_request(q, req, el_ret); 44 goto out_unlock; 45 } 46 } 47 // 不能合并就根据bio构造request 48 get_rq: 49 /* 50 * This sync check and mask will be re-done in init_request_from_bio(), 51 * but we need to set it earlier to expose the sync flag to the 52 * rq allocator and io schedulers. 53 */ 54 rw_flags = bio_data_dir(bio); 55 if (sync) 56 rw_flags |= REQ_SYNC; 57 /* 58 * Grab a free request. This is might sleep but can not fail. 59 * Returns with the queue unlocked. 60 */ 61 req = get_request(q, rw_flags, bio, GFP_NOIO); // 获取一个request 62 if (unlikely(!req)) { 63 bio_endio(bio, -ENODEV); /* @q is dead */ 64 goto out_unlock; 65 } 66 /* 67 * After dropping the lock and possibly sleeping here, our request 68 * may now be mergeable after it had proven unmergeable (above). 69 * We don't worry about that case for efficiency. It won't happen 70 * often, and the elevators are able to handle it. 71 */ 72 init_request_from_bio(req, bio); // 根据bio构造一个request,并添加到IO调度器队列 73 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) 74 req->cpu = raw_smp_processor_id(); 75 plug = current->plug; 76 // 接下来是蓄流/泄流策略 77 if (plug) { 78 /* 79 * If this is the first request added after a plug, fire 80 * of a plug trace. If others have been added before, check 81 * if we have multiple devices in this plug. If so, make a 82 * note to sort the list before dispatch. 83 */ 84 if (list_empty(&plug->list)) 85 trace_block_plug(q); 86 else { 87 if (request_count >= BLK_MAX_REQUEST_COUNT) { 88 blk_flush_plug_list(plug, false); 89 trace_block_plug(q); 90 } 91 } 92 list_add_tail(&req->queuelist, &plug->list); 93 drive_stat_acct(req, 1); 94 } else { 95 spin_lock_irq(q->queue_lock); 96 add_acct_request(q, req, where); // 将请求添加到IO调度队列或请求队列,主要被用来处理屏障请求 97 __blk_run_queue(q); 98 out_unlock: 99 spin_unlock_irq(q->queue_lock); 100 } 101 } 102 EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */
第13行,blk_queue_bounce创建一个反弹缓冲区。通常是在驱动尝试在外围设备不可达到的地址。例如高端内存上执行DMA等。创建反弹缓冲区后,数据要在原缓冲区和反弹缓冲区之间进行与读写方向对应的复制。毫无疑问,使用反弹缓冲区会降低性能,但也没有其他办法。
所谓反弹,实际上是分配一个新的bio描述符,它和原始bio的segment一一对应。如果原始bio的segment使用的页面在DMA内存范围外,则分配一个在DMA范围内的页面,赋给新的bio对应的segment。对于写操作,需要将旧bio页面的内容复制到新的bio中。如果原始的bio的segment使用的页面在DMA范围内,则将新的bio指向同一地方。
最后将原始bio保存在新的bio的bi_private域中,并设置新bio的完成回调函数。
接下来交给IO调度器,由它负责合并和排序请求。合并是指将对磁盘上连续位置的请求合并为一个,通过一次SCSI命令完成。排序是将多个请求对磁盘上的访问位置顺序重新排列,使得磁头尽可能向一个方向移动。请求的合并和排序是在SCSI设备的请求队列描述符上进行的。
1 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) 2 { 3 struct elevator_queue *e = q->elevator; 4 struct request *__rq; 5 int ret; 6 7 /* 8 * Levels of merges: 9 * nomerges: No merges at all attempted 10 * noxmerges: Only simple one-hit cache try 11 * merges: All merge tries attempted 12 */ 13 if (blk_queue_nomerges(q)) // 如果设置了QUEUE_FLAG_NOMERGES的标志位,就直接返回不合并 14 return ELEVATOR_NO_MERGE; 15 16 /* 17 * First try one-hit cache. 18 */ 19 // 如果请求队列的last_merge有缓存下来的request,调用blk_try_merge来进行尝试和它进行合并,如果可以合并,通过参数输出这个req 20 if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { 21 ret = blk_try_merge(q->last_merge, bio); 22 if (ret != ELEVATOR_NO_MERGE) { 23 *req = q->last_merge; 24 return ret; 25 } 26 } 27 28 // 如果设置了QUEUE_FLAG_NOXMERGES的标志位,表明不要进行“扩展”的合并尝试 29 if (blk_queue_noxmerges(q)) 30 return ELEVATOR_NO_MERGE; 31 32 /* 33 * See if our hash lookup can find a potential backmerge. 34 * 后面的代码就是所谓的“扩展”合并尝试,它包含两方面的内容: 35 * 第一部分是各种IO调度算法全都适用的,而第二部分则是各种IO调度算法特定的 36 */ 37 __rq = elv_rqhash_find(q, bio->bi_sector); 38 if (__rq && elv_rq_merge_ok(__rq, bio)) { 39 *req = __rq; 40 return ELEVATOR_BACK_MERGE; 41 } 42 43 /* 44 * IO调度特定的合并算法是通过电梯队列操作表的elevator_merge_fn回调实现的 45 */ 46 if (e->type->ops.elevator_merge_fn) 47 return e->type->ops.elevator_merge_fn(q, req, bio); 48 49 return ELEVATOR_NO_MERGE; 50 }
如果我们的请求不能合并到现有的request中,那么就要新申请request描述符了,根据bio对它初始化,并添加到IO调度器队列
最后Linux块设备层采用蓄流/泄流技术来改进吞吐量,蓄流是为了将请求合并和排序,然后一起泄流,泄流函数为__blk_run_queue(q)
/** * __blk_run_queue - run a single device queue * @q: The queue to run * * Description: * See @blk_run_queue. This variant must be called with the queue lock * held and interrupts disabled. */ void __blk_run_queue(struct request_queue *q) { if (unlikely(blk_queue_stopped(q))) return; __blk_run_queue_uncond(q); } /** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q: The queue to run * * Description: * Invoke request handling on a queue if there are any pending requests. * May be used to restart request handling after a request has completed. * This variant runs the queue whether or not the queue has been * stopped. Must be called with the queue lock held and interrupts * disabled. See also @blk_run_queue. */ inline void __blk_run_queue_uncond(struct request_queue *q) { if (unlikely(blk_queue_dead(q))) return; /* * Some request_fn implementations, e.g. scsi_request_fn(), unlock * the queue lock internally. As a result multiple threads may be * running such a request function concurrently. Keep track of the * number of active request_fn invocations such that blk_drain_queue() * can wait until all these request_fn calls have finished. */ q->request_fn_active++; q->request_fn(q); // 回调函数实例化为scsi_request_fn,也就是通常所说的SCSI策略例程 q->request_fn_active--; }
对于SCSI设备,在为它分配请求队列时,将请求队列的request_fn回调函数实例化为scsi_request_fn,也就是通常所说的SCSI策略例程。
-------------------------------------------------- 少年应是春风和煦,肩头挑着草长莺飞 --------------------------------------------------