Linux3.10.0块IO子系统流程(2)-- 构造、排序、合并请求

Linux块设备可以分为三类。分别针对顺序访问物理设备、随机访问物理设备和逻辑设备(即“栈式设备”)
 
类型 make_request_fn request_fn 备注
SCSI 设备等 从bio构造request(经过合并和排序),返回0 逐个处理request 调用blk_init_queue,使用默认的__make_request,提供策略例程
SSD等 直接处理bio,返回0 调用blk_alloc_queue,提供make_request_fn
RAID或Device Mapper设备 重定向bio,返回非零值 调用blk_alloc_queue,提供make_request_fn
 
blk_init_queue原型:
 1 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 2 {
 3     return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
 4 }
 5 EXPORT_SYMBOL(blk_init_queue);
 6 
 7 struct request_queue *
 8 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 9 {
10     struct request_queue *uninit_q, *q;
11     uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
12     if (!uninit_q)
13         return NULL;
14     q = blk_init_allocated_queue(uninit_q, rfn, lock);
15     if (!q)
16         blk_cleanup_queue(uninit_q);
17     return q;
18 }
19 EXPORT_SYMBOL(blk_init_queue_node);
20 
21 struct request_queue *
22 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
23              spinlock_t *lock)
24 {
25     if (!q)
26         return NULL;
27     if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
28         return NULL;
29     q->request_fn        = rfn;
30     q->prep_rq_fn        = NULL;
31     q->unprep_rq_fn        = NULL;
32     q->queue_flags        |= QUEUE_FLAG_DEFAULT;
33     /* Override internal queue lock with supplied lock pointer */
34     if (lock)
35         q->queue_lock        = lock;
36     /*
37      * This also sets hw/phys segments, boundary and size
38      */
39     blk_queue_make_request(q, blk_queue_bio);  //使用blk_init_queue会默认绑定blk_queue_bio来处理IO
40     q->sg_reserved_size = INT_MAX;
41     /* init elevator */
42     if (elevator_init(q, NULL))    // 初始化IO调度
43         return NULL;
44     return q;
45 }
46 EXPORT_SYMBOL(blk_init_allocated_queue);

 

下面来跟踪blk_queue_bio函数:

  1 void blk_queue_bio(struct request_queue *q, struct bio *bio)
  2 {
  3     const bool sync = !!(bio->bi_rw & REQ_SYNC);
  4     struct blk_plug *plug;
  5     int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
  6     struct request *req;
  7     unsigned int request_count = 0;
  8     /*
  9      * low level driver can indicate that it wants pages above a
 10      * certain limit bounced to low memory (ie for highmem, or even
 11      * ISA dma in theory)
 12      */
 13     blk_queue_bounce(q, &bio);    // 如果需要,创建反弹缓冲区
 14     if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 15         bio_endio(bio, -EIO);
 16         return;
 17     }
 18     if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 19         spin_lock_irq(q->queue_lock);
 20         where = ELEVATOR_INSERT_FLUSH;
 21         goto get_rq;
 22     }
 23     /*
 24      * Check if we can merge with the plugged list before grabbing any locks
 25      * 首先尝试请求合并
 26      */
 27     if (attempt_plug_merge(q, bio, &request_count))
 28         return;
 29     spin_lock_irq(q->queue_lock);
 30     el_ret = elv_merge(q, &req, bio);    // 判断是否bio是否可以合并
 31     // 如果可以合并的话,分为向前和向后合并
 32     if (el_ret == ELEVATOR_BACK_MERGE) {
 33         if (bio_attempt_back_merge(q, req, bio)) {
 34             elv_bio_merged(q, req, bio);    // 请求如果在硬件上允许,则进行合并
 35             if (!attempt_back_merge(q, req))    // 合并之后可能两个request可以合并
 36                 elv_merged_request(q, req, el_ret);
 37             goto out_unlock;
 38         }
 39     } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 40         if (bio_attempt_front_merge(q, req, bio)) {
 41             elv_bio_merged(q, req, bio);
 42             if (!attempt_front_merge(q, req))
 43                 elv_merged_request(q, req, el_ret);
 44             goto out_unlock;
 45         }
 46     }
 47 // 不能合并就根据bio构造request
 48 get_rq:
 49     /*
 50      * This sync check and mask will be re-done in init_request_from_bio(),
 51      * but we need to set it earlier to expose the sync flag to the
 52      * rq allocator and io schedulers.
 53      */
 54     rw_flags = bio_data_dir(bio);
 55     if (sync)
 56         rw_flags |= REQ_SYNC;
 57     /*
 58      * Grab a free request. This is might sleep but can not fail.
 59      * Returns with the queue unlocked.
 60      */
 61     req = get_request(q, rw_flags, bio, GFP_NOIO);    // 获取一个request
 62     if (unlikely(!req)) {
 63         bio_endio(bio, -ENODEV);    /* @q is dead */
 64         goto out_unlock;
 65     }
 66     /*
 67      * After dropping the lock and possibly sleeping here, our request
 68      * may now be mergeable after it had proven unmergeable (above).
 69      * We don't worry about that case for efficiency. It won't happen
 70      * often, and the elevators are able to handle it.
 71      */
 72     init_request_from_bio(req, bio);    // 根据bio构造一个request,并添加到IO调度器队列
 73     if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 74         req->cpu = raw_smp_processor_id();
 75     plug = current->plug;
 76     // 接下来是蓄流/泄流策略
 77     if (plug) {
 78         /*
 79          * If this is the first request added after a plug, fire
 80          * of a plug trace. If others have been added before, check
 81          * if we have multiple devices in this plug. If so, make a
 82          * note to sort the list before dispatch.
 83          */
 84         if (list_empty(&plug->list))
 85             trace_block_plug(q);
 86         else {
 87             if (request_count >= BLK_MAX_REQUEST_COUNT) {
 88                 blk_flush_plug_list(plug, false);
 89                 trace_block_plug(q);
 90             }
 91         }
 92         list_add_tail(&req->queuelist, &plug->list);
 93         drive_stat_acct(req, 1);
 94     } else {
 95         spin_lock_irq(q->queue_lock);
 96         add_acct_request(q, req, where);  // 将请求添加到IO调度队列或请求队列,主要被用来处理屏障请求
 97         __blk_run_queue(q);
 98 out_unlock:
 99         spin_unlock_irq(q->queue_lock);
100     }
101 }
102 EXPORT_SYMBOL_GPL(blk_queue_bio);    /* for device mapper only */

 

第13行,blk_queue_bounce创建一个反弹缓冲区。通常是在驱动尝试在外围设备不可达到的地址。例如高端内存上执行DMA等。创建反弹缓冲区后,数据要在原缓冲区和反弹缓冲区之间进行与读写方向对应的复制。毫无疑问,使用反弹缓冲区会降低性能,但也没有其他办法。

所谓反弹,实际上是分配一个新的bio描述符,它和原始bio的segment一一对应。如果原始bio的segment使用的页面在DMA内存范围外,则分配一个在DMA范围内的页面,赋给新的bio对应的segment。对于写操作,需要将旧bio页面的内容复制到新的bio中。如果原始的bio的segment使用的页面在DMA范围内,则将新的bio指向同一地方。

最后将原始bio保存在新的bio的bi_private域中,并设置新bio的完成回调函数。

 

接下来交给IO调度器,由它负责合并和排序请求。合并是指将对磁盘上连续位置的请求合并为一个,通过一次SCSI命令完成。排序是将多个请求对磁盘上的访问位置顺序重新排列,使得磁头尽可能向一个方向移动。请求的合并和排序是在SCSI设备的请求队列描述符上进行的。

 

 1 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 2 {
 3     struct elevator_queue *e = q->elevator;
 4     struct request *__rq;
 5     int ret;
 6 
 7     /*
 8      * Levels of merges:
 9      *     nomerges:  No merges at all attempted
10      *     noxmerges: Only simple one-hit cache try
11      *     merges:       All merge tries attempted
12      */
13     if (blk_queue_nomerges(q))    // 如果设置了QUEUE_FLAG_NOMERGES的标志位,就直接返回不合并
14         return ELEVATOR_NO_MERGE;
15 
16     /*
17      * First try one-hit cache.
18      */
19     // 如果请求队列的last_merge有缓存下来的request,调用blk_try_merge来进行尝试和它进行合并,如果可以合并,通过参数输出这个req
20     if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) {
21         ret = blk_try_merge(q->last_merge, bio);
22         if (ret != ELEVATOR_NO_MERGE) {
23             *req = q->last_merge;
24             return ret;
25         }
26     }
27 
28      // 如果设置了QUEUE_FLAG_NOXMERGES的标志位,表明不要进行“扩展”的合并尝试
29     if (blk_queue_noxmerges(q))
30         return ELEVATOR_NO_MERGE;
31 
32     /*
33      * See if our hash lookup can find a potential backmerge.
34      * 后面的代码就是所谓的“扩展”合并尝试,它包含两方面的内容:
35      * 第一部分是各种IO调度算法全都适用的,而第二部分则是各种IO调度算法特定的
36      */
37     __rq = elv_rqhash_find(q, bio->bi_sector);
38     if (__rq && elv_rq_merge_ok(__rq, bio)) {
39         *req = __rq;
40         return ELEVATOR_BACK_MERGE;
41     }
42 
43     /*
44      * IO调度特定的合并算法是通过电梯队列操作表的elevator_merge_fn回调实现的
45      */
46     if (e->type->ops.elevator_merge_fn)
47         return e->type->ops.elevator_merge_fn(q, req, bio);
48 
49     return ELEVATOR_NO_MERGE;
50 }

 

如果我们的请求不能合并到现有的request中,那么就要新申请request描述符了,根据bio对它初始化,并添加到IO调度器队列

 

最后Linux块设备层采用蓄流/泄流技术来改进吞吐量,蓄流是为了将请求合并和排序,然后一起泄流,泄流函数为__blk_run_queue(q)

 

/**
* __blk_run_queue - run a single device queue
* @q:    The queue to run
*
* Description:
*    See @blk_run_queue. This variant must be called with the queue lock
*    held and interrupts disabled.
*/
void __blk_run_queue(struct request_queue *q)
{
    if (unlikely(blk_queue_stopped(q)))
        return;
    __blk_run_queue_uncond(q);
}



/**
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
* @q:    The queue to run
*
* Description:
*    Invoke request handling on a queue if there are any pending requests.
*    May be used to restart request handling after a request has completed.
*    This variant runs the queue whether or not the queue has been
*    stopped. Must be called with the queue lock held and interrupts
*    disabled. See also @blk_run_queue.
*/
inline void __blk_run_queue_uncond(struct request_queue *q)
{
    if (unlikely(blk_queue_dead(q)))
        return;
    /*
     * Some request_fn implementations, e.g. scsi_request_fn(), unlock
     * the queue lock internally. As a result multiple threads may be
     * running such a request function concurrently. Keep track of the
     * number of active request_fn invocations such that blk_drain_queue()
     * can wait until all these request_fn calls have finished.
     */
    q->request_fn_active++;
    q->request_fn(q);    // 回调函数实例化为scsi_request_fn,也就是通常所说的SCSI策略例程
    q->request_fn_active--;
}
__blk_run_queue

 

对于SCSI设备,在为它分配请求队列时,将请求队列的request_fn回调函数实例化为scsi_request_fn,也就是通常所说的SCSI策略例程。

 

 

 

 

 

posted @ 2018-07-03 09:50  陆小呆  阅读(1513)  评论(0编辑  收藏  举报