Linux3.10.0块IO子系统流程(1)-- 上层提交请求
Linux通用块层提供给上层的接口函数是submit_bio。上层在构造好bio之后,调用submit_bio提交给通用块层处理。
submit_bio函数如下:
void submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; //记录读写方式 /* * 执行有数据传输的读写或屏障请求统计,暂不关心 */ if (bio_has_data(bio)) { unsigned int count; if (unlikely(rw & REQ_WRITE_SAME)) count = bdev_logical_block_size(bio->bi_bdev) >> 9; else count = bio_sectors(bio); if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, bdevname(bio->bi_bdev, b), count); } } //执行真实的IO处理 generic_make_request(bio); }
void generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; if (!generic_make_request_checks(bio)) return; if (current->bio_list) { bio_list_add(current->bio_list, bio); return; } BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack); current->bio_list = &bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); //获取bio对应的请求队列 q->make_request_fn(q, bio); //调用请求队列的回调函数来处理IO bio = bio_list_pop(current->bio_list); } while (bio); current->bio_list = NULL; /* deactivate */ }
在调用make_request_fn处理bio的时候,可能会产生新的bio,即make_request_fn会递归调用generic_make_request 最直观的例子就是“栈式块设备”。为了防止栈式块设备执行请求可能出现问题,在一个时刻只允许进程有一个generic_make_request被调用。为此,在进程结构中定义了一个bio等待处理链表:bio_list。同时区分“活动”和“非活动”状态。活动状态表示进程已经在调用generic_make_request。这时,所有后续产生的bio都链入bio_list链表,在当前bio完成的情况下,逐个处理。
generic_make_request的执行过程:
- generic_make_request_checks
- 判断make_request是否处于活动状态。如果current->bio_list不为NULL,则表明当前进程已经有generic_make_request在执行,这时候传进来的bio都将链接到当前进程等待处理的bio链表尾部
- 设置current->bio_list表明当前的generic_make_request为活动状态,让后来的bio有机会插入等待链表
- 处理bio。这里的bio可能是传入的bio,也可能是当前进程待处理bio链表中的bio。如果是前者,上层保证了其bi_next必然为NULL;如果是后者,则在将bio从链表中脱离的时候,已经设置了bi_next为NULL
- 调用make_request_fn回调处理bio
- 检查当前进程的等待链表中是否还有bio,如果有,跳到第三步
- 至此,generic_make_request的“本轮执行周期”已经完毕,清零current->bio_list,使得generic_make_request处于“非活动”状态
这里再看下generic_make_request_checks
1 static noinline_for_stack bool 2 generic_make_request_checks(struct bio *bio) 3 { 4 struct request_queue *q; 5 int nr_sectors = bio_sectors(bio); 6 int err = -EIO; 7 char b[BDEVNAME_SIZE]; 8 struct hd_struct *part; 9 10 might_sleep(); 11 12 // 检查bio的扇区有没有超过块设备的扇区数 13 if (bio_check_eod(bio, nr_sectors)) 14 goto end_io; 15 16 // 检测块设备的请求队列是否为空 17 q = bdev_get_queue(bio->bi_bdev); 18 if (unlikely(!q)) { 19 printk(KERN_ERR 20 "generic_make_request: Trying to access " 21 "nonexistent block-device %s (%Lu)\n", 22 bdevname(bio->bi_bdev, b), 23 (long long) bio->bi_sector); 24 goto end_io; 25 } 26 27 // 检测请求的扇区长度是否超过物理限制 28 if (likely(bio_is_rw(bio) && 29 nr_sectors > queue_max_hw_sectors(q))) { 30 printk(KERN_ERR "bio too big device %s (%u > %u)\n", 31 bdevname(bio->bi_bdev, b), 32 bio_sectors(bio), 33 queue_max_hw_sectors(q)); 34 goto end_io; 35 } 36 37 part = bio->bi_bdev->bd_part; 38 if (should_fail_request(part, bio->bi_size) || 39 should_fail_request(&part_to_disk(part)->part0, 40 bio->bi_size)) 41 goto end_io; 42 43 /* 44 * If this device has partitions, remap block n of partition p to block n+start(p) of the disk. 45 * 如果请求的块设备可能代表一个分区,这里重新映射到所在的磁盘设备 46 */ 47 blk_partition_remap(bio); 48 49 if (bio_check_eod(bio, nr_sectors)) 50 goto end_io; 51 52 /* 53 * Filter flush bio's early so that make_request based 54 * drivers without flush support don't have to worry 55 * about them. 56 */ 57 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { 58 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); 59 if (!nr_sectors) { 60 err = 0; 61 goto end_io; 62 } 63 } 64 65 // 检查设备对DISCARD命令的支持 66 if ((bio->bi_rw & REQ_DISCARD) && 67 (!blk_queue_discard(q) || 68 ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { 69 err = -EOPNOTSUPP; 70 goto end_io; 71 } 72 73 if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { 74 err = -EOPNOTSUPP; 75 goto end_io; 76 } 77 78 /* 79 * Various block parts want %current->io_context and lazy ioc 80 * allocation ends up trading a lot of pain for a small amount of 81 * memory. Just allocate it upfront. This may fail and block 82 * layer knows how to live with it. 83 */ 84 create_io_context(GFP_ATOMIC, q->node); 85 86 if (blk_throtl_bio(q, bio)) 87 return false; /* throttled, will be resubmitted later */ 88 89 trace_block_bio_queue(q, bio); 90 return true; 91 92 end_io: 93 bio_endio(bio, err); 94 return false; 95 }
-------------------------------------------------- 少年应是春风和煦,肩头挑着草长莺飞 --------------------------------------------------