linux设备驱动(22)块设备驱动详解
块设备是i/o设备中的一类, 当我们的应用层对该设备读写时,是按扇区大小来读写数据的,若读写的数据小于扇区的大小,就会需要缓存区, 可以随机读写设备的任意位置处的数据,使用缓冲区来存放暂时的数据,待条件成熟后,从缓存一次性写入设备或者从设备一次性读到缓冲区。例如 普通文件(*.txt,*.c等),硬盘,U盘,SD卡。
对比字符设备驱动,应用层读写(read()/write())字符设备驱动时,是按字节/字符来读写数据的,期间没有任何缓存区,因为数据量小,不能随机读取数据,是一个顺序的数据流设备,对这种设备的读写是按字符进行的,而且这些字符是连续地形成一个数据流。他不具备缓冲区,所以对这种设备的读写是实时的。
1 块设备的基本概念
扇区:
硬盘的基本访问单位,扇区的大小一般是512B(对于现在的有些磁盘的扇区>512B,比如光盘的一个扇区就是2048B,Linux将其看成4个扇区,无非就是需要完成4次的读写)。
块:
扇区是硬件传输数据的基本单位,硬件一次传输一个扇区的数据到内存中。但是和扇区不同的是,块是虚拟文件系统传输数据的基本单位。在Linux中,块的大小必须是2的幂,但是不能超过一个页的大小(4k)。(在X86平台,一个页的大小是4094个字节,所以块大小可以是512,1024,2048,4096)。
主要为了做scatter/gather DMA操作使用,同一个物理页面中的在硬盘存储介质上连续的多个块组成一个段。段的大小只与块有关,必须是块的整数倍。所以块通常包括多个扇区,段通常包括多个块,物理段通常包括多个段;段在内核中由结构struct bio_vec来描述,多个段的信息存放于struct bio结构中的bio_io_vec指针数组中,段数组在后续的块设备处理流程中会被合并成物理段,段结构定义如下:
定义位于:linux-3.10.73\include\linux\blk_types.h
1 /* 2 * was unsigned short, but we might as well be ready for > 64kB I/O pages 3 */ 4 struct bio_vec { 5 struct page *bv_page; 6 unsigned int bv_len; 7 unsigned int bv_offset; 8 };
扇区由磁盘的物理特性决定;块缓冲区由内核代码决定;块由缓冲区决定,是块缓冲区大小的整数倍,但是不能超过一个页。
所以:扇区(512)≤块≤页(4096) 块=n*扇区(n为整数)
注意:段(struct bio_vec{})由多个块组成,一个段就是一个内存页(如果一个块是两个扇区大小,也就是1024B,那么一个段的大小可以是1024,2018,3072,4096,也就是说段的大小只与块有关,而且是整数倍)。Linux系统一次读取磁盘的大小是一个块,而不是一个扇区,块设备驱动由此得名。
2 块设备驱动数据结构
2.1 gendisk磁盘结构体gendisk
用来存储该设备的硬盘信息,包括请求队列、分区链表和块设备操作函数集等。
1 struct gendisk { 2 /* major, first_minor and minors are input parameters only, 3 * don't use directly. Use disk_devt() and disk_max_parts(). 4 */ 5 int major; /* major number of driver */设备主设备号,等于register_blkdev()函数里的major 6 int first_minor;//起始次设备号,等于0,则表示此设备号从0开始的 7 int minors; /* maximum number of minors, =1 for disks that can't be partitioned. */分区(次设备)数量,当使用alloc_disk()时,就会自动设置该成员 8 9 char disk_name[DISK_NAME_LEN]; /* name of major driver */块设备名称, 等于register_blkdev()函数里的name 10 char *(*devnode)(struct gendisk *gd, umode_t *mode); 11 12 unsigned int events; /* supported events */ 13 unsigned int async_events; /* async events, subset of all */ 14 15 /* Array of pointers to partitions indexed by partno. 16 * Protected with matching bdev lock but stat and other 17 * non-critical accesses use RCU. Always access through 18 * helpers. 19 */ 20 struct disk_part_tbl __rcu *part_tbl;//分区表的信息 21 struct hd_struct part0; 22 23 const struct block_device_operations *fops;//块设备操作函数 24 struct request_queue *queue;//请求队列,用于管理该设备IO请求队列的指针 25 void *private_data;//私有数据 26 27 int flags; 28 struct device *driverfs_dev; // FIXME: remove 29 struct kobject *slave_dir; 30 31 struct timer_rand_state *random; 32 atomic_t sync_io; /* RAID */ 33 struct disk_events *ev; 34 #ifdef CONFIG_BLK_DEV_INTEGRITY 35 struct blk_integrity *integrity; 36 #endif 37 int node_id; 38 }
2.2 结构体request
请求结构体,对块设备的IO请求,都会向块设备驱动发出一个请求,在驱动中用request结构体描述。定义位于:linux-3.10.73\include\linux\blkdev.h
1 /* 2 * try to put the fields that are referenced together in the same cacheline. 3 * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() 4 * as well! 5 */ 6 struct request { 7 struct list_head queuelist; 8 struct call_single_data csd; 9 10 struct request_queue *q;//指向请求队列 11 12 unsigned int cmd_flags;//命令标识 13 enum rq_cmd_type_bits cmd_type;////读写命令标志,为 0(READ)表示读, 为1(WRITE)表示写 14 unsigned long atomic_flags; 15 16 int cpu; 17 18 /* the following two fields are internal, NEVER access directly */ 19 unsigned int __data_len; /* total data len */ 20 sector_t __sector; /* sector cursor */ 要提交的下一个扇区偏移位置(offset) 21 22 struct bio *bio; 23 struct bio *biotail; 24 25 struct hlist_node hash; /* merge hash */ 26 /* 27 * The rb_node is only used inside the io scheduler, requests 28 * are pruned when moved to the dispatch queue. So let the 29 * completion_data share space with the rb_node. 30 */ 31 union { 32 struct rb_node rb_node; /* sort/lookup */ 33 void *completion_data; 34 }; 35 36 /* 37 * Three pointers are available for the IO schedulers, if they need 38 * more they have to dynamically allocate it. Flush requests are 39 * never put on the IO scheduler. So let the flush fields share 40 * space with the elevator data. 41 */ 42 union { 43 struct { 44 struct io_cq *icq; 45 void *priv[2]; 46 } elv; 47 48 struct { 49 unsigned int seq; 50 struct list_head list; 51 rq_end_io_fn *saved_end_io; 52 } flush; 53 }; 54 55 struct gendisk *rq_disk; 56 struct hd_struct *part; 57 unsigned long start_time; 58 #ifdef CONFIG_BLK_CGROUP 59 struct request_list *rl; /* rl this rq is alloced from */ 60 unsigned long long start_time_ns; 61 unsigned long long io_start_time_ns; /* when passed to hardware */ 62 #endif 63 /* Number of scatter-gather DMA addr+len pairs after 64 * physical address coalescing is performed. 65 */ 66 unsigned short nr_phys_segments; 67 #if defined(CONFIG_BLK_DEV_INTEGRITY) 68 unsigned short nr_integrity_segments; 69 #endif 70 71 unsigned short ioprio; 72 73 int ref_count; 74 75 void *special; /* opaque pointer available for LLD use */ 76 char *buffer; /* kaddr of the current segment if available 当前申请队列链表的数据,用来读写扇区的数据即源地址*/ 77 78 int tag; 79 int errors; 80 81 /* 82 * when request is used as a packet command carrier 83 */ 84 unsigned char __cmd[BLK_MAX_CDB]; 85 unsigned char *cmd; 86 unsigned short cmd_len; 87 88 unsigned int extra_len; /* length of alignment and padding */ 89 unsigned int sense_len; 90 unsigned int resid_len; /* residual count */ 91 void *sense; 92 93 unsigned long deadline; 94 struct list_head timeout_list; 95 unsigned int timeout; 96 int retries; 97 98 /* 99 * completion callback. 100 */ 101 rq_end_io_fn *end_io; 102 void *end_io_data; 103 104 /* for bidi */ 105 struct request *next_rq; 106 }
2.3 结构体request_queue
请求队列结构体
1 struct request_queue {
2 /*
3 * Together with queue_head for cacheline sharing
4 */
5 struct list_head queue_head;
6 struct request *last_merge;
7 struct elevator_queue *elevator;
8 int nr_rqs[2]; /* # allocated [a]sync rqs */
9 int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
10
11 /*
12 * If blkcg is not used, @q->root_rl serves all requests. If blkcg
13 * is used, root blkg allocates from @q->root_rl and all other
14 * blkgs from their own blkg->rl. Which one to use should be
15 * determined using bio_request_list().
16 */
17 struct request_list root_rl;
18
19 request_fn_proc *request_fn;//实现驱动程序处理请求的函数,在Virtual_blkdev中实现这个函数
20 make_request_fn *make_request_fn;//将一个新的request插入请求队列的方法
21 prep_rq_fn *prep_rq_fn;
22 unprep_rq_fn *unprep_rq_fn;
23 merge_bvec_fn *merge_bvec_fn;
24 softirq_done_fn *softirq_done_fn;
25 rq_timed_out_fn *rq_timed_out_fn;
26 dma_drain_needed_fn *dma_drain_needed;
27 lld_busy_fn *lld_busy_fn;
28
29 /*
30 * Dispatch queue sorting
31 */
32 sector_t end_sector;
33 struct request *boundary_rq;
34
35 /*
36 * Delayed queue handling
37 */
38 struct delayed_work delay_work;
39
40 struct backing_dev_info backing_dev_info;
41
42 /*
43 * The queue owner gets to use this for whatever they like.
44 * ll_rw_blk doesn't touch it.
45 */
46 void *queuedata;
47
48 /*
49 * various queue flags, see QUEUE_* below
50 */
51 unsigned long queue_flags;
52
53 /*
54 * ida allocated id for this queue. Used to index queues from
55 * ioctx.
56 */
57 int id;
58
59 /*
60 * queue needs bounce pages for pages above this limit
61 */
62 gfp_t bounce_gfp;
63
64 /*
65 * protects queue structures from reentrancy. ->__queue_lock should
66 * _never_ be used directly, it is queue private. always use
67 * ->queue_lock.
68 */
69 spinlock_t __queue_lock;
70 spinlock_t *queue_lock;
71
72 /*
73 * queue kobject
74 */
75 struct kobject kobj;
76
77 #ifdef CONFIG_PM_RUNTIME
78 struct device *dev;
79 int rpm_status;
80 unsigned int nr_pending;
81 #endif
82
83 /*
84 * queue settings
85 */
86 unsigned long nr_requests; /* Max # of requests */
87 unsigned int nr_congestion_on;
88 unsigned int nr_congestion_off;
89 unsigned int nr_batching;
90
91 unsigned int dma_drain_size;
92 void *dma_drain_buffer;
93 unsigned int dma_pad_mask;
94 unsigned int dma_alignment;
95
96 struct blk_queue_tag *queue_tags;
97 struct list_head tag_busy_list;
98
99 unsigned int nr_sorted;
100 unsigned int in_flight[2];
101 /*
102 * Number of active block driver functions for which blk_drain_queue()
103 * must wait. Must be incremented around functions that unlock the
104 * queue_lock internally, e.g. scsi_request_fn().
105 */
106 unsigned int request_fn_active;
107
108 unsigned int rq_timeout;
109 struct timer_list timeout;
110 struct list_head timeout_list;
111
112 struct list_head icq_list;
113 #ifdef CONFIG_BLK_CGROUP
114 DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
115 struct blkcg_gq *root_blkg;
116 struct list_head blkg_list;
117 #endif
118
119 struct queue_limits limits;
120
121 /*
122 * sg stuff
123 */
124 unsigned int sg_timeout;
125 unsigned int sg_reserved_size;
126 int node;
127 #ifdef CONFIG_BLK_DEV_IO_TRACE
128 struct blk_trace *blk_trace;
129 #endif
130 /*
131 * for flush operations
132 */
133 unsigned int flush_flags;
134 unsigned int flush_not_queueable:1;
135 unsigned int flush_queue_delayed:1;
136 unsigned int flush_pending_idx:1;
137 unsigned int flush_running_idx:1;
138 unsigned long flush_pending_since;
139 struct list_head flush_queue[2];
140 struct list_head flush_data_in_flight;
141 struct request flush_rq;
142
143 struct mutex sysfs_lock;
144
145 int bypass_depth;
146
147 #if defined(CONFIG_BLK_DEV_BSG)
148 bsg_job_fn *bsg_job_fn;
149 int bsg_job_size;
150 struct bsg_class_device bsg_dev;
151 #endif
152
153 #ifdef CONFIG_BLK_CGROUP
154 struct list_head all_q_node;
155 #endif
156 #ifdef CONFIG_BLK_DEV_THROTTLING
157 /* Throttle data */
158 struct throtl_data *td;
159 #endif
160 struct rcu_head rcu_head;
161 }
2.4 bio结构体
1 struct bio { 2 sector_t bi_sector; /* device address in 512 byte 3 sectors */要传输的第一个扇区 4 struct bio *bi_next; /* request queue link */下一个 bio 5 struct block_device *bi_bdev; 6 unsigned long bi_flags; /* status, command, etc */状态、命令等 7 unsigned long bi_rw; /*低位表示 READ/WRITE,高位表示优先级*/ 8 9 unsigned short bi_vcnt; /* bio_vec 数量 */ 10 unsigned short bi_idx; /* 当前 bvl_vec 索引 */ 11 12 /* Number of segments in this BIO after 13 * physical address coalescing is performed. 14 */ 15 unsigned int bi_phys_segments;//执行物理地址合并后 sgement 的数目 16 17 unsigned int bi_size; /* residual I/O count */ 18 19 /为了明了最大的 segment 尺寸,我们考虑这个 bio 中第一个和最后一个可合并的 segment 的尺寸 */ 20 unsigned int bi_seg_front_size; 21 unsigned int bi_seg_back_size; 22 23 bio_end_io_t *bi_end_io; 24 25 void *bi_private; 26 #ifdef CONFIG_BLK_CGROUP 27 /* 28 * Optional ioc and css associated with this bio. Put on bio 29 * release. Read comment on top of bio_associate_current(). 30 */ 31 struct io_context *bi_ioc; 32 struct cgroup_subsys_state *bi_css; 33 #endif 34 #if defined(CONFIG_BLK_DEV_INTEGRITY) 35 struct bio_integrity_payload *bi_integrity; /* data integrity */ 36 #endif 37 38 /* 39 * Everything starting with bi_max_vecs will be preserved by bio_reset() 40 */ 41 42 unsigned int bi_max_vecs; /* 我们能持有的最大 bvl_vecs 数*/ 43 44 atomic_t bi_cnt; /* pin count */ 45 46 struct bio_vec *bi_io_vec; /* the actual vec list 实际的 vec 列表*/ 47 48 struct bio_set *bi_pool; 49 50 /* 51 * We can inline a number of vecs at the end of the bio, to avoid 52 * double allocations for a small number of bio_vecs. This member 53 * MUST obviously be kept at the very end of the bio. 54 */ 55 struct bio_vec bi_inline_vecs[0]; 56 }
2.5 bio_vec结构体
bio的核心是一个被称为bi_io_vec的数组,它由bio_vec组成(也就是说bio由许多bio_vec组成)。内核定义如下:
1 struct bio_vec { 2 struct page *bv_page; /* 页指针 */ 3 unsigned int bv_len; /* 传输的字节数 */ 4 unsigned int bv_offset; /* 偏移位置 */ 5 };
2.6 buffer_head结构体
bio_vec描述一个特定的片段,片段所在的物理页,块在物理页中的偏移页,整个bio_io_vec结构表示一个完整的缓冲区。当一个块被调用内存时,要储存在一个缓冲区,每个缓冲区与一个块对应,所以每一个缓冲区独有一个对应的描述符,该描述符用buffer_head结构表示,定义位于:linux-3.10.73\include\linux\buffer_head.h
1 struct buffer_head { 2 unsigned long b_state; /* buffer state bitmap (see above) */ 3 struct buffer_head *b_this_page;/* circular list of page's buffers */ 4 struct page *b_page; /* the page this bh is mapped to */ 5 6 sector_t b_blocknr; /* start block number */ 7 size_t b_size; /* size of mapping */ 8 char *b_data; /* pointer to data within the page */ 9 10 struct block_device *b_bdev; 11 bh_end_io_t *b_end_io; /* I/O completion */ 12 void *b_private; /* reserved for b_end_io */ 13 struct list_head b_assoc_buffers; /* associated with another mapping */ 14 struct address_space *b_assoc_map; /* mapping this buffer is 15 associated with */ 16 atomic_t b_count; /* users using this buffer_head */ 17 }
2.7 块设备结构体
定义位于:linux-3.10.73\include\linux\fs.h
1 struct block_device { 2 dev_t bd_dev; /* not a kdev_t - it's a search key */ 3 int bd_openers; 4 struct inode * bd_inode; /* will die */ 5 struct super_block * bd_super; 6 struct mutex bd_mutex; /* open/close mutex */ 7 struct list_head bd_inodes; 8 void * bd_claiming; 9 void * bd_holder; 10 int bd_holders; 11 bool bd_write_holder; 12 #ifdef CONFIG_SYSFS 13 struct list_head bd_holder_disks; 14 #endif 15 struct block_device * bd_contains; 16 unsigned bd_block_size; 17 struct hd_struct * bd_part; 18 /* number of times partitions within this device have been opened. */ 19 unsigned bd_part_count; 20 int bd_invalidated; 21 struct gendisk * bd_disk; 22 struct request_queue * bd_queue; 23 struct list_head bd_list; 24 /* 25 * Private data. You must have bd_claim'ed the block_device 26 * to use this. NOTE: bd_claim allows an owner to claim 27 * the same device multiple times, the owner must take special 28 * care to not mess up bd_private for that case. 29 */ 30 unsigned long bd_private; 31 32 /* The counter of freeze processes */ 33 int bd_fsfreeze_count; 34 /* Mutex for freeze */ 35 struct mutex bd_fsfreeze_mutex; 36 }
3 api
3.1 核心ll_rw_block函数
定义位于:linux-3.10.73\fs\buffer.c
1 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) 2 {//rw:读写标志位, nr:bhs[]长度, bhs[]:要读写的数据数组 3 int i; 4 5 for (i = 0; i < nr; i++) { 6 struct buffer_head *bh = bhs[i];//获取nr个buffer_head 7 8 if (!trylock_buffer(bh)) 9 continue; 10 if (rw == WRITE) { 11 if (test_clear_buffer_dirty(bh)) { 12 bh->b_end_io = end_buffer_write_sync; 13 get_bh(bh); 14 submit_bh(WRITE, bh);//提交WRITE写标志的buffer_head 15 continue; 16 } 17 } else { 18 if (!buffer_uptodate(bh)) { 19 bh->b_end_io = end_buffer_read_sync; 20 get_bh(bh); 21 submit_bh(rw, bh);//提交其它标志的buffer_head 22 continue; 23 } 24 } 25 unlock_buffer(bh); 26 } 27 }
3.2 函数submit_bh
submit_bh()函数就是通过bh来构造bio,然后调用submit_bio()提交bio。
1 int submit_bh(int rw, struct buffer_head *bh) 2 { 3 return _submit_bh(rw, bh, 0); 4 } 5 6 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) 7 { 8 struct bio *bio;//定义一个bio(block input output),也就是块设备i/o 9 int ret = 0; 10 11 BUG_ON(!buffer_locked(bh)); 12 BUG_ON(!buffer_mapped(bh)); 13 BUG_ON(!bh->b_end_io); 14 BUG_ON(buffer_delay(bh)); 15 BUG_ON(buffer_unwritten(bh)); 16 17 /* 18 * Only clear out a write error when rewriting 19 */ 20 if (test_set_buffer_req(bh) && (rw & WRITE)) 21 clear_buffer_write_io_error(bh); 22 23 /* 24 * from here on down, it's all bio -- do the initial mapping, 25 * submit_bio -> generic_make_request may further map this bio around 26 */ 27 bio = bio_alloc(GFP_NOIO, 1);//分配bio 28 /*根据buffer_head(bh)构造bio */ 29 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);//存放逻辑块号 30 bio->bi_bdev = bh->b_bdev;//存放对应的块设备 31 bio->bi_io_vec[0].bv_page = bh->b_page;//存放缓冲区所在的物理页面 32 bio->bi_io_vec[0].bv_len = bh->b_size; //存放扇区的大小 33 bio->bi_io_vec[0].bv_offset = bh_offset(bh);//存放扇区中以字节为单位的偏移量 34 35 bio->bi_vcnt = 1;//计数值 36 bio->bi_size = bh->b_size;//存放扇区的大小 37 38 bio->bi_end_io = end_bio_bh_io_sync;//设置i/o回调函数 39 bio->bi_private = bh;//指向的缓冲区 40 bio->bi_flags |= bio_flags; 41 42 /* Take care of bh's that straddle the end of the device */ 43 guard_bh_eod(rw, bio, bh); 44 45 if (buffer_meta(bh)) 46 rw |= REQ_META; 47 if (buffer_prio(bh)) 48 rw |= REQ_PRIO; 49 50 bio_get(bio); 51 submit_bio(rw, bio);/提交bio 52 53 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 54 ret = -EOPNOTSUPP; 55 56 bio_put(bio); 57 return ret; 58 }
3.3 函数submit_bio
定义位于:linux-3.10.73\block\blk-core.c
请求结构体request的提交工作由submit_bio()去完成,通用层在调用相应的设备IO调度器,这个调度器的调度算法,将这个bio合并到已经存在的request中,或者创建一个新的request,并将创建的插入到请求队列中。
1 void submit_bio(int rw, struct bio *bio) 2 { 3 bio->bi_rw |= rw; 4 5 /* 6 * If it's a regular read/write or a barrier with data attached, 7 * go through the normal accounting stuff before submission. 8 */ 9 if (bio_has_data(bio)) { 10 unsigned int count; 11 12 if (unlikely(rw & REQ_WRITE_SAME)) 13 count = bdev_logical_block_size(bio->bi_bdev) >> 9; 14 else 15 count = bio_sectors(bio); 16 17 if (rw & WRITE) { 18 count_vm_events(PGPGOUT, count); 19 } else { 20 task_io_account_read(bio->bi_size); 21 count_vm_events(PGPGIN, count); 22 } 23 24 if (unlikely(block_dump)) { 25 char b[BDEVNAME_SIZE]; 26 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", 27 current->comm, task_pid_nr(current), 28 (rw & WRITE) ? "WRITE" : "READ", 29 (unsigned long long)bio->bi_sector, 30 bdevname(bio->bi_bdev, b), 31 count); 32 } 33 } 34 35 generic_make_request(bio); 36 }
3.4 函数generic_make_request
1 void generic_make_request(struct bio *bio) 2 { 3 struct bio_list bio_list_on_stack; 4 5 if (!generic_make_request_checks(bio)) 6 return; 7 8 /* 9 * We only want one ->make_request_fn to be active at a time, else 10 * stack usage with stacked devices could be a problem. So use 11 * current->bio_list to keep a list of requests submited by a 12 * make_request_fn function. current->bio_list is also used as a 13 * flag to say if generic_make_request is currently active in this 14 * task or not. If it is NULL, then no make_request is active. If 15 * it is non-NULL, then a make_request is active, and new requests 16 * should be added at the tail 17 */ 18 if (current->bio_list) { 19 bio_list_add(current->bio_list, bio); 20 return; 21 } 22 23 /* following loop may be a bit non-obvious, and so deserves some 24 * explanation. 25 * Before entering the loop, bio->bi_next is NULL (as all callers 26 * ensure that) so we have a list with a single bio. 27 * We pretend that we have just taken it off a longer list, so 28 * we assign bio_list to a pointer to the bio_list_on_stack, 29 * thus initialising the bio_list of new bios to be 30 * added. ->make_request() may indeed add some more bios 31 * through a recursive call to generic_make_request. If it 32 * did, we find a non-NULL value in bio_list and re-enter the loop 33 * from the top. In this case we really did just take the bio 34 * of the top of the list (no pretending) and so remove it from 35 * bio_list, and call into ->make_request() again. 36 */ 37 BUG_ON(bio->bi_next); 38 bio_list_init(&bio_list_on_stack); 39 current->bio_list = &bio_list_on_stack; 40 do { 41 struct request_queue *q = bdev_get_queue(bio->bi_bdev);//通过bio->bi_bdev获取申请队列q 42 43 q->make_request_fn(q, bio);//提交申请队列q和bio 44 45 bio = bio_list_pop(current->bio_list); 46 } while (bio); 47 current->bio_list = NULL; /* deactivate */ 48 }
函数bdev_get_queue拿到请求队列request_queue
1 static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 2 { 3 return bdev->bd_disk->queue; 4 }
3.5 函数blk_queue_make_request
定义位于:linux-3.10.73\block\blk-settings.c
那么3.4中的struct request_queue的make_request_fn在哪里被初始化的呢?它在blk_queue_make_request()函数中被初始化mfn这个参数
1 void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) 2 { 3 /* 4 * set defaults 5 */ 6 q->nr_requests = BLKDEV_MAX_RQ; 7 8 q->make_request_fn = mfn; 9 blk_queue_dma_alignment(q, 511); 10 blk_queue_congestion_threshold(q); 11 q->nr_batching = BLK_BATCH_REQ; 12 13 blk_set_default_limits(&q->limits); 14 15 /* 16 * by default assume old behaviour and bounce for any highmem page 17 */ 18 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 19 }
这个函数主要是取出块设备相应的队列中的每个设备,在调用块设备驱动的make_request,如果没有指定make_request就调用内核默认的__make_request,这个函数主要作用就是调用I/O调度算法将bio合并,或插入到队列中合适的位置中去。
3.6 队列的初始化
3.6.1 请求队列数据结构request_queue如上所述
1 struct request_queue { 2 ... 3 request_fn_proc *request_fn; 4 make_request_fn *make_request_fn; 5 ... 6 }
3.6.2 函数blk_init_queue
函数的作用就是为了分配一个request_queue请求队列,并初始化,分配成功返回一个request_queue结构体。
参数request_fn_proc *rfn :
typedef void (request_fn_proc)(struct reqest_queue *q),是指向"请求处理函数"的指针,用来处理request_queue申请队列的处理函数,该函数直接和硬盘打交道,用来处理数据在内存和硬盘之间的传输。
该函数作为函数blk_init_queue的参数,主要作用就是处理请求队列中的bio,完成数据在内存和硬盘之间的传递。(注意:该函数参数中的bio都是经过i/o调度器的)。
参数spinlock_t *lock:队列访问权限的自旋锁(spinlock),该锁需要通过DEFINE_SPINLOCK()函数来定义。
1 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 2 { 3 return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); 4 } 5 EXPORT_SYMBOL(blk_init_queue); 6 7 struct request_queue * 8 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 9 { 10 struct request_queue *uninit_q, *q; 11 12 uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); 13 if (!uninit_q) 14 return NULL; 15 16 q = blk_init_allocated_queue(uninit_q, rfn, lock); 17 if (!q) 18 blk_cleanup_queue(uninit_q); 19 20 return q; 21 }
3.6.3 函数make_request_fn
typedef int (make_request_fn)(struct request_queue *q,struct bio *bio)
该函数的作用是根据bio生成一个request,所以叫制造请求函数。
该函数是的第一个参数是请求队列,第二个参数是bio,。
注意:在想不使用I/O调度器的时候,就应该在该函数中实现,对每一传入该函数的bio之间进行处理,完成数据在内存和硬盘的之间的传输,这样就可以不使用"request_fn_proc"函数了。(所以可以看出来,如果使用i/o调度器,make_request_fn函数是在request_fn_proc函数之前执行)
3.7 函数add_disk
向内核中注册gendisk结构体,定义位于:linux-3.10.73\block\genhd.c
1 void add_disk(struct gendisk *disk) 2 { 3 struct backing_dev_info *bdi; 4 dev_t devt; 5 int retval; 6 7 /* minors == 0 indicates to use ext devt from part0 and should 8 * be accompanied with EXT_DEVT flag. Make sure all 9 * parameters make sense. 10 */ 11 WARN_ON(disk->minors && !(disk->major || disk->first_minor)); 12 WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); 13 14 disk->flags |= GENHD_FL_UP; 15 16 retval = blk_alloc_devt(&disk->part0, &devt); 17 if (retval) { 18 WARN_ON(1); 19 return; 20 } 21 disk_to_dev(disk)->devt = devt; 22 23 /* ->major and ->first_minor aren't supposed to be 24 * dereferenced from here on, but set them just in case. 25 */ 26 disk->major = MAJOR(devt); 27 disk->first_minor = MINOR(devt); 28 29 disk_alloc_events(disk); 30 31 /* Register BDI before referencing it from bdev */ 32 bdi = &disk->queue->backing_dev_info; 33 bdi_register_dev(bdi, disk_devt(disk)); 34 35 blk_register_region(disk_devt(disk), disk->minors, NULL, 36 exact_match, exact_lock, disk); 37 register_disk(disk); 38 blk_register_queue(disk); 39 40 /* 41 * Take an extra ref on queue which will be put on disk_release() 42 * so that it sticks around as long as @disk is there. 43 */ 44 WARN_ON_ONCE(!blk_get_queue(disk->queue)); 45 46 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 47 "bdi"); 48 WARN_ON(retval); 49 50 disk_add_events(disk); 51 }
3.8 函数put_disk
注销内核中的gendisk结构体,在出口函数中使用。
1 void put_disk(struct gendisk *disk) 2 { 3 if (disk) 4 kobject_put(&disk_to_dev(disk)->kobj); 5 }
3.9 函数alloc_disk
分配一个gendisk结构,minors为分区数,填1表示不分区。
1 struct gendisk *alloc_disk(int minors) 2 { 3 return alloc_disk_node(minors, NUMA_NO_NODE); 4 }
3.10 函数del_gendisk
释放gendisk结构,在出口函数中使用,也就是不需要这个磁盘了
1 void del_gendisk(struct gendisk *disk) 2 { 3 struct disk_part_iter piter; 4 struct hd_struct *part; 5 6 disk_del_events(disk); 7 8 /* invalidate stuff */ 9 disk_part_iter_init(&piter, disk, 10 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); 11 while ((part = disk_part_iter_next(&piter))) { 12 invalidate_partition(disk, part->partno); 13 delete_partition(disk, part->partno); 14 } 15 disk_part_iter_exit(&piter); 16 17 invalidate_partition(disk, 0); 18 set_capacity(disk, 0); 19 disk->flags &= ~GENHD_FL_UP; 20 21 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 22 bdi_unregister(&disk->queue->backing_dev_info); 23 blk_unregister_queue(disk); 24 blk_unregister_region(disk_devt(disk), disk->minors); 25 26 part_stat_set_all(&disk->part0, 0); 27 disk->part0.stamp = 0; 28 29 kobject_put(disk->part0.holder_dir); 30 kobject_put(disk->slave_dir); 31 disk->driverfs_dev = NULL; 32 if (!sysfs_deprecated) 33 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); 34 pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); 35 device_del(disk_to_dev(disk)); 36 }
3.11 函数elv_next_request
通过电梯算法获取申请队列中未完成的申请,获取成功返回一个request结构体,不成功返回NULL。不使用获取到的这个申请时,应使用end_request()来结束获取申请。
函数end_request
结束获取申请, 当uptodate==0,表示使用该申请读写扇区失败, uptodate==1,表示成功。
3.12 函数register_blkdev
int register_blkdev(unsigned int major, const char *name);
创建一个块设备,当major==0时,表示动态创建,创建成功会返回一个主设备号
unregister_blkdev(unsigned int major, const char *name);
卸载一个块设备, 在出口函数中使用,major:主设备号, name:名称。
4 块驱动实现步骤:
入口函数中:
(1)使用register_blkdev()创建一个块设备:
该过程是一个可选过程。也可以不用注册设备,驱动一样可以工作,该函数和字符设备的register_chrdev()函数相对应,对于大多数的块设备,第一个工作就是相内核注册自己,但是在Linux2.6以后,register_blkdev()函数的调用变得可选,内核中register_blkdev()函数的功能正在逐渐减少。基本上就只有如下作用:
1)分局major分配一个块设备号
2)在/proc/devices中新增加一行数据,表示块设备的信息)
(2)blk_init_queue()使用分配一个申请队列,并赋申请队列处理函数
根据是否需要I/O调度,将情况分为两种情况,一种是使用请求队列进行数据传输,一种是不使用请求队列进行数据传输。
(3)使用alloc_disk()分配一个gendisk结构体
(4)设置gendisk结构体的成员
(4.1)设置成员参数(major、first_minor、disk_name、fops)
(4.2)设置queue成员,等于之前分配的申请队列
(4.3)通过set_capacity()设置capacity成员,等于扇区数
(5)使用kzalloc()来获取缓存地址,用做扇区
(6)使用add_disk()注册gendisk结构体
申请队列的处理函数:
(1) while循环使用elv_next_request()获取申请队列中每个未处理的申请
(2)使用rq_data_dir()来获取每个申请的读写命令标志,为 0(READ)表示读, 为1(WRITE)表示写
(3)使用memcp()来读或者写扇区(缓存)
(4)使用end_request()来结束获取的每个申请
出口函数中
(1)使用put_disk()和del_gendisk()来注销,释放gendisk结构体
(2)使用kfree()释放磁盘扇区缓存
(3)使用blk_cleanup_queue()清除内存中的申请队列
(4)使用unregister_blkdev()卸载块设备
5 使用I/O调度器和不使用I/O调度器
I/O调度器看起来可以提高访问速度,但是这是并不是最快的,因为I/O调度过程会花费很多时间。最快的方式就是不使用I/O调度器。
5.1 不使用i/o调度器(blk_alloc_queue())
bio的流程完全由驱动开发人员控制,要达到这个目的,必须使用函数blk_alloc_queue()来申请请求队列,然后使用函数blk_queue_make_requset()给bio指定具有request_fn_proc()功能的函数Virtual_blkdev_make_request来完成数据在内存和硬盘之间的传输(该函数本来是用来将bio加入request中的)。
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
1 static int Virtual_blkdev_make_request(struct requset_queue *q,structb bio *bio) 2 { 3 //因为不使用I/O调度算法,直接在该函数中完成数据在内存和硬盘之间的数据传输,该函数 4 //代替了request_fn_proc()函数的功能 5 ............ 6 } 7 Virtual_blkdev_queue = blk_alloc_queue(GFP_KERNEL) 8 if(!Virtual_blkdev_queue) 9 { 10 ret=-ENOMEN; 11 goto err_alloc_queue; 12 } 13 blk_queue_make_request(Virtual_blkdev_queue,Virtual_blkdev_make_request);
5.2 使用i/o调度器(blk_init_queue())
bio先经过__make_request()函数,I/O调度器,和request_fn_proc()完成内存和硬盘之间的数据传输。该过程使用函数blk_init_queue()函数完成队列的初始化,并指定request_fn_proc():
struct request_queue* blk_inti_queue(request_fn_proc *rfn,spinlock_t *lock)
6 实例代码
6.1 使用i/o调度器
1 #include <linux/module.h> 2 #include <linux/errno.h> 3 #include <linux/interrupt.h> 4 #include <linux/mm.h> 5 #include <linux/fs.h> 6 #include <linux/kernel.h> 7 #include <linux/timer.h> 8 #include <linux/genhd.h> 9 #include <linux/hdreg.h> 10 #include <linux/ioport.h> 11 #include <linux/init.h> 12 #include <linux/wait.h> 13 #include <linux/blkdev.h> 14 #include <linux/blkpg.h> 15 #include <linux/delay.h> 16 #include <linux/io.h> 17 18 #include <asm/system.h> 19 #include <asm/uaccess.h> 20 #include <asm/dma.h> 21 22 static DEFINE_SPINLOCK(memblock_lock); //定义自旋锁 23 static request_queue_t * memblock_request; //申请队列 24 static struct gendisk *memblock_disk; //磁盘结构体 25 static int memblock_major; 26 27 #define BLOCKBUF_SIZE (1024*1024) //磁盘大小 28 #define SECTOR_SIZE (512) //扇区大小 29 static unsigned char *block_buf; //磁盘地址 30 31 32 static int memblock_getgeo(struct block_device *bdev, struct hd_geometry *geo) 33 { 34 geo->heads =2; // 2个磁头分区 35 geo->cylinders = 32; //一个磁头有32个柱面 36 geo->sectors = BLOCKBUF_SIZE/(2*32*SECTOR_SIZE); //一个柱面有多少个扇区 37 return 0; 38 } 39 40 static struct block_device_operations memblock_fops = { 41 .owner = THIS_MODULE, 42 .getgeo = memblock_getgeo, //几何,保存磁盘的信息(柱头,柱面,扇区) 43 }; 44 45 /*申请队列处理函数*/ 46 static void do_memblock_request (request_queue_t * q) 47 { 48 struct request *req; 49 unsigned long offset; 50 unsigned long len; 51 static unsigned long r_cnt = 0; 52 static unsigned long w_cnt = 0; 53 54 while ((req = elv_next_request(q)) != NULL) //获取每个申请 55 { 56 offset=req->sector*SECTOR_SIZE; //偏移值 57 len=req->current_nr_sectors*SECTOR_SIZE; //长度 58 59 if(rq_data_dir(req)==READ) 60 { 61 memcpy(req->buffer,block_buf+offset,len); //读出缓存 62 } 63 else 64 { 65 memcpy(block_buf+offset,req->buffer,len); //写入缓存 66 } 67 end_request(req, 1); //结束获取的申请 68 } 69 } 70 71 /*入口函数*/ 72 static int memblock_init(void) 73 { 74 /*1)使用register_blkdev()创建一个块设备*/ 75 memblock_major=register_blkdev(0, "memblock"); 76 77 /*2) blk_init_queue()使用分配一个申请队列,并赋申请队列处理函数*/ 78 memblock_request=blk_init_queue(do_memblock_request,&memblock_lock); 79 80 /*3)使用alloc_disk()分配一个gendisk结构体*/ 81 memblock_disk=alloc_disk(16); //不分区 82 83 /*4)设置gendisk结构体的成员*/ 84 /*->4.1)设置成员参数(major、first_minor、disk_name、fops)*/ 85 memblock_disk->major = memblock_major; 86 memblock_disk->first_minor = 0; 87 sprintf(memblock_disk->disk_name, "memblock"); 88 memblock_disk->fops = &memblock_fops; 89 90 /*->4.2)设置queue成员,等于之前分配的申请队列*/ 91 memblock_disk->queue = memblock_request; 92 93 /*->4.3)通过set_capacity()设置capacity成员,等于扇区数*/ 94 set_capacity(memblock_disk,BLOCKBUF_SIZE/SECTOR_SIZE); 95 96 /*5)使用kzalloc()来获取缓存地址,用做扇区*/ 97 block_buf=kzalloc(BLOCKBUF_SIZE, GFP_KERNEL); 98 99 /*6)使用add_disk()注册gendisk结构体*/ 100 add_disk(memblock_disk); 101 return 0; 102 } 103 static void memblock_exit(void) 104 { 105 /*1)使用put_disk()和del_gendisk()来注销,释放gendisk结构体*/ 106 put_disk(memblock_disk); 107 del_gendisk(memblock_disk); 108 /*2)使用kfree()释放磁盘扇区缓存 */ 109 kfree(block_buf); 110 /*3)使用blk_cleanup_queue()清除内存中的申请队列 */ 111 blk_cleanup_queue(memblock_request); 112 113 /*4)使用unregister_blkdev()卸载块设备 */ 114 unregister_blkdev(memblock_major,"memblock"); 115 } 116 117 module_init(memblock_init); 118 module_exit(memblock_exit); 119 MODULE_LICENSE("GPL");
6.2 不使用IO调度器
即不使用__make_request 这个系统指定的函数。其实从blk_init_queue()函数中也能看出来,系统使用了blk_queue_make_request(q, __make_request)这个函数,那么我们也可以使用这个函数来指定我们自己的策略函数,从而替换掉__make_request函数。那初始化request_queue的blk_init_queue函数也不需要了。
1 #include<linux/init.h> 2 #include<linux/module.h> 3 #include<linux/genhd.h> 4 #include<linux/fs.h> 5 #include<linux/blkdev.h> 6 #include<linux/bio.h> 7 8 #define SIMP_BLKDEV_DISKNAME "simp_blkdev" 9 #define SIMP_BLKDEV_DEVICEMAJOR COMPAQ_SMART2_MAJOR 10 #define SIMP_BLKDEV_BYTES (8*1024*1024) 11 12 13 14 static DEFINE_SPINLOCK(rq_lock); 15 unsigned char simp_blkdev_data[SIMP_BLKDEV_BYTES]; 16 static struct gendisk *simp_blkdev_disk; 17 static struct request_queue *simp_blkdev_queue;//device's request queue 18 19 struct block_device_operations simp_blkdev_fops = { 20 .owner = THIS_MODULE, 21 }; 22 //handle bio 23 static int simp_blkdev_make_request(struct request_queue *q, struct bio *bio){ 24 struct bio_vec *bvec; 25 int i; 26 void *dsk_mem; 27 if( (bio->bi_sector << 9) + bio->bi_size > SIMP_BLKDEV_BYTES){ 28 printk(KERN_ERR SIMP_BLKDEV_DISKNAME ":bad request:block=%llu,count=%u\n",(unsigned long long)bio->bi_sector,bio->bi_size); 29 bio_endio(bio,-EIO); 30 return 0; 31 } 32 dsk_mem = simp_blkdev_data + (bio->bi_sector << 9); 33 bio_for_each_segment(bvec, bio, i){ 34 void *iovec_mem; 35 switch( bio_rw(bio) ){ 36 case READ: 37 case READA: 38 iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset; 39 memcpy(iovec_mem, dsk_mem, bvec->bv_len); 40 kunmap(bvec->bv_page); 41 break; 42 case WRITE: 43 iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset; 44 memcpy(dsk_mem, iovec_mem, bvec->bv_len); 45 kunmap(bvec->bv_page); 46 break; 47 default: 48 printk(KERN_ERR SIMP_BLKDEV_DISKNAME ": unknown value of bio_rw: %lu\n",bio_rw(bio)); 49 bio_endio(bio,-EIO); 50 return 0; 51 } 52 dsk_mem += bvec->bv_len; 53 } 54 bio_endio(bio,0); 55 return 0; 56 } 57 58 59 static int simp_blkdev_init(void){ 60 int ret; 61 simp_blkdev_queue = blk_alloc_queue(GFP_KERNEL); 62 if(!simp_blkdev_queue){ 63 ret = -ENOMEM; 64 goto error_alloc_queue; 65 } 66 blk_queue_make_request(simp_blkdev_queue, simp_blkdev_make_request); 67 //alloc the resource of gendisk 68 simp_blkdev_disk = alloc_disk(1); 69 if(!simp_blkdev_disk){ 70 ret = -ENOMEM; 71 goto error_alloc_disk; 72 } 73 74 75 //populate the gendisk structure 76 strcpy(simp_blkdev_disk->disk_name,SIMP_BLKDEV_DISKNAME); 77 simp_blkdev_disk->major = SIMP_BLKDEV_DEVICEMAJOR; 78 simp_blkdev_disk->first_minor = 0; 79 simp_blkdev_disk->fops = &simp_blkdev_fops; 80 simp_blkdev_disk->queue = simp_blkdev_queue; 81 set_capacity(simp_blkdev_disk,SIMP_BLKDEV_BYTES>>9); 82 83 add_disk(simp_blkdev_disk); 84 printk("module simp_blkdev added.\n"); 85 return 0; 86 87 error_alloc_queue: 88 blk_cleanup_queue(simp_blkdev_queue); 89 90 error_alloc_disk: 91 return ret; 92 93 } 94 static void simp_blkdev_exit(void){ 95 del_gendisk(simp_blkdev_disk); 96 put_disk(simp_blkdev_disk); 97 blk_cleanup_queue(simp_blkdev_queue); 98 printk("module simp_blkdev romoved.\n"); 99 } 100 101 102 103 module_init(simp_blkdev_init); 104 module_exit(simp_blkdev_exit);
参考博文: