linux设备驱动(22)块设备驱动详解

块设备是i/o设备中的一类, 当我们的应用层对该设备读写时,是按扇区大小来读写数据的,若读写的数据小于扇区的大小,就会需要缓存区, 可以随机读写设备的任意位置处的数据,使用缓冲区来存放暂时的数据,待条件成熟后,从缓存一次性写入设备或者从设备一次性读到缓冲区。例如 普通文件(*.txt,*.c等),硬盘,U盘,SD卡。

对比字符设备驱动,应用层读写(read()/write())字符设备驱动时,是按字节/字符来读写数据的,期间没有任何缓存区,因为数据量小,不能随机读取数据,是一个顺序的数据流设备,对这种设备的读写是按字符进行的,而且这些字符是连续地形成一个数据流。他不具备缓冲区,所以对这种设备的读写是实时的。

1 块设备的基本概念

扇区:

硬盘的基本访问单位,扇区的大小一般是512B(对于现在的有些磁盘的扇区>512B,比如光盘的一个扇区就是2048B,Linux将其看成4个扇区,无非就是需要完成4次的读写)。

块:

扇区是硬件传输数据的基本单位,硬件一次传输一个扇区的数据到内存中。但是和扇区不同的是,块是虚拟文件系统传输数据的基本单位。在Linux中,块的大小必须是2的幂,但是不能超过一个页的大小(4k)。(在X86平台,一个页的大小是4094个字节,所以块大小可以是512,1024,2048,4096)。

主要为了做scatter/gather DMA操作使用,同一个物理页面中的在硬盘存储介质上连续的多个块组成一个段。段的大小只与块有关,必须是块的整数倍。所以块通常包括多个扇区,段通常包括多个块,物理段通常包括多个段;段在内核中由结构struct bio_vec来描述,多个段的信息存放于struct bio结构中的bio_io_vec指针数组中,段数组在后续的块设备处理流程中会被合并成物理段,段结构定义如下:

定义位于:linux-3.10.73\include\linux\blk_types.h

1 /*
2  * was unsigned short, but we might as well be ready for > 64kB I/O pages
3  */
4 struct bio_vec {
5     struct page    *bv_page;
6     unsigned int    bv_len;
7     unsigned int    bv_offset;
8 };

扇区由磁盘的物理特性决定;块缓冲区由内核代码决定;块由缓冲区决定,是块缓冲区大小的整数倍,但是不能超过一个页。

所以:扇区(512)≤块≤页(4096) 块=n*扇区(n为整数)
注意:段(struct bio_vec{})由多个块组成,一个段就是一个内存页(如果一个块是两个扇区大小,也就是1024B,那么一个段的大小可以是1024,2018,3072,4096,也就是说段的大小只与块有关,而且是整数倍)。Linux系统一次读取磁盘的大小是一个块,而不是一个扇区,块设备驱动由此得名。

2 块设备驱动数据结构

2.1 gendisk磁盘结构体gendisk

用来存储该设备的硬盘信息,包括请求队列、分区链表和块设备操作函数集等。

 1 struct gendisk {
 2     /* major, first_minor and minors are input parameters only,
 3      * don't use directly.  Use disk_devt() and disk_max_parts().
 4      */
 5     int major;            /* major number of driver */设备主设备号,等于register_blkdev()函数里的major 
 6     int first_minor;//起始次设备号,等于0,则表示此设备号从0开始的
 7     int minors;                     /* maximum number of minors, =1 for disks that can't be partitioned. */分区(次设备)数量,当使用alloc_disk()时,就会自动设置该成员
 8 
 9     char disk_name[DISK_NAME_LEN];    /* name of major driver */块设备名称, 等于register_blkdev()函数里的name
10     char *(*devnode)(struct gendisk *gd, umode_t *mode);
11 
12     unsigned int events;        /* supported events */
13     unsigned int async_events;    /* async events, subset of all */
14 
15     /* Array of pointers to partitions indexed by partno.
16      * Protected with matching bdev lock but stat and other
17      * non-critical accesses use RCU.  Always access through
18      * helpers.
19      */
20     struct disk_part_tbl __rcu *part_tbl;//分区表的信息
21     struct hd_struct part0;
22 
23     const struct block_device_operations *fops;//块设备操作函数
24     struct request_queue *queue;//请求队列,用于管理该设备IO请求队列的指针
25     void *private_data;//私有数据
26 
27     int flags;
28     struct device *driverfs_dev;  // FIXME: remove
29     struct kobject *slave_dir;
30 
31     struct timer_rand_state *random;
32     atomic_t sync_io;        /* RAID */
33     struct disk_events *ev;
34 #ifdef  CONFIG_BLK_DEV_INTEGRITY
35     struct blk_integrity *integrity;
36 #endif
37     int node_id;
38 }

2.2 结构体request

请求结构体,对块设备的IO请求,都会向块设备驱动发出一个请求,在驱动中用request结构体描述。定义位于:linux-3.10.73\include\linux\blkdev.h

  1 /*
  2  * try to put the fields that are referenced together in the same cacheline.
  3  * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init()
  4  * as well!
  5  */
  6 struct request {
  7     struct list_head queuelist;
  8     struct call_single_data csd;
  9 
 10     struct request_queue *q;//指向请求队列
 11 
 12     unsigned int cmd_flags;//命令标识
 13     enum rq_cmd_type_bits cmd_type;////读写命令标志,为 0(READ)表示读, 为1(WRITE)表示写
 14     unsigned long atomic_flags;
 15 
 16     int cpu;
 17 
 18     /* the following two fields are internal, NEVER access directly */
 19     unsigned int __data_len;    /* total data len */
 20     sector_t __sector;        /* sector cursor */ 要提交的下一个扇区偏移位置(offset)
 21 
 22     struct bio *bio;
 23     struct bio *biotail;
 24 
 25     struct hlist_node hash;    /* merge hash */
 26     /*
 27      * The rb_node is only used inside the io scheduler, requests
 28      * are pruned when moved to the dispatch queue. So let the
 29      * completion_data share space with the rb_node.
 30      */
 31     union {
 32         struct rb_node rb_node;    /* sort/lookup */
 33         void *completion_data;
 34     };
 35 
 36     /*
 37      * Three pointers are available for the IO schedulers, if they need
 38      * more they have to dynamically allocate it.  Flush requests are
 39      * never put on the IO scheduler. So let the flush fields share
 40      * space with the elevator data.
 41      */
 42     union {
 43         struct {
 44             struct io_cq        *icq;
 45             void            *priv[2];
 46         } elv;
 47 
 48         struct {
 49             unsigned int        seq;
 50             struct list_head    list;
 51             rq_end_io_fn        *saved_end_io;
 52         } flush;
 53     };
 54 
 55     struct gendisk *rq_disk;
 56     struct hd_struct *part;
 57     unsigned long start_time;
 58 #ifdef CONFIG_BLK_CGROUP
 59     struct request_list *rl;        /* rl this rq is alloced from */
 60     unsigned long long start_time_ns;
 61     unsigned long long io_start_time_ns;    /* when passed to hardware */
 62 #endif
 63     /* Number of scatter-gather DMA addr+len pairs after
 64      * physical address coalescing is performed.
 65      */
 66     unsigned short nr_phys_segments;
 67 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 68     unsigned short nr_integrity_segments;
 69 #endif
 70 
 71     unsigned short ioprio;
 72 
 73     int ref_count;
 74 
 75     void *special;        /* opaque pointer available for LLD use */
 76     char *buffer;        /* kaddr of the current segment if available 当前申请队列链表的数据,用来读写扇区的数据即源地址*/
 77 
 78     int tag;
 79     int errors;
 80 
 81     /*
 82      * when request is used as a packet command carrier
 83      */
 84     unsigned char __cmd[BLK_MAX_CDB];
 85     unsigned char *cmd;
 86     unsigned short cmd_len;
 87 
 88     unsigned int extra_len;    /* length of alignment and padding */
 89     unsigned int sense_len;
 90     unsigned int resid_len;    /* residual count */
 91     void *sense;
 92 
 93     unsigned long deadline;
 94     struct list_head timeout_list;
 95     unsigned int timeout;
 96     int retries;
 97 
 98     /*
 99      * completion callback.
100      */
101     rq_end_io_fn *end_io;
102     void *end_io_data;
103 
104     /* for bidi */
105     struct request *next_rq;
106 }

2.3 结构体request_queue

请求队列结构体

  1 struct request_queue {
  2     /*
  3      * Together with queue_head for cacheline sharing
  4      */
  5     struct list_head    queue_head;
  6     struct request        *last_merge;
  7     struct elevator_queue    *elevator;
  8     int            nr_rqs[2];    /* # allocated [a]sync rqs */
  9     int            nr_rqs_elvpriv;    /* # allocated rqs w/ elvpriv */
 10 
 11     /*
 12      * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
 13      * is used, root blkg allocates from @q->root_rl and all other
 14      * blkgs from their own blkg->rl.  Which one to use should be
 15      * determined using bio_request_list().
 16      */
 17     struct request_list    root_rl;
 18 
 19     request_fn_proc        *request_fn;//实现驱动程序处理请求的函数,在Virtual_blkdev中实现这个函数
 20     make_request_fn        *make_request_fn;//将一个新的request插入请求队列的方法
 21     prep_rq_fn        *prep_rq_fn;
 22     unprep_rq_fn        *unprep_rq_fn;
 23     merge_bvec_fn        *merge_bvec_fn;
 24     softirq_done_fn        *softirq_done_fn;
 25     rq_timed_out_fn        *rq_timed_out_fn;
 26     dma_drain_needed_fn    *dma_drain_needed;
 27     lld_busy_fn        *lld_busy_fn;
 28 
 29     /*
 30      * Dispatch queue sorting
 31      */
 32     sector_t        end_sector;
 33     struct request        *boundary_rq;
 34 
 35     /*
 36      * Delayed queue handling
 37      */
 38     struct delayed_work    delay_work;
 39 
 40     struct backing_dev_info    backing_dev_info;
 41 
 42     /*
 43      * The queue owner gets to use this for whatever they like.
 44      * ll_rw_blk doesn't touch it.
 45      */
 46     void            *queuedata;
 47 
 48     /*
 49      * various queue flags, see QUEUE_* below
 50      */
 51     unsigned long        queue_flags;
 52 
 53     /*
 54      * ida allocated id for this queue.  Used to index queues from
 55      * ioctx.
 56      */
 57     int            id;
 58 
 59     /*
 60      * queue needs bounce pages for pages above this limit
 61      */
 62     gfp_t            bounce_gfp;
 63 
 64     /*
 65      * protects queue structures from reentrancy. ->__queue_lock should
 66      * _never_ be used directly, it is queue private. always use
 67      * ->queue_lock.
 68      */
 69     spinlock_t        __queue_lock;
 70     spinlock_t        *queue_lock;
 71 
 72     /*
 73      * queue kobject
 74      */
 75     struct kobject kobj;
 76 
 77 #ifdef CONFIG_PM_RUNTIME
 78     struct device        *dev;
 79     int            rpm_status;
 80     unsigned int        nr_pending;
 81 #endif
 82 
 83     /*
 84      * queue settings
 85      */
 86     unsigned long        nr_requests;    /* Max # of requests */
 87     unsigned int        nr_congestion_on;
 88     unsigned int        nr_congestion_off;
 89     unsigned int        nr_batching;
 90 
 91     unsigned int        dma_drain_size;
 92     void            *dma_drain_buffer;
 93     unsigned int        dma_pad_mask;
 94     unsigned int        dma_alignment;
 95 
 96     struct blk_queue_tag    *queue_tags;
 97     struct list_head    tag_busy_list;
 98 
 99     unsigned int        nr_sorted;
100     unsigned int        in_flight[2];
101     /*
102      * Number of active block driver functions for which blk_drain_queue()
103      * must wait. Must be incremented around functions that unlock the
104      * queue_lock internally, e.g. scsi_request_fn().
105      */
106     unsigned int        request_fn_active;
107 
108     unsigned int        rq_timeout;
109     struct timer_list    timeout;
110     struct list_head    timeout_list;
111 
112     struct list_head    icq_list;
113 #ifdef CONFIG_BLK_CGROUP
114     DECLARE_BITMAP        (blkcg_pols, BLKCG_MAX_POLS);
115     struct blkcg_gq        *root_blkg;
116     struct list_head    blkg_list;
117 #endif
118 
119     struct queue_limits    limits;
120 
121     /*
122      * sg stuff
123      */
124     unsigned int        sg_timeout;
125     unsigned int        sg_reserved_size;
126     int            node;
127 #ifdef CONFIG_BLK_DEV_IO_TRACE
128     struct blk_trace    *blk_trace;
129 #endif
130     /*
131      * for flush operations
132      */
133     unsigned int        flush_flags;
134     unsigned int        flush_not_queueable:1;
135     unsigned int        flush_queue_delayed:1;
136     unsigned int        flush_pending_idx:1;
137     unsigned int        flush_running_idx:1;
138     unsigned long        flush_pending_since;
139     struct list_head    flush_queue[2];
140     struct list_head    flush_data_in_flight;
141     struct request        flush_rq;
142 
143     struct mutex        sysfs_lock;
144 
145     int            bypass_depth;
146 
147 #if defined(CONFIG_BLK_DEV_BSG)
148     bsg_job_fn        *bsg_job_fn;
149     int            bsg_job_size;
150     struct bsg_class_device bsg_dev;
151 #endif
152 
153 #ifdef CONFIG_BLK_CGROUP
154     struct list_head    all_q_node;
155 #endif
156 #ifdef CONFIG_BLK_DEV_THROTTLING
157     /* Throttle data */
158     struct throtl_data *td;
159 #endif
160     struct rcu_head        rcu_head;
161 }

2.4 bio结构体

 1 struct bio {
 2     sector_t        bi_sector;    /* device address in 512 byte
 3                            sectors */要传输的第一个扇区
 4     struct bio        *bi_next;    /* request queue link */下一个 bio 
 5     struct block_device    *bi_bdev;
 6     unsigned long        bi_flags;    /* status, command, etc */状态、命令等
 7     unsigned long        bi_rw;        /*低位表示 READ/WRITE,高位表示优先级*/
 8 
 9     unsigned short        bi_vcnt;    /* bio_vec 数量 */
10     unsigned short        bi_idx;        /* 当前 bvl_vec 索引 */
11 
12     /* Number of segments in this BIO after
13      * physical address coalescing is performed.
14      */
15     unsigned int        bi_phys_segments;//执行物理地址合并后 sgement 的数目
16 
17     unsigned int        bi_size;    /* residual I/O count */
18 
19     /为了明了最大的 segment 尺寸,我们考虑这个 bio 中第一个和最后一个可合并的 segment 的尺寸 */
20     unsigned int        bi_seg_front_size;
21     unsigned int        bi_seg_back_size;
22 
23     bio_end_io_t        *bi_end_io;
24 
25     void            *bi_private;
26 #ifdef CONFIG_BLK_CGROUP
27     /*
28      * Optional ioc and css associated with this bio.  Put on bio
29      * release.  Read comment on top of bio_associate_current().
30      */
31     struct io_context    *bi_ioc;
32     struct cgroup_subsys_state *bi_css;
33 #endif
34 #if defined(CONFIG_BLK_DEV_INTEGRITY)
35     struct bio_integrity_payload *bi_integrity;  /* data integrity */
36 #endif
37 
38     /*
39      * Everything starting with bi_max_vecs will be preserved by bio_reset()
40      */
41 
42     unsigned int        bi_max_vecs;    /* 我们能持有的最大 bvl_vecs 数*/
43 
44     atomic_t        bi_cnt;        /* pin count */
45 
46     struct bio_vec        *bi_io_vec;    /* the actual vec list 实际的 vec 列表*/
47 
48     struct bio_set        *bi_pool;
49 
50     /*
51      * We can inline a number of vecs at the end of the bio, to avoid
52      * double allocations for a small number of bio_vecs. This member
53      * MUST obviously be kept at the very end of the bio.
54      */
55     struct bio_vec        bi_inline_vecs[0];
56 }

2.5 bio_vec结构体

bio的核心是一个被称为bi_io_vec的数组,它由bio_vec组成(也就是说bio由许多bio_vec组成)。内核定义如下:

1 struct bio_vec {
2  struct page *bv_page; /* 页指针 */
3  unsigned int bv_len; /* 传输的字节数 */
4  unsigned int bv_offset; /* 偏移位置 */
5  };

2.6 buffer_head结构体

bio_vec描述一个特定的片段,片段所在的物理页,块在物理页中的偏移页,整个bio_io_vec结构表示一个完整的缓冲区。当一个块被调用内存时,要储存在一个缓冲区,每个缓冲区与一个块对应,所以每一个缓冲区独有一个对应的描述符,该描述符用buffer_head结构表示,定义位于:linux-3.10.73\include\linux\buffer_head.h

 1 struct buffer_head {
 2     unsigned long b_state;        /* buffer state bitmap (see above) */
 3     struct buffer_head *b_this_page;/* circular list of page's buffers */
 4     struct page *b_page;        /* the page this bh is mapped to */
 5 
 6     sector_t b_blocknr;        /* start block number */
 7     size_t b_size;            /* size of mapping */
 8     char *b_data;            /* pointer to data within the page */
 9 
10     struct block_device *b_bdev;
11     bh_end_io_t *b_end_io;        /* I/O completion */
12      void *b_private;        /* reserved for b_end_io */
13     struct list_head b_assoc_buffers; /* associated with another mapping */
14     struct address_space *b_assoc_map;    /* mapping this buffer is
15                            associated with */
16     atomic_t b_count;        /* users using this buffer_head */
17 }

2.7 块设备结构体

定义位于:linux-3.10.73\include\linux\fs.h

 1 struct block_device {
 2     dev_t            bd_dev;  /* not a kdev_t - it's a search key */
 3     int            bd_openers;
 4     struct inode *        bd_inode;    /* will die */
 5     struct super_block *    bd_super;
 6     struct mutex        bd_mutex;    /* open/close mutex */
 7     struct list_head    bd_inodes;
 8     void *            bd_claiming;
 9     void *            bd_holder;
10     int            bd_holders;
11     bool            bd_write_holder;
12 #ifdef CONFIG_SYSFS
13     struct list_head    bd_holder_disks;
14 #endif
15     struct block_device *    bd_contains;
16     unsigned        bd_block_size;
17     struct hd_struct *    bd_part;
18     /* number of times partitions within this device have been opened. */
19     unsigned        bd_part_count;
20     int            bd_invalidated;
21     struct gendisk *    bd_disk;
22     struct request_queue *  bd_queue;
23     struct list_head    bd_list;
24     /*
25      * Private data.  You must have bd_claim'ed the block_device
26      * to use this.  NOTE:  bd_claim allows an owner to claim
27      * the same device multiple times, the owner must take special
28      * care to not mess up bd_private for that case.
29      */
30     unsigned long        bd_private;
31 
32     /* The counter of freeze processes */
33     int            bd_fsfreeze_count;
34     /* Mutex for freeze */
35     struct mutex        bd_fsfreeze_mutex;
36 }

3 api

3.1 核心ll_rw_block函数

定义位于:linux-3.10.73\fs\buffer.c

 1 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 2 {//rw:读写标志位,  nr:bhs[]长度,  bhs[]:要读写的数据数组
 3     int i;
 4 
 5     for (i = 0; i < nr; i++) {
 6         struct buffer_head *bh = bhs[i];//获取nr个buffer_head
 7 
 8         if (!trylock_buffer(bh))
 9             continue;
10         if (rw == WRITE) {
11             if (test_clear_buffer_dirty(bh)) {
12                 bh->b_end_io = end_buffer_write_sync;
13                 get_bh(bh);
14                 submit_bh(WRITE, bh);//提交WRITE写标志的buffer_head
15                 continue;
16             }
17         } else {
18             if (!buffer_uptodate(bh)) {
19                 bh->b_end_io = end_buffer_read_sync;
20                 get_bh(bh);
21                 submit_bh(rw, bh);//提交其它标志的buffer_head
22                 continue;
23             }
24         }
25         unlock_buffer(bh);
26     }
27 }

3.2 函数submit_bh

submit_bh()函数就是通过bh来构造bio,然后调用submit_bio()提交bio。

 1 int submit_bh(int rw, struct buffer_head *bh)
 2 {
 3     return _submit_bh(rw, bh, 0);
 4 }
 5 
 6 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 7 {
 8     struct bio *bio;//定义一个bio(block input output),也就是块设备i/o
 9     int ret = 0;
10 
11     BUG_ON(!buffer_locked(bh));
12     BUG_ON(!buffer_mapped(bh));
13     BUG_ON(!bh->b_end_io);
14     BUG_ON(buffer_delay(bh));
15     BUG_ON(buffer_unwritten(bh));
16 
17     /*
18      * Only clear out a write error when rewriting
19      */
20     if (test_set_buffer_req(bh) && (rw & WRITE))
21         clear_buffer_write_io_error(bh);
22 
23     /*
24      * from here on down, it's all bio -- do the initial mapping,
25      * submit_bio -> generic_make_request may further map this bio around
26      */
27     bio = bio_alloc(GFP_NOIO, 1);//分配bio
28         /*根据buffer_head(bh)构造bio */
29     bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);//存放逻辑块号
30     bio->bi_bdev = bh->b_bdev;//存放对应的块设备
31     bio->bi_io_vec[0].bv_page = bh->b_page;//存放缓冲区所在的物理页面
32     bio->bi_io_vec[0].bv_len = bh->b_size; //存放扇区的大小
33     bio->bi_io_vec[0].bv_offset = bh_offset(bh);//存放扇区中以字节为单位的偏移量
34 
35     bio->bi_vcnt = 1;//计数值
36     bio->bi_size = bh->b_size;//存放扇区的大小
37 
38     bio->bi_end_io = end_bio_bh_io_sync;//设置i/o回调函数
39     bio->bi_private = bh;//指向的缓冲区
40     bio->bi_flags |= bio_flags;
41 
42     /* Take care of bh's that straddle the end of the device */
43     guard_bh_eod(rw, bio, bh);
44 
45     if (buffer_meta(bh))
46         rw |= REQ_META;
47     if (buffer_prio(bh))
48         rw |= REQ_PRIO;
49 
50     bio_get(bio);
51     submit_bio(rw, bio);/提交bio
52 
53     if (bio_flagged(bio, BIO_EOPNOTSUPP))
54         ret = -EOPNOTSUPP;
55 
56     bio_put(bio);
57     return ret;
58 }

3.3 函数submit_bio

定义位于:linux-3.10.73\block\blk-core.c

请求结构体request的提交工作由submit_bio()去完成,通用层在调用相应的设备IO调度器,这个调度器的调度算法,将这个bio合并到已经存在的request中,或者创建一个新的request,并将创建的插入到请求队列中。

 1 void submit_bio(int rw, struct bio *bio)
 2 {
 3     bio->bi_rw |= rw;
 4 
 5     /*
 6      * If it's a regular read/write or a barrier with data attached,
 7      * go through the normal accounting stuff before submission.
 8      */
 9     if (bio_has_data(bio)) {
10         unsigned int count;
11 
12         if (unlikely(rw & REQ_WRITE_SAME))
13             count = bdev_logical_block_size(bio->bi_bdev) >> 9;
14         else
15             count = bio_sectors(bio);
16 
17         if (rw & WRITE) {
18             count_vm_events(PGPGOUT, count);
19         } else {
20             task_io_account_read(bio->bi_size);
21             count_vm_events(PGPGIN, count);
22         }
23 
24         if (unlikely(block_dump)) {
25             char b[BDEVNAME_SIZE];
26             printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
27             current->comm, task_pid_nr(current),
28                 (rw & WRITE) ? "WRITE" : "READ",
29                 (unsigned long long)bio->bi_sector,
30                 bdevname(bio->bi_bdev, b),
31                 count);
32         }
33     }
34 
35     generic_make_request(bio);
36 }

3.4  函数generic_make_request

 1 void generic_make_request(struct bio *bio)
 2 {
 3     struct bio_list bio_list_on_stack;
 4 
 5     if (!generic_make_request_checks(bio))
 6         return;
 7 
 8     /*
 9      * We only want one ->make_request_fn to be active at a time, else
10      * stack usage with stacked devices could be a problem.  So use
11      * current->bio_list to keep a list of requests submited by a
12      * make_request_fn function.  current->bio_list is also used as a
13      * flag to say if generic_make_request is currently active in this
14      * task or not.  If it is NULL, then no make_request is active.  If
15      * it is non-NULL, then a make_request is active, and new requests
16      * should be added at the tail
17      */
18     if (current->bio_list) {
19         bio_list_add(current->bio_list, bio);
20         return;
21     }
22 
23     /* following loop may be a bit non-obvious, and so deserves some
24      * explanation.
25      * Before entering the loop, bio->bi_next is NULL (as all callers
26      * ensure that) so we have a list with a single bio.
27      * We pretend that we have just taken it off a longer list, so
28      * we assign bio_list to a pointer to the bio_list_on_stack,
29      * thus initialising the bio_list of new bios to be
30      * added.  ->make_request() may indeed add some more bios
31      * through a recursive call to generic_make_request.  If it
32      * did, we find a non-NULL value in bio_list and re-enter the loop
33      * from the top.  In this case we really did just take the bio
34      * of the top of the list (no pretending) and so remove it from
35      * bio_list, and call into ->make_request() again.
36      */
37     BUG_ON(bio->bi_next);
38     bio_list_init(&bio_list_on_stack);
39     current->bio_list = &bio_list_on_stack;
40     do {
41         struct request_queue *q = bdev_get_queue(bio->bi_bdev);//通过bio->bi_bdev获取申请队列q
42 
43         q->make_request_fn(q, bio);//提交申请队列q和bio
44 
45         bio = bio_list_pop(current->bio_list);
46     } while (bio);
47     current->bio_list = NULL; /* deactivate */
48 }

函数bdev_get_queue拿到请求队列request_queue

1 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
2 {
3     return bdev->bd_disk->queue;
4 }

3.5 函数blk_queue_make_request

定义位于:linux-3.10.73\block\blk-settings.c

那么3.4中的struct request_queue的make_request_fn在哪里被初始化的呢?它在blk_queue_make_request()函数中被初始化mfn这个参数

 1 void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 2 {
 3     /*
 4      * set defaults
 5      */
 6     q->nr_requests = BLKDEV_MAX_RQ;
 7 
 8     q->make_request_fn = mfn;
 9     blk_queue_dma_alignment(q, 511);
10     blk_queue_congestion_threshold(q);
11     q->nr_batching = BLK_BATCH_REQ;
12 
13     blk_set_default_limits(&q->limits);
14 
15     /*
16      * by default assume old behaviour and bounce for any highmem page
17      */
18     blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
19 }

 这个函数主要是取出块设备相应的队列中的每个设备,在调用块设备驱动的make_request,如果没有指定make_request就调用内核默认的__make_request,这个函数主要作用就是调用I/O调度算法将bio合并,或插入到队列中合适的位置中去。

3.6 队列的初始化

3.6.1 请求队列数据结构request_queue如上所述

1 struct request_queue {
2     ...
3     request_fn_proc        *request_fn;
4     make_request_fn        *make_request_fn;
5     ...
6 }

3.6.2 函数blk_init_queue

函数的作用就是为了分配一个request_queue请求队列,并初始化,分配成功返回一个request_queue结构体。

参数request_fn_proc *rfn :

typedef void (request_fn_proc)(struct reqest_queue *q),是指向"请求处理函数"的指针,用来处理request_queue申请队列的处理函数,该函数直接和硬盘打交道,用来处理数据在内存和硬盘之间的传输。

该函数作为函数blk_init_queue的参数,主要作用就是处理请求队列中的bio,完成数据在内存和硬盘之间的传递。(注意:该函数参数中的bio都是经过i/o调度器的)。

参数spinlock_t *lock:队列访问权限的自旋锁(spinlock),该锁需要通过DEFINE_SPINLOCK()函数来定义。

 1 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 2 {
 3     return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
 4 }
 5 EXPORT_SYMBOL(blk_init_queue);
 6 
 7 struct request_queue *
 8 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 9 {
10     struct request_queue *uninit_q, *q;
11 
12     uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
13     if (!uninit_q)
14         return NULL;
15 
16     q = blk_init_allocated_queue(uninit_q, rfn, lock);
17     if (!q)
18         blk_cleanup_queue(uninit_q);
19 
20     return q;
21 }

3.6.3  函数make_request_fn

typedef int (make_request_fn)(struct request_queue *q,struct bio *bio)

该函数的作用是根据bio生成一个request,所以叫制造请求函数。
该函数是的第一个参数是请求队列,第二个参数是bio,。
注意:在想不使用I/O调度器的时候,就应该在该函数中实现,对每一传入该函数的bio之间进行处理,完成数据在内存和硬盘的之间的传输,这样就可以不使用"request_fn_proc"函数了。(所以可以看出来,如果使用i/o调度器,make_request_fn函数是在request_fn_proc函数之前执行)

3.7 函数add_disk

向内核中注册gendisk结构体,定义位于:linux-3.10.73\block\genhd.c

 1 void add_disk(struct gendisk *disk)
 2 {
 3     struct backing_dev_info *bdi;
 4     dev_t devt;
 5     int retval;
 6 
 7     /* minors == 0 indicates to use ext devt from part0 and should
 8      * be accompanied with EXT_DEVT flag.  Make sure all
 9      * parameters make sense.
10      */
11     WARN_ON(disk->minors && !(disk->major || disk->first_minor));
12     WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
13 
14     disk->flags |= GENHD_FL_UP;
15 
16     retval = blk_alloc_devt(&disk->part0, &devt);
17     if (retval) {
18         WARN_ON(1);
19         return;
20     }
21     disk_to_dev(disk)->devt = devt;
22 
23     /* ->major and ->first_minor aren't supposed to be
24      * dereferenced from here on, but set them just in case.
25      */
26     disk->major = MAJOR(devt);
27     disk->first_minor = MINOR(devt);
28 
29     disk_alloc_events(disk);
30 
31     /* Register BDI before referencing it from bdev */
32     bdi = &disk->queue->backing_dev_info;
33     bdi_register_dev(bdi, disk_devt(disk));
34 
35     blk_register_region(disk_devt(disk), disk->minors, NULL,
36                 exact_match, exact_lock, disk);
37     register_disk(disk);
38     blk_register_queue(disk);
39 
40     /*
41      * Take an extra ref on queue which will be put on disk_release()
42      * so that it sticks around as long as @disk is there.
43      */
44     WARN_ON_ONCE(!blk_get_queue(disk->queue));
45 
46     retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
47                    "bdi");
48     WARN_ON(retval);
49 
50     disk_add_events(disk);
51 }

3.8 函数put_disk

注销内核中的gendisk结构体,在出口函数中使用。

1 void put_disk(struct gendisk *disk)
2 {
3     if (disk)
4         kobject_put(&disk_to_dev(disk)->kobj);
5 }

3.9 函数alloc_disk

分配一个gendisk结构,minors为分区数,填1表示不分区。

1 struct gendisk *alloc_disk(int minors)
2 {
3     return alloc_disk_node(minors, NUMA_NO_NODE);
4 }

3.10 函数del_gendisk

释放gendisk结构,在出口函数中使用,也就是不需要这个磁盘了

 1 void del_gendisk(struct gendisk *disk)
 2 {
 3     struct disk_part_iter piter;
 4     struct hd_struct *part;
 5 
 6     disk_del_events(disk);
 7 
 8     /* invalidate stuff */
 9     disk_part_iter_init(&piter, disk,
10                  DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
11     while ((part = disk_part_iter_next(&piter))) {
12         invalidate_partition(disk, part->partno);
13         delete_partition(disk, part->partno);
14     }
15     disk_part_iter_exit(&piter);
16 
17     invalidate_partition(disk, 0);
18     set_capacity(disk, 0);
19     disk->flags &= ~GENHD_FL_UP;
20 
21     sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
22     bdi_unregister(&disk->queue->backing_dev_info);
23     blk_unregister_queue(disk);
24     blk_unregister_region(disk_devt(disk), disk->minors);
25 
26     part_stat_set_all(&disk->part0, 0);
27     disk->part0.stamp = 0;
28 
29     kobject_put(disk->part0.holder_dir);
30     kobject_put(disk->slave_dir);
31     disk->driverfs_dev = NULL;
32     if (!sysfs_deprecated)
33         sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
34     pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
35     device_del(disk_to_dev(disk));
36 }

3.11 函数elv_next_request

通过电梯算法获取申请队列中未完成的申请,获取成功返回一个request结构体,不成功返回NULL。不使用获取到的这个申请时,应使用end_request()来结束获取申请。

函数end_request  

结束获取申请, 当uptodate==0,表示使用该申请读写扇区失败, uptodate==1,表示成功。

3.12 函数register_blkdev

int register_blkdev(unsigned int major, const char *name);

创建一个块设备,当major==0时,表示动态创建,创建成功会返回一个主设备号

unregister_blkdev(unsigned int major, const char *name);

卸载一个块设备, 在出口函数中使用,major:主设备号, name:名称。

4 块驱动实现步骤:

入口函数中:

(1)使用register_blkdev()创建一个块设备:

  该过程是一个可选过程。也可以不用注册设备,驱动一样可以工作,该函数和字符设备的register_chrdev()函数相对应,对于大多数的块设备,第一个工作就是相内核注册自己,但是在Linux2.6以后,register_blkdev()函数的调用变得可选,内核中register_blkdev()函数的功能正在逐渐减少。基本上就只有如下作用:

     1)分局major分配一个块设备号
     2)在/proc/devices中新增加一行数据,表示块设备的信息)

(2)blk_init_queue()使用分配一个申请队列,并赋申请队列处理函数

        根据是否需要I/O调度,将情况分为两种情况,一种是使用请求队列进行数据传输,一种是不使用请求队列进行数据传输。

(3)使用alloc_disk()分配一个gendisk结构体

(4)设置gendisk结构体的成员

  (4.1)设置成员参数(major、first_minor、disk_name、fops)

  (4.2)设置queue成员,等于之前分配的申请队列

  (4.3)通过set_capacity()设置capacity成员,等于扇区数

(5)使用kzalloc()来获取缓存地址,用做扇区

(6)使用add_disk()注册gendisk结构体

申请队列的处理函数:

(1) while循环使用elv_next_request()获取申请队列中每个未处理的申请

(2)使用rq_data_dir()来获取每个申请的读写命令标志,为 0(READ)表示读, 为1(WRITE)表示写

(3)使用memcp()来读或者写扇区(缓存)

(4)使用end_request()来结束获取的每个申请

出口函数中

(1)使用put_disk()和del_gendisk()来注销,释放gendisk结构体

(2)使用kfree()释放磁盘扇区缓存

(3)使用blk_cleanup_queue()清除内存中的申请队列

(4)使用unregister_blkdev()卸载块设备

5 使用I/O调度器和不使用I/O调度器

I/O调度器看起来可以提高访问速度,但是这是并不是最快的,因为I/O调度过程会花费很多时间。最快的方式就是不使用I/O调度器。

5.1 不使用i/o调度器(blk_alloc_queue())
bio的流程完全由驱动开发人员控制,要达到这个目的,必须使用函数blk_alloc_queue()来申请请求队列,然后使用函数blk_queue_make_requset()给bio指定具有request_fn_proc()功能的函数Virtual_blkdev_make_request来完成数据在内存和硬盘之间的传输(该函数本来是用来将bio加入request中的)。

void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)

 1 static int Virtual_blkdev_make_request(struct requset_queue *q,structb bio *bio)
 2 {
 3    //因为不使用I/O调度算法,直接在该函数中完成数据在内存和硬盘之间的数据传输,该函数
 4    //代替了request_fn_proc()函数的功能
 5    ............
 6 }
 7 Virtual_blkdev_queue = blk_alloc_queue(GFP_KERNEL)
 8 if(!Virtual_blkdev_queue)
 9 {
10    ret=-ENOMEN;
11    goto err_alloc_queue;
12 }
13 blk_queue_make_request(Virtual_blkdev_queue,Virtual_blkdev_make_request);

5.2 使用i/o调度器(blk_init_queue())
bio先经过__make_request()函数,I/O调度器,和request_fn_proc()完成内存和硬盘之间的数据传输。该过程使用函数blk_init_queue()函数完成队列的初始化,并指定request_fn_proc():

struct request_queue* blk_inti_queue(request_fn_proc *rfn,spinlock_t *lock)

6 实例代码

6.1 使用i/o调度器

  1 #include <linux/module.h>
  2 #include <linux/errno.h>
  3 #include <linux/interrupt.h>
  4 #include <linux/mm.h>
  5 #include <linux/fs.h>
  6 #include <linux/kernel.h>
  7 #include <linux/timer.h>
  8 #include <linux/genhd.h>
  9 #include <linux/hdreg.h>
 10 #include <linux/ioport.h>
 11 #include <linux/init.h>
 12 #include <linux/wait.h>
 13 #include <linux/blkdev.h>
 14 #include <linux/blkpg.h>
 15 #include <linux/delay.h>
 16 #include <linux/io.h>
 17 
 18 #include <asm/system.h>
 19 #include <asm/uaccess.h>
 20 #include <asm/dma.h>
 21 
 22 static DEFINE_SPINLOCK(memblock_lock);                //定义自旋锁
 23 static request_queue_t * memblock_request;                //申请队列
 24 static struct gendisk   *memblock_disk;                  //磁盘结构体
 25 static int memblock_major;
 26 
 27 #define BLOCKBUF_SIZE               (1024*1024)              //磁盘大小
 28 #define SECTOR_SIZE                   (512)                    //扇区大小
 29 static unsigned char   *block_buf;                              //磁盘地址
 30 
 31 
 32 static int memblock_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 33 {    
 34        geo->heads =2;                                         // 2个磁头分区
 35        geo->cylinders = 32;                                   //一个磁头有32个柱面
 36        geo->sectors = BLOCKBUF_SIZE/(2*32*SECTOR_SIZE);      //一个柱面有多少个扇区    
 37     return 0;
 38 }
 39 
 40 static struct block_device_operations memblock_fops = {
 41        .owner    = THIS_MODULE,
 42        .getgeo   =  memblock_getgeo,                //几何,保存磁盘的信息(柱头,柱面,扇区)
 43 };
 44 
 45     /*申请队列处理函数*/
 46 static void do_memblock_request (request_queue_t * q)
 47 {
 48         struct request *req;
 49         unsigned long offset;
 50         unsigned long len; 
 51         static unsigned long r_cnt = 0;
 52         static unsigned long w_cnt = 0;
 53               
 54         while ((req = elv_next_request(q)) != NULL)        //获取每个申请
 55         {
 56         offset=req->sector*SECTOR_SIZE;                     //偏移值
 57         len=req->current_nr_sectors*SECTOR_SIZE;            //长度    
 58                       
 59         if(rq_data_dir(req)==READ)
 60         {            
 61             memcpy(req->buffer,block_buf+offset,len);       //读出缓存
 62         }
 63         else
 64         {              
 65             memcpy(block_buf+offset,req->buffer,len);     //写入缓存
 66         }
 67         end_request(req, 1);                                            //结束获取的申请
 68         }    
 69 }
 70 
 71     /*入口函数*/
 72 static int memblock_init(void)
 73 {
 74      /*1)使用register_blkdev()创建一个块设备*/
 75      memblock_major=register_blkdev(0, "memblock");     
 76      
 77      /*2) blk_init_queue()使用分配一个申请队列,并赋申请队列处理函数*/
 78      memblock_request=blk_init_queue(do_memblock_request,&memblock_lock);
 79     
 80      /*3)使用alloc_disk()分配一个gendisk结构体*/
 81      memblock_disk=alloc_disk(16);                        //不分区
 82     
 83      /*4)设置gendisk结构体的成员*/
 84      /*->4.1)设置成员参数(major、first_minor、disk_name、fops)*/           
 85      memblock_disk->major = memblock_major;
 86      memblock_disk->first_minor = 0;
 87      sprintf(memblock_disk->disk_name, "memblock");
 88      memblock_disk->fops = &memblock_fops;
 89         
 90      /*->4.2)设置queue成员,等于之前分配的申请队列*/
 91      memblock_disk->queue = memblock_request;
 92       
 93      /*->4.3)通过set_capacity()设置capacity成员,等于扇区数*/
 94      set_capacity(memblock_disk,BLOCKBUF_SIZE/SECTOR_SIZE);
 95    
 96      /*5)使用kzalloc()来获取缓存地址,用做扇区*/
 97      block_buf=kzalloc(BLOCKBUF_SIZE, GFP_KERNEL);
 98  
 99      /*6)使用add_disk()注册gendisk结构体*/
100      add_disk(memblock_disk);   
101      return  0;
102 }
103 static void memblock_exit(void)
104 {        
105       /*1)使用put_disk()和del_gendisk()来注销,释放gendisk结构体*/
106       put_disk(memblock_disk);
107       del_gendisk(memblock_disk);
108       /*2)使用kfree()释放磁盘扇区缓存   */ 
109       kfree(block_buf);
110       /*3)使用blk_cleanup_queue()清除内存中的申请队列    */
111       blk_cleanup_queue(memblock_request);
112       
113       /*4)使用unregister_blkdev()卸载块设备               */
114       unregister_blkdev(memblock_major,"memblock");
115 }
116 
117 module_init(memblock_init);
118 module_exit(memblock_exit);
119 MODULE_LICENSE("GPL");

6.2  不使用IO调度器

即不使用__make_request 这个系统指定的函数。其实从blk_init_queue()函数中也能看出来,系统使用了blk_queue_make_request(q, __make_request)这个函数,那么我们也可以使用这个函数来指定我们自己的策略函数,从而替换掉__make_request函数。那初始化request_queue的blk_init_queue函数也不需要了。

  1 #include<linux/init.h>
  2 #include<linux/module.h>
  3 #include<linux/genhd.h>
  4 #include<linux/fs.h>
  5 #include<linux/blkdev.h>
  6 #include<linux/bio.h>
  7  
  8 #define SIMP_BLKDEV_DISKNAME "simp_blkdev"
  9 #define SIMP_BLKDEV_DEVICEMAJOR COMPAQ_SMART2_MAJOR
 10 #define SIMP_BLKDEV_BYTES (8*1024*1024)
 11  
 12  
 13  
 14 static DEFINE_SPINLOCK(rq_lock);
 15 unsigned char simp_blkdev_data[SIMP_BLKDEV_BYTES];
 16 static struct gendisk *simp_blkdev_disk;
 17 static struct request_queue *simp_blkdev_queue;//device's request queue
 18  
 19 struct block_device_operations simp_blkdev_fops = {
 20     .owner = THIS_MODULE,
 21 };
 22 //handle bio
 23 static int simp_blkdev_make_request(struct request_queue *q, struct bio *bio){
 24     struct bio_vec *bvec;
 25     int i;
 26     void *dsk_mem;
 27     if( (bio->bi_sector << 9) + bio->bi_size > SIMP_BLKDEV_BYTES){
 28         printk(KERN_ERR SIMP_BLKDEV_DISKNAME ":bad request:block=%llu,count=%u\n",(unsigned long long)bio->bi_sector,bio->bi_size);
 29         bio_endio(bio,-EIO);
 30     return 0;
 31     }
 32     dsk_mem = simp_blkdev_data + (bio->bi_sector << 9);
 33     bio_for_each_segment(bvec, bio, i){
 34         void *iovec_mem;
 35         switch( bio_rw(bio) ){
 36             case READ:
 37             case READA:
 38                 iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset;
 39                 memcpy(iovec_mem, dsk_mem, bvec->bv_len);
 40                 kunmap(bvec->bv_page);
 41                 break;
 42             case WRITE:
 43                 iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset;
 44                 memcpy(dsk_mem, iovec_mem, bvec->bv_len);
 45                 kunmap(bvec->bv_page);
 46                 break;
 47             default:
 48                 printk(KERN_ERR SIMP_BLKDEV_DISKNAME ": unknown value of bio_rw: %lu\n",bio_rw(bio));
 49                 bio_endio(bio,-EIO);
 50                 return 0;
 51         }
 52         dsk_mem += bvec->bv_len;
 53     }
 54     bio_endio(bio,0);
 55     return 0;
 56 }
 57  
 58  
 59 static int simp_blkdev_init(void){
 60     int ret;
 61     simp_blkdev_queue = blk_alloc_queue(GFP_KERNEL);
 62     if(!simp_blkdev_queue){
 63         ret = -ENOMEM;
 64         goto error_alloc_queue;
 65     }
 66     blk_queue_make_request(simp_blkdev_queue, simp_blkdev_make_request);
 67     //alloc the resource of gendisk
 68     simp_blkdev_disk = alloc_disk(1);
 69     if(!simp_blkdev_disk){
 70         ret = -ENOMEM;
 71         goto error_alloc_disk;
 72     }
 73     
 74  
 75     //populate the gendisk structure
 76     strcpy(simp_blkdev_disk->disk_name,SIMP_BLKDEV_DISKNAME);
 77     simp_blkdev_disk->major = SIMP_BLKDEV_DEVICEMAJOR;
 78     simp_blkdev_disk->first_minor = 0;
 79     simp_blkdev_disk->fops = &simp_blkdev_fops;
 80     simp_blkdev_disk->queue = simp_blkdev_queue;
 81     set_capacity(simp_blkdev_disk,SIMP_BLKDEV_BYTES>>9);
 82     
 83     add_disk(simp_blkdev_disk);
 84     printk("module simp_blkdev added.\n");
 85     return 0;
 86  
 87 error_alloc_queue:
 88     blk_cleanup_queue(simp_blkdev_queue);
 89  
 90 error_alloc_disk:
 91     return ret;    
 92  
 93 }
 94 static void simp_blkdev_exit(void){
 95     del_gendisk(simp_blkdev_disk);
 96     put_disk(simp_blkdev_disk);
 97     blk_cleanup_queue(simp_blkdev_queue);
 98     printk("module simp_blkdev romoved.\n");
 99 }
100  
101  
102  
103 module_init(simp_blkdev_init);
104 module_exit(simp_blkdev_exit);

参考博文:

https://blog.csdn.net/qq_16933601/article/details/103553403

https://www.cnblogs.com/lifexy/p/7661454.html

posted @ 2020-06-27 09:58  Action_er  阅读(1709)  评论(0编辑  收藏  举报