linux文件系统初探--Day8~11

Day8中并没有代码的更新,只是介绍了一下struct file,以及open等系统调用关联文件系统中的很多操作。struct file保存内核看到的文件的特征信息。

Day9中实现了支持page cache的文件操作,重点涉及file_operations的实现以及页缓存的相关知识。

file_opearions

file_operations的定义如下:

struct file_operations {
	struct module *owner;
	loff_t (*llseek) (struct file *, loff_t, int);
	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
	int (*iterate) (struct file *, struct dir_context *);
	int (*iterate_shared) (struct file *, struct dir_context *);
	__poll_t (*poll) (struct file *, struct poll_table_struct *);
	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
	int (*mmap) (struct file *, struct vm_area_struct *);
	unsigned long mmap_supported_flags;
	int (*open) (struct inode *, struct file *);
	int (*flush) (struct file *, fl_owner_t id);
	int (*release) (struct inode *, struct file *);
	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
	int (*fasync) (int, struct file *, int);
	int (*lock) (struct file *, int, struct file_lock *);
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
	int (*check_flags)(int);
	int (*flock) (struct file *, int, struct file_lock *);
	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
	int (*setlease)(struct file *, long, struct file_lock **, void **);
	long (*fallocate)(struct file *file, int mode, loff_t offset,
			  loff_t len);
	void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
	unsigned (*mmap_capabilities)(struct file *);
#endif
	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
			loff_t, size_t, unsigned int);
	int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
			u64);
	int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
			u64);
	int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;

而下面是file.c中的实现:

struct file_operations sfs_file_operations = {
	.read           = do_sync_read,
	.aio_read	= generic_file_aio_read,
	.write          = do_sync_write,
	.aio_write	= generic_file_aio_write,
	.mmap           = generic_file_mmap,
	.fsync          = simple_sync_file,
	.sendfile       = generic_file_sendfile,
	.llseek         = generic_file_llseek,
};

readwrite分别负责读写数据,参数分别为文件描述符、数据缓冲区、数据大小和文件偏移量。这里追一下read_write.c中read系统调用的代码就会发现,do_sync_read可用new_sync_read替代,do_sync_write类似。而且我们发现,实际上new_sync_read也调用了generic_file_read_iter;同时查看了ext2,ext4和通用块设备的file_operations,发现他们都没有设置readwrite成员,所以我们在实验中也不设置readwrite

aio_readaio_write用于异步读写操作,在新版本内核中,更名为read_iterwrite_iter,对应的generic函数在filemap.c中,更名为generic_file_read_itergeneric_file_write_iter。这两个函数在这里不做具体分析。

mmap将文件内容映射到虚拟地址空间,减少了一次拷贝操作。举例来说,通常情况下,一个文件写操作需要经过以下几次拷贝:用户态->内核态->写文件,而mmap则省略了用户态到内核态的拷贝,可以直接从用户态访问具体文件。具体实现是generic_file_mmap,没变化。

fsync用于同步内存中和存储介质上的数据。这里不妨使用libfs.c中的generic_file_sync

sendfile这个成员函数在新版本内核中被去掉了,关于sendfile的相关知识,可以查看这篇文章,总体上来讲,就是绕过用户态内核态的切换和复制,直接用DMA从一个内核传到另一个内核缓冲区。所以在本次实验中直接去掉。

llseek很简单,就是调整文件描述符到指定的位置,改变当前的读写位置。generic_file_llseek在read_write.c中,具体代码在这里不细说了。

address_space

个人理解是:地址空间是内核针对缓存的一个统一抽象。

地址空间是内核中最关键的数据结构之一,可以认为是内核最基本的抽象机制之一,其重要性堪比进程、文件等抽象结构。

address_space定义如下:

struct address_space {
	struct inode		*host;		/* owner: inode, block_device */
	struct radix_tree_root	i_pages;	/* cached pages */
	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
	struct rb_root_cached	i_mmap;		/* tree of private and shared mappings */
	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
	/* Protected by the i_pages lock */
	unsigned long		nrpages;	/* number of total pages */
	/* number of shadow or DAX exceptional entries */
	unsigned long		nrexceptional;
	pgoff_t			writeback_index;/* writeback starts here */
	const struct address_space_operations *a_ops;	/* methods */
	unsigned long		flags;		/* error bits */
	spinlock_t		private_lock;	/* for use by the address_space */
	gfp_t			gfp_mask;	/* implicit gfp mask for allocations */
	struct list_head	private_list;	/* for use by the address_space */
	void			*private_data;	/* ditto */
	errseq_t		wb_err;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;

host保存当前地址空间的所有者,inode或者是块设备。

i_pages指向一个基数树的跟,这个基数树列出了当前地址空间中所有的物理内存页。

nrpages保存缓存页的总数。

a_ops指向关于address_space的操作结构,定义了一些处理地址空间的特定操作。下文详述。

其他的内容我们目前不关注。必须指出的是,新内核中移除了backing_dev_info这个成员,主要保存了后备存储器中的设备信息,而后备存储器指定了地址空间中页的数据的来源,这部分功能在新内核中是怎样实现的还不清楚。

address_space_operations

address_space_operations定义如下:

struct address_space_operations {
	int (*writepage)(struct page *page, struct writeback_control *wbc);
	int (*readpage)(struct file *, struct page *);

	/* Write back some dirty pages from this mapping. */
	int (*writepages)(struct address_space *, struct writeback_control *);

	/* Set a page dirty.  Return true if this dirtied it */
	int (*set_page_dirty)(struct page *page);

	/*
	 * Reads in the requested pages. Unlike ->readpage(), this is
	 * PURELY used for read-ahead!.
	 */
	int (*readpages)(struct file *filp, struct address_space *mapping,
			struct list_head *pages, unsigned nr_pages);

	int (*write_begin)(struct file *, struct address_space *mapping,
				loff_t pos, unsigned len, unsigned flags,
				struct page **pagep, void **fsdata);
	int (*write_end)(struct file *, struct address_space *mapping,
				loff_t pos, unsigned len, unsigned copied,
				struct page *page, void *fsdata);

	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
	sector_t (*bmap)(struct address_space *, sector_t);
	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
	int (*releasepage) (struct page *, gfp_t);
	void (*freepage)(struct page *);
	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
	/*
	 * migrate the contents of a page to the specified target. If
	 * migrate_mode is MIGRATE_ASYNC, it must not block.
	 */
	int (*migratepage) (struct address_space *,
			struct page *, struct page *, enum migrate_mode);
	bool (*isolate_page)(struct page *, isolate_mode_t);
	void (*putback_page)(struct page *);
	int (*launder_page) (struct page *);
	int (*is_partially_uptodate) (struct page *, unsigned long,
					unsigned long);
	void (*is_dirty_writeback) (struct page *, bool *, bool *);
	int (*error_remove_page)(struct address_space *, struct page *);

	/* swapfile support */
	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
				sector_t *span);
	void (*swap_deactivate)(struct file *file);
};

在file.c中,具体实现如下:

struct address_space_operations sfs_aops = {
	.readpage       = simple_readpage,
	.write_begin	= simple_write_begin,
	.write_end	= simple_write_end
};

readpage从存储器将一页读入页帧。

write_beginwrite_end执行由write系统调用触发的写操作,begin将事务数据存储到日志,end将执行实际的写操作,在写入时,内核必须保证两个函数成对使用,并且顺序正确,否则日志机制会失效。当前将写操作划分为两部分已经成为一个约定俗成的传统。

注意,原来的老版本代码中write_beginwrite_end分别命名为:prepare_writecommit_write

实验结果

可见,基本的文件夹创建、文件读写操作目前都已经支持。

Day10 11

samplefs整体已经完成构建,在day10和11中也只是对address_space_operations中的readpageswritepages进行了介绍,以及在inode_operations中加入了对硬链接和软链接的处理。

参考资料

linux文件系统二 VFS读写流程
linux内核 address_space 结构

posted @ 2021-02-03 15:37  xinze  阅读(281)  评论(0编辑  收藏  举报