挂个文件系统
每个文件系统都是一个独立的世界,统管着属于自己的文件们。如果你想进入这世界瞧一瞧,要准备一扇门(文件夹),然后施展魔法(mount命令),门的另一边便通向了新文件系统。
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
int ret;
char *kernel_type;
char *kernel_dir;
char *kernel_dev;
unsigned long data_page;
ret = copy_mount_string(type, &kernel_type);
if (ret < 0)
goto out_type;
kernel_dir = getname(dir_name); //从app获得设备要挂载的节点
if (IS_ERR(kernel_dir)) {
ret = PTR_ERR(kernel_dir);
goto out_dir;
}
ret = copy_mount_string(dev_name, &kernel_dev); //设备路径
/**
* "copy_mount_string --> strndup_user --> memdup_user"
*
* 用户态到内核态的拷贝,都会涉及到两个必要的步骤:
* void *memdup_user(const void __user *src, size_t len)
* {
* void *p;
*
* p = kmalloc_track_caller(len, GFP_KERNEL); //内核态分配个空间
* if (!p)
* return ERR_PTR(-ENOMEM);
*
* if (copy_from_user(p, src, len)) { //从用户态拷过来
* kfree(p);
* return ERR_PTR(-EFAULT);
* }
*
* return p;
* }
*/
if (ret < 0)
goto out_dev;
ret = copy_mount_options(data, &data_page); //获得data_page
if (ret < 0)
goto out_data;
/*以上所做的一切,只为获得do_mount的参数*/
ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, (void *)data_page); //-->
free_page(data_page);
out_data:
kfree(kernel_dev);
out_dev:
putname(kernel_dir);
out_dir:
kfree(kernel_type);
out_type:
return ret;
}
mount正式开始:
long do_mount(char *dev_name, char *dir_name, char *type_page,
unsigned long flags, void *data_page)
{
struct path path;
int retval = 0;
int mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) //字符串的长度不能超过一个页面大小
return -EINVAL;
/**
* void *memchr(const void *s, int c, size_t n)
* {
* const unsigned char *p = s;
* while (n-- != 0) {
* if ((unsigned char)c == *p++) { //找字符串的结尾
* return (void *)(p - 1);
* }
* }
* return NULL;
* }
*/
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
/* ... and get the mountpoint */
retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
if (retval)
return retval;
retval = security_sb_mount(dev_name, &path,
type_page, flags, data_page);
if (retval)
goto dput_out;
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
/* Separate the per-mountpoint flags 安全模式,这个细抠起来比较复杂诶*/
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_STRICTATIME)
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME);
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page); //改变一个原已安装设备的安装方式
else if (flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC); //回接设备的处理 -->
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&path, dev_name);
else
retval = do_new_mount(&path, type_page, flags, mnt_flags, //增加新的mount点
dev_name, data_page);
dput_out:
path_put(&path);
return retval;
}
这里出现了个loopback,回接设备。何为回接设备,一种机制的体现,什么机制?将一个普通文件当作块设备用。
[jesse@localhost linux-3.0]$ ls /dev/loop
loop0 loop1 loop2 loop3 loop4 loop5 loop6 loop7
新建文件blkfile
[root@localhost test]# dd if=/dev/zero of=./blkfile bs=1k count=100
100+0 records in
100+0 records out
102400 bytes (102 kB) copied, 0.00113843 s, 89.9 MB/s
[root@localhost test]# ll
total 100
-rw-rw-r--. 1 jesse jesse 102400 Nov 13 11:35 blkfile
选loop1回接
[root@localhost test]# losetup /dev/loop1 ./blkfile
直接格式化blkfile当然不行,not a block special device...
[root@localhost test]# mkfs -t ext2 ./blkfile 100
mke2fs 1.41.12 (17-May-2010)
./blkfile is not a block special device.
格式化loop1,其实操作的是blkfile
[root@localhost test]# mkfs -t ext2 /dev/loop1 100
mke2fs 1.41.12 (17-May-2010)
Filesystem label=
OS type: Linux
Block size=1024 (log=0)
Fragment size=1024 (log=0)
Stride=0 blocks, Stripe width=0 blocks
16 inodes, 100 blocks
5 blocks (5.00%) reserved for the super user
First data block=1
1 block group
8192 blocks per group, 8192 fragments per group
16 inodes per group
Writing inode tables: done
Writing superblocks and filesystem accounting information: done
This filesystem will be automatically checked every 25 mounts or
180 days, whichever comes first. Use tune2fs -c or -i to override.
结论:看来回接至少能将一个普通文件当块设备来用。让后挂载到/mnt,哇!就这么有了自定义的子系统。
然后,恍然大悟:
/backup/iso/rhel6.1.iso /rhel6 iso9660 loop 0 0
呵呵~
----------------------------------------------------------------------------------------------------------------------------------
了解了回接设备,我们返回正题。
//其实,我们更关心这个
retval = do_new_mount(&path, type_page, flags, mnt_flags,
dev_name, data_page);
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
static int do_new_mount(struct path *path, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;
int err;
if (!type)
return -EINVAL;
/* we need capabilities... */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
mnt = do_kern_mount(type, flags, name, data); //-->
if (IS_ERR(mnt))
return PTR_ERR(mnt);
err = do_add_mount(mnt, path, mnt_flags); //
if (err)
mntput(mnt);
return err;
}
-->
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
struct file_system_type *type = get_fs_type(fstype);
struct vfsmount *mnt; //把一个设备安装到一个目录结点
if (!type)
return ERR_PTR(-ENODEV);
mnt = vfs_kern_mount(type, flags, name, data); //返回特定fs的file_system_type-->
View Code
struct vfsmount {
struct list_head mnt_hash;
struct vfsmount *mnt_parent; /* fs we are mounted on */
struct dentry *mnt_mountpoint; /* dentry of mountpoint */
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
atomic_t mnt_longterm; /* how many of the refs are longterm */
#else
int mnt_count;
int mnt_writers;
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
int mnt_flags;
/* 4 bytes hole on 64bits arches without fsnotify */
#ifdef CONFIG_FSNOTIFY
__u32 mnt_fsnotify_mask;
struct hlist_head mnt_fsnotify_marks;
#endif
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
struct list_head mnt_slave; /* slave list entry */
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
};
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
mnt = fs_set_subtype(mnt, fstype);
put_filesystem(type);
return mnt;
}
-->
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct vfsmount *mnt;
struct dentry *root;
if (!type)
return ERR_PTR(-ENODEV);
mnt = alloc_vfsmnt(name); //申请 struct vfsmount 空间,然后便是填充
if (!mnt)
return ERR_PTR(-ENOMEM);
if (flags & MS_KERNMOUNT)
mnt->mnt_flags = MNT_INTERNAL;
root = mount_fs(type, flags, name, data); //-->
if (IS_ERR(root)) {
free_vfsmnt(mnt);
return ERR_CAST(root);
}
mnt->mnt_root = root;
mnt->mnt_sb = root->d_sb;
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
return mnt;
}
-->
struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
...
root = type->mount(type, flags, name, data); //回调具体文件系统的mount
if (IS_ERR(root)) {
error = PTR_ERR(root);
goto out_free_secdata;
}
...
}
看到一个回调函数:
root = type->mount(type, flags, name, data);
内核支持的文件系统不止一种,每一种文件系统都该有个结构体来描述:
-- include/linux/fs.h --
struct file_system_type {
const char *name;
int fs_flags;
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
struct list_head fs_supers;
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
struct lock_class_key i_alloc_sem_key;
};
回调函数调的是哪个文件系统的mount,这就取决type->mount中的type。而type怎么来的,当然是find出来的咯。
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
struct file_system_type *type = get_fs_type(fstype);
|
fs = __get_fs_type(name, len);
|
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len)); //-->
if (fs && !try_module_get(fs->owner)) //增加该fs引用计数
fs = NULL;
read_unlock(&file_systems_lock);
... ...
}
-->
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
struct file_system_type **p;
for ( p=&file_systems; *p; p=&(*p)->next ) //原来是个简单的队列 -->
if (strlen((*p)->name) == len &&
strncmp((*p)->name, name, len) == 0)
break;
return p;
}
-->
新添加个文件系统
-- fs/ext2/super.c --
static int __init init_ext2_fs(void)
{
int err = init_ext2_xattr();
if (err)
return err;
err = init_inodecache();
if (err)
goto out1;
err = register_filesystem(&ext2_fs_type); //将自己挂载上队列
if (err)
goto out;
return 0;
out:
destroy_inodecache();
out1:
exit_ext2_xattr();
return err;
}
说到底,要看的其实是具体fs的mount函数。获得struct vfsmount之后,开始正式挂载上去。
mnt = do_kern_mount(type, flags, name, data);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
err = do_add_mount(mnt, path, mnt_flags); //-->
if (err)
mntput(mnt);
-->
/*
* add a mount into a namespace's mount tree
*/
static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
{
int err;
mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
err = lock_mount(path);
if (err)
return err;
err = -EINVAL;
if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
goto unlock;
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
if (path->mnt->mnt_sb == newmnt->mnt_sb &&
path->mnt->mnt_root == path->dentry)
goto unlock;
err = -EINVAL;
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;
newmnt->mnt_flags = mnt_flags;
err = graft_tree(newmnt, path); //又接到了什么树 -->!!!
unlock:
unlock_mount(path);
return err;
}
一个新的文件系统就这么挂了上去,只是大概的流程,要具体到文件系统,就要从回调mount入手。
-- fs/ext2/super.c --
static int __init init_ext2_fs(void)
{
int err = init_ext2_xattr();
if (err)
return err;
err = init_inodecache();
if (err)
goto out1;
err = register_filesystem(&ext2_fs_type);
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
.mount = ext2_mount, //-->
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
if (err)
goto out;
return 0;
out:
destroy_inodecache();
out1:
exit_ext2_xattr();
return err;
}
ext2的挂载:
static struct dentry *ext2_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
}
mount_bdev函数首先
/* open a block device by name */
struct block_device *bdev = blkdev_get_by_path();
然后
/* find or create a superblock */
struct super_block *s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
获得了超级块指针,如果s->s_root,也就是目录挂载点为空的化,那就要填充ext2的超级块,这涉及到ext2文件系统格式的理解。
挂上文件系统后意味着什么?意味着struct file_operations,struct inode_operations挂上了具体的操作函数。
读取一个文件由vfs层的read调到ext2的read,又调到具体的磁盘驱动;又或者是嵌入式,先调到mtd的read,最后调到nand驱动的read,根据nand datasheet的时序控制gpio引脚,发送cmd,接收数据。:-)
最后,来个山寨小实验:
增加个自己的文件系统,ext2_jes。其实就是个ext2的副本。
第一步:
[jesse@localhost linux-2.6.39]$ vim fs/ext2/
acl.c ialloc.c super.c xattr_user.c
acl.h inode.c symlink.c xip.c
balloc.c ioctl.c xattr.c xip.h
dir.c Kconfig xattr.h
ext2.h Makefile xattr_security.c
file.c namei.c xattr_trusted.c
[jesse@localhost linux-2.6.39]$ vim include/linux/
ext2_fs.h ext2_fs_sb.h
以上部分果断山寨一份。
[jesse@localhost linux-2.6.39]$ vim fs/ext2_jes/
acl.c ialloc.c super.c xattr_user.c
acl.h inode.c symlink.c xip.c
balloc.c ioctl.c xattr.c xip.h
dir.c Kconfig xattr.h
ext2.h Makefile xattr_security.c
file.c namei.c xattr_trusted.c
[jesse@localhost linux-2.6.39]$ vim include/linux/
ext2_fs_jes.h ext2_fs_sb_jes.h
当然,好的山寨必将是彻底的,将fs/ext2_jes/里内容里的所有ext2改为ext2_jes。看来你需要一个脚本。
cat $f | sed 's/ext2/ext2_jes/g' > ${f}_tmp
mv ${f}_tmp $f
*.h文件同理。
要记得大写的EXT2也要改哦。
第二步:
修改Makefile和Kconfig,照猫画虎即刻。
第三步:
编译过程中,可能会不断报错,原因大多是缺少函数,复制对应的然后粘贴即可。
最后还要改一下magic,起个吉利的8888作为名字。
-- include/linux/magic.h --
#define EXT2_SUPER_MAGIC 0xEF53
#define EXT2_JES_SUPER_MAGIC 0x8888
编译好后,app作个测试,用到之前说到的回接:
#dd if=/dev/zero of=jesfs bs=1M count=1
#mkfs.ext2 jesfs
/**
* 注意,这里的mkfs.ext2仍然用的是ext2格式,需要改下它的magic,
* 用vim打开,找到0xEF53,然后改为0x8888即可
*/
#mount -t ext2_jes -o loop ./jesfs /mnt