挂个文件系统

每个文件系统都是一个独立的世界，统管着属于自己的文件们。如果你想进入这世界瞧一瞧，要准备一扇门（文件夹），然后施展魔法（mount命令），门的另一边便通向了新文件系统。

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
            char __user *, type, unsigned long, flags, void __user *, data)
{
    int ret; 
    char *kernel_type;
    char *kernel_dir;
    char *kernel_dev;
    unsigned long data_page;

    ret = copy_mount_string(type, &kernel_type);
    if (ret < 0) 
        goto out_type;

    kernel_dir = getname(dir_name);    //从app获得设备要挂载的节点
    if (IS_ERR(kernel_dir)) {
        ret = PTR_ERR(kernel_dir);
        goto out_dir;
    }    

    ret = copy_mount_string(dev_name, &kernel_dev);    //设备路径

/**
 *    "copy_mount_string --> strndup_user --> memdup_user"
 *
 *    用户态到内核态的拷贝，都会涉及到两个必要的步骤：
 *    void *memdup_user(const void __user *src, size_t len)
 *    {
 *        void *p;
 *    
 *        p = kmalloc_track_caller(len, GFP_KERNEL);    //内核态分配个空间
 *        if (!p)
 *            return ERR_PTR(-ENOMEM);
 *    
 *        if (copy_from_user(p, src, len)) {        //从用户态拷过来
 *            kfree(p);
 *            return ERR_PTR(-EFAULT);
 *        }
 *    
 *        return p;
 *    }
 */

    if (ret < 0)
        goto out_dev;

    ret = copy_mount_options(data, &data_page);        //获得data_page
    if (ret < 0)
        goto out_data;

    /*以上所做的一切，只为获得do_mount的参数*/
    ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, (void *)data_page);　　//-->

    free_page(data_page);
out_data:
    kfree(kernel_dev);
out_dev:
    putname(kernel_dir);
out_dir:
    kfree(kernel_type);
out_type:
    return ret;
}

mount正式开始：

long do_mount(char *dev_name, char *dir_name, char *type_page,
          unsigned long flags, void *data_page)
{
    struct path path;
    int retval = 0;
    int mnt_flags = 0;

    /* Discard magic */
    if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
        flags &= ~MS_MGC_MSK;

    /* Basic sanity checks */
    if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))    //字符串的长度不能超过一个页面大小
        return -EINVAL;
/**
 *    void *memchr(const void *s, int c, size_t n)
 *    {
 *        const unsigned char *p = s;
 *        while (n-- != 0) {
 *                if ((unsigned char)c == *p++) {    //找字符串的结尾
 *                return (void *)(p - 1);
 *            }
 *        }
 *        return NULL;
 *    }
 */

    if (data_page)
        ((char *)data_page)[PAGE_SIZE - 1] = 0;

    /* ... and get the mountpoint */
    retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
    if (retval)
        return retval;

    retval = security_sb_mount(dev_name, &path,
                   type_page, flags, data_page);
    if (retval)
        goto dput_out;

    /* Default to relatime unless overriden */
    if (!(flags & MS_NOATIME))
        mnt_flags |= MNT_RELATIME;

    /* Separate the per-mountpoint flags 安全模式，这个细抠起来比较复杂诶*/
    if (flags & MS_NOSUID)
        mnt_flags |= MNT_NOSUID;
    if (flags & MS_NODEV)
        mnt_flags |= MNT_NODEV;
    if (flags & MS_NOEXEC)
        mnt_flags |= MNT_NOEXEC;
    if (flags & MS_NOATIME)
        mnt_flags |= MNT_NOATIME;
    if (flags & MS_NODIRATIME)
        mnt_flags |= MNT_NODIRATIME;
    if (flags & MS_STRICTATIME)
        mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
    if (flags & MS_RDONLY)
        mnt_flags |= MNT_READONLY;

    flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
           MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
           MS_STRICTATIME);

    if (flags & MS_REMOUNT)
        retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
                    data_page);        //改变一个原已安装设备的安装方式
    else if (flags & MS_BIND)
        retval = do_loopback(&path, dev_name, flags & MS_REC);    //回接设备的处理 -->
    else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
        retval = do_change_type(&path, flags);
    else if (flags & MS_MOVE)
        retval = do_move_mount(&path, dev_name);
    else
        retval = do_new_mount(&path, type_page, flags, mnt_flags,　　//增加新的mount点
                      dev_name, data_page);
dput_out:
    path_put(&path);
    return retval;
}

 
  
这里出现了个loopback，回接设备。何为回接设备，一种机制的体现，什么机制？将一个普通文件当作块设备用。

[jesse@localhost linux-3.0]$ ls /dev/loop
loop0  loop1  loop2  loop3  loop4  loop5  loop6  loop7

新建文件blkfile

[root@localhost test]# dd if=/dev/zero of=./blkfile bs=1k count=100
100+0 records in
100+0 records out
102400 bytes (102 kB) copied, 0.00113843 s, 89.9 MB/s

[root@localhost test]# ll
total 100
-rw-rw-r--. 1 jesse jesse 102400 Nov 13 11:35 blkfile

选loop1回接

[root@localhost test]# losetup  /dev/loop1 ./blkfile

直接格式化blkfile当然不行，not a block special device...

[root@localhost test]# mkfs -t ext2 ./blkfile 100
mke2fs 1.41.12 (17-May-2010)
./blkfile is not a block special device.

格式化loop1，其实操作的是blkfile

[root@localhost test]# mkfs -t ext2 /dev/loop1 100

mke2fs 1.41.12 (17-May-2010)
Filesystem label=
OS type: Linux
Block size=1024 (log=0)
Fragment size=1024 (log=0)
Stride=0 blocks, Stripe width=0 blocks
16 inodes, 100 blocks
5 blocks (5.00%) reserved for the super user
First data block=1
1 block group
8192 blocks per group, 8192 fragments per group
16 inodes per group

Writing inode tables: done                            
Writing superblocks and filesystem accounting information: done

This filesystem will be automatically checked every 25 mounts or
180 days, whichever comes first.  Use tune2fs -c or -i to override.

结论：看来回接至少能将一个普通文件当块设备来用。让后挂载到/mnt，哇！就这么有了自定义的子系统。
然后，恍然大悟：
/backup/iso/rhel6.1.iso /rhel6 iso9660 loop 0 0
呵呵～

----------------------------------------------------------------------------------------------------------------------------------

了解了回接设备，我们返回正题。

//其实，我们更关心这个
retval = do_new_mount(&path, type_page, flags, mnt_flags,
               dev_name, data_page);

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, char *type, int flags,
            int mnt_flags, char *name, void *data)
{
    struct vfsmount *mnt;
    int err; 

    if (!type)
        return -EINVAL;

    /* we need capabilities... */
    if (!capable(CAP_SYS_ADMIN))
        return -EPERM;

    mnt = do_kern_mount(type, flags, name, data);    //-->
    if (IS_ERR(mnt))
        return PTR_ERR(mnt);

    err = do_add_mount(mnt, path, mnt_flags);        //
    if (err)
        mntput(mnt);
    return err; 
}

-->

struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{       
    struct file_system_type *type = get_fs_type(fstype);
    struct vfsmount *mnt;    //把一个设备安装到一个目录结点
    if (!type)
        return ERR_PTR(-ENODEV);

    mnt = vfs_kern_mount(type, flags, name, data);    //返回特定fs的file_system_type-->

mnt

View Code

struct vfsmount {

    struct list_head mnt_hash;
    struct vfsmount *mnt_parent;        /* fs we are mounted on */
    struct dentry *mnt_mountpoint;      /* dentry of mountpoint */
    struct dentry *mnt_root;        /* root of the mounted tree */
    struct super_block *mnt_sb;     /* pointer to superblock */

#ifdef CONFIG_SMP
    struct mnt_pcp __percpu *mnt_pcp;
    atomic_t mnt_longterm;      /* how many of the refs are longterm */
#else
    int mnt_count;
    int mnt_writers;
#endif
    struct list_head mnt_mounts;        /* list of children, anchored here */
    struct list_head mnt_child;     /* and going through their mnt_child */
    int mnt_flags;
    /* 4 bytes hole on 64bits arches without fsnotify */
#ifdef CONFIG_FSNOTIFY
    __u32 mnt_fsnotify_mask;
    struct hlist_head mnt_fsnotify_marks;
#endif
    const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
    struct list_head mnt_list;
    struct list_head mnt_expire;    /* link in fs-specific expiry list */
    struct list_head mnt_share;     /* circular list of shared mounts */
    struct list_head mnt_slave_list;/* list of slave mounts */
    struct list_head mnt_slave;     /* slave list entry */
    struct vfsmount *mnt_master;    /* slave is on master->mnt_slave_list */
    struct mnt_namespace *mnt_ns;   /* containing namespace */
    int mnt_id;                 /* mount identifier */
    int mnt_group_id;               /* peer group identifier */
    int mnt_expiry_mark;            /* true if marked for expiry */
    int mnt_pinned;
    int mnt_ghosts;
};




    if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
        !mnt->mnt_sb->s_subtype)
        mnt = fs_set_subtype(mnt, fstype);

    put_filesystem(type);
    return mnt;
}

-->

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{   
    struct vfsmount *mnt;
    struct dentry *root;
    
    if (!type)
        return ERR_PTR(-ENODEV);
    
    mnt = alloc_vfsmnt(name);    //申请 struct vfsmount 空间，然后便是填充
    if (!mnt)
        return ERR_PTR(-ENOMEM);
    
    if (flags & MS_KERNMOUNT)
        mnt->mnt_flags = MNT_INTERNAL;

    root = mount_fs(type, flags, name, data);    //-->
    if (IS_ERR(root)) {
        free_vfsmnt(mnt);
        return ERR_CAST(root);
    }

    mnt->mnt_root = root;
    mnt->mnt_sb = root->d_sb;
    mnt->mnt_mountpoint = mnt->mnt_root;
    mnt->mnt_parent = mnt;
    return mnt;
}

-->

struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
    ...   

    root = type->mount(type, flags, name, data);    //回调具体文件系统的mount
    if (IS_ERR(root)) {
        error = PTR_ERR(root);
        goto out_free_secdata;
    }    
    ...
}

看到一个回调函数：
root = type->mount(type, flags, name, data);

内核支持的文件系统不止一种，每一种文件系统都该有个结构体来描述：

View Code

-- include/linux/fs.h --

struct file_system_type {
    const char *name;
    int fs_flags; 
    struct dentry *(*mount) (struct file_system_type *, int,
               const char *, void *);
    void (*kill_sb) (struct super_block *);
    struct module *owner;
    struct file_system_type * next;
    struct list_head fs_supers;
    
    struct lock_class_key s_lock_key;
    struct lock_class_key s_umount_key;
    struct lock_class_key s_vfs_rename_key;
        
    struct lock_class_key i_lock_key;
    struct lock_class_key i_mutex_key;
    struct lock_class_key i_mutex_dir_key;
    struct lock_class_key i_alloc_sem_key;
};

回调函数调的是哪个文件系统的mount，这就取决type->mount中的type。而type怎么来的，当然是find出来的咯。

struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{       
    struct file_system_type *type = get_fs_type(fstype);
                                        |
                             fs = __get_fs_type(name, len);
                                        |
                read_lock(&file_systems_lock);
                fs = *(find_filesystem(name, len));        //-->
                if (fs && !try_module_get(fs->owner))      //增加该fs引用计数
                    fs = NULL;
                read_unlock(&file_systems_lock);
    ... ...
}

-->

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
    struct file_system_type **p;
    for ( p=&file_systems; *p; p=&(*p)->next )        //原来是个简单的队列 -->
        if (strlen((*p)->name) == len &&
            strncmp((*p)->name, name, len) == 0)
            break;
    return p;
}

-->
新添加个文件系统

-- fs/ext2/super.c --

static int __init init_ext2_fs(void)
{
    int err = init_ext2_xattr();
    if (err)
        return err; 
    err = init_inodecache();
    if (err)
        goto out1;
        err = register_filesystem(&ext2_fs_type);    //将自己挂载上队列
    if (err)
        goto out; 
    return 0;
out:
    destroy_inodecache();
out1:
    exit_ext2_xattr();
    return err; 
}

说到底，要看的其实是具体fs的mount函数。获得struct vfsmount之后，开始正式挂载上去。

    mnt = do_kern_mount(type, flags, name, data);
    if (IS_ERR(mnt))
        return PTR_ERR(mnt);

    err = do_add_mount(mnt, path, mnt_flags);    //-->
    if (err)
        mntput(mnt);

-->

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
{
    int err; 

    mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);

    err = lock_mount(path);
    if (err)
        return err; 

    err = -EINVAL;
    if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
        goto unlock;

    /* Refuse the same filesystem on the same mount point */
    err = -EBUSY;
    if (path->mnt->mnt_sb == newmnt->mnt_sb &&
        path->mnt->mnt_root == path->dentry)
        goto unlock;

    err = -EINVAL;
    if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
        goto unlock;

    newmnt->mnt_flags = mnt_flags;
    err = graft_tree(newmnt, path);    //又接到了什么树 -->！！！

unlock:
    unlock_mount(path);
    return err; 
}

一个新的文件系统就这么挂了上去，只是大概的流程，要具体到文件系统，就要从回调mount入手。

-- fs/ext2/super.c --

static int __init init_ext2_fs(void)
{
    int err = init_ext2_xattr();
    if (err)
        return err; 
    err = init_inodecache();
    if (err)
        goto out1;
        err = register_filesystem(&ext2_fs_type);

ext2_fs_type

static struct file_system_type ext2_fs_type = {
    .owner      = THIS_MODULE,
    .name       = "ext2",
    .mount      = ext2_mount,        //-->
    .kill_sb    = kill_block_super,
    .fs_flags   = FS_REQUIRES_DEV,
};

    if (err)
        goto out; 
    return 0;
out:
    destroy_inodecache();
out1:
    exit_ext2_xattr();
    return err; 
}

ext2的挂载：

static struct dentry *ext2_mount(struct file_system_type *fs_type,
    int flags, const char *dev_name, void *data)
{
    return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
}

mount_bdev函数首先
/* open a block device by name */

struct block_device *bdev = blkdev_get_by_path();

然后

/* find or create a superblock */

struct super_block *s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);

获得了超级块指针，如果s->s_root，也就是目录挂载点为空的化，那就要填充ext2的超级块，这涉及到ext2文件系统格式的理解。

挂上文件系统后意味着什么？意味着struct file_operations，struct inode_operations挂上了具体的操作函数。

读取一个文件由vfs层的read调到ext2的read，又调到具体的磁盘驱动；又或者是嵌入式，先调到mtd的read，最后调到nand驱动的read，根据nand datasheet的时序控制gpio引脚，发送cmd，接收数据。：-）

最后，来个山寨小实验：

　　增加个自己的文件系统，ext2_jes。其实就是个ext2的副本。

第一步：

[jesse@localhost linux-2.6.39]$ vim fs/ext2/
acl.c             ialloc.c          super.c           xattr_user.c
acl.h             inode.c           symlink.c         xip.c
balloc.c          ioctl.c           xattr.c           xip.h
dir.c             Kconfig           xattr.h           
ext2.h            Makefile          xattr_security.c  
file.c            namei.c           xattr_trusted.c

[jesse@localhost linux-2.6.39]$ vim include/linux/
ext2_fs.h     ext2_fs_sb.h

以上部分果断山寨一份。

[jesse@localhost linux-2.6.39]$ vim fs/ext2_jes/
acl.c             ialloc.c          super.c           xattr_user.c
acl.h             inode.c           symlink.c         xip.c
balloc.c          ioctl.c           xattr.c           xip.h
dir.c             Kconfig           xattr.h           
ext2.h            Makefile          xattr_security.c  
file.c            namei.c           xattr_trusted.c

[jesse@localhost linux-2.6.39]$ vim include/linux/
ext2_fs_jes.h     ext2_fs_sb_jes.h

当然，好的山寨必将是彻底的，将fs/ext2_jes/里内容里的所有ext2改为ext2_jes。看来你需要一个脚本。

cat $f | sed 's/ext2/ext2_jes/g' > ${f}_tmp
    mv ${f}_tmp $f

*.h文件同理。
要记得大写的EXT2也要改哦。

第二步：

　　修改Makefile和Kconfig，照猫画虎即刻。

第三步：

　　编译过程中，可能会不断报错，原因大多是缺少函数，复制对应的然后粘贴即可。
　　最后还要改一下magic，起个吉利的8888作为名字。

-- include/linux/magic.h --

#define EXT2_SUPER_MAGIC    0xEF53
#define EXT2_JES_SUPER_MAGIC    0x8888

编译好后，app作个测试，用到之前说到的回接：

#dd if=/dev/zero of=jesfs bs=1M count=1
#mkfs.ext2 jesfs

/**
 * 注意，这里的mkfs.ext2仍然用的是ext2格式，需要改下它的magic，
 * 用vim打开，找到0xEF53，然后改为0x8888即可
 */

#mount -t ext2_jes -o loop ./jesfs /mnt

posted @ 2011-11-13 17:50 郝壹贰叁阅读(1993) 评论(7) 编辑收藏举报

刷新页面返回顶部

机器学习水很深

We all have two lives. The second one starts when we realize that we only have one. --- Tom Hiddleston

挂个文件系统

公告