系统调用之“一调到底”

先写个简单的test.c

#include <linux/module.h>
#include <linux/init.h>

#include <linux/fs.h>
#include <linux/cdev.h>

#include <linux/errno.h>


struct cdev test_cdev;
dev_t devno;
unsigned int major = 0;
unsigned int minor = 0;

int test_open (struct inode *nod, struct file *filp)
{
    printk("<kernel> %s\n", __FUNCTION__);            
    return 0;
}

struct file_operations test_ops = {
    .open = test_open,
};

int init_test(void)
{
    int err = 0;

    err = alloc_chrdev_region(&devno, 0, 1, "alloc register");
    if(err){
        printk("<kernel> cdev_add failed\n");            
        err = -EBUSY;
        goto fail;
    }

    major = MAJOR(devno);
    minor = MINOR(devno);
    printk("major is [%d], minor is [%d]\n", major, minor);

    cdev_init(&test_cdev, &test_ops);

    err = cdev_add(&test_cdev, devno, 1);    
    if(err){
        printk("<kernel> cdev_add failed\n");            
        err = -ENODEV;
        goto fail1;
    }

    printk("<kernel>init \n");        
    return 0;

fail:
    return err;
fail1:
    unregister_chrdev_region(devno, 1);        
    return err;
}

void exit_test(void)
{
    cdev_del(&test_cdev);
    unregister_chrdev_region(devno, 1);        
    printk("bye\n");        
}

module_init(init_test);
module_exit(exit_test);

MODULE_LICENSE("GPL");

MODULE_AUTHOR("Jesse");
MODULE_DESCRIPTION("this is a test module");
MODULE_VERSION("v0.1");

仅一个简单的open，应该不会有更简单的字符设备驱动了。

app 层还应该有这么个东西。

fd = open("/dev/test", O_RDWR);

好了，上面下面都有了。那，中间是怎么个回事？

大致的过程：

fd = open("/dev/test", O_RDWR);

sys_open

test_open

这个sys_open()可不是一个简单的函数，它包括了文件路径查找，文件权限判断等各种复杂BT的步骤。况且，不知何时起，内核里的sys_open已不是曾经的那个光明磊落的sys_open，tag不到，即便find到，也是一些bt的形式，早已面目全非。

-- fs/open.c --
 
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
    long ret; 

    if (force_o_largefile())
        flags |= O_LARGEFILE;

    ret = do_sys_open(AT_FDCWD, filename, flags, mode);        //==>bb

    /* avoid REGPARM breakage on x86: */
    asmlinkage_protect(3, ret, filename, flags, mode);
    return ret; 
}

有人问了，这个SYSCALL_DEFINE3是个什么东西，“你最好不要追究这样的问题”。

内核里的各种宏定义，不是一般的有才。简单的gcc -E一下简单的瞧瞧。

#define __SYSCALL_DEFINEx(x, name, ...)                 \
    asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));       \
    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));   \
    asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))        \
    {                               \
        __SC_TEST##x(__VA_ARGS__);              \
        return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));    \
    }                               \
    SYSCALL_ALIAS(sys##name, SyS##name);                \
    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))



#define SYSCALL_DEFINEx(x, sname, ...)              \
    static const char *types_##sname[] = {          \
        __SC_STR_TDECL##x(__VA_ARGS__)          \
    };                          \
    static const char *args_##sname[] = {           \
        __SC_STR_ADECL##x(__VA_ARGS__)          \
    };                          \
    SYSCALL_METADATA(sname, x);             \
    __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)


#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

int main(void)
{
    SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode);

}

展开真面目：

int main(void)
{

    static const char *types__open[] = { __SC_STR_TDECL3(const char __user *, filename, int, flags, int, mode) };
    static const char *args__open[] = { __SC_STR_ADECL3(const char __user *, filename, int, flags, int, mode) }; 
    SYSCALL_METADATA(_open, 3); 
    asmlinkage long sys_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode)); 
    static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode)); 
    asmlinkage long SyS_open(__SC_LONG3(const char __user *, filename, int, flags, int, mode)) 
    {
         __SC_TEST3(const char __user *, filename, int, flags, int, mode); 
         return (long) SYSC_open(__SC_CAST3(const char __user *, filename, int, flags, int, mode)); 
    }
    SYSCALL_ALIAS(sys_open, SyS_open); 
    static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));


}

一些宏还未展开，点到为止，见好就收吧。

我们继续往下看。

bb:

long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
    char *tmp = getname(filename);    //filename复制到了内核空间，即 *tmp ==>cc
    int   fd  = PTR_ERR(tmp);        //return (long) ptr;

    if (!IS_ERR(tmp)) {
        fd = get_unused_fd_flags(flags);    //得到一个有效的fd ==>dd
        if (fd >= 0) {
            struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);    // ==>ee
            if (IS_ERR(f)) {
                put_unused_fd(fd);
                fd = PTR_ERR(f);
            } else {
                fsnotify_open(f->f_path.dentry);    //==>ff
                fd_install(fd, f);    //将 fd 与file结构关联，以便 read write 等系统调用使用 ==>gg
            }    
        }    
        putname(tmp);    //分配完毕，释放掉暂时保存filename的内核空间：kmem_cache_free
    }    
    return fd;
}

cc:

#define __getname_gfp(gfp)  kmem_cache_alloc(names_cachep, (gfp))
#define __getname()     __getname_gfp(GFP_KERNEL)

char * getname(const char __user * filename)
{
    char *tmp, *result;

    result = ERR_PTR(-ENOMEM);
    tmp = __getname();        // kmem_cache_alloc: 内存分配出一块空间
    if (tmp)  {
        int retval = do_getname(filename, tmp);    //copy filenames to the kernel data space(*tmp) before using them

        result = tmp;
        if (retval < 0) {
            __putname(tmp);
            result = ERR_PTR(retval);
        }
    }
    audit_getname(result);
    return result;
}

dd:

#define get_unused_fd_flags(flags) alloc_fd(0, (flags))

-- fs/file.c --

/*
 * allocate a file descriptor, mark it busy.
 */
int alloc_fd(unsigned start, unsigned flags)
{   
    struct files_struct *files = current->files;
    unsigned int fd;
    int error;
    struct fdtable *fdt;

    spin_lock(&files->file_lock);

repeat:
    fdt = files_fdtable(files);
    fd = start;
    if (fd < files->next_fd)
        fd = files->next_fd;

    if (fd < fdt->max_fds)
        fd = find_next_zero_bit(fdt->open_fds->fds_bits,
                                fdt->max_fds, 
                                fd);    //这个很熟悉的函数==>ddD

    error = expand_files(files, fd);
    if (error < 0)
        goto out;

    /*
     * If we needed to expand the fs array we
     * might have blocked - try again.
     */
    if (error)
        goto repeat;

    if (start <= files->next_fd)
        files->next_fd = fd + 1;

    FD_SET(fd, fdt->open_fds);
    if (flags & O_CLOEXEC)
        FD_SET(fd, fdt->close_on_exec);
    else
        FD_CLR(fd, fdt->close_on_exec);

    error = fd;

#if 1
    /* Sanity check */
    if (rcu_dereference(fdt->fd[fd]) != NULL) {
        printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
        rcu_assign_pointer(fdt->fd[fd], NULL);
    }
#endif

out:
    spin_unlock(&files->file_lock);
    return error;
}

ddD:

　　一个出镜率很高的函数，常用于各种什么符的的分配。当然了，这些符都是按顺序分配di，用类似数组的形式，数组里的0表示未分配，然后遍历去找这些0。

unsigned long find_next_zero_bit(const unsigned long *addr, 
                         　　　　 unsigned long size,
                                 unsigned long offset)
{
    const unsigned long *p = addr + BITOP_WORD(offset);    //p = addr
    unsigned long result = offset & ~(BITS_PER_LONG-1);    //result = 0
    unsigned long tmp;

    if (offset >= size)
        return size;

    size   -= result;        
    offset %= BITS_PER_LONG; 
    if (offset) {
        tmp = *(p++);
        tmp |= ~0UL >> (BITS_PER_LONG - offset);
        if (size < BITS_PER_LONG)
            goto found_first;
        if (~tmp)
            goto found_middle;
        size -= BITS_PER_LONG;
        result += BITS_PER_LONG;
    }

    while (size & ~(BITS_PER_LONG-1)) {

        if (~(tmp = *(p++)))
            goto found_middle;
        result += BITS_PER_LONG;
        size -= BITS_PER_LONG;
    }

    if (!size)
        return result;

    tmp = *p;

found_first:
    tmp |= ~0UL << size;
    if (tmp == ~0UL)    /* Are any bits zero? */
        return result + size;   /* Nope. */

found_middle:
    return result + ffz(tmp);
}

下面是理解的重点，也是一调到底的精髓。重点在于struct file的分配。

ee:

/*
 * Note that the low bits of the passed in "open_flag"
 * are not the same as in the local variable "flag". See
 * open_to_namei_flags() for more details.
 */
struct file *do_filp_open(int dfd, const char *pathname,        
                          int open_flag, int mode, int acc_mode)
{
    struct file *filp;
    struct nameidata nd;
    int error;
    struct path path;
    struct dentry *dir;
    int count = 0;
    int will_write;
    int flag = open_to_namei_flags(open_flag);

    /* 设置open的 mode */
    if (!acc_mode)
        acc_mode = MAY_OPEN | ACC_MODE(flag);
                
    /* O_TRUNC implies we need access checks for write permissions */
    if (flag & O_TRUNC)
        acc_mode |= MAY_WRITE; 
            
    /* Allow the LSM permission hook to distinguish append 
       access from general write access. */
    if (flag & O_APPEND)
        acc_mode |= MAY_APPEND;

    /*
     * The simplest case - just a plain lookup.
     */
    if (!(flag & O_CREAT)) {
        error = path_lookup_open(dfd, pathname, lookup_flags(flag),
                                 &nd, flag);    
        if (error)
            return ERR_PTR(error);
        goto ok;
    }

    ...
    ...


ok:

    /*
     * Consider:
     * 1. may_open() truncates a file
     * 2. a rw->ro mount transition occurs
     * 3. nameidata_to_filp() fails due to
     *    the ro mount.
     * That would be inconsistent, and should
     * be avoided. Taking this mnt write here
     * ensures that (2) can not occur.
     */
    will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
    if (will_write) {
        error = mnt_want_write(nd.path.mnt);
        if (error)
            goto exit;
    }
    error = may_open(&nd.path, acc_mode, flag);    //**
    if (error) {
        if (will_write)
            mnt_drop_write(nd.path.mnt);
        goto exit;
    }
    filp = nameidata_to_filp(&nd, open_flag);    //分配struct file，得到filp
    if (IS_ERR(filp))
        ima_counts_put(&nd.path,
                       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
    /*
     * It is now safe to drop the mnt write
     * because the filp has had a write taken
     * on its behalf.
     */
    if (will_write)
        mnt_drop_write(nd.path.mnt);
    if (nd.root.mnt)
        path_put(&nd.root);
    return filp;

    ...
    ...

}

struct file *nameidata_to_filp(struct nameidata *nd, int flags)
{
    const struct cred *cred = current_cred();
    struct file *filp;

    /* Pick up the filp from the open intent */
    filp = nd->intent.open.file;
    /* Has the filesystem initialised the file for us? */
    if (filp->f_path.dentry == NULL)
        filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
                     NULL, cred);　　//!!!
    else 
        path_put(&nd->path);
    return filp;
}

static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                    int flags, struct file *f,
                    int (*open)(struct inode *, struct file *),
                    const struct cred *cred)
{
    struct inode *inode;
    int error;

    f->f_flags = flags;
    f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
                FMODE_PREAD | FMODE_PWRITE;
    inode = dentry->d_inode;
    if (f->f_mode & FMODE_WRITE) {
        error = __get_file_write_access(inode, mnt);
        if (error)
            goto cleanup_file;
        if (!special_file(inode->i_mode))
            file_take_write(f);
    }

    f->f_mapping = inode->i_mapping;
    f->f_path.dentry = dentry;
    f->f_path.mnt = mnt;
    f->f_pos = 0;
    f->f_op = fops_get(inode->i_fop);　　//!!! !!!
    file_move(f, &inode->i_sb->s_files);

    error = security_dentry_open(f, cred);
    if (error)
        goto cleanup_all;

    if (!open && f->f_op)　　//f->f_op若有，则执行open
        open = f->f_op->open;　　
    if (open) {
        error = open(inode, f);
        if (error)
            goto cleanup_all;
    }

    f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

    file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

说下六个感叹号的地方。记得我们在注册字符设备的时候是否有个cdev_init ？她的体内是不是有个 cdev->ops = fops ?

inode里是不是有个i_cdev ?

这里，file的f_op是不是被赋了inode的i_fop ?

打开struct file, struct inode的定义处，多瞧上两眼。这里就不贴了。

就这样，fd = open("/dev/test", O_RDWR) 最终还是调到了test_open 。

最后就是个首尾函数，将得到的fd和struct file关联起来。

gg:

void fd_install(unsigned int fd, struct file *file)
{   
    struct files_struct *files = current->files;
    struct fdtable *fdt;

    spin_lock(&files->file_lock);

    fdt = files_fdtable(files);
    BUG_ON(fdt->fd[fd] != NULL);
    rcu_assign_pointer(fdt->fd[fd], file);

    spin_unlock(&files->file_lock);
}

do_sys_open 的结尾 return fd;  返回给 app。

fd = open("/dev/test", O_RDWR)

你懂的。

posted @ 2011-09-21 10:31 郝壹贰叁阅读(1898) 评论(0) 收藏举报

刷新页面返回顶部

机器学习水很深

We all have two lives. The second one starts when we realize that we only have one. --- Tom Hiddleston

系统调用之“一调到底”

公告