系统调用之“一调到底”

先写个简单的test.c

#include <linux/module.h>
#include
<linux/init.h>

#include
<linux/fs.h>
#include
<linux/cdev.h>

#include
<linux/errno.h>


struct cdev test_cdev;
dev_t devno;
unsigned
int major = 0;
unsigned
int minor = 0;

int test_open (struct inode *nod, struct file *filp)
{
printk(
"<kernel> %s\n", __FUNCTION__);
return 0;
}

struct file_operations test_ops = {
.open
= test_open,
};

int init_test(void)
{
int err = 0;

err
= alloc_chrdev_region(&devno, 0, 1, "alloc register");
if(err){
printk(
"<kernel> cdev_add failed\n");
err
= -EBUSY;
goto fail;
}

major
= MAJOR(devno);
minor
= MINOR(devno);
printk(
"major is [%d], minor is [%d]\n", major, minor);

cdev_init(
&test_cdev, &test_ops);

err
= cdev_add(&test_cdev, devno, 1);
if(err){
printk(
"<kernel> cdev_add failed\n");
err
= -ENODEV;
goto fail1;
}

printk(
"<kernel>init \n");
return 0;

fail:
return err;
fail1:
unregister_chrdev_region(devno,
1);
return err;
}

void exit_test(void)
{
cdev_del(
&test_cdev);
unregister_chrdev_region(devno,
1);
printk(
"bye\n");
}

module_init(init_test);
module_exit(exit_test);

MODULE_LICENSE(
"GPL");

MODULE_AUTHOR(
"Jesse");
MODULE_DESCRIPTION(
"this is a test module");
MODULE_VERSION(
"v0.1");

  

仅一个简单的open,应该不会有更简单的字符设备驱动了。

app 层还应该有这么个东西。

fd = open("/dev/test", O_RDWR);

好了,上面下面都有了。那,中间是怎么个回事?

大致的过程:

  

fd = open("/dev/test", O_RDWR);
sys_open
test_open

    

这个sys_open()可不是一个简单的函数,它包括了文件路径查找,文件权限判断等各种复杂BT的步骤。况且,不知何时起,内核里的sys_open已不是曾经的那个光明磊落的sys_open,tag不到,即便find到,也是一些bt的形式,早已面目全非。

     

-- fs/open.c --

SYSCALL_DEFINE3(open,
const char __user *, filename, int, flags, int, mode)
{
long ret;

if (force_o_largefile())
flags
|= O_LARGEFILE;

ret
= do_sys_open(AT_FDCWD, filename, flags, mode); //==>bb

/* avoid REGPARM breakage on x86: */
asmlinkage_protect(
3, ret, filename, flags, mode);
return ret;
}

有人问了,这个SYSCALL_DEFINE3是个什么东西,“你最好不要追究这样的问题”。

内核里的各种宏定义,不是一般的有才。简单的gcc -E一下 简单的瞧瞧。
    

#define __SYSCALL_DEFINEx(x, name, ...)                 \
asmlinkage
long sys##name(__SC_DECL##x(__VA_ARGS__)); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
asmlinkage
long SyS##name(__SC_LONG##x(__VA_ARGS__)) \
{ \
__SC_TEST##x(__VA_ARGS__); \
return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \
} \
SYSCALL_ALIAS(sys##name, SyS##name); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))



#define SYSCALL_DEFINEx(x, sname, ...) \
static const char *types_##sname[] = { \
__SC_STR_TDECL##x(__VA_ARGS__) \
}; \
static const char *args_##sname[] = { \
__SC_STR_ADECL##x(__VA_ARGS__) \
}; \
SYSCALL_METADATA(sname, x); \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)


#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

int main(void)
{
SYSCALL_DEFINE3(open,
const char __user *, filename, int, flags, int, mode);

}

展开真面目:

int main(void)
{

static const char *types__open[] = { __SC_STR_TDECL3(const char __user *, filename, int, flags, int, mode) };
static const char *args__open[] = { __SC_STR_ADECL3(const char __user *, filename, int, flags, int, mode) };
SYSCALL_METADATA(_open,
3);
asmlinkage
long sys_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));
static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));
asmlinkage
long SyS_open(__SC_LONG3(const char __user *, filename, int, flags, int, mode))
{
__SC_TEST3(
const char __user *, filename, int, flags, int, mode);
return (long) SYSC_open(__SC_CAST3(const char __user *, filename, int, flags, int, mode));
}
SYSCALL_ALIAS(sys_open, SyS_open);
static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));


}

一些宏还未展开,点到为止,见好就收吧。

       

我们继续往下看。

     

bb:

long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
char *tmp = getname(filename); //filename复制到了内核空间,即 *tmp ==>cc
int fd = PTR_ERR(tmp); //return (long) ptr;

if (!IS_ERR(tmp)) {
fd
= get_unused_fd_flags(flags); //得到一个有效的fd ==>dd
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); // ==>ee
if (IS_ERR(f)) {
put_unused_fd(fd);
fd
= PTR_ERR(f);
}
else {
fsnotify_open(f
->f_path.dentry); //==>ff
fd_install(fd, f); //将 fd 与file结构关联,以便 read write 等系统调用使用 ==>gg
}
}
putname(tmp);
//分配完毕,释放掉暂时保存filename的内核空间:kmem_cache_free
}
return fd;
}

         

cc:

#define __getname_gfp(gfp)  kmem_cache_alloc(names_cachep, (gfp))
#define __getname() __getname_gfp(GFP_KERNEL)

char * getname(const char __user * filename)
{
char *tmp, *result;

result
= ERR_PTR(-ENOMEM);
tmp
= __getname(); // kmem_cache_alloc: 内存分配出一块空间
if (tmp) {
int retval = do_getname(filename, tmp); //copy filenames to the kernel data space(*tmp) before using them

result
= tmp;
if (retval < 0) {
__putname(tmp);
result
= ERR_PTR(retval);
}
}
audit_getname(result);
return result;
}

dd:

#define get_unused_fd_flags(flags) alloc_fd(0, (flags))

-- fs/file.c --

/*
* allocate a file descriptor, mark it busy.
*/
int alloc_fd(unsigned start, unsigned flags)
{
struct files_struct *files = current->files;
unsigned
int fd;
int error;
struct fdtable *fdt;

spin_lock(
&files->file_lock);

repeat:
fdt
= files_fdtable(files);
fd
= start;
if (fd < files->next_fd)
fd
= files->next_fd;

if (fd < fdt->max_fds)
fd
= find_next_zero_bit(fdt->open_fds->fds_bits,
fdt
->max_fds,
fd);
//这个很熟悉的函数==>ddD

error
= expand_files(files, fd);
if (error < 0)
goto out;

/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat;

if (start <= files->next_fd)
files
->next_fd = fd + 1;

FD_SET(fd, fdt
->open_fds);
if (flags & O_CLOEXEC)
FD_SET(fd, fdt
->close_on_exec);
else
FD_CLR(fd, fdt
->close_on_exec);

error
= fd;

#if 1
/* Sanity check */
if (rcu_dereference(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING
"alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt
->fd[fd], NULL);
}
#endif

out:
spin_unlock(
&files->file_lock);
return error;
}

ddD:

  一个出镜率很高的函数,常用于各种什么符的的分配。当然了,这些符都是按顺序分配di,用类似数组的形式,数组里的0表示未分配,然后遍历去找这些0。

unsigned long find_next_zero_bit(const unsigned long *addr, 
     unsigned
long size,
unsigned
long offset)
{
const unsigned long *p = addr + BITOP_WORD(offset); //p = addr
unsigned long result = offset & ~(BITS_PER_LONG-1); //result = 0
unsigned long tmp;

if (offset >= size)
return size;

size
-= result;
offset %= BITS_PER_LONG;
if (offset) {
tmp
= *(p++);
tmp
|= ~0UL >> (BITS_PER_LONG - offset);
if (size < BITS_PER_LONG)
goto found_first;
if (~tmp)
goto found_middle;
size
-= BITS_PER_LONG;
result
+= BITS_PER_LONG;
}

while (size & ~(BITS_PER_LONG-1)) {

if (~(tmp = *(p++)))
goto found_middle;
result
+= BITS_PER_LONG;
size
-= BITS_PER_LONG;
}

if (!size)
return result;

tmp
= *p;

found_first:
tmp
|= ~0UL << size;
if (tmp == ~0UL) /* Are any bits zero? */
return result + size; /* Nope. */

found_middle:
return result + ffz(tmp);
}

           
下面是理解的重点,也是一调到底的精髓。重点在于struct file的分配。

         
ee:

/*
* Note that the low bits of the passed in "open_flag"
* are not the same as in the local variable "flag". See
* open_to_namei_flags() for more details.
*/
struct file *do_filp_open(int dfd, const char *pathname,
int open_flag, int mode, int acc_mode)
{
struct file *filp;
struct nameidata nd;
int error;
struct path path;
struct dentry *dir;
int count = 0;
int will_write;
int flag = open_to_namei_flags(open_flag);

/* 设置open的 mode */
if (!acc_mode)
acc_mode = MAY_OPEN | ACC_MODE(flag);

/* O_TRUNC implies we need access checks for write permissions */
if (flag & O_TRUNC)
acc_mode |= MAY_WRITE;

/* Allow the LSM permission hook to distinguish append
access from general write access.
*/
if (flag & O_APPEND)
acc_mode |= MAY_APPEND;

/*
* The simplest case - just a plain lookup.
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
&nd, flag);
if (error)
return ERR_PTR(error);
goto ok;
}

...
...


ok:

/*
* Consider:
* 1. may_open() truncates a file
* 2. a rw->ro mount transition occurs
* 3. nameidata_to_filp() fails due to
* the ro mount.
* That would be inconsistent, and should
* be avoided. Taking this mnt write here
* ensures that (2) can not occur.
*/
will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
if (will_write) {
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit;
}
error = may_open(&nd.path, acc_mode, flag); //**
if (error) {
if (will_write)
mnt_drop_write(nd.path.mnt);
goto exit;
}
filp = nameidata_to_filp(&nd, open_flag); //分配struct file,得到filp
if (IS_ERR(filp))
ima_counts_put(&nd.path,
acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
/*
* It is now safe to drop the mnt write
* because the filp has had a write taken
* on its behalf.
*/
if (will_write)
mnt_drop_write(nd.path.mnt);
if (nd.root.mnt)
path_put(&nd.root);
return filp;

...
...

}

            

struct file *nameidata_to_filp(struct nameidata *nd, int flags)
{
const struct cred *cred = current_cred();
struct file *filp;

/* Pick up the filp from the open intent */
filp = nd->intent.open.file;
/* Has the filesystem initialised the file for us? */
if (filp->f_path.dentry == NULL)
filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
NULL, cred);  //!!!
else
path_put(&nd->path);
return filp;
}

           

static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
int flags, struct file *f,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
struct inode *inode;
int error;

f->f_flags = flags;
f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, mnt);
if (error)
goto cleanup_file;
if (!special_file(inode->i_mode))
file_take_write(f);
}

f->f_mapping = inode->i_mapping;
f->f_path.dentry = dentry;
f->f_path.mnt = mnt;
f->f_pos = 0;
f->f_op = fops_get(inode->i_fop);  //!!! !!!
file_move(f, &inode->i_sb->s_files);

error = security_dentry_open(f, cred);
if (error)
goto cleanup_all;

if (!open && f->f_op)  //f->f_op若有,则执行open
open = f->f_op->open;  
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}

f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

          
说下六个感叹号的地方。记得我们在注册字符设备的时候是否有个cdev_init ? 她的体内是不是有个 cdev->ops = fops ?

inode里是不是有个i_cdev ?

这里,file的f_op是不是被赋了inode的i_fop ?

打开struct file, struct inode的定义处,多瞧上两眼。这里就不贴了。

         

就这样,fd = open("/dev/test", O_RDWR) 最终还是调到了test_open 。

             

最后就是个首尾函数,将得到的fd和struct file关联起来。

         

gg:

void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
struct fdtable *fdt;

spin_lock(&files->file_lock);

fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);

spin_unlock(&files->file_lock);
}

           

do_sys_open 的结尾 return fd;  返回给 app。

  fd = open("/dev/test", O_RDWR) 

          

  你懂的。

  

posted @ 2011-09-21 10:31  郝壹贰叁  阅读(1886)  评论(0编辑  收藏  举报