系统调用之“一调到底”
先写个简单的test.c
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/errno.h>
struct cdev test_cdev;
dev_t devno;
unsigned int major = 0;
unsigned int minor = 0;
int test_open (struct inode *nod, struct file *filp)
{
printk("<kernel> %s\n", __FUNCTION__);
return 0;
}
struct file_operations test_ops = {
.open = test_open,
};
int init_test(void)
{
int err = 0;
err = alloc_chrdev_region(&devno, 0, 1, "alloc register");
if(err){
printk("<kernel> cdev_add failed\n");
err = -EBUSY;
goto fail;
}
major = MAJOR(devno);
minor = MINOR(devno);
printk("major is [%d], minor is [%d]\n", major, minor);
cdev_init(&test_cdev, &test_ops);
err = cdev_add(&test_cdev, devno, 1);
if(err){
printk("<kernel> cdev_add failed\n");
err = -ENODEV;
goto fail1;
}
printk("<kernel>init \n");
return 0;
fail:
return err;
fail1:
unregister_chrdev_region(devno, 1);
return err;
}
void exit_test(void)
{
cdev_del(&test_cdev);
unregister_chrdev_region(devno, 1);
printk("bye\n");
}
module_init(init_test);
module_exit(exit_test);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jesse");
MODULE_DESCRIPTION("this is a test module");
MODULE_VERSION("v0.1");
仅一个简单的open,应该不会有更简单的字符设备驱动了。
app 层还应该有这么个东西。
fd = open("/dev/test", O_RDWR);
好了,上面下面都有了。那,中间是怎么个回事?
大致的过程:
fd = open("/dev/test", O_RDWR);
sys_open
test_open
这个sys_open()可不是一个简单的函数,它包括了文件路径查找,文件权限判断等各种复杂BT的步骤。况且,不知何时起,内核里的sys_open已不是曾经的那个光明磊落的sys_open,tag不到,即便find到,也是一些bt的形式,早已面目全非。
-- fs/open.c --
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
long ret;
if (force_o_largefile())
flags |= O_LARGEFILE;
ret = do_sys_open(AT_FDCWD, filename, flags, mode); //==>bb
/* avoid REGPARM breakage on x86: */
asmlinkage_protect(3, ret, filename, flags, mode);
return ret;
}
有人问了,这个SYSCALL_DEFINE3是个什么东西,“你最好不要追究这样的问题”。
内核里的各种宏定义,不是一般的有才。简单的gcc -E一下 简单的瞧瞧。
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \
{ \
__SC_TEST##x(__VA_ARGS__); \
return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \
} \
SYSCALL_ALIAS(sys##name, SyS##name); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
#define SYSCALL_DEFINEx(x, sname, ...) \
static const char *types_##sname[] = { \
__SC_STR_TDECL##x(__VA_ARGS__) \
}; \
static const char *args_##sname[] = { \
__SC_STR_ADECL##x(__VA_ARGS__) \
}; \
SYSCALL_METADATA(sname, x); \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
int main(void)
{
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode);
}
展开真面目:
int main(void)
{
static const char *types__open[] = { __SC_STR_TDECL3(const char __user *, filename, int, flags, int, mode) };
static const char *args__open[] = { __SC_STR_ADECL3(const char __user *, filename, int, flags, int, mode) };
SYSCALL_METADATA(_open, 3);
asmlinkage long sys_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));
static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));
asmlinkage long SyS_open(__SC_LONG3(const char __user *, filename, int, flags, int, mode))
{
__SC_TEST3(const char __user *, filename, int, flags, int, mode);
return (long) SYSC_open(__SC_CAST3(const char __user *, filename, int, flags, int, mode));
}
SYSCALL_ALIAS(sys_open, SyS_open);
static inline long SYSC_open(__SC_DECL3(const char __user *, filename, int, flags, int, mode));
}
一些宏还未展开,点到为止,见好就收吧。
我们继续往下看。
bb:
long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
char *tmp = getname(filename); //filename复制到了内核空间,即 *tmp ==>cc
int fd = PTR_ERR(tmp); //return (long) ptr;
if (!IS_ERR(tmp)) {
fd = get_unused_fd_flags(flags); //得到一个有效的fd ==>dd
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); // ==>ee
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f->f_path.dentry); //==>ff
fd_install(fd, f); //将 fd 与file结构关联,以便 read write 等系统调用使用 ==>gg
}
}
putname(tmp); //分配完毕,释放掉暂时保存filename的内核空间:kmem_cache_free
}
return fd;
}
cc:
#define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp))
#define __getname() __getname_gfp(GFP_KERNEL)
char * getname(const char __user * filename)
{
char *tmp, *result;
result = ERR_PTR(-ENOMEM);
tmp = __getname(); // kmem_cache_alloc: 内存分配出一块空间
if (tmp) {
int retval = do_getname(filename, tmp); //copy filenames to the kernel data space(*tmp) before using them
result = tmp;
if (retval < 0) {
__putname(tmp);
result = ERR_PTR(retval);
}
}
audit_getname(result);
return result;
}
dd:
#define get_unused_fd_flags(flags) alloc_fd(0, (flags))
-- fs/file.c --
/*
* allocate a file descriptor, mark it busy.
*/
int alloc_fd(unsigned start, unsigned flags)
{
struct files_struct *files = current->files;
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
fd = start;
if (fd < files->next_fd)
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fds,
fd); //这个很熟悉的函数==>ddD
error = expand_files(files, fd);
if (error < 0)
goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat;
if (start <= files->next_fd)
files->next_fd = fd + 1;
FD_SET(fd, fdt->open_fds);
if (flags & O_CLOEXEC)
FD_SET(fd, fdt->close_on_exec);
else
FD_CLR(fd, fdt->close_on_exec);
error = fd;
#if 1
/* Sanity check */
if (rcu_dereference(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out:
spin_unlock(&files->file_lock);
return error;
}
ddD:
一个出镜率很高的函数,常用于各种什么符的的分配。当然了,这些符都是按顺序分配di,用类似数组的形式,数组里的0表示未分配,然后遍历去找这些0。
unsigned long find_next_zero_bit(const unsigned long *addr,
unsigned long size,
unsigned long offset)
{
const unsigned long *p = addr + BITOP_WORD(offset); //p = addr
unsigned long result = offset & ~(BITS_PER_LONG-1); //result = 0
unsigned long tmp;
if (offset >= size)
return size;
size -= result;
offset %= BITS_PER_LONG;
if (offset) {
tmp = *(p++);
tmp |= ~0UL >> (BITS_PER_LONG - offset);
if (size < BITS_PER_LONG)
goto found_first;
if (~tmp)
goto found_middle;
size -= BITS_PER_LONG;
result += BITS_PER_LONG;
}
while (size & ~(BITS_PER_LONG-1)) {
if (~(tmp = *(p++)))
goto found_middle;
result += BITS_PER_LONG;
size -= BITS_PER_LONG;
}
if (!size)
return result;
tmp = *p;
found_first:
tmp |= ~0UL << size;
if (tmp == ~0UL) /* Are any bits zero? */
return result + size; /* Nope. */
found_middle:
return result + ffz(tmp);
}
下面是理解的重点,也是一调到底的精髓。重点在于struct file的分配。
ee:
/*
* Note that the low bits of the passed in "open_flag"
* are not the same as in the local variable "flag". See
* open_to_namei_flags() for more details.
*/
struct file *do_filp_open(int dfd, const char *pathname,
int open_flag, int mode, int acc_mode)
{
struct file *filp;
struct nameidata nd;
int error;
struct path path;
struct dentry *dir;
int count = 0;
int will_write;
int flag = open_to_namei_flags(open_flag);
/* 设置open的 mode */
if (!acc_mode)
acc_mode = MAY_OPEN | ACC_MODE(flag);
/* O_TRUNC implies we need access checks for write permissions */
if (flag & O_TRUNC)
acc_mode |= MAY_WRITE;
/* Allow the LSM permission hook to distinguish append
access from general write access. */
if (flag & O_APPEND)
acc_mode |= MAY_APPEND;
/*
* The simplest case - just a plain lookup.
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
&nd, flag);
if (error)
return ERR_PTR(error);
goto ok;
}
...
...
ok:
/*
* Consider:
* 1. may_open() truncates a file
* 2. a rw->ro mount transition occurs
* 3. nameidata_to_filp() fails due to
* the ro mount.
* That would be inconsistent, and should
* be avoided. Taking this mnt write here
* ensures that (2) can not occur.
*/
will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
if (will_write) {
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit;
}
error = may_open(&nd.path, acc_mode, flag); //**
if (error) {
if (will_write)
mnt_drop_write(nd.path.mnt);
goto exit;
}
filp = nameidata_to_filp(&nd, open_flag); //分配struct file,得到filp
if (IS_ERR(filp))
ima_counts_put(&nd.path,
acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
/*
* It is now safe to drop the mnt write
* because the filp has had a write taken
* on its behalf.
*/
if (will_write)
mnt_drop_write(nd.path.mnt);
if (nd.root.mnt)
path_put(&nd.root);
return filp;
...
...
}
struct file *nameidata_to_filp(struct nameidata *nd, int flags)
{
const struct cred *cred = current_cred();
struct file *filp;
/* Pick up the filp from the open intent */
filp = nd->intent.open.file;
/* Has the filesystem initialised the file for us? */
if (filp->f_path.dentry == NULL)
filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
NULL, cred); //!!!
else
path_put(&nd->path);
return filp;
}
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
int flags, struct file *f,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
struct inode *inode;
int error;
f->f_flags = flags;
f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, mnt);
if (error)
goto cleanup_file;
if (!special_file(inode->i_mode))
file_take_write(f);
}
f->f_mapping = inode->i_mapping;
f->f_path.dentry = dentry;
f->f_path.mnt = mnt;
f->f_pos = 0;
f->f_op = fops_get(inode->i_fop); //!!! !!!
file_move(f, &inode->i_sb->s_files);
error = security_dentry_open(f, cred);
if (error)
goto cleanup_all;
if (!open && f->f_op) //f->f_op若有,则执行open
open = f->f_op->open;
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
说下六个感叹号的地方。记得我们在注册字符设备的时候是否有个cdev_init ? 她的体内是不是有个 cdev->ops = fops ?
inode里是不是有个i_cdev ?
这里,file的f_op是不是被赋了inode的i_fop ?
打开struct file, struct inode的定义处,多瞧上两眼。这里就不贴了。
就这样,fd = open("/dev/test", O_RDWR) 最终还是调到了test_open 。
最后就是个首尾函数,将得到的fd和struct file关联起来。
gg:
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
}
do_sys_open 的结尾 return fd; 返回给 app。
fd = open("/dev/test", O_RDWR)
你懂的。