用crash来分析一下proc的文件访问
一般来说,用户通过fd的传入,调用open系统调用,来获取fd,然后read的时候,通过这个fd来查找对应的file*
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_sys_open(AT_FDCWD, filename, flags, mode); /* avoid REGPARM breakage on x86: */ asmlinkage_protect(3, ret, filename, flags, mode); return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { struct file *file; ssize_t ret = -EBADF; int fput_needed; file = fget_light(fd, &fput_needed); if (file) { loff_t pos = file_pos_read(file); ret = vfs_read(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); } return ret; }
所以file*是fget_light的返回值,下面通过一个crash来分析对应的file指针
file指针是fget_light的返回值,默认在rax中,调用完之后,rax赋值给了rbx,且rbx在调用vfs_read之前未更改,所以rbx里面存放的file指针,调用vfs_read之后,rvx压栈在rbp-0x18的位置 crash> dis -l vfs_read /usr/src/debug/kernel-2.6.32-220.el6/linux-2.6.32-220.el6.x86_64/fs/read_write.c: 278 0xffffffff8117e9c0 <vfs_read>: push %rbp 0xffffffff8117e9c1 <vfs_read+1>: mov %rsp,%rbp 0xffffffff8117e9c4 <vfs_read+4>: sub $0x30,%rsp 0xffffffff8117e9c8 <vfs_read+8>: mov %rbx,-0x18(%rbp) crash> px 0xffff8817fc9e1f28-0x18 $2 = 0xffff8817fc9e1f10 crash> crash> struct file ffff880c9ddbba80 struct file { f_u = { fu_list = { next = 0xffff880c860d8500, prev = 0xffff880c1188cce8 }, fu_rcuhead = { next = 0xffff880c860d8500, func = 0xffff880c1188cce8 } }, f_path = { mnt = 0xffff88180ec5eec0, dentry = 0xffff8818014b1540 }, f_op = 0xffffffff8161f980 <proc_sops+128>,---------------------这个f_op是我们要继续分析的: f_lock = { raw_lock = { slock = 0 } }, f_count = { counter = 1 }, f_flags = 32768, f_mode = 13, f_pos = 20480, f_owner = { lock = { raw_lock = { lock = 16777216 } }, pid = 0x0, pid_type = PIDTYPE_PID, uid = 0, euid = 0, signum = 0 }, f_cred = 0xffff880c0ecd8b00, f_ra = { start = 0, size = 0, async_size = 0, ra_pages = 32, mmap_miss = 0, prev_pos = -1 }, f_version = 0, f_security = 0x0, private_data = 0xffff8817f3eee740, f_ep_links = { next = 0xffff880c9ddbbb28, prev = 0xffff880c9ddbbb28 }, f_mapping = 0xffff88100a811918 }
我们继续取对应的f_op成员分析,这个是不同的文件系统有不同的函数,proc文件系统就是
crash> struct file_operations 0xffffffff8161f980 struct file_operations { owner = 0x0, llseek = 0xffffffff811e3600 <proc_reg_llseek>, read = 0xffffffff811e3540 <proc_reg_read>,-----------------这个read要继续跟进 write = 0xffffffff811e3480 <proc_reg_write>, aio_read = 0x0, aio_write = 0x0, readdir = 0x0, poll = 0xffffffff811e33d0 <proc_reg_poll>, ioctl = 0x0, unlocked_ioctl = 0xffffffff811e36b0 <proc_reg_unlocked_ioctl>, compat_ioctl = 0x0, mmap = 0xffffffff811e3320 <proc_reg_mmap>, open = 0xffffffff811e3a50 <proc_reg_open>, flush = 0x0, release = 0xffffffff811e31f0 <proc_reg_release>, fsync = 0x0, aio_fsync = 0x0, fasync = 0x0, lock = 0x0, sendpage = 0x0, get_unmapped_area = 0x0, check_flags = 0x0, flock = 0x0, splice_write = 0x0, splice_read = 0x0, setlease = 0x0 }
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) return -EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret >= 0) { count = ret; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos);-------proc文件系统,这个为proc_ref_read else
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);-----------根据inode来获取pde
一个proc文件系统的entry结构是通过PDE宏来获取的。
PDE的实现如下:
static inline struct proc_inode *PROC_I(const struct inode *inode)
{
return container_of(inode, struct proc_inode, vfs_inode);
}
static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
return PROC_I(inode)->pde;
}
所以vfs层的inode结构其实就是嵌入到了proc_inode中;
crash> struct -xo proc_inode
struct proc_inode {
[0x0] struct pid *pid;
[0x8] int fd;
[0x10] union proc_op op;
[0x18] struct proc_dir_entry *pde;
[0x20] struct ctl_table_header *sysctl;
[0x28] struct ctl_table *sysctl_entry;
[0x30] struct inode vfs_inode;-------------------vfs层的inode
}
通过vfs_inode获取到proc_inode之后,就会获取proc_dir_entry
crash> struct -xo proc_dir_entry
struct proc_dir_entry {
[0x0] unsigned int low_ino;
[0x4] unsigned short namelen;
[0x8] const char *name;
[0x10] mode_t mode;
[0x18] nlink_t nlink;
[0x20] uid_t uid;
[0x24] gid_t gid;
[0x28] loff_t size;
[0x30] const struct inode_operations *proc_iops;
[0x38] const struct file_operations *proc_fops;
[0x40] struct proc_dir_entry *next;
[0x48] struct proc_dir_entry *parent;
[0x50] struct proc_dir_entry *subdir;
[0x58] void *data;
[0x60] read_proc_t *read_proc;
[0x68] write_proc_t *write_proc;
[0x70] atomic_t count;
[0x74] int pde_users;
[0x78] spinlock_t pde_unload_lock;
[0x80] struct completion *pde_unload_completion;
[0x88] struct list_head pde_openers;
}
SIZE: 0x98
所以我们找出inode,就可以找出pde。
crash> struct file.f_path ffff880c9ddbba80 f_path = { mnt = 0xffff88180ec5eec0, dentry = 0xffff8818014b1540 }
crash> struct file.f_path.dentry ffff880c9ddbba80 f_path.dentry = 0xffff8818014b1540 crash> struct dentry.d_inode 0xffff8818014b1540 d_inode = 0xffff88100a8117f8
crash> px 0xffff88100a8117f8-0x30 $11 = 0xffff88100a8117c8 crash> struct proc_inode.pde 0xffff88100a8117c8 pde = 0xffff880c0ef13b00
crash> struct proc_dir_entry.proc_fops 0xffff880c0ef13b00 proc_fops = 0xffffffff8161cdc0
crash> struct file_operations 0xffffffff8161cdc0 struct file_operations { owner = 0x0, llseek = 0xffffffff811a0490 <seq_lseek>, read = 0xffffffff811a0950 <seq_read>, write = 0xffffffff811687c0 <slabinfo_write>, aio_read = 0x0, aio_write = 0x0, readdir = 0x0, poll = 0x0, ioctl = 0x0, unlocked_ioctl = 0x0, compat_ioctl = 0x0, mmap = 0x0, open = 0xffffffff81165640 <slabinfo_open>,----------------最终调用的open flush = 0x0, release = 0xffffffff8119fde0 <seq_release>, fsync = 0x0, aio_fsync = 0x0, fasync = 0x0, lock = 0x0, sendpage = 0x0, get_unmapped_area = 0x0, check_flags = 0x0, flock = 0x0, splice_write = 0x0, splice_read = 0x0, setlease = 0x0 }
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); ssize_t rv = -EIO; ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); spin_lock(&pde->pde_unload_lock); if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); return rv; } pde->pde_users++; read = pde->proc_fops->read;--------------------调用的read是seq_read spin_unlock(&pde->pde_unload_lock); if (read) rv = read(file, buf, count, ppos); pde_users_dec(pde); return rv; }
所以本案例中,proc文件系统最终调用的read是seq_read,open是:slabinfo_open,当然,这个跟案例相关,因为proc的封装是到proc_reg_read 为止,函数proc_reg_read是vfs read进入proc的入口。看linux源码的时候,不能只见树木,不见森林,要有清晰的层次概念。后面的实现与不同的proc文件相关,不同的层次完成不同的功能是linux的设计哲学,由于seq_read是seq类文件的封装,实现在seq_file.c中,主要功能如下:
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *m = (struct seq_file *)file->private_data; 。。。。。 p = m->op->start(m, &pos); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) break; err = m->op->show(m, p); if (err < 0) break; if (unlikely(err)) m->count = 0; if (unlikely(!m->count)) { p = m->op->next(m, p, &pos); m->index = pos; continue; } if (m->count < m->size) goto Fill; m->op->stop(m, p); kfree(m->buf); m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); if (!m->buf) goto Enomem; m->count = 0; m->version = 0; pos = m->index; p = m->op->start(m, &pos); } m->op->stop(m, p); m->count = 0; goto Done; Fill:
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *m = (struct seq_file *)file->private_data; 。。。。。 p = m->op->start(m, &pos);------------标准的start while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) break; err = m->op->show(m, p); if (err < 0) break; if (unlikely(err)) m->count = 0; if (unlikely(!m->count)) { p = m->op->next(m, p, &pos);----next动作 m->index = pos; continue; } if (m->count < m->size) goto Fill; m->op->stop(m, p);----stop动作 kfree(m->buf); m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); if (!m->buf) goto Enomem; m->count = 0; m->version = 0; pos = m->index; p = m->op->start(m, &pos);-----循环start } m->op->stop(m, p); m->count = 0; goto Done; Fill:
crash> struct file.private_data ffff880c9ddbba80 private_data = 0xffff8817f3eee740 crash> struct seq_file 0xffff8817f3eee740 struct seq_file { buf = 0xffff8817f4448000 "size-65536(DMA) 0 0 65536 1 16 : tunables 8 4 0 : slabdata 0 0 0\nsize-65536 19 19 65536 1 16 : tunables 8 4 0 : slabdata 19 19 0\nsize-32768(DMA) 0 0 32768 "..., size = 4096, from = 963, count = 0, index = 189, read_pos = 20480, version = 0, lock = { count = { counter = 0 }, wait_lock = { raw_lock = { slock = 0 } }, wait_list = { next = 0xffff8817f3eee780, prev = 0xffff8817f3eee780 }, owner = 0xffff8817fc9e0000 }, op = 0xffffffff8161cea0 <proc_slabinfo_operations+192>, private = 0x0 } crash> struct seq_file struct seq_file { char *buf; size_t size; size_t from; size_t count; loff_t index; loff_t read_pos; u64 version; struct mutex lock; const struct seq_operations *op; void *private; } SIZE: 104 crash> struct seq_operations 0xffffffff8161cea0 struct seq_operations { start = 0xffffffff81165680 <s_start>, stop = 0xffffffff811652f0 <s_stop>, next = 0xffffffff81165660 <s_next>, show = 0xffffffff81166420 <s_show> }
有seq_file,一般就会设计seq_operations,上面就是分析对应seq_operations中的函数的情况。由于后面已经不是proc文件系统层的范畴,本文结束。
水平有限,如果有错误,请帮忙提醒我。如果您觉得本文对您有帮助,可以点击下面的 推荐 支持一下我。版权所有,需要转发请带上本文源地址,博客一直在更新,欢迎 关注 。