用crash来分析一下proc的文件访问

一般来说，用户通过fd的传入，调用open系统调用，来获取fd，然后read的时候，通过这个fd来查找对应的file*

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
    long ret;

    if (force_o_largefile())
        flags |= O_LARGEFILE;

    ret = do_sys_open(AT_FDCWD, filename, flags, mode);
    /* avoid REGPARM breakage on x86: */
    asmlinkage_protect(3, ret, filename, flags, mode);
    return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct file *file;
    ssize_t ret = -EBADF;
    int fput_needed;

    file = fget_light(fd, &fput_needed);
    if (file) {
        loff_t pos = file_pos_read(file);
        ret = vfs_read(file, buf, count, &pos);
        file_pos_write(file, pos);
        fput_light(file, fput_needed);
    }

    return ret;
}

所以file*是fget_light的返回值,下面通过一个crash来分析对应的file指针

file指针是fget_light的返回值，默认在rax中，调用完之后，rax赋值给了rbx，且rbx在调用vfs_read之前未更改，所以rbx里面存放的file指针，调用vfs_read之后，rvx压栈在rbp-0x18的位置

crash> dis -l vfs_read
/usr/src/debug/kernel-2.6.32-220.el6/linux-2.6.32-220.el6.x86_64/fs/read_write.c: 278
0xffffffff8117e9c0 <vfs_read>:  push   %rbp
0xffffffff8117e9c1 <vfs_read+1>:        mov    %rsp,%rbp
0xffffffff8117e9c4 <vfs_read+4>:        sub    $0x30,%rsp
0xffffffff8117e9c8 <vfs_read+8>:        mov    %rbx,-0x18(%rbp)
crash> px 0xffff8817fc9e1f28-0x18
$2 = 0xffff8817fc9e1f10
crash>
crash> struct file ffff880c9ddbba80
struct file {
  f_u = {
    fu_list = {
      next = 0xffff880c860d8500,
      prev = 0xffff880c1188cce8
    },
    fu_rcuhead = {
      next = 0xffff880c860d8500,
      func = 0xffff880c1188cce8
    }
  },
  f_path = {
    mnt = 0xffff88180ec5eec0,
    dentry = 0xffff8818014b1540
  },
  f_op = 0xffffffff8161f980 <proc_sops+128>,---------------------这个f_op是我们要继续分析的：
  f_lock = {
    raw_lock = {
      slock = 0
    }
  },
  f_count = {
    counter = 1
  },
  f_flags = 32768,
  f_mode = 13,
  f_pos = 20480,
  f_owner = {
    lock = {
      raw_lock = {
        lock = 16777216
      }
    },
    pid = 0x0,
    pid_type = PIDTYPE_PID,
    uid = 0,
    euid = 0,
    signum = 0
  },
  f_cred = 0xffff880c0ecd8b00,
  f_ra = {
    start = 0,
    size = 0,
    async_size = 0,
    ra_pages = 32,
    mmap_miss = 0,
    prev_pos = -1
  },
  f_version = 0,
  f_security = 0x0,
  private_data = 0xffff8817f3eee740,
  f_ep_links = {
    next = 0xffff880c9ddbbb28,
    prev = 0xffff880c9ddbbb28
  },
  f_mapping = 0xffff88100a811918
}

我们继续取对应的f_op成员分析，这个是不同的文件系统有不同的函数，proc文件系统就是

crash> struct file_operations 0xffffffff8161f980
struct file_operations {
  owner = 0x0,
  llseek = 0xffffffff811e3600 <proc_reg_llseek>,
  read = 0xffffffff811e3540 <proc_reg_read>,-----------------这个read要继续跟进
  write = 0xffffffff811e3480 <proc_reg_write>,
  aio_read = 0x0,
  aio_write = 0x0,
  readdir = 0x0,
  poll = 0xffffffff811e33d0 <proc_reg_poll>,
  ioctl = 0x0,
  unlocked_ioctl = 0xffffffff811e36b0 <proc_reg_unlocked_ioctl>,
  compat_ioctl = 0x0,
  mmap = 0xffffffff811e3320 <proc_reg_mmap>,
  open = 0xffffffff811e3a50 <proc_reg_open>,
  flush = 0x0,
  release = 0xffffffff811e31f0 <proc_reg_release>,
  fsync = 0x0,
  aio_fsync = 0x0,
  fasync = 0x0,
  lock = 0x0,
  sendpage = 0x0,
  get_unmapped_area = 0x0,
  check_flags = 0x0,
  flock = 0x0,
  splice_write = 0x0,
  splice_read = 0x0,
  setlease = 0x0
}

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_READ))
        return -EBADF;
    if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
        return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);
    if (ret >= 0) {
        count = ret;
        if (file->f_op->read)
            ret = file->f_op->read(file, buf, count, pos);-------proc文件系统，这个为proc_ref_read
        else

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
    struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);-----------根据inode来获取pde

一个proc文件系统的entry结构是通过PDE宏来获取的。

PDE的实现如下：

static inline struct proc_inode *PROC_I(const struct inode *inode)
{
    return container_of(inode, struct proc_inode, vfs_inode);
}

static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
    return PROC_I(inode)->pde;
}

所以vfs层的inode结构其实就是嵌入到了proc_inode中；

crash> struct -xo proc_inode
struct proc_inode {
    [0x0] struct pid *pid;
    [0x8] int fd;
   [0x10] union proc_op op;
   [0x18] struct proc_dir_entry *pde;
   [0x20] struct ctl_table_header *sysctl;
   [0x28] struct ctl_table *sysctl_entry;
   [0x30] struct inode vfs_inode;-------------------vfs层的inode
}

通过vfs_inode获取到proc_inode之后，就会获取proc_dir_entry

crash> struct -xo proc_dir_entry
struct proc_dir_entry {
   [0x0] unsigned int low_ino;
   [0x4] unsigned short namelen;
   [0x8] const char *name;
  [0x10] mode_t mode;
  [0x18] nlink_t nlink;
  [0x20] uid_t uid;
  [0x24] gid_t gid;
  [0x28] loff_t size;
  [0x30] const struct inode_operations *proc_iops;
  [0x38] const struct file_operations *proc_fops;
  [0x40] struct proc_dir_entry *next;
  [0x48] struct proc_dir_entry *parent;
  [0x50] struct proc_dir_entry *subdir;
  [0x58] void *data;
  [0x60] read_proc_t *read_proc;
  [0x68] write_proc_t *write_proc;
  [0x70] atomic_t count;
  [0x74] int pde_users;
  [0x78] spinlock_t pde_unload_lock;
  [0x80] struct completion *pde_unload_completion;
  [0x88] struct list_head pde_openers;
}
SIZE: 0x98

所以我们找出inode，就可以找出pde。

crash> struct file.f_path ffff880c9ddbba80
  f_path = {
    mnt = 0xffff88180ec5eec0,
    dentry = 0xffff8818014b1540
  }

crash> struct file.f_path.dentry ffff880c9ddbba80
  f_path.dentry = 0xffff8818014b1540
crash> struct dentry.d_inode 0xffff8818014b1540
  d_inode = 0xffff88100a8117f8


crash> px 0xffff88100a8117f8-0x30
$11 = 0xffff88100a8117c8
crash> struct proc_inode.pde 0xffff88100a8117c8
  pde = 0xffff880c0ef13b00

crash> struct proc_dir_entry.proc_fops 0xffff880c0ef13b00
  proc_fops = 0xffffffff8161cdc0

crash> struct file_operations 0xffffffff8161cdc0
struct file_operations {
  owner = 0x0,
  llseek = 0xffffffff811a0490 <seq_lseek>,
  read = 0xffffffff811a0950 <seq_read>,
  write = 0xffffffff811687c0 <slabinfo_write>,
  aio_read = 0x0,
  aio_write = 0x0,
  readdir = 0x0,
  poll = 0x0,
  ioctl = 0x0,
  unlocked_ioctl = 0x0,
  compat_ioctl = 0x0,
  mmap = 0x0,
  open = 0xffffffff81165640 <slabinfo_open>,----------------最终调用的open
  flush = 0x0,
  release = 0xffffffff8119fde0 <seq_release>,
  fsync = 0x0,
  aio_fsync = 0x0,
  fasync = 0x0,
  lock = 0x0,
  sendpage = 0x0,
  get_unmapped_area = 0x0,
  check_flags = 0x0,
  flock = 0x0,
  splice_write = 0x0,
  splice_read = 0x0,
  setlease = 0x0
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
    struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
    ssize_t rv = -EIO;
    ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);

    spin_lock(&pde->pde_unload_lock);
    if (!pde->proc_fops) {
        spin_unlock(&pde->pde_unload_lock);
        return rv;
    }
    pde->pde_users++;
    read = pde->proc_fops->read;--------------------调用的read是seq_read
    spin_unlock(&pde->pde_unload_lock);

    if (read)
        rv = read(file, buf, count, ppos);

    pde_users_dec(pde);
    return rv;
}

所以本案例中，proc文件系统最终调用的read是seq_read，open是：slabinfo_open，当然，这个跟案例相关，因为proc的封装是到proc_reg_read 为止，函数proc_reg_read是vfs read进入proc的入口。看linux源码的时候，不能只见树木，不见森林，要有清晰的层次概念。后面的实现与不同的proc文件相关，不同的层次完成不同的功能是linux的设计哲学，由于seq_read是seq类文件的封装，实现在seq_file.c中，主要功能如下：

ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
    struct seq_file *m = (struct seq_file *)file->private_data;
。。。。。
    p = m->op->start(m, &pos);
    while (1) {
        err = PTR_ERR(p);
        if (!p || IS_ERR(p))
            break;
        err = m->op->show(m, p);
        if (err < 0)
            break;
        if (unlikely(err))
            m->count = 0;
        if (unlikely(!m->count)) {
            p = m->op->next(m, p, &pos);
            m->index = pos;
            continue;
        }
        if (m->count < m->size)
            goto Fill;
        m->op->stop(m, p);
        kfree(m->buf);
        m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
        if (!m->buf)
            goto Enomem;
        m->count = 0;
        m->version = 0;
        pos = m->index;
        p = m->op->start(m, &pos);
    }
    m->op->stop(m, p);
    m->count = 0;
    goto Done;
Fill:

ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
    struct seq_file *m = (struct seq_file *)file->private_data;
。。。。。
    p = m->op->start(m, &pos);------------标准的start
    while (1) {
        err = PTR_ERR(p);
        if (!p || IS_ERR(p))
            break;
        err = m->op->show(m, p);
        if (err < 0)
            break;
        if (unlikely(err))
            m->count = 0;
        if (unlikely(!m->count)) {
            p = m->op->next(m, p, &pos);----next动作
            m->index = pos;
            continue;
        }
        if (m->count < m->size)
            goto Fill;
        m->op->stop(m, p);----stop动作
        kfree(m->buf);
        m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
        if (!m->buf)
            goto Enomem;
        m->count = 0;
        m->version = 0;
        pos = m->index;
        p = m->op->start(m, &pos);-----循环start
    }
    m->op->stop(m, p);
    m->count = 0;
    goto Done;
Fill:

crash> struct file.private_data ffff880c9ddbba80
  private_data = 0xffff8817f3eee740
crash> struct seq_file 0xffff8817f3eee740
struct seq_file {
  buf = 0xffff8817f4448000 "size-65536(DMA)        0      0  65536    1   16 : tunables    8    4    0 : slabdata      0      0      0\nsize-65536            19     19  65536    1   16 : tunables    8    4    0 : slabdata     19     19      0\nsize-32768(DMA)        0      0  32768    "...,
  size = 4096,
  from = 963,
  count = 0,
  index = 189,
  read_pos = 20480,
  version = 0,
  lock = {
    count = {
      counter = 0
    },
    wait_lock = {
      raw_lock = {
        slock = 0
      }
    },
    wait_list = {
      next = 0xffff8817f3eee780,
      prev = 0xffff8817f3eee780
    },
    owner = 0xffff8817fc9e0000
  },
  op = 0xffffffff8161cea0 <proc_slabinfo_operations+192>,
  private = 0x0
}
crash> struct seq_file
struct seq_file {
    char *buf;
    size_t size;
    size_t from;
    size_t count;
    loff_t index;
    loff_t read_pos;
    u64 version;
    struct mutex lock;
    const struct seq_operations *op;
    void *private;
}
SIZE: 104
crash> struct seq_operations 0xffffffff8161cea0
struct seq_operations {
  start = 0xffffffff81165680 <s_start>,
  stop = 0xffffffff811652f0 <s_stop>,
  next = 0xffffffff81165660 <s_next>,
  show = 0xffffffff81166420 <s_show>
}

有seq_file,一般就会设计seq_operations,上面就是分析对应seq_operations中的函数的情况。由于后面已经不是proc文件系统层的范畴，本文结束。

posted on 2018-08-22 14:31 _备忘录阅读(843) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

安庆

导航

公告

用crash来分析一下proc的文件访问