PROC文件系统
1. seq_file
参考: http://blog.chinaunix.net/uid-26084833-id-1754437.html
seq_file的结构体定义:
1: struct seq_file {
2: char *buf;
3: size_t size;
4: size_t from;
5: size_t count;
6: loff_t index;
7: loff_t read_pos;
8: u64 version;
9: struct mutex lock;
10: const struct seq_operations *op;
11: void *private;
12: };
seq_operations的定义:
1: struct seq_operations {
2: void * (*start) (struct seq_file *m, loff_t *pos);
3: void (*stop) (struct seq_file *m, void *v);
4: void * (*next) (struct seq_file *m, void *v, loff_t *pos);
5: int (*show) (struct seq_file *m, void *v);
6: };
start函数
用于指定seq_file文件的读开始位置,返回实际读开始位置,如果指定的位置超过文件末尾,应当返回NULL,start函数可以有一个特殊的返回SEQ_START_TOKEN,它用于让show函数输出文件头,但这只能在pos为0时使用;
next函数
用于把seq_file 文件的当前读位置移动到下一个读位置,返回实际的下一个读位置,如果已经到达文件末尾,返回NULL;
stop函数
用于在读完seq_file文件后调 用,它类似于文件操作close,用于做一些必要的清理,如释放内存等;
show函数
用于格式化输出,如果成功返回0,否则返回出错码。
我们查看一下用来打印/proc/mounts信息对应的seq_file操作函数:
1: const struct seq_operations mounts_op = {
2: .start = m_start,
3: .next = m_next,
4: .stop = m_stop,
5: .show = show_vfsmnt
6: };
依次来看各个函数的实现:
1: static void *m_start(struct seq_file *m, loff_t *pos)
2: {
3: struct proc_mounts *p = m->private;
4:
5: down_read(&namespace_sem);
6: return seq_list_start(&p->ns->list, *pos);
7: }
8:
down_read(&namespace_sem);
用来将可用的信号量降低一个数值,表示占用一个信号量,用来读取namespace相关的信息。
1: static struct list_head *mount_hashtable __read_mostly;
2: static struct kmem_cache *mnt_cache __read_mostly;
3: static struct rw_semaphore namespace_sem;
namespace_sem用来保护对mount_hashtable的并发读写。
struct proc_mounts *p = m->private;
这里可以知道,给mountinfo使用的seq_file的成员private用来保存proc_mounts结构体指针。
1: struct proc_mounts {
2: struct seq_file m; /* must be the first element */
3: struct mnt_namespace *ns;
4: struct path root;
5: int event;
6: };
顾名思义,proc_mounts保存的是我们想要的/proc/mounts信息的数据结构。
?那么,proc_mounts结构体中的数据是从哪里得到的呢
1: static int mounts_open_common(struct inode *inode, struct file *file,
2: const struct seq_operations *op)
3: {
4: struct task_struct *task = get_proc_task(inode);
5: struct nsproxy *nsp;
6: struct mnt_namespace *ns = NULL;
7: struct path root;
8: struct proc_mounts *p;
9: int ret = -EINVAL;
10:
11: if (task) {
12: rcu_read_lock();
13: nsp = task_nsproxy(task);
14: if (nsp) {
15: ns = nsp->mnt_ns;
16: if (ns)
17: get_mnt_ns(ns);
18: }
19: rcu_read_unlock();
20: if (ns && get_task_root(task, &root) == 0)
21: ret = 0;
22: put_task_struct(task);
23: }
24:
25: if (!ns)
26: goto err;
27: if (ret)
28: goto err_put_ns;
29:
30: ret = -ENOMEM;
31: p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
32: if (!p)
33: goto err_put_path;
34:
35: file->private_data = &p->m;
36: ret = seq_open(file, op);
37: if (ret)
38: goto err_free;
39:
40: p->m.private = p;
41: p->ns = ns;
42: p->root = root;
43: p->event = ns->event;
44:
45: return 0;
46:
47: err_free:
48: kfree(p);
49: err_put_path:
50: path_put(&root);
51: err_put_ns:
52: put_mnt_ns(ns);
53: err:
54: return ret;
55: }
首先看到这段代码
p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
if (!p)
goto err_put_path;file->private_data = &p->m;
ret = seq_open(file, op);
if (ret)
goto err_free;p->m.private = p;
p->ns = ns;
p->root = root;
p->event = ns->event;
可以确定,proc_mounts结构体是在这里创建并且初始化的。其中最重要的数据ns是怎么来的呢?
if (task) {
rcu_read_lock();
nsp = task_nsproxy(task);
if (nsp) {
ns = nsp->mnt_ns;
if (ns)
get_mnt_ns(ns);
}
rcu_read_unlock();
if (ns && get_task_root(task, &root) == 0)
ret = 0;
put_task_struct(task);
}
1: /*
2: * A structure to contain pointers to all per-process
3: * namespaces - fs (mount), uts, network, sysvipc, etc.
4: *
5: * 'count' is the number of tasks holding a reference.
6: * The count for each namespace, then, will be the number
7: * of nsproxies pointing to it, not the number of tasks.
8: *
9: * The nsproxy is shared by tasks which share all namespaces.
10: * As soon as a single namespace is cloned or unshared, the
11: * nsproxy is copied.
12: */
13: struct nsproxy {
14: atomic_t count;
15: struct uts_namespace *uts_ns;
16: struct ipc_namespace *ipc_ns;
17: struct mnt_namespace *mnt_ns;
18: struct pid_namespace *pid_ns;
19: struct net *net_ns;
20: };
那么是哪里调到了mounts_open_common函数呢?
1: static int mounts_open(struct inode *inode, struct file *file)
2: {
3: return mounts_open_common(inode, file, &mounts_op);
4: }
5:
6: static const struct file_operations proc_mounts_operations = {
7: .open = mounts_open,
8: .read = seq_read,
9: .llseek = seq_lseek,
10: .release = mounts_release,
11: .poll = mounts_poll,
12: };
我们又在fs/proc/base.c中有了大发现:
1: static const struct pid_entry tgid_base_stuff[] = {
2: DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
3: DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
4: DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
5: DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
6: #ifdef CONFIG_NET
7: DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
8: #endif
9: REG("environ", S_IRUSR, proc_environ_operations),
10: INF("auxv", S_IRUSR, proc_pid_auxv),
11: ONE("status", S_IRUGO, proc_pid_status),
12: ONE("personality", S_IRUGO, proc_pid_personality),
13: INF("limits", S_IRUGO, proc_pid_limits),
14: #ifdef CONFIG_SCHED_DEBUG
15: REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
16: #endif
17: #ifdef CONFIG_SCHED_AUTOGROUP
18: REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
19: #endif
20: REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
21: #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
22: INF("syscall", S_IRUGO, proc_pid_syscall),
23: #endif
24: INF("cmdline", S_IRUGO, proc_pid_cmdline),
25: ONE("stat", S_IRUGO, proc_tgid_stat),
26: ONE("statm", S_IRUGO, proc_pid_statm),
27: REG("maps", S_IRUGO, proc_maps_operations),
28: #ifdef CONFIG_NUMA
29: REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
30: #endif
31: REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
32: LNK("cwd", proc_cwd_link),
33: LNK("root", proc_root_link),
34: LNK("exe", proc_exe_link),
35: REG("mounts", S_IRUGO, proc_mounts_operations),
36: REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
37: REG("mountstats", S_IRUSR, proc_mountstats_operations),
38: #ifdef CONFIG_PROC_PAGE_MONITOR
39: REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
40: REG("smaps", S_IRUGO, proc_smaps_operations),
41: REG("pagemap", S_IRUGO, proc_pagemap_operations),
42: #endif
43: #ifdef CONFIG_SECURITY
44: DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
45: #endif
46: #ifdef CONFIG_KALLSYMS
47: INF("wchan", S_IRUGO, proc_pid_wchan),
48: #endif
49: #ifdef CONFIG_STACKTRACE
50: ONE("stack", S_IRUGO, proc_pid_stack),
51: #endif
52: #ifdef CONFIG_SCHEDSTATS
53: INF("schedstat", S_IRUGO, proc_pid_schedstat),
54: #endif
55: #ifdef CONFIG_LATENCYTOP
56: REG("latency", S_IRUGO, proc_lstats_operations),
57: #endif
58: #ifdef CONFIG_PROC_PID_CPUSET
59: REG("cpuset", S_IRUGO, proc_cpuset_operations),
60: #endif
61: #ifdef CONFIG_CGROUPS
62: REG("cgroup", S_IRUGO, proc_cgroup_operations),
63: #endif
64: INF("oom_score", S_IRUGO, proc_oom_score),
65: REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
66: REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
67: #ifdef CONFIG_AUDITSYSCALL
68: REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
69: REG("sessionid", S_IRUGO, proc_sessionid_operations),
70: #endif
71: #ifdef CONFIG_FAULT_INJECTION
72: REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
73: #endif
74: #ifdef CONFIG_ELF_CORE
75: REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
76: #endif
77: #ifdef CONFIG_TASK_IO_ACCOUNTING
78: INF("io", S_IRUSR, proc_tgid_io_accounting),
79: #endif
80: #ifdef CONFIG_HARDWALL
81: INF("hardwall", S_IRUGO, proc_pid_hardwall),
82: #endif
83: };
这里定义着在每个/proc/[pid]下面的所有目录项
那么/proc/mounts呢,我们查看一下/proc/mounts的信息:
1: #ls -l /proc
2: ......
3: lrwxrwxrwx 1 root root 11 2014-01-26 22:11 mounts -> self/mounts
4: ......
5: lrwxrwxrwx 1 root root 64 2014-01-23 01:22 self -> 10590
6: ......
因此,一切都明了了,/proc/mounts其实是到当前任务的/proc/self/mounts的软链接。
proc_mounts的数据源头,以及生成数据的调用层次问题已经找到了答案,接下来再回过头来看看seq_file。
return seq_list_start(&p->ns->list, *pos);
1: struct list_head *seq_list_start(struct list_head *head, loff_t pos)
2: {
3: struct list_head *lh;
4:
5: list_for_each(lh, head)
6: if (pos-- == 0)
7: return lh;
8:
9: return NULL;
10: }
11: EXPORT_SYMBOL(seq_list_start);
其实很简单,就是返回到双链表head的第pos项的位置指针。如果pos超出了head双链表中的项目数目,就返回NULL。
可见,这是为了读取seq_file中的内容做准备。
对于m_next和m_stop的逻辑也很简单,不再详述。
1: static void *m_next(struct seq_file *m, void *v, loff_t *pos)
2: {
3: struct proc_mounts *p = m->private;
4:
5: return seq_list_next(v, &p->ns->list, pos);
6: }
7:
8: static void m_stop(struct seq_file *m, void *v)
9: {
10: up_read(&namespace_sem);
11: }
1: struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
2: {
3: struct list_head *lh;
4:
5: lh = ((struct list_head *)v)->next;
6: ++*ppos;
7: return lh == head ? NULL : lh;
8: }
9: EXPORT_SYMBOL(seq_list_next);
总结一下,就是m_start/m_next向外界暴露proc_mounts->ns->list的位置指针,允许外界对其内容进行读取。
m_stop用来当读取结束后做清理工作,这里是恢复namespace_sem信号量。
显示函数
1: static int show_vfsmnt(struct seq_file *m, void *v)
2: {
3: struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
4: int err = 0;
5: struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
6:
7: if (mnt->mnt_sb->s_op->show_devname) {
8: err = mnt->mnt_sb->s_op->show_devname(m, mnt);
9: if (err)
10: goto out;
11: } else {
12: mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
13: }
14: seq_putc(m, ' ');
15: seq_path(m, &mnt_path, " \t\n\\");
16: seq_putc(m, ' ');
17: show_type(m, mnt->mnt_sb);
18: seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
19: err = show_sb_opts(m, mnt->mnt_sb);
20: if (err)
21: goto out;
22: show_mnt_opts(m, mnt);
23: if (mnt->mnt_sb->s_op->show_options)
24: err = mnt->mnt_sb->s_op->show_options(m, mnt);
25: seq_puts(m, " 0 0\n");
26: out:
27: return err;
28: }
从show函数来看,是将v指针指向的vfsmount结构体的mnt_list内容以一定的格式写到seq_file的buffer里面去。
这里有理由猜想v实际上保存的是
m->p->ns->root
接下来我们看一下,这些简单的功能(m_start/m_next/m_stop/show_vfsmnt)是怎样发挥作用的:
1: ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
2: {
3: struct seq_file *m = file->private_data;
4: size_t copied = 0;
5: loff_t pos;
6: size_t n;
7: void *p;
8: int err = 0;
9:
10: mutex_lock(&m->lock);
11:
12: /* Don't assume *ppos is where we left it */
13: if (unlikely(*ppos != m->read_pos)) {
14: m->read_pos = *ppos;
15: while ((err = traverse(m, *ppos)) == -EAGAIN)
16: ;
17: if (err) {
18: /* With prejudice... */
19: m->read_pos = 0;
20: m->version = 0;
21: m->index = 0;
22: m->count = 0;
23: goto Done;
24: }
25: }
26:
27: /*
28: * seq_file->op->..m_start/m_stop/m_next may do special actions
29: * or optimisations based on the file->f_version, so we want to
30: * pass the file->f_version to those methods.
31: *
32: * seq_file->version is just copy of f_version, and seq_file
33: * methods can treat it simply as file version.
34: * It is copied in first and copied out after all operations.
35: * It is convenient to have it as part of structure to avoid the
36: * need of passing another argument to all the seq_file methods.
37: */
38: m->version = file->f_version;
39: /* grab buffer if we didn't have one */
40: if (!m->buf) {
41: m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
42: if (!m->buf)
43: goto Enomem;
44: }
45: /* if not empty - flush it first */
46: if (m->count) {
47: n = min(m->count, size);
48: err = copy_to_user(buf, m->buf + m->from, n);
49: if (err)
50: goto Efault;
51: m->count -= n;
52: m->from += n;
53: size -= n;
54: buf += n;
55: copied += n;
56: if (!m->count)
57: m->index++;
58: if (!size)
59: goto Done;
60: }
61: /* we need at least one record in buffer */
62: pos = m->index;
63: p = m->op->start(m, &pos);
64: while (1) {
65: err = PTR_ERR(p);
66: if (!p || IS_ERR(p))
67: break;
68: err = m->op->show(m, p);
69: if (err < 0)
70: break;
71: if (unlikely(err))
72: m->count = 0;
73: if (unlikely(!m->count)) {
74: p = m->op->next(m, p, &pos);
75: m->index = pos;
76: continue;
77: }
78: if (m->count < m->size)
79: goto Fill;
80: m->op->stop(m, p);
81: kfree(m->buf);
82: m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
83: if (!m->buf)
84: goto Enomem;
85: m->count = 0;
86: m->version = 0;
87: pos = m->index;
88: p = m->op->start(m, &pos);
89: }
90: m->op->stop(m, p);
91: m->count = 0;
92: goto Done;
93: Fill:
94: /* they want more? let's try to get some more */
95: while (m->count < size) {
96: size_t offs = m->count;
97: loff_t next = pos;
98: p = m->op->next(m, p, &next);
99: if (!p || IS_ERR(p)) {
100: err = PTR_ERR(p);
101: break;
102: }
103: err = m->op->show(m, p);
104: if (m->count == m->size || err) {
105: m->count = offs;
106: if (likely(err <= 0))
107: break;
108: }
109: pos = next;
110: }
111: m->op->stop(m, p);
112: n = min(m->count, size);
113: err = copy_to_user(buf, m->buf, n);
114: if (err)
115: goto Efault;
116: copied += n;
117: m->count -= n;
118: if (m->count)
119: m->from = n;
120: else
121: pos++;
122: m->index = pos;
123: Done:
124: if (!copied)
125: copied = err;
126: else {
127: *ppos += copied;
128: m->read_pos += copied;
129: }
130: file->f_version = m->version;
131: mutex_unlock(&m->lock);
132: return copied;
133: Enomem:
134: err = -ENOMEM;
135: goto Done;
136: Efault:
137: err = -EFAULT;
138: goto Done;
139: }
seq_read,显然是用来读取文件内容的,但是其接口并不是seq_file,而是file,这就表明这个接口是把seq_file的实现细节隐藏在了该函数的内容,而对于外面来说,可以通过常用的struct file接口来调用该函数。
因此该函数起到了Adapter的作用。
下面这段是核心代码
pos = m->index;
p = m->op->start(m, &pos);
while (1) {
err = PTR_ERR(p);
if (!p || IS_ERR(p))
break;
err = m->op->show(m, p);
if (err < 0)
break;
if (unlikely(err))
m->count = 0;
if (unlikely(!m->count)) {
p = m->op->next(m, p, &pos);
m->index = pos;
continue;
}
if (m->count < m->size)
goto Fill;
m->op->stop(m, p);
kfree(m->buf);
m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
if (!m->buf)
goto Enomem;
m->count = 0;
m->version = 0;
pos = m->index;
p = m->op->start(m, &pos);
}
m->op->stop(m, p);
m->count = 0;
goto Done;
如果err代表出错,则使用m_next读取下一条,因此控制逻辑在show中,如果没有读完,就返回出错的信息。
err < 0, 代表成功,跳出循环;
err > 0, 代表没有读完全,即调用m_next来读下一条;
err = 0, 代表失败,将buffer大小调整为原平的2倍,再尝试重新读。
其中,m->count代表已经读到m->buffer中的字节数目,m->size代表一共需要读取多少字节。