inode缓存与dentry缓存
1. inode缓存
1: struct inode {
2: /* RCU path lookup touches following: */
3: umode_t i_mode;
4: uid_t i_uid;
5: gid_t i_gid;
6: const struct inode_operations *i_op;
7: struct super_block *i_sb;
8:
9: spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
10: unsigned int i_flags;
11: unsigned long i_state;
12: #ifdef CONFIG_SECURITY
13: void *i_security;
14: #endif
15: struct mutex i_mutex;
16:
17:
18: unsigned long dirtied_when; /* jiffies of first dirtying */
19:
20: struct hlist_node i_hash;
21: struct list_head i_wb_list; /* backing dev IO list */
22: struct list_head i_lru; /* inode LRU list */
23: struct list_head i_sb_list;
24: union {
25: struct list_head i_dentry;
26: struct rcu_head i_rcu;
27: };
28: unsigned long i_ino;
29: atomic_t i_count;
30: unsigned int i_nlink;
31: dev_t i_rdev;
32: unsigned int i_blkbits;
33: u64 i_version;
34: loff_t i_size;
35: #ifdef __NEED_I_SIZE_ORDERED
36: seqcount_t i_size_seqcount;
37: #endif
38: struct timespec i_atime;
39: struct timespec i_mtime;
40: struct timespec i_ctime;
41: blkcnt_t i_blocks;
42: unsigned short i_bytes;
43: struct rw_semaphore i_alloc_sem;
44: const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
45: struct file_lock *i_flock;
46: struct address_space *i_mapping;
47: struct address_space i_data;
48: #ifdef CONFIG_QUOTA
49: struct dquot *i_dquot[MAXQUOTAS];
50: #endif
51: struct list_head i_devices;
52: union {
53: struct pipe_inode_info *i_pipe;
54: struct block_device *i_bdev;
55: struct cdev *i_cdev;
56: };
57:
58: __u32 i_generation;
59:
60: #ifdef CONFIG_FSNOTIFY
61: __u32 i_fsnotify_mask; /* all events this inode cares about */
62: struct hlist_head i_fsnotify_marks;
63: #endif
64:
65: #ifdef CONFIG_IMA
66: atomic_t i_readcount; /* struct files open RO */
67: #endif
68: atomic_t i_writecount;
69: #ifdef CONFIG_FS_POSIX_ACL
70: struct posix_acl *i_acl;
71: struct posix_acl *i_default_acl;
72: #endif
73: void *i_private; /* fs or device private pointer */
74: };
inode可能处于三种状态:
1)unused,里面没有保存有效的内容,可以被复用为新的用途;
2)in use,正在被使用,其成员i_count以及i_nlink一定大于0,此时inode与文件系统或者说设备上的文件相关联,但是自从上次与设备同步后,内容没有发生改变,即不是dirty的;
3)dirty,inode里面的内容已经与文件系统中的文件内容不一致了,即脏了,需要进行文件同步操作。
前两种状态的inode都各自位于一个全局的链表中,而第三种的inode位于super_block结构体中的一个链表中。
先看inode结构体中的一个成员:
struct list_head i_lru; /* inode LRU list */
对应着一个全局的链表:
static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);
1: /*
2: * Called when we're dropping the last reference
3: * to an inode.
4: *
5: * Call the FS "drop_inode()" function, defaulting to
6: * the legacy UNIX filesystem behaviour. If it tells
7: * us to evict inode, do so. Otherwise, retain inode
8: * in cache if fs is alive, sync and evict if fs is
9: * shutting down.
10: */
11: static void iput_final(struct inode *inode)
12: {
13: struct super_block *sb = inode->i_sb;
14: const struct super_operations *op = inode->i_sb->s_op;
15: int drop;
16:
17: WARN_ON(inode->i_state & I_NEW);
18:
19: if (op && op->drop_inode)
20: drop = op->drop_inode(inode);
21: else
22: drop = generic_drop_inode(inode);
23:
24: if (!drop && (sb->s_flags & MS_ACTIVE)) {
25: inode->i_state |= I_REFERENCED;
26: if (!(inode->i_state & (I_DIRTY|I_SYNC)))
27: inode_lru_list_add(inode);
28: spin_unlock(&inode->i_lock);
29: return;
30: }
31:
32: if (!drop) {
33: inode->i_state |= I_WILL_FREE;
34: spin_unlock(&inode->i_lock);
35: write_inode_now(inode, 1);
36: spin_lock(&inode->i_lock);
37: WARN_ON(inode->i_state & I_NEW);
38: inode->i_state &= ~I_WILL_FREE;
39: }
40:
41: inode->i_state |= I_FREEING;
42: inode_lru_list_del(inode);
43: spin_unlock(&inode->i_lock);
44:
45: evict(inode);
46: }
函数iput_final是在当inode没有被任何地方引用后,即变成了unused状态后,回收inode的机制。
if (op && op->drop_inode)
drop = op->drop_inode(inode);
else
drop = generic_drop_inode(inode);
drop为0时,表示i_nlink为0,并且inode没有保存着inode_hashtable中的拉链表,即这个inode可以被释放掉。
1: /*
2: * Normal UNIX filesystem behaviour: delete the
3: * inode when the usage count drops to zero, and
4: * i_nlink is zero.
5: */
6: int generic_drop_inode(struct inode *inode)
7: {
8: return !inode->i_nlink || inode_unhashed(inode);
9: }
10: EXPORT_SYMBOL_GPL(generic_drop_inode);
if (!drop && (sb->s_flags & MS_ACTIVE)) {
inode->i_state |= I_REFERENCED;
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
inode_lru_list_add(inode);
spin_unlock(&inode->i_lock);
return;
}
如果superblock还存在在系统中,就调用inode_lru_list_add将inode添加到unused列表中,即将inode缓存起来。
否则,就先调用write_inode_now写回到磁盘上,再调用inode_lru_list_del将已经缓存下来的inode删除掉,最后调用evict函数将inode彻底删除。
1: static void inode_lru_list_add(struct inode *inode)
2: {
3: spin_lock(&inode_lru_lock);
4: if (list_empty(&inode->i_lru)) {
5: list_add(&inode->i_lru, &inode_lru);
6: inodes_stat.nr_unused++;
7: }
8: spin_unlock(&inode_lru_lock);
9: }
因此inode_lru就是全局的unused inode列表,通过“Least Recently Used”的顺序保存。
此外,操作inode_lru的函数还有prune_icache
1: /*
2: * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
3: * temporary list and then are freed outside inode_lru_lock by dispose_list().
4: *
5: * Any inodes which are pinned purely because of attached pagecache have their
6: * pagecache removed. If the inode has metadata buffers attached to
7: * mapping->private_list then try to remove them.
8: *
9: * If the inode has the I_REFERENCED flag set, then it means that it has been
10: * used recently - the flag is set in iput_final(). When we encounter such an
11: * inode, clear the flag and move it to the back of the LRU so it gets another
12: * pass through the LRU before it gets reclaimed. This is necessary because of
13: * the fact we are doing lazy LRU updates to minimise lock contention so the
14: * LRU does not have strict ordering. Hence we don't want to reclaim inodes
15: * with this flag set because they are the inodes that are out of order.
16: */
17: static void prune_icache(int nr_to_scan)
18: {
19: LIST_HEAD(freeable);
20: int nr_scanned;
21: unsigned long reap = 0;
22:
23: down_read(&iprune_sem);
24: spin_lock(&inode_lru_lock);
25: for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
26: struct inode *inode;
27:
28: if (list_empty(&inode_lru))
29: break;
30:
31: inode = list_entry(inode_lru.prev, struct inode, i_lru);
32:
33: /*
34: * we are inverting the inode_lru_lock/inode->i_lock here,
35: * so use a trylock. If we fail to get the lock, just move the
36: * inode to the back of the list so we don't spin on it.
37: */
38: if (!spin_trylock(&inode->i_lock)) {
39: list_move(&inode->i_lru, &inode_lru);
40: continue;
41: }
42:
43: /*
44: * Referenced or dirty inodes are still in use. Give them
45: * another pass through the LRU as we canot reclaim them now.
46: */
47: if (atomic_read(&inode->i_count) ||
48: (inode->i_state & ~I_REFERENCED)) {
49: list_del_init(&inode->i_lru);
50: spin_unlock(&inode->i_lock);
51: inodes_stat.nr_unused--;
52: continue;
53: }
54:
55: /* recently referenced inodes get one more pass */
56: if (inode->i_state & I_REFERENCED) {
57: inode->i_state &= ~I_REFERENCED;
58: list_move(&inode->i_lru, &inode_lru);
59: spin_unlock(&inode->i_lock);
60: continue;
61: }
62: if (inode_has_buffers(inode) || inode->i_data.nrpages) {
63: __iget(inode);
64: spin_unlock(&inode->i_lock);
65: spin_unlock(&inode_lru_lock);
66: if (remove_inode_buffers(inode))
67: reap += invalidate_mapping_pages(&inode->i_data,
68: 0, -1);
69: iput(inode);
70: spin_lock(&inode_lru_lock);
71:
72: if (inode != list_entry(inode_lru.next,
73: struct inode, i_lru))
74: continue; /* wrong inode or list_empty */
75: /* avoid lock inversions with trylock */
76: if (!spin_trylock(&inode->i_lock))
77: continue;
78: if (!can_unuse(inode)) {
79: spin_unlock(&inode->i_lock);
80: continue;
81: }
82: }
83: WARN_ON(inode->i_state & I_NEW);
84: inode->i_state |= I_FREEING;
85: spin_unlock(&inode->i_lock);
86:
87: list_move(&inode->i_lru, &freeable);
88: inodes_stat.nr_unused--;
89: }
90: if (current_is_kswapd())
91: __count_vm_events(KSWAPD_INODESTEAL, reap);
92: else
93: __count_vm_events(PGINODESTEAL, reap);
94: spin_unlock(&inode_lru_lock);
95:
96: dispose_list(&freeable);
97: up_read(&iprune_sem);
98: }
该函数的作用是在内存压力较大时,通过缩减缓存的inode列表inode_lru以释放出更多的内存。
该函数就是从inode_lru中从头开始取inode出来,做一些简单检查,如果inode还有一些原因需要继续存在在缓存中,就将该inode移到链表的尾部,然后检查下一个inode。
使得inode继续保留的原因包括:无法获取到操作inode中数据的锁i_lock;inode中的数据是脏的;inode的使用计数非0;inode刚刚被引用过等等。
还有一个比较实用的问题,我们看到在调用iput_final时,检查如果i_nlink为0,并且没有被用作拉链表的话,就将其放到缓存inode_lru中,但是在prune_icache时,会检查i_count引用计数是否为0。
这也就是说,如果一个inode对应的磁盘文件已经被删除了,但是还有进程对其进行操作的话,那么它不会被直接删除,而是会保存在缓存中,也就是说对其操作的进程仍然可以对已经缓存下来的数据页面page进行操作。
直到没有进程再对其进行操作了,才有可能被清除出缓存。
inode中有两个链表头元素,分别是i_sb_list和i_wb_list,其中i_sb_list是super_block->s_inodes列表的元素,而i_wb_list是用于维护设备的后备inode列表。
2. dentry缓存
dentry缓存的目的,为了减少对慢速磁盘的访问,每当VFS文件系统对底层的数据进行访问时,都会将访问的结果缓存下来,保存成一个dentry对象。
而且dentry对象的组织与管理,是和inode缓存极其相似的,也有一个hash表,和一个lru队列。
而且当内存压力较大时,也会调用prune_dcache来企图释放lru中优先级较低的dentry项目。
区别在于,inode是不需要维护目录的关系的,但是dentry需要,因此dentry的组织比inode要复杂。
1: static struct hlist_bl_head *dentry_hashtable __read_mostly;
2:
在super_block中
1: /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
2: struct list_head s_dentry_lru; /* unused dentry lru */
3:
因此,保存dentry全局hash表的数据结构是全局的,而保存dentry缓存的数据结构是存在于super_block数据结构中。
1: /*
2: * dentry_lru_(add|del|move_tail) must be called with d_lock held.
3: */
4: static void dentry_lru_add(struct dentry *dentry)
5: {
6: if (list_empty(&dentry->d_lru)) {
7: spin_lock(&dcache_lru_lock);
8: list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
9: dentry->d_sb->s_nr_dentry_unused++;
10: dentry_stat.nr_unused++;
11: spin_unlock(&dcache_lru_lock);
12: }
13: }
dentry_lur_add函数用于向dentry缓存中添加一个释放的dentry,它被函数dput调用。
1: /*
2: * This is dput
3: *
4: * This is complicated by the fact that we do not want to put
5: * dentries that are no longer on any hash chain on the unused
6: * list: we'd much rather just get rid of them immediately.
7: *
8: * However, that implies that we have to traverse the dentry
9: * tree upwards to the parents which might _also_ now be
10: * scheduled for deletion (it may have been only waiting for
11: * its last child to go away).
12: *
13: * This tail recursion is done by hand as we don't want to depend
14: * on the compiler to always get this right (gcc generally doesn't).
15: * Real recursion would eat up our stack space.
16: */
17:
18: /*
19: * dput - release a dentry
20: * @dentry: dentry to release
21: *
22: * Release a dentry. This will drop the usage count and if appropriate
23: * call the dentry unlink method as well as removing it from the queues and
24: * releasing its resources. If the parent dentries were scheduled for release
25: * they too may now get deleted.
26: */
27: void dput(struct dentry *dentry)
28: {
29: if (!dentry)
30: return;
31:
32: repeat:
33: if (dentry->d_count == 1)
34: might_sleep();
35: spin_lock(&dentry->d_lock);
36: BUG_ON(!dentry->d_count);
37: if (dentry->d_count > 1) {
38: dentry->d_count--;
39: spin_unlock(&dentry->d_lock);
40: return;
41: }
42:
43: if (dentry->d_flags & DCACHE_OP_DELETE) {
44: if (dentry->d_op->d_delete(dentry))
45: goto kill_it;
46: }
47:
48: /* Unreachable? Get rid of it */
49: if (d_unhashed(dentry))
50: goto kill_it;
51:
52: /* Otherwise leave it cached and ensure it's on the LRU */
53: dentry->d_flags |= DCACHE_REFERENCED;
54: dentry_lru_add(dentry);
55:
56: dentry->d_count--;
57: spin_unlock(&dentry->d_lock);
58: return;
59:
60: kill_it:
61: dentry = dentry_kill(dentry, 1);
62: if (dentry)
63: goto repeat;
64: }
65: EXPORT_SYMBOL(dput);
所有的dentry实例会形成一个网络,用于反映文件系统的结构。
d_subdirs成员,里面保存着所有的子目录以及该目录下的文件组成的列表。
d_child成员,是该dentry链接到其父目录的dentry节点的锚点。
这两个成员,是构成文件系统的层次结构的基本设施。
if (dentry->d_count == 1)
might_sleep();
参考:http://yuxu9710108.blog.163.com/blog/static/23751534201011715413404/
用于调试时,提示atomic context的可能睡眠情况。
分析dput函数的逻辑:
如果dentry的引用计数大于1,那么代表还有其他的地方在使用这个dentry,因此只减少引用计数,直接返回;
如果dentry->d_flags里面设置了delete标志,那么直接调用d_op->d_delete函数指针删除该dentry,再调用dentry_kill来处理;
【d_op->d_delete与dentry_kill在功能上有什么不同?】
如果在全局的hash表中也已经找不该dentry了,那么直接调用dentry_kill来处理;
如果dentry的引用计数为1,而且也不属于上面二种需要调用dentry_kill的情况,那么就将其缓存在super_block的LRU队列中。
我们看一种可能的d_delete的实现
1: /*
2: * This is called from dput() when d_count is going to 0.
3: */
4: static int nfs_dentry_delete(const struct dentry *dentry)
5: {
6: dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
7: dentry->d_parent->d_name.name, dentry->d_name.name,
8: dentry->d_flags);
9:
10: /* Unhash any dentry with a stale inode */
11: if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
12: return 1;
13:
14: if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
15: /* Unhash it, so that ->d_iput() would be called */
16: return 1;
17: }
18: if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
19: /* Unhash it, so that ancestors of killed async unlink
20: * files will be cleaned up during umount */
21: return 1;
22: }
23: return 0;
24:
25: }
可见,该函数是进行一些内部的判断,决定是否需要将该dentry从全局的hash表中删除掉。
if (dentry->d_flags & DCACHE_OP_DELETE) {
if (dentry->d_op->d_delete(dentry))
goto kill_it;
}
1: /*
2: * Finish off a dentry we've decided to kill.
3: * dentry->d_lock must be held, returns with it unlocked.
4: * If ref is non-zero, then decrement the refcount too.
5: * Returns dentry requiring refcount drop, or NULL if we're done.
6: */
7: static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
8: __releases(dentry->d_lock)
9: {
10: struct inode *inode;
11: struct dentry *parent;
12:
13: inode = dentry->d_inode;
14: if (inode && !spin_trylock(&inode->i_lock)) {
15: relock:
16: spin_unlock(&dentry->d_lock);
17: cpu_relax();
18: return dentry; /* try again with same dentry */
19: }
20: if (IS_ROOT(dentry))
21: parent = NULL;
22: else
23: parent = dentry->d_parent;
24: if (parent && !spin_trylock(&parent->d_lock)) {
25: if (inode)
26: spin_unlock(&inode->i_lock);
27: goto relock;
28: }
29:
30: if (ref)
31: dentry->d_count--;
32: /* if dentry was on the d_lru list delete it from there */
33: dentry_lru_del(dentry);
34: /* if it was on the hash then remove it */
35: __d_drop(dentry);
36: return d_kill(dentry, parent);
37: }