从buffer、cache区别看linux文件系统的内存管理
一、free命令的man手册说明
从这个描述可以看到,free命令的数据源主要是从/proc/meminfo文件读取
DESCRIPTION
free displays the total amount of free and used physical and swap memory in the system, as well as the buffers and caches used by the kernel. The information is gathered by parsing /proc/meminfo. The displayed columns are: total Total installed memory (MemTotal and SwapTotal in /proc/meminfo) used Used memory (calculated as total - free - buffers - cache) free Unused memory (MemFree and SwapFree in /proc/meminfo) shared Memory used (mostly) by tmpfs (Shmem in /proc/meminfo, available on kernels 2.6.32, displayed as zero if not available) buffers Memory used by kernel buffers (Buffers in /proc/meminfo) cache Memory used by the page cache and slabs (Cached and Slab in /proc/meminfo) buff/cache Sum of buffers and cache available Estimation of how much memory is available for starting new applications, without swapping. Unlike the data provided by the cache or free fields, this field takes into account page cache and also that not all reclaimable memory slabs will be reclaimed due to items being in use (MemAvailable in /proc/meminfo, available on kernels 3.14, emu- lated on kernels 2.6.27+, otherwise the same as free)
二、/proc/meminfo文件中信息的由来
tsecer@harry:head /proc/meminfo
MemTotal: XXX kB
MemFree: XXX kB
MemAvailable: XXX kB
Buffers: XXX kB
Cached: XXX kB
从代码中看,cache的数据是文件系统的页面减去所有块设备使用的页面。文件页面(NR_FILE_PAGES)这个比较容易理解:通俗的说,就是从文件中读取的数据;或者更准确的说,这些页面是文件的数据在内存中的缓存。
它们的特点是如果修改了可以写回到磁盘,如果未修改,丢弃之后可以再次从磁盘上读取。
总之,这些数据是用来提高文件的读写速度的,如果没有的话读写速度会变慢,但是不会有错误。
linux-3.12.6\fs\proc\meminfo.c
static int meminfo_proc_show(struct seq_file *m, void *v)
{
//……
si_meminfo(&i);
si_swapinfo(&i);
//……
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
if (cached < 0)
cached = 0;
//……
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
//……
K(i.totalram),
K(i.freeram),
K(i.bufferram),
K(cached),
//……
}
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = 0;
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
long nr_blockdev_pages(void)
{
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
}
三、buffers的由来
绝大部分情况下,我们操作的都是文件的内容。但是对文件系统来说,除了文件内容本身,还有很多其它的元数据(meta data)。
例如,记录设备空闲扇区数量、空闲inode数量的控制信息,文件修改时间、文件名、文件夹中文件名、各个文件对应的block信息等。这些信息在打开文件、创建文件时也需要使用,它们位于块设备对应inode(bdev->bd_inode)的i_mapping中,例如代码下面是打开文件时可能触发的对设备页面的读取。
当然,还有一些其它的场景,例如直接把设备作为一个文件打开,但是这种通常不是很常见。
ext3_lookup>>ext3_iget>>__ext3_get_inode_loc>>sb_getblk>>__getblk>>__getblk_slow>>grow_buffers==>>grow_dev_page
linux-3.12.6\fs\buffer.c
/*
* Create the page-cache page that contains the requested block.
*
* This is used purely for blockdev mappings.
*/
static int
grow_dev_page(struct block_device *bdev, sector_t block,
pgoff_t index, int size, int sizebits)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
struct buffer_head *bh;
//……
page = find_or_create_page(inode->i_mapping, index, gfp_mask);
//……
}
从上面的代码也可以看到:所谓的buffers就是设备文件使用的页面cache,只是这些信息通常对文件系统的使用者不直接可见。例如,文件系统的使用者不会关心一个磁盘中inode的分配和使用情况,扇区如何管理等,这些信息不属于磁盘上的任何一个具体文件。所以内核做了抽象,为每个块设备定义了一个inode,从而使它可以容纳到已经存在的常规文件的cache机制中。或者更通俗的说,buffer和cache从实现上看,并没有区别,都是磁盘扇区的页面缓存,只是buffer是元数据内容,而cache则是用户可见的常规文件内容的页面缓存。
四、常规文件读写时的页面操作
1、当首次读取时
由于page_lru_base_type默认为LRU_INACTIVE_FILE,所以默认文件是非活跃的。
do_generic_file_read>>add_to_page_cache_lru>>lru_cache_add_file>>__lru_cache_add>>__pagevec_lru_add==>>__pagevec_lru_add_fn
linux-3.12.6\mm\swap.c
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
int file = page_is_file_cache(page);
int active = PageActive(page);
enum lru_list lru = page_lru(page);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
update_page_reclaim_stat(lruvec, file, active);
trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
}
2、page_lru的由来
lru所属的类型通过page_lru函数决定
linux-3.12.6\include\linux\mm_inline.h
/**
* page_lru_base_type - which LRU list type should a page be on?
* @page: the page to test
*
* Used for LRU list index arithmetic.
*
* Returns the base LRU type - file or anon - @page should be on.
*/
static inline enum lru_list page_lru_base_type(struct page *page)
{
if (page_is_file_cache(page))
return LRU_INACTIVE_FILE;
return LRU_INACTIVE_ANON;
}
……
/**
* page_lru - which LRU list should a page be on?
* @page: the page to test
*
* Returns the LRU list a page should be on, as an index
* into the array of LRU lists.
*/
static __always_inline enum lru_list page_lru(struct page *page)
{
enum lru_list lru;
if (PageUnevictable(page))
lru = LRU_UNEVICTABLE;
else {
lru = page_lru_base_type(page);
if (PageActive(page))
lru += LRU_ACTIVE;
}
return lru;
}
3、进入active队列
static void do_generic_file_read(struct file *filp, loff_t *ppos,
read_descriptor_t *desc, read_actor_t actor)
{
……
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
……
}
linux-3.12.6\mm\swap.c
/*
* Mark a page as having seen activity.
*
* inactive,unreferenced -> inactive,referenced
* inactive,referenced -> active,unreferenced
* active,unreferenced -> active,referenced
*/
void mark_page_accessed(struct page *page)
{
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
/*
* If the page is on the LRU, queue it for activation via
* activate_page_pvecs. Otherwise, assume the page is on a
* pagevec, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
if (PageLRU(page))
activate_page(page);
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
}
4、首次写入时
在触发写操作时,同样是在分配之后手动添加到lru列表中的。
/*
* Find or create a page at the given pagecache position. Return the locked
* page. This function is specifically for buffered writes.
*/
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
……
page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
if (!page)
return NULL;
status = add_to_page_cache_lru(page, mapping, index,
GFP_KERNEL & ~gfp_notmask);
……
}
五、共享内存读写时页面操作
共享内存(shmem)文件系统有一个特殊之处:它虽然是块设备,但使用的具体存储介质是内存,这导致它们没有办法被写回存储介质之后释放内存页面。
1、shm页面的读取
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
shmem_getpage==>>shmem_getpage_gfp
/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, gfp_t gfp, void *expected)
{
int error;
……
if (!error) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
page_cache_release(page);
}
return error;
}
2、shm页面的写回
在写页面时,shm的页面是同时加入到了NR_SHMMEM和NR_FILE_PAGES,所以当考虑这个内容的时候其实是包括了这两部分内容。
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, gfp_t gfp, void *expected)
{
……
if (!error) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
page_cache_release(page);
}
return error;
}
可以看到:
- 共享内存的页面操作没有加入到LRU队列中,这意味着即使通过/proc/sys/vm/drop_caches无法回收这些页面;
- 它们存在于NR_FILE_PAGES,所以meminfo中的cache会包括共享内存数量。
- 它们存在于NR_SHMEM,所以meminfo中的shm会包括共享内存数量。
六、inode什么时候释放
当文件关闭之后,inode(以及对应的数据)不会释放,而是首先进入LRU队列,等待之后回收。
__fput>>dput>>dentry_kill>>d_kill>>dentry_iput>>iput>>iput_final>>inode_add_lru>>inode_lru_list_add
static void inode_lru_list_add(struct inode *inode)
{
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
}