Linux内存相关sysfs、工具
1. 全局内存分析
1.1 /proc/meminfo
详细参考:《/proc/meminfo》。
while true; do cat /proc/meminfo | grep MemFree; sleep 10; done
1.2 /proc/pagetypeinfo
1.3 slab相关问题定位(/proc/slabinfo、/sys/kernel/slab、slabinfo)
在内核中打开slub_debug,相关的工具有slabinfo。
slabinfo的数据来源是/sys/kernel/slab。
slabinfo从/sys/kernel/slab目录获取数据,并格式化输出。每个slab的详细信息可以从/sys/kernel/slab/找出。
关于slab,内核提供CONFIG_MODULEDEBUG_SLAB_LEAK用于监测slab的泄漏。但是对slub不生效。
CONFIG_SLUB_STATS提供每个slab更加详细的统计信息,这些信息用于slab分配器性能,作为优化分配器参考标准。
1.3.1 通过/proc/slabinfo查看slub统计信息
/proc/slabinfo在slab_proc_init()中创建,核心函数是slab_show()。
static int __init slab_proc_init(void) { proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &proc_slabinfo_operations); return 0; }
数据结构slabinfo表示每个slab的统计信息,是/proc/slabinfo的数据来源。
struct slabinfo { unsigned long active_objs;----------使用中的高速缓存数目。 unsigned long num_objs;-------------总高速缓存数目。 unsigned long active_slabs;---------使用中的slab数目。 unsigned long num_slabs;------------总slab数目。 unsigned long shared_avail; unsigned int limit; unsigned int batchcount; unsigned int shared; unsigned int objects_per_slab;------一个slab包含多少高速缓存。 unsigned int cache_order;-----------一个slab占用页面数order。 };
slab_show()显示/proc/slabinfo头后,遍历所有的struct kmem_cache,然后通过get_slabinfo()获取信息,cache_show()显示信息。
static int slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); if (p == slab_caches.next) print_slabinfo_header(m); if (is_root_cache(s)) cache_show(s, m); return 0; } static void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it * without _too_ many complaints. */ #ifdef CONFIG_DEBUG_SLAB seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); #else seq_puts(m, "slabinfo - version: 2.1\n"); #endif seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); #ifdef CONFIG_DEBUG_SLAB seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); #endif seq_putc(m, '\n'); } static void cache_show(struct kmem_cache *s, struct seq_file *m) { struct slabinfo sinfo; memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(s, &sinfo);------------------------------根据s在所有node中遍历,填充sinfo。 memcg_accumulate_slabinfo(s, &sinfo);-----------------在未定义CONFIG_MEMCG的时候无效。 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, sinfo.objects_per_slab, (1 << sinfo.cache_order)); seq_printf(m, " : tunables %4u %4u %4u", sinfo.limit, sinfo.batchcount, sinfo.shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); slabinfo_show_stats(m, s); seq_putc(m, '\n'); } void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_slabs = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) {--------------根据struct kmem_cache->node[]找到具体node下的对应struct kmem_cache_node,也即n。 nr_slabs += node_nr_slabs(n);-------------------然后累计nr_slabs、nr_objs、free数目。 nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); } sinfo->active_objs = nr_objs - nr_free; sinfo->num_objs = nr_objs; sinfo->active_slabs = nr_slabs; sinfo->num_slabs = nr_slabs; sinfo->objects_per_slab = oo_objects(s->oo);-------同一类型struct kmem_cache占用相同的页面阶数。 sinfo->cache_order = oo_order(s->oo); }
根据上面的代码可知,kmalloc-1024共78个高速缓存obj,其中72个在使用中。每个占用obj大小为1248,这个大小是包含meta data的。
每个slab包含13个obj,共占用4个页面;一共6个slab,都处于活跃状态。
slabinfo - version: 2.1 # name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>... kmalloc-2048 27 28 2272 14 8 : tunables 0 0 0 : slabdata 2 2 0 kmalloc-1024 72 78 1248 13 4 : tunables 0 0 0 : slabdata 6 6 0...
/proc/slabinfo和slabinfo工具的objsize不一样,是因为两者差别在是否包含meta data。
对应struc kmem_cache中的size和object_size。
Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg ... kmalloc-1024 72 1024 98.3K 6/1/0 13 2 16 75 PZFU kmalloc-2048 27 2048 65.5K 2/1/0 14 3 50 84 PZFU ...
通过/proc/slabinfo前后对比,可以分析出不同高速缓存的增加或者减少情况。对于分析内存使用,或者高速缓存泄漏问题大有裨益。
1.3.2 slab详细信息接口/sys/kernel/slab
系统initcall阶段,调用slab_sysfs_init()创建/sys/kernel/slab目录。
static int __init slab_sysfs_init(void) { struct kmem_cache *s; int err; mutex_lock(&slab_mutex); slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); ... mutex_unlock(&slab_mutex); resiliency_test(); return 0; }
系统在kmem_cache_create()创建高速缓存的时候,会在/sys/kernel/slab目录下创建同名的目录。
在目录下创建一系列节点slab_attr_group,用于设置显示高速缓存的信息。
struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { ... s = create_cache(cache_name, size, size, calculate_alignment(flags, align, size), flags, ctor, NULL, NULL); ... } static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, unsigned long flags, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { ... err = __kmem_cache_create(s, flags); ... } int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) { ... err = sysfs_slab_add(s); if (err) __kmem_cache_release(s); ... } static int sysfs_slab_add(struct kmem_cache *s) { ... s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) goto out_del_kobj; ... } static struct attribute_group slab_attr_group = { .attrs = slab_attrs, }; static struct attribute *slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, .... NULL };
alloc_calls/free_calls:显示分配/释放者
struct location用于记录slabcache在什么地方分配、释放。还记录了最大最小耗时、最大最小pid。
struct location { unsigned long count; unsigned long addr; long long sum_time; long min_time; long max_time; long min_pid; long max_pid; DECLARE_BITMAP(cpus, NR_CPUS); nodemask_t nodes; };
alloc_calls_show()用于显示分配者的信息,free_calls_show()用于显示释放者的信息。
两者都通过list_locations()来输出struct location中记录的信息。
static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) { if (!(s->flags & SLAB_STORE_USER)) return -ENOSYS; return list_locations(s, buf, TRACK_ALLOC); } SLAB_ATTR_RO(alloc_calls);
static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
{
if (!(s->flags & SLAB_STORE_USER))
return -ENOSYS;
return list_locations(s, buf, TRACK_FREE);
}
SLAB_ATTR_RO(free_calls);
static int list_locations(struct kmem_cache *s, char *buf, enum track_item alloc) { ... for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; if (!atomic_long_read(&n->nr_slabs)) continue; spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) break; len += sprintf(buf + len, "%7ld ", l->count);------------此location总次数。 if (l->addr) len += sprintf(buf + len, "%pS", (void *)l->addr);---此location对应地址的函数名,否则<not-available>。 else len += sprintf(buf + len, "<not-available>"); if (l->sum_time != l->min_time) {------------------------age依次显示min/average/max。 len += sprintf(buf + len, " age=%ld/%ld/%ld", l->min_time, (long)div_u64(l->sum_time, l->count), l->max_time); } else len += sprintf(buf + len, " age=%ld", l->min_time); if (l->min_pid != l->max_pid)----------------------------显示pid范围。 len += sprintf(buf + len, " pid=%ld-%ld", l->min_pid, l->max_pid); else len += sprintf(buf + len, " pid=%ld", l->min_pid); ... len += sprintf(buf + len, "\n"); } ... }
如下分别是kmalloc-32的alloc_calls和free_calls调用者:
5 register_tracer+0xa2/0x19c age=703110/703970/704186 pid=1 3 ipc_init_proc_interface+0x2e/0xa4 age=704057/704057/704057 pid=1 ... 21 register_blkdev+0x4c/0xec age=703173/703452/704152 pid=1 29 disk_expand_part_tbl+0x4a/0xbc age=703072/703748/703896 pid=1-78 4371 <not-available> age=635262 pid=0 53 of_clk_init+0x1c0/0x224 age=710262/710262/710262 pid=0 ... 1 led_trigger_set+0x11e/0x1b8 age=709217 pid=1 154 __of_attach_node_sysfs+0x74/0x114 age=710237/710244/710253 pid=1
在/proc/slabinfo可以看到不同高速缓存的增加减小的量,通过/sys/kernel/slab/可以更加详细的看到是谁分配或者释放了此高速缓存。
shrink:尽可能释放高速缓存
使能shrink则会尽可能释放高速缓存,调用kmem_cache_shrink()。
static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { if (buf[0] == '1') kmem_cache_shrink(s); else return -EINVAL; return length; } SLAB_ATTR(shrink); int kmem_cache_shrink(struct kmem_cache *cachep) { int ret; get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; }
store_user:记录调用者信息
使能store_user将调用者信息记录到struct track中。
trace:跟踪高速缓存分配释放
通过写入0/1来关闭/打开对高速缓存的分配释放的跟踪,是通过flags中增加删减SLAB_TRACE来设置的。
static ssize_t trace_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { if (s->refcount > 1) return -EINVAL; s->flags &= ~SLAB_TRACE; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; } return length; } SLAB_ATTR(trace);
在__slab_alloc()中调用alloc_debug_processing(),在__slab_free()中调用free_alloc_processing()。
最终都是通过trace()实现:
static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { if (s->flags & SLAB_TRACE) { pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, page->freelist); if (!alloc) print_section(KERN_INFO, "Object ", (void *)object, s->object_size); dump_stack();-----------------------------------------------显示当前调用栈信息。 } }
在shell中ls一下,监控kmalloc-32实际输出如下:
[ 7685.244170] TRACE kmalloc-32 alloc 0xbc99d820 inuse=16 fp=0x (null)--------------kmalloc-32的alloc,进程是sh。 [ 7685.250561] CPU: 0 PID: 218 Comm: sh Not tainted 4.9.56 #93 [ 7685.256140] Call Trace: [<803006fe>] dump_stack+0x1e/0x3c [<8012c180>] alloc_debug_processing+0x5c/0x17c [<8012c46e>] ___slab_alloc.constprop.28+0x1ce/0x22c [<8012c52c>] __slab_alloc.constprop.27+0x60/0xb0 [<8012c6b0>] __kmalloc+0x134/0x158 [<8018c644>] load_elf_binary+0x254/0x12ec [<8013ee2a>] search_binary_handler+0x7a/0x1a4 [<8013f8ac>] do_execveat_common+0x4f4/0x6a0 [<8013fd7c>] SyS_execve+0x38/0x4c [<80046186>] csky_systemcall+0x96/0xe0 [ 7685.300343] TRACE kmalloc-32 free 0xbc99d820 inuse=8 fp=0xbc99d720 [ 7685.306560] Object bc99d820: 2f 6c 69 62 2f 6c 64 2e 73 6f 2e 31 00 6b 6b 6b /lib/ld.so.1.kkk [ 7685.315181] Object bc99d830: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk. [ 7685.323807] CPU: 0 PID: 218 Comm: which Not tainted 4.9.56 #93 [ 7685.329647] Call Trace: [<803006fe>] dump_stack+0x1e/0x3c [<8012cd34>] free_debug_processing+0x28c/0x3b8 [<8012d05c>] __slab_free+0x1fc/0x310 [<8012d6c8>] kfree+0x148/0x178 [<8018d438>] load_elf_binary+0x1048/0x12ec [<8013ee2a>] search_binary_handler+0x7a/0x1a4 [<8013f8ac>] do_execveat_common+0x4f4/0x6a0 [<8013fd7c>] SyS_execve+0x38/0x4c [<80046186>] csky_systemcall+0x96/0xe0
1.3.3 slabinfo
slabinfo从/sys/kernel/slab中获取数据。
直接输入slabinfo可以获得一个统计信息列表:
Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg anon_vma 84 32 24.5K 6/4/0 19 0 66 10 PZFU anon_vma_chain 111 32 28.6K 7/5/0 19 0 71 12 PZFU bdev_cache 4 408 8.1K 1/1/0 12 1 100 19 APaZFU bio-0 60 132 36.8K 9/4/0 11 0 44 21 APZFU bio_integrity_payload 2 104 4.0K 1/1/0 12 0 100 5 APZFU ...
slabinfo -r显示所有单个高速缓存的详细统计信息。
slabinfo -r kmalloc-32查看单个高速缓存的详细信息,包括分配者和释放者信息。
Slabcache: kmalloc-32 Aliases: 0 Order : 0 Objects: 5126 Sizes (bytes) Slabs Debug Memory ------------------------------------------------------------------------ Object : 32 Total : 321 Sanity Checks : On Total: 1314816 SlabObj: 256 Full : 319 Redzoning : On Used : 164032 SlabSiz: 4096 Partial: 2 Poisoning : On Loss : 1150784 Loss : 224 CpuSlab: 0 Tracking : On Lalig: 1148224 Align : 32 Objects: 16 Tracing : Off Lpadd: 0 kmalloc-32 has no kmem_cache operations kmalloc-32: Kernel object allocation ----------------------------------------------------------------------- 5 register_tracer+0xa2/0x19c age=1203320/1204180/1204396 pid=1 3 ipc_init_proc_interface+0x2e/0xa4 age=1204267/1204267/1204267 pid=1 ... 8 blk_mq_realloc_hw_ctxs+0x1b8/0x3f0 age=1204078/1204082/1204086 pid=1 8 blk_mq_init_allocated_queue+0x3e/0x2cc age=1204078/1204082/1204086 pid=1 kmalloc-32: Kernel object freeing ------------------------------------------------------------------------ 4371 <not-available> age=1129512 pid=0 53 of_clk_init+0x1c0/0x224 age=1204512/1204512/1204512 pid=0 1 free_resource+0x62/0x70 age=1204203 pid=1 ... 1 serio_handle_event+0x162/0x248 age=1203596 pid=19 2 media_entity_graph_walk_cleanup+0x1e/0x30 age=1203347/1203360/1203373 pid=98 1 led_trigger_set+0x11e/0x1b8 age=1203467 pid=1 154 __of_attach_node_sysfs+0x74/0x114 age=1204487/1204494/1204503 pid=1 kmalloc-32: No NUMA information available.
slabinfo -s对所有的尽量释放高速缓存,slabinfo -s <cache name>则释放单个高速缓存。
slabinfo -T显示所有高速缓存的整体统计信息。