kfence源码分析【转】
转自:https://www.cnblogs.com/pengdonglin137/p/16342898.html
参考
作者
pengdonglin137@163.com
内核版本
linux-5.14
实现分析
Kfence (Kernel Electric Fence) 是 Linux 内核引入的一种低开销的内存错误检测机制,因为是低开销的所以它可以在运行的生产环境中开启,同样由于是低开销所以它的功能相比较 KASAN 会偏弱。
-
Kfence是一种基于采样的低开销的内存安全错误检测技术。可以检测UAF、非法释放、OOB三种内存错误,目前支持x86和ARM64,它在slab和slub内存分配器中添加了hook函数。
-
Kfence的设计理念:如果有足够长的总的运行时间,kfence可以在非生产环境的测试程序无法充分测试的代码路径上检测到bug。可以通过大范围部署kfence来快速达到足够长的总运行时间。
-
Kfence管理的每个object都分别存放在一个单独的内存页的左边或者右边,跟这个内存页紧邻的左右两侧的内存页被成为保护页,这些保护页的内存属性被设置成保护状态(PTE页表项的P位),如果访问这些保护页,就会导致缺页异常,而kfence在缺页异常中会解析和报告发生的错误。
-
从kfence内存池中分配object是基于一个采样间隔,这个间隔可以通过内核启动参数
kfence.sample_interval
来修改。当经过了一个采样间隔的时间,下一次从slab或slub中分配的object将会来自kfence内存池。然后需要再经过一个采样间隔,slab或者slub才能从kfence内存池中分配一个object。 -
由于采用了static key机制,可以省去判断逻辑,所以不管是否开启kfence,从slub或者slab的的快速路径分配内存时的性能都不会受到影响。
-
Kfence内存池的大小是固定的,如果Kfence内存池被用光了,那么就不能再从kfence内存池分配内存了。默认的内核配置是kfence内存池大小为2MB,可以分配到255的object,每个object对应一个内存页。
初始化
kfence内存池框图:
其中data区域是用来分配的,fence区域是用来检测内存越界的。metadata数组的元素跟data区域一一对应,用于描述data区域的信息。
start_kernel | |
-> mm_init | |
-> kfence_alloc_pool | |
// 将memblock分配器中的空闲页面释放给伙伴分配器,之前被memblock分配出去还没有释放的内存也就不会出现在伙伴系统里,虽然如此,这部分内存还是有 | |
// 与之对应的page结构体 | |
-> mem_init | |
-> kfence_init |
- kfence_alloc_pool [mm\kfence\core.c]
void __init kfence_alloc_pool(void) | |
{ | |
// 如果采样间隔为0的话,不初始化kfence。需要通过内核配置选项CONFIG_KFENCE_SAMPLE_INTERVAL或者内核启动参数kfence.sample_interval来设置 | |
if (!kfence_sample_interval) | |
return; | |
// 申请kfence pool内存池,大小为:((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE),对齐到PAGE_SIZE | |
// CONFIG_KFENCE_NUM_OBJECTS最大为65535,最小为1. | |
__kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); | |
} |
此时伙伴分配器不能使用,所以给kfence的内存在伙伴系统之外,不属于伙伴系统管理,所以也就不用担心被伙伴系统分配出去。
- kfence_init
void __init kfence_init(void) | |
{ | |
/* 如果采样间隔为0,那么会关闭kfence */ | |
if (!kfence_sample_interval) | |
return; | |
// 初始化kfence内存池 | |
kfence_init_pool(); | |
// 表示kfence可以工作了 | |
WRITE_ONCE(kfence_enabled, true); | |
/* | |
用于周期性开启kfence内存池的任务,这里delay时间为0,表示立刻开启,见下文toggle_allocation_gate | |
*/ | |
queue_delayed_work(system_unbound_wq, &kfence_timer, 0); | |
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, | |
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, | |
(void *)(__kfence_pool + KFENCE_POOL_SIZE)); | |
} |
- kfence_init_pool [kfence_init -> kfence_init_pool]
static bool __init kfence_init_pool(void) | |
{ | |
unsigned long addr = (unsigned long)__kfence_pool; | |
struct page *pages; | |
int i; | |
/* 对于x86架构,会检查__kfence_pool是否映射到物理地址了 */ | |
arch_kfence_init_pool(); | |
/* 获取将kfence内存池首地址对应的page结构体 */ | |
pages = virt_to_page(addr); | |
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { | |
if (!i || (i % 2)) // 跳过第0页和所有的奇数页 | |
continue; | |
/* 1. 设置所有的偶数页的struct page结构体的slab标志,因为在调用kmem_cache_free时会检查 | |
虚拟地址对应的page结构体是否设置了slab标志,如果没有设置,那么无法释放 | |
2. 如果用kfree释放,这个标志可以保证调用slab_free -> __slab_free -> kfence_free | |
*/ | |
__SetPageSlab(&pages[i]); | |
} | |
// 将前两页在页表中的PTE项的Present标志去掉,这样当cpu访问前两页时,就会触发缺页异常 | |
for (i = 0; i < 2; i++) { | |
kfence_protect(addr); | |
addr += PAGE_SIZE; | |
} | |
// kfence_metadata是一个数据类型为struct kfence_metadata的数组,元素个数是CONFIG_KFENCE_NUM_OBJECTS | |
// 从这里可以看出,每一个kfence_metadata数组成员管理一个object | |
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { | |
struct kfence_metadata *meta = &kfence_metadata[i]; | |
/* Initialize metadata. */ | |
INIT_LIST_HEAD(&meta->list); | |
raw_spin_lock_init(&meta->lock); | |
meta->state = KFENCE_OBJECT_UNUSED; // object的初始状态为UNUSED | |
meta->addr = addr; /* object所在的4KB内存的起始地址 */ | |
list_add_tail(&meta->list, &kfence_freelist); // 添加到全局链表中 | |
// 将object所在的4KB内存的下一个4KB的页表映射信息置为无效,用来检测内存越界访问 | |
kfence_protect(addr + PAGE_SIZE); | |
addr += 2 * PAGE_SIZE; | |
} | |
// 之前在调用memblock_alloc时在kmemleak中有记录,这里先删除这部分记录,防止后面调用kfence_alloc出现冲突 | |
kmemleak_free(__kfence_pool); | |
return true; | |
} |
周期性开启kfence内存池
在kfence_init中还添加了一个kfence_timer的延迟任务,用于周期性开启kfence内存分配,实现如下:
- toggle_allocation_gate
/* | |
* Set up delayed work, which will enable and disable the static key. We need to | |
* use a work queue (rather than a simple timer), since enabling and disabling a | |
* static key cannot be done from an interrupt. | |
* | |
* Note: Toggling a static branch currently causes IPIs, and here we'll end up | |
* with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with | |
* more aggressive sampling intervals), we could get away with a variant that | |
* avoids IPIs, at the cost of not immediately capturing allocations if the | |
* instructions remain cached. | |
*/ | |
static struct delayed_work kfence_timer; | |
static void toggle_allocation_gate(struct work_struct *work) | |
{ | |
if (!READ_ONCE(kfence_enabled)) | |
return; | |
// 周期性将kfence_allocation_gate设置为0,这个作为一个kfence内存池开启的标志位,0表示开启,非0表示关闭, | |
// 保证每隔一定时间最多只允许从kfence内存池分配一次内存 | |
atomic_set(&kfence_allocation_gate, 0); | |
// 使用static key来优化性能,因为直接通过读取kfence_allocation_gate的值是否为0来判断的性能开销比较大 | |
/* 打开static key,并且等待从kfence内存池分配 */ | |
static_branch_enable(&kfence_allocation_key); | |
if (sysctl_hung_task_timeout_secs) { // 内核发出hang task警告的时间最短时间长度,一般为120秒 | |
/* | |
* 如果内存分配没有那么频繁,就有可能出现等待时间过长的问题,这里将等待超时时间设置为hang task警告时间的一半, | |
这样内核就不会因为处于D状态过长导致内核出现警告。 | |
被唤醒的原因: | |
1. 当有人从kfence分配了内存,会将kfence_allocation_gate设置为1,然后唤醒阻塞在allocation_wait里的任务 | |
2. 超时 | |
*/ | |
wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), | |
sysctl_hung_task_timeout_secs * HZ / 2); | |
} else { | |
/* 如果hangtask检测时间为0,表示时间无限长,那么可以放心地等待下去,直到有人从kfence分配了内存,会将kfence_allocation_gate | |
设置为1,然后唤醒阻塞在allocation_wait里的任务 | |
*/ | |
wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); | |
} | |
/* 将static keys关闭,保证不会进入__kfence_alloc */ | |
static_branch_disable(&kfence_allocation_key); | |
// 等待kfence_sample_interval,单位时毫秒,然后再此开启kfence内存池 | |
queue_delayed_work(system_unbound_wq, &kfence_timer, | |
msecs_to_jiffies(kfence_sample_interval)); | |
} | |
static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); |
分配内存
框图:
- 入口1:
kmalloc | |
-> kmem_cache_alloc_trace | |
-> slab_alloc | |
-> return | |
-> __kmalloc | |
-> slab_alloc | |
-> return |
- 入口2
kmem_cache_alloc | |
-> slab_alloc |
上面两个路径最后都会调用到slab_alloc:
slab_alloc | |
-> slab_alloc_node | |
-> kfence_alloc | |
-> 如果kfence_alloc返回NULL的话,走常规的slub分配 |
- kfence_alloc
static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) | |
{ | |
/* 如果内核配置了kfence_static_keys,那么走这个优化分支 */ | |
if (static_branch_unlikely(&kfence_allocation_key)) | |
/* 常规的判断分支,性能比static key分支差 */ | |
if (unlikely(!atomic_read(&kfence_allocation_gate))) | |
return __kfence_alloc(s, size, flags); | |
return NULL; | |
} |
- __kfence_alloc
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) | |
{ | |
/* | |
目前kfence内存池仅支持大小不超过一页的内存大小object分配 | |
*/ | |
if (size > PAGE_SIZE) | |
return NULL; | |
/* | |
* 需要从DMA、DMA32、HIGHMEM分配内存的话,kfence内存池不支持。因为kfence内存池的内存 | |
属性不一定满足需求,比如dma一般要求内存是不带cache的,而kfence内存池中的内存不能保证这一点。 | |
*/ | |
if ((flags & GFP_ZONEMASK) || | |
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) | |
return NULL; | |
/* | |
下面判断可以保证只有一个分配者可以进入,进入后kfence内存池就关闭后,在下次开启之前,所有的分配者 | |
都无法进入,只能返回NULL,从而走常规的slub分配器。 | |
*/ | |
if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) | |
return NULL; | |
/* | |
* 检查allocation_wait中是否有进程在阻塞,有的话,会起一个work来唤醒被阻塞的进程 | |
*/ | |
if (waitqueue_active(&allocation_wait)) { | |
/* | |
* Calling wake_up() here may deadlock when allocations happen | |
* from within timer code. Use an irq_work to defer it. | |
*/ | |
irq_work_queue(&wake_up_kfence_timer_work); | |
} | |
// 判断kfence功能是否使能了 | |
if (!READ_ONCE(kfence_enabled)) | |
return NULL; | |
// 从kfence内存池中分配object | |
return kfence_guarded_alloc(s, size, flags); | |
} |
- kfence_guarded_alloc [kfence_alloc -> __kfence_alloc -> kfence_guarded_alloc]
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp) | |
{ | |
struct kfence_metadata *meta = NULL; | |
unsigned long flags; | |
struct page *page; | |
void *addr; | |
// 检查kfence内存池是否还有空闲的内存页 | |
if (!list_empty(&kfence_freelist)) { | |
// 获取空闲内存页对应的kfence_metadata数据结构 | |
meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); | |
list_del_init(&meta->list); | |
} | |
// 如果为空,表示kfence内存池已经分配完了。需要用常规的slub分配器分配。 | |
if (!meta) | |
return NULL; | |
// 获取meta对应的空闲内存页的虚拟首地址 | |
meta->addr = metadata_to_pageaddr(meta); | |
/* 如果是空闲的,那么需要恢复这个内存页在页表的PTE的present标志,保证cpu可以正常访问这页内存而不发生缺页异常 | |
这里为什么要判断freed呢?因为在初始函数kfence_init_pool中设置的初始状态是KFENCE_OBJECT_UNUSED,表示还 | |
这页内存还没有使用过,而且初始化时也没有调用kfence_protect来保护该页,所以对于UNUSED的页就没有必要kfence_unprotect | |
只有当这页被分配出去,然后释放的时候会将该页设置为freed,并且调用kfence_protect来保护该页,用于检查use after free。 | |
所以对于free的内存页在下次分配的时候当然要进行kfence_unprotect处理。 | |
*/ | |
if (meta->state == KFENCE_OBJECT_FREED) | |
kfence_unprotect(meta->addr); | |
/* | |
* Note: for allocations made before RNG initialization, will always | |
* return zero. We still benefit from enabling KFENCE as early as | |
* possible, even when the RNG is not yet available, as this will allow | |
* KFENCE to detect bugs due to earlier allocations. The only downside | |
* is that the out-of-bounds accesses detected are deterministic for | |
* such allocations. | |
如果随机数发生器初始化之前分配,那么object的地址是从这页内存的起始位置开始。当随机数 | |
发生器可以工作了,那么将object放到这页内存的最右侧 | |
*/ | |
if (prandom_u32_max(2)) { | |
/* Allocate on the "right" side, re-calculate address. */ | |
meta->addr += PAGE_SIZE - size; | |
meta->addr = ALIGN_DOWN(meta->addr, cache->align); | |
} | |
// object起始地址 | |
addr = (void *)meta->addr; | |
/* | |
这个函数做了几件事: | |
1. 将当前进程的调用栈记录到meta的alloc_track中,即内存分配栈 | |
2. 将当前进程的pid记录到meta的pid中 | |
3. 设置meta的状态为KFENCE_OBJECT_ALLOCATED,表示meta描述的一页内存已经被分配 | |
*/ | |
metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED); | |
/* 将当前kmem_cache记录到meta中 */ | |
WRITE_ONCE(meta->cache, cache); | |
/* 记录object的大小 */ | |
meta->size = size; | |
/* 将这页内存中除了给object用的size大小的空间之外的填充成一个跟地址相关的pattern数 | |
目的是在释放时检查有没有发生内存越界访问 | |
*/ | |
for_each_canary(meta, set_canary_byte); | |
/* 获取这页内存对应的struct page结构 */ | |
page = virt_to_page(meta->addr); | |
/* 在page中记录对应的kmem_cache,将来释放的时候要用到 */ | |
page->slab_cache = cache; | |
/* 由于kfence内存池中一个页只放了一个object,所以这里将objects设置为1 */ | |
if (IS_ENABLED(CONFIG_SLUB)) | |
page->objects = 1; | |
// 如果是slab分配器,s_smem会记录第一个object的地址 | |
if (IS_ENABLED(CONFIG_SLAB)) | |
page->s_mem = addr; | |
/* Memory initialization. */ | |
/* | |
* We check slab_want_init_on_alloc() ourselves, rather than letting | |
* SL*B do the initialization, as otherwise we might overwrite KFENCE's | |
* redzone. | |
*/ | |
if (unlikely(slab_want_init_on_alloc(gfp, cache))) // 如果设置了__GFP_ZERO标志,返回true | |
memzero_explicit(addr, size); // 将object使用的那部分区域清零 | |
if (cache->ctor) // 如果有构造函数 | |
cache->ctor(addr); | |
/* KFENCE_COUNTER_ALLOCATED 表示kfence内存池中有多少object被分配出去了,在释放的时候会减一 */ | |
atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); | |
/* KFENCE_COUNTER_ALLOCS 表示发生从kfence内存池分配内存的次数,单调递增 */ | |
atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); | |
return addr; | |
} |
释放内存
- 路径1:
kfree | |
-> slab_free | |
-> slab_free_hook | |
-> do_slab_free | |
-> __slab_free | |
-> kfence_free |
- 路径2
kmem_cache_free | |
-> slab_free |
释放内存时,最终会调用到kfence_free
- kfence_free
static __always_inline __must_check bool kfence_free(void *addr) | |
{ | |
// 检查要释放的虚拟地址是否在kfence内存池的虚拟地址范围内 | |
if (!is_kfence_address(addr)) | |
return false; | |
__kfence_free(addr); | |
return true; | |
} |
- __kfence_free
void __kfence_free(void *addr) | |
{ | |
/* | |
根据object的地址可以获取对应的meta。根据addr跟kfence内存池起始地址的偏移可以计算出一个索引,然后从kfence_metadata数组 | |
中就可以得到索引对应的meta | |
*/ | |
struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); | |
/* | |
* 如果meta对应的kmem_cache有SLAB_TYPESAFE_BY_RCU,那么不能立刻释放,需要异步处理,当过了一个宽限期再释放 | |
在rcu_guarded_free会直接调用kfence_guarded_free | |
*/ | |
if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) | |
call_rcu(&meta->rcu_head, rcu_guarded_free); | |
else | |
kfence_guarded_free(addr, meta, false); | |
} |
- kfence_guarded_free [kfence_free -> __kfence_free -> kfence_guarded_free]
static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) | |
{ | |
struct kcsan_scoped_access assert_page_exclusive; | |
unsigned long flags; | |
// 如果meta的状态不是已分配的话或者地址不匹配,或者是释放了两次,或者是释放时传的地址跟申请时获得的不一样 | |
if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { | |
/* Invalid or double-free, bail out. */ | |
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); // 将kfence检测到的内存问题的个数加1 | |
kfence_report_error((unsigned long)addr, false, NULL, meta, | |
KFENCE_ERROR_INVALID_FREE); | |
raw_spin_unlock_irqrestore(&meta->lock, flags); | |
return; | |
} | |
/* 如果在缺页异常中检测到OOB内存错误,那么unprotected_page会记录发生异常的地址 */ | |
if (meta->unprotected_page) { | |
// 将发生OOB的地址所在的page页清零 | |
memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); | |
// 将发生OOB的地址所在的内存页设置为保护,因为缺页异常的最后会取消保护发生异常的地址所在的页 | |
kfence_protect(meta->unprotected_page); | |
meta->unprotected_page = 0; | |
} | |
/* 检查object所在的内存页的空闲区域的pattern值是否发生了改变,以此来判断是否发生了OOB | |
for_eatch_canary首先检查object左侧的pattern,将第一个pattern不一致的信息输出。然后检查object右侧 | |
的pattern,也只输出第一个pattern不一致的信息输出 | |
*/ | |
for_each_canary(meta, check_canary_byte); | |
/* | |
* Clear memory if init-on-free is set. While we protect the page, the | |
* data is still there, and after a use-after-free is detected, we | |
* unprotect the page, so the data is still accessible. | |
*/ | |
if (!zombie && unlikely(slab_want_init_on_free(meta->cache))) | |
memzero_explicit(addr, meta->size); | |
/* 这个函数做如下几件事: | |
1. 将当前进程的调用栈存放到meta的free_track中,即内存释放栈 | |
2. 记录当前进程的pid到meta的pid成员中 | |
3. 设置meta的状态为KFENCE_OBJECT_FREED,表示对应的内存页空闲了 | |
*/ | |
metadata_update_state(meta, KFENCE_OBJECT_FREED); | |
/* 将这页内存保护起来,用来检测use after free类型的内存访问错误 */ | |
kfence_protect((unsigned long)addr); | |
if (!zombie) { | |
/* 将meta重新放回空闲链表 */ | |
list_add_tail(&meta->list, &kfence_freelist); | |
// 将KFENCE_COUNTER_ALLOCATED的计数减1,表示当前有多少kfence内存池里有多少object被分配出去了 | |
atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); | |
// 将KFENCE_COUNTER_FREES的计数加1,表示kfence内存池发生了多少次object释放,单调递增 | |
atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); | |
} else { | |
/* 当kmem_cache被销毁时,所有尚未释放的object个数会记录到KFENCE_COUNTER_ZOMBIES中 | |
处于zombie的object也时free的,但是不能被分配了 | |
*/ | |
atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); | |
} | |
} |
检查pattern区
- for_each_canary [kfence_free -> __kfence_free -> kfence_guarded_free -> for_each_canary]
/* __always_inline this to ensure we won't do an indirect call to fn. */ | |
static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *)) | |
{ | |
const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); | |
unsigned long addr; | |
/* 检查object所在的内存页的左侧的pattern区域 */ | |
for (addr = pageaddr; addr < meta->addr; addr++) { | |
if (!fn((u8 *)addr)) // 如果不匹配,会输出kfence错误log,并返回false | |
break; | |
} | |
/* 检查object所在的内存页的右侧的pattern区域 */ | |
for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) { | |
if (!fn((u8 *)addr)) // 如果不匹配,会输出kfence错误log,并返回false | |
break; | |
} | |
} |
- check_canary_byte [kfence_free -> __kfence_free -> kfence_guarded_free -> for_each_canary -> check_canary_byte ]
/* Check canary byte at @addr. */ | |
static inline bool check_canary_byte(u8 *addr) | |
{ | |
if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) | |
return true; | |
// 如果内存页中的空闲区域的值跟之前的pattern值不同,表示在该页内部发生了越界,这种越界不会触发缺页 | |
// KFENCE_COUNTER_BUGS的计数加1,表示kfence检测到的内存问题的个数 | |
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); | |
kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr), | |
KFENCE_ERROR_CORRUPTION); | |
return false; | |
} |
kmem_cache销毁
kmem_cache_destroy | |
-> shutdown_cache | |
-> kfence_shutdown_cache |
- kfence_shutdown_cache
void kfence_shutdown_cache(struct kmem_cache *s) | |
{ | |
unsigned long flags; | |
struct kfence_metadata *meta; | |
int i; | |
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { | |
bool in_use; | |
meta = &kfence_metadata[i]; | |
/* 跳过不跟指定kmem_cache匹配的meta以及状态不是已分配的meta | |
*/ | |
if (READ_ONCE(meta->cache) != s || | |
READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) | |
continue; | |
raw_spin_lock_irqsave(&meta->lock, flags); | |
in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; | |
raw_spin_unlock_irqrestore(&meta->lock, flags); | |
if (in_use) { | |
/* | |
* This cache still has allocations, and we should not | |
* release them back into the freelist so they can still | |
* safely be used and retain the kernel's default | |
* behaviour of keeping the allocations alive (leak the | |
* cache); however, they effectively become "zombie | |
* allocations" as the KFENCE objects are the only ones | |
* still in use and the owning cache is being destroyed. | |
* | |
* We mark them freed, so that any subsequent use shows | |
* more useful error messages that will include stack | |
* traces of the user of the object, the original | |
* allocation, and caller to shutdown_cache(). | |
*/ | |
kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); | |
// 将zombie设置为true,被释放的meta并不会加入到kfence_freelist中,也就不会分分配出去 | |
// 处于zombie的object也属于free,但是不能再被分配 | |
} | |
} | |
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { | |
meta = &kfence_metadata[i]; | |
/* See above. */ | |
if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) | |
continue; | |
raw_spin_lock_irqsave(&meta->lock, flags); | |
// 将meta的cache字段清除,这样通过/sys/kernel/debug/kfence/objects知道哪些object是zombie的 | |
if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) | |
meta->cache = NULL; | |
raw_spin_unlock_irqrestore(&meta->lock, flags); | |
} | |
} |
缺页异常
-
当发生内存越界访问导致被protect的页被访问,此时会发生缺页。
-
当发生了use after free,即object被释放后在没有申请的情况下,又访问这个object,也会发生缺页。因为在释放时,空闲object所在的内存页已经被保护了。
路径:
handle_page_fault | |
-> do_kern_addr_fault | |
-> bad_area_nosemaphore | |
-> __bad_area_nosemaphore | |
-> kernelmode_fixup_or_oops | |
-> page_fault_oops | |
-> kfence_handle_page_fault |
- kfence_handle_page_fault
/* | |
addr是导致缺页的地址 | |
is_write表示是否是写访问 | |
regs记录缺页发生时的cpu寄存器上下文 | |
*/ | |
bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) | |
{ | |
/* | |
根据缺页发生的地址计算在kfence内存池中的索引 | |
*/ | |
const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; | |
struct kfence_metadata *to_report = NULL; | |
enum kfence_error_type error_type; | |
unsigned long flags; | |
// 判断是否为kfence内存池的地址范围 | |
if (!is_kfence_address((void *)addr)) | |
return false; | |
// 检查kfence是否被关闭了,可以向/sys/module/kfence/parameters/sample_interval写入0关闭kfence | |
if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ | |
return kfence_unprotect(addr); /* ... unprotect and proceed. */ | |
// KFENCE_COUNTER_BUGS计数加1,表示检测到的内存错误的个数 | |
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); | |
if (page_index % 2) { | |
/* | |
如果是在kfence内存池中奇数页上发生的缺页,表示发生了内存越界。因为在初始化时,已经将奇数页保护起来了 | |
*/ | |
/* This is a redzone, report a buffer overflow. */ | |
struct kfence_metadata *meta; | |
int distance = 0; | |
// 获取缺页地址左边的一页对应的meta,因为奇数页不用来存放object。 | |
meta = addr_to_metadata(addr - PAGE_SIZE); | |
if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { // 检查左边的页是否分配了 | |
to_report = meta; | |
/* Data race ok; distance calculation approximate. | |
计算发生缺页的地址跟左边被分配出去的object的结尾地址之间的距离 | |
*/ | |
distance = addr - data_race(meta->addr + meta->size); | |
} | |
// 检查缺页地址右边的页对应的meta | |
meta = addr_to_metadata(addr + PAGE_SIZE); | |
if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { // 检查右边的页是否分配了 | |
/* Data race ok; distance calculation approximate. | |
如果to_report是空,表示左边的页没有分配,那么当前右边的页就是发生越界的object所在的页 | |
如果左边的页也分配了,需要比较右边的的页中object的起始地址距离缺页发生的地址之间的距离跟左边页计算来的 | |
的距离,距离小的一边就是发生越界的object所在的页 | |
*/ | |
if (!to_report || distance > data_race(meta->addr) - addr) | |
to_report = meta; | |
} | |
// 如果左边和右边的页都没有分配出去,这是一种kfence也不敢确定的异常行为,可能是UAF或者OOB | |
if (!to_report) | |
goto out; | |
raw_spin_lock_irqsave(&to_report->lock, flags); | |
// 记录缺页发生的地址 | |
to_report->unprotected_page = addr; | |
// kfence检测到的错误类型为越界访问 | |
error_type = KFENCE_ERROR_OOB; | |
/* | |
* If the object was freed before we took the look we can still | |
* report this as an OOB -- the report will simply show the | |
* stacktrace of the free as well. | |
*/ | |
} else { | |
// 表示发生了UAF,在偶数页上发生了缺页,只有一种可能,就是object被释放后,没有申请的情况下,又访问了这个object。 | |
// 在前面的分析中直到,对于偶数页,只有在free后才会被protect起来。 | |
to_report = addr_to_metadata(addr); | |
if (!to_report) | |
goto out; | |
raw_spin_lock_irqsave(&to_report->lock, flags); | |
// kfence检测到UAF内存访问错误 | |
error_type = KFENCE_ERROR_UAF; | |
/* | |
* We may race with __kfence_alloc(), and it is possible that a | |
* freed object may be reallocated. We simply report this as a | |
* use-after-free, with the stack trace showing the place where | |
* the object was re-allocated. | |
*/ | |
} | |
out: | |
if (to_report) { | |
// 报告OOB内存访问错误 | |
kfence_report_error(addr, is_write, regs, to_report, error_type); | |
raw_spin_unlock_irqrestore(&to_report->lock, flags); | |
} else { | |
/* 触发OOB的左侧和右侧的内存页都没有分配,既可能使UAF,也可能是OOB | |
This may be a UAF or OOB access, but we can't be sure. */ | |
kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); | |
} | |
// 执行到这里,说明kfence不希望系统宕机,所以撤销发生缺页的地址所在的内存区的保护,保证系统还可以正常跑下去 | |
return kfence_unprotect(addr); /* Unprotect and let access proceed. */ | |
} |
错误报告
当检测到内存错误访问时,会调用kfence_report_error输出错误log。
错误种类分为如下几种:
-
缺页异常中检测到的访问了protect页的oob:KFENCE_ERROR_OOB
-
释放内存时检测到的访问了object所在的内存区的空闲区域的OOB:KFENCE_ERROR_CORRUPTION
-
缺页异常中检测到的访问了被释放的object所在的内存页的UAF:KFENCE_ERROR_UAF
-
释放内存时检测到的kfence到重复释放或者申请和释放的地址不一致:KFENCE_ERROR_INVALID_FREE
-
缺页异常中检测到的kfence无法确定的内存访问错误,比如发生OOB时但是protect页左右的内存页都没有分配出去:KFENCE_ERROR_INVALID
- kfence_report_error
/* | |
address: 导致内存问题的地址 | |
is_write: 是不是写访问、 | |
regs: 发生缺页异常时的cpu上下文 | |
meta:跟导致内存异常的地址关联的meta,对于访问protect区域的oob来说,meta表示的是因为访问那个object导致的oob,这个object对应的meta | |
type:内存问题的类型 | |
*/ | |
void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, | |
const struct kfence_metadata *meta, enum kfence_error_type type) | |
{ | |
unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; | |
const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; | |
int num_stack_entries; | |
int skipnr = 0; | |
/* | |
对于regs非空,是因为触发了缺页的情况,此时根据regs得到的调用栈不需要skip任何一项,所以skipnr为0,因为regs记录的就是异常发生那 | |
一刻的栈的状态; | |
对于regs为空的场景,是通过释放内存触发的,记录调用栈的时候,调用栈里不可避免的会出现kfence、slab以及kmem_cache相关的函数,这些 | |
函数对于分析问题没啥帮助,所以对分析问题有帮助的是谁调用了这些函数,即谁在哪里执行了释放内存的操作,因为需要将这部分的调用栈输出出来, | |
以节省开发人员时间,所以skipnr非0 | |
*/ | |
if (regs) { | |
/* 根据pt_regs获取发生异常时的调用栈,并且存放到stack_entries中,深度为64 */ | |
num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0); | |
} else { | |
/* 如果没有传递pt_regs,那么记录的当前的调用栈,但是会将堆栈的去掉调用栈的第一项,即stack_trace_save */ | |
num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); | |
/* 解析调用栈,目的是尽量得到导致内存问题的业务逻辑的位置,跳过kfence、slab、kfree、kmem_cache、kmalloc相关的函数 | |
这样更加方便定位问题 | |
*/ | |
skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); | |
} | |
/* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ | |
if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) | |
return; | |
if (meta) | |
lockdep_assert_held(&meta->lock); | |
/* | |
* Because we may generate reports in printk-unfriendly parts of the | |
* kernel, such as scheduler code, the use of printk() could deadlock. | |
* Until such time that all printing code here is safe in all parts of | |
* the kernel, accept the risk, and just get our message out (given the | |
* system might already behave unpredictably due to the memory error). | |
* As such, also disable lockdep to hide warnings, and avoid disabling | |
* lockdep for the rest of the kernel. | |
*/ | |
lockdep_off(); | |
pr_err("==================================================================\n"); | |
/* Print report header. */ | |
switch (type) { | |
case KFENCE_ERROR_OOB: { // 访问了protect的内存页导致的OOB | |
// 如果触发异常的地址小于meta对应的object地址,意味着访问了与object所在的内存页紧邻的左边的protect内存页 | |
// 否则,意味着访问的是与object所在的内存页紧邻的右边的protect内存页 | |
const bool left_of_object = address < meta->addr; | |
pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), | |
(void *)stack_entries[skipnr]); | |
// 输出访问类型,缺页地址,缺页地址跟object之间的字节偏移,缺页地址在object的左边内存页还是右边内存页,以及object的索引 | |
pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n", | |
get_access_type(is_write), (void *)address, | |
left_of_object ? meta->addr - address : address - meta->addr, | |
left_of_object ? "left" : "right", object_index); | |
break; | |
} | |
case KFENCE_ERROR_UAF: // object被释放了,没有申请,又访问了 | |
pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), | |
(void *)stack_entries[skipnr]); | |
pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n", | |
get_access_type(is_write), (void *)address, object_index); | |
break; | |
case KFENCE_ERROR_CORRUPTION: // object所在的内存页的空闲区域的pattern被破坏,也属于OOB | |
pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); | |
pr_err("Corrupted memory at 0x%p ", (void *)address); // 发生pattern不一致的地址 | |
print_diff_canary(address, 16, meta); // 显示pattern不一致的地址右侧16字节地址范围内的数据的匹配信息 | |
pr_cont(" (in kfence-#%td):\n", object_index); // object的索引 | |
break; | |
case KFENCE_ERROR_INVALID: // 缺页异常里检测到的无效的错误 | |
pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), | |
(void *)stack_entries[skipnr]); | |
pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write), | |
(void *)address); | |
break; | |
case KFENCE_ERROR_INVALID_FREE: // kfence_free检测到的重复释放以及申请和释放的地址不一致的错误 | |
pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); | |
pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address, | |
object_index); | |
break; | |
} | |
/* 输出内存错误发生的调用栈,其中skipnr用于帮助跳过一些对分析问题没有帮助的mm内部函数 */ | |
stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); | |
if (meta) { | |
pr_err("\n"); | |
/* | |
1. 输出meta的状态信息,object的地址范围,kmem_cache以及进程pid | |
2. 输出object被分配出去时的调用栈 | |
3. 如果meta是free状态,那么还会输出内存释放时的调用栈,以及调用者的pid | |
*/ | |
kfence_print_object(NULL, meta); | |
} | |
/* Print report footer. */ | |
pr_err("\n"); | |
if (no_hash_pointers && regs) // 可以通过启动参数no_hash_pointers来设置为1 | |
show_regs(regs); // 输出缺页异常发生时的CPU寄存器内容以及调用栈 | |
else | |
dump_stack_print_info(KERN_ERR); // 简略的debug信息 | |
trace_error_report_end(ERROR_DETECTOR_KFENCE, address); | |
pr_err("==================================================================\n"); | |
lockdep_on(); | |
if (panic_on_warn) // 可以通过将/proc/sys/kernel/panic_on_warn设置为1让系统宕机 | |
panic("panic_on_warn set ...\n"); | |
/* We encountered a memory safety error, taint the kernel! | |
可以通过给启动参数设置'panic_on_taint=0x20',这样当添加TAINT_BAD_PAGE类型的taint时,会发生宕机 | |
*/ | |
add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); | |
} |
- get_stack_skipnr [kfence_report_error -> get_stack_skipnr ]
从调用栈里将mm的内部函数跳过。
/* | |
* Get the number of stack entries to skip to get out of MM internals. @type is | |
* optional, and if set to NULL, assumes an allocation or free stack. | |
*/ | |
static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries, | |
const enum kfence_error_type *type) | |
{ | |
char buf[64]; | |
int skipnr, fallback = 0; | |
if (type) { | |
/* Depending on error type, find different stack entries. */ | |
switch (*type) { | |
case KFENCE_ERROR_UAF: | |
case KFENCE_ERROR_OOB: | |
case KFENCE_ERROR_INVALID: | |
/* | |
* kfence_handle_page_fault() may be called with pt_regs | |
* set to NULL; in that case we'll simply show the full | |
* stack trace. | |
*/ | |
return 0; | |
case KFENCE_ERROR_CORRUPTION: | |
case KFENCE_ERROR_INVALID_FREE: | |
break; | |
} | |
} | |
for (skipnr = 0; skipnr < num_entries; skipnr++) { | |
int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); | |
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || | |
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || | |
!strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { | |
/* | |
* In case of tail calls from any of the below | |
* to any of the above. | |
*/ | |
fallback = skipnr + 1; | |
} | |
/* Also the *_bulk() variants by only checking prefixes. */ | |
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || | |
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || | |
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || | |
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) | |
goto found; | |
} | |
if (fallback < num_entries) | |
return fallback; | |
found: | |
skipnr++; | |
return skipnr < num_entries ? skipnr : 0; | |
} |
- print_diff_canary [kfence_report_error -> print_diff_canary]
/* | |
* Show bytes at @addr that are different from the expected canary values, up to | |
* @max_bytes. | |
address: pattern不一致的地址,这个地址可能是左侧pattern区域或者右侧pattern区域的,通过跟meta->addr比较就可以知道,参考下图 | |
bytes_to_show: 最长输出多少个地址的的匹配信息 | |
meta:pattern区所在的内存页对应的meta信息 | |
*/ | |
static void print_diff_canary(unsigned long address, size_t bytes_to_show, | |
const struct kfence_metadata *meta) | |
{ | |
const unsigned long show_until_addr = address + bytes_to_show; // | |
const u8 *cur, *end; | |
/* 计算结束地址,不能越出pattern区的范围。比如左侧的pattern区,最长输出到meta->addr-1。 | |
对于右侧的pattern区,最长到右边保护区起始地址-1 */ | |
end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr) | |
: min(show_until_addr, PAGE_ALIGN(address))); | |
pr_cont("["); | |
for (cur = (const u8 *)address; cur < end; cur++) { | |
if (*cur == KFENCE_CANARY_PATTERN(cur)) | |
pr_cont(" ."); // 对于pattern一致的地址,输出 '.' | |
else if (no_hash_pointers) // 可以通过启动参数no_hash_pointers来设置为1 | |
pr_cont(" 0x%02x", *cur); | |
else /* Do not leak kernel memory in non-debug builds. */ | |
pr_cont(" !"); // 对于pattern不一致的地址,输出 '!' | |
} | |
pr_cont(" ]"); | |
} |
内存异常log分析
OOB错误
- 读左侧保护区导致的OOB: KFENCE_ERROR_OOB
示例:
size = kmalloc_cache_alignment(size); | |
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); | |
expect.addr = buf - 1; | |
READ_ONCE(*expect.addr); | |
KUNIT_EXPECT_TRUE(test, report_matches(&expect)); | |
test_free(buf); |
log:
================================================================== | |
BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xad/0x1f2 [kfence_test] | |
# 触发异常时的内核栈 | |
Out-of-bounds read at 0x000000008e1b5d12 (1B left of kfence-#109): | |
test_out_of_bounds_read+0xad/0x1f2 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 分配object的调用栈 | |
kfence-#109 [0x00000000753194ac-0x000000000d237ced, size=32, cache=kmalloc-32] allocated by task 35779: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_out_of_bounds_read+0x86/0x1f2 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35779 Comm: kunit_try_catch Kdump: loaded Not tainted 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
- 读右侧保护区导致的OOB: KFENCE_ERROR_OOB
示例:
size = kmalloc_cache_alignment(size); | |
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); | |
expect.addr = buf + size; | |
READ_ONCE(*expect.addr); | |
KUNIT_EXPECT_TRUE(test, report_matches(&expect)); | |
test_free(buf); |
log:
================================================================== | |
BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0x14a/0x1f2 [kfence_test] | |
# 触发异常的调用栈 | |
Out-of-bounds read at 0x0000000002d76451 (32B right of kfence-#111): | |
test_out_of_bounds_read+0x14a/0x1f2 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 分配object的调用栈 | |
kfence-#111 [0x00000000432dce97-0x000000008d6138c3, size=32, cache=kmalloc-32] allocated by task 35779: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_out_of_bounds_read+0x140/0x1f2 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35779 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
- 写左侧保护区导致的OOB: KFENCE_ERROR_OOB
示例:
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); | |
expect.addr = buf - 1; | |
WRITE_ONCE(*expect.addr, 42); |
log:
================================================================== | |
BUG: KFENCE: out-of-bounds write in test_out_of_bounds_write+0x7a/0x116 [kfence_test] | |
# 触发异常的调用栈 | |
Out-of-bounds write at 0x000000003f50719f (1B left of kfence-#134): | |
test_out_of_bounds_write+0x7a/0x116 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 分配object的调用栈 | |
kfence-#134 [0x0000000080436418-0x0000000052b079df, size=32, cache=kmalloc-32] allocated by task 35781: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_out_of_bounds_write+0x65/0x116 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35781 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
UAF
KFENCE_ERROR_UAF
示例:
expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); | |
test_free(expect.addr); | |
READ_ONCE(*expect.addr); |
log:
================================================================== | |
BUG: KFENCE: use-after-free read in test_use_after_free_read+0x89/0x10b [kfence_test] | |
# 触发UAF时的调用栈 | |
Use-after-free read at 0x0000000067fb284c (in kfence-#152): | |
test_use_after_free_read+0x89/0x10b [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 分配object的调用栈 | |
kfence-#152 [0x0000000067fb284c-0x00000000cd45daeb, size=32, cache=kmalloc-32] allocated by task 35783: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_use_after_free_read+0x63/0x10b [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 释放object的调用栈 | |
freed by task 35783: | |
test_use_after_free_read+0x85/0x10b [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 7 PID: 35783 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
pattern区不一致
- 右侧pattern区不一致:KFENCE_ERROR_CORRUPTION
示例:
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); | |
expect.addr = buf + size; | |
WRITE_ONCE(*expect.addr, 42); | |
test_free(buf); |
log:
================================================================== | |
BUG: KFENCE: memory corruption in test_corruption+0x9c/0x1cb [kfence_test] | |
# 输出pattern不一致的地址及其右侧一共16个地址(不超出右侧pattern区)的匹配结果,'!'表示不一致,'.'表示一致。 | |
Corrupted memory at 0x000000003b880c36 [ ! . . . . . . . . . . . . . . . ] (in kfence-#139): | |
test_corruption+0x9c/0x1cb [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 分配object的调用栈 | |
kfence-#139 [0x0000000084320c94-0x00000000ebf5c6c5, size=32, cache=kmalloc-32] allocated by task 35789: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_corruption+0x72/0x1cb [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35789 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
- 左侧pattern区不一致:KFENCE_ERROR_CORRUPTION
示例:
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); | |
expect.addr = buf - 1; | |
WRITE_ONCE(*expect.addr, 42); | |
test_free(buf); |
log:
================================================================== | |
BUG: KFENCE: memory corruption in test_corruption+0x14e/0x1cb [kfence_test] | |
# 输出pattern不一致的地址及其右侧一共16个地址(不超出左侧pattern区)的匹配结果,'!'表示不一致,'.'表示一致。 | |
Corrupted memory at 0x00000000d7861e9d [ ! ] (in kfence-#155): | |
test_corruption+0x14e/0x1cb [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
kfence-#155 [0x000000009acdf655-0x00000000008cbfb7, size=32, cache=kmalloc-32] allocated by task 35789: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_corruption+0x124/0x1cb [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35789 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
无效的释放
- 重复释放:KFENCE_ERROR_INVALID_FREE
示例:
expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); | |
test_free(expect.addr); | |
test_free(expect.addr); /* Double-free. */ |
log:
================================================================== | |
BUG: KFENCE: invalid free in test_double_free+0x9a/0x124 [kfence_test] | |
# 触发重复释放的调用栈 | |
Invalid free of 0x000000007fb6a8f8 (in kfence-#136): | |
test_double_free+0x9a/0x124 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 分配objcet的调用栈 | |
kfence-#136 [0x000000007fb6a8f8-0x00000000d967e9cd, size=32, cache=test] allocated by task 35786: | |
test_alloc+0xdf/0x36f [kfence_test] | |
test_double_free+0x63/0x124 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
# 释放object的调用栈 | |
freed by task 35786: | |
test_double_free+0x7b/0x124 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35786 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
- 申请和释放的地址不一致:KFENCE_ERROR_INVALID_FREE
示例:
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); | |
expect.addr = buf + 1; /* Free on invalid address. */ | |
test_free(expect.addr); /* Invalid address free. */ | |
test_free(buf); /* No error. */ |
log:
================================================================== | |
BUG: KFENCE: invalid free in test_invalid_addr_free+0x8b/0x12b [kfence_test] | |
Invalid free of 0x0000000000b3e82d (in kfence-#124): | |
test_invalid_addr_free+0x8b/0x12b [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
kfence-#124 [0x000000002aecf77f-0x0000000046ff045a, size=32, cache=kmalloc-32] allocated by task 35787: | |
test_alloc+0xe9/0x36f [kfence_test] | |
test_invalid_addr_free+0x65/0x12b [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35787 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
其他无法识别的内存错误
如触发缺页的OOB区域左侧和右侧的内存页都没有分配出去:KFENCE_ERROR_INVALID
示例:
READ_ONCE(__kfence_pool[10]); |
log:
================================================================== | |
BUG: KFENCE: invalid read in test_invalid_access+0x48/0xd0 [kfence_test] | |
Invalid read at 0x0000000023713263: | |
test_invalid_access+0x48/0xd0 [kfence_test] | |
kunit_try_run_case+0x51/0x80 | |
kunit_generic_run_threadfn_adapter+0x16/0x30 | |
kthread+0x11a/0x140 | |
ret_from_fork+0x22/0x30 | |
CPU: 5 PID: 35936 Comm: kunit_try_catch Kdump: loaded Tainted: G B 5.14.0+ #4 | |
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 | |
================================================================== |
debugfs调试节点
在/sys/kernel/debug/kfence
下面有两个用于查看kfence状态的节点:objects和stats
stats节点
# cat stats | |
enabled: 1 | |
currently allocated: 47 | |
total allocations: 2416 | |
total frees: 2369 | |
zombie allocations: 0 | |
total bugs: 21 |
含义
名字 | 含义 |
---|---|
enabled | kfence功能是否处于开启状态。可以通过内核启动参数开启,启动后可以通过模块参数关闭 |
currently allocated | kfence内存池中有多少个object被分配出去了 |
total allocations | 在kfence内存池中发生过object分配的总次数,当掉递增 |
total frees | 在kfence内存池中发生过object释放的总次数,当掉递增 |
zombie allocations | 当某个kmem_cache被销毁时,在kfence中与之对应的尚未释放的object个数 |
total bugs | kfence检测到的内存错误的次数 |
实现
static int stats_show(struct seq_file *seq, void *v) | |
{ | |
int i; | |
seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); | |
for (i = 0; i < KFENCE_COUNTER_COUNT; i++) | |
seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); | |
return 0; | |
} | |
DEFINE_SHOW_ATTRIBUTE(stats); |
其中用到的统计数据定义如下:
/* Statistics counters for debugfs. */ | |
enum kfence_counter_id { | |
KFENCE_COUNTER_ALLOCATED, | |
KFENCE_COUNTER_ALLOCS, | |
KFENCE_COUNTER_FREES, | |
KFENCE_COUNTER_ZOMBIES, | |
KFENCE_COUNTER_BUGS, | |
KFENCE_COUNTER_COUNT, | |
}; | |
static atomic_long_t counters[KFENCE_COUNTER_COUNT]; | |
static const char *const counter_names[] = { | |
[KFENCE_COUNTER_ALLOCATED] = "currently allocated", | |
[KFENCE_COUNTER_ALLOCS] = "total allocations", | |
[KFENCE_COUNTER_FREES] = "total frees", | |
[KFENCE_COUNTER_ZOMBIES] = "zombie allocations", | |
[KFENCE_COUNTER_BUGS] = "total bugs", | |
}; |
objects节点
输出kfence中每个meta的信息,当前状态以及调用栈。
# cat objects | |
kfence-#0 [0xffff89c43b202000-0xffff89c43b202067, size=104, cache=kmalloc-128] allocated by task 8: | |
set_kthread_struct+0x30/0x40 | |
kthread+0x2e/0x140 | |
ret_from_fork+0x22/0x30 | |
--------------------------------- | |
kfence-#1 [0xffff89c43b204000-0xffff89c43b20400f, size=16, cache=kmalloc-16] allocated by task 1: | |
__smpboot_create_thread.part.9+0x3c/0x120 | |
smpboot_create_threads+0x67/0x90 | |
cpuhp_invoke_callback+0x105/0x400 | |
cpuhp_invoke_callback_range+0x40/0x80 | |
_cpu_up+0xd8/0x1e0 | |
cpu_up+0x85/0x90 | |
bringup_nonboot_cpus+0x4f/0x60 | |
smp_init+0x26/0x74 | |
kernel_init_freeable+0x10e/0x246 | |
kernel_init+0x16/0x120 | |
ret_from_fork+0x22/0x30 | |
--------------------------------- | |
... | |
kfence-#40 [0xffff89c43b252dc0-0xffff89c43b252fff, size=576, cache=inode_cache] allocated by task 531: | |
alloc_inode+0x87/0xa0 | |
new_inode_pseudo+0xb/0x50 | |
create_pipe_files+0x32/0x200 | |
__do_pipe_flags+0x2c/0xd0 | |
do_pipe2+0x2d/0xb0 | |
__x64_sys_pipe+0x10/0x20 | |
do_syscall_64+0x3a/0x80 | |
entry_SYSCALL_64_after_hwframe+0x44/0xae | |
freed by task 531: | |
destroy_inode+0x3b/0x70 | |
__dentry_kill+0xc5/0x150 | |
__fput+0xd9/0x230 | |
task_work_run+0x74/0xb0 | |
exit_to_user_mode_prepare+0x191/0x1a0 | |
syscall_exit_to_user_mode+0x19/0x30 | |
do_syscall_64+0x46/0x80 | |
entry_SYSCALL_64_after_hwframe+0x44/0xae | |
... | |
--------------------------------- | |
kfence-#254 unused | |
--------------------------------- |
含义
- 对于被分配出去且尚未释放的object,只显示分配栈。
- 对于当前处于free状态的object,既显示分配栈,也显示释放栈。处于zombie的object也属于free。
- 对于从来没有被分配出去过的object,显示unused
- 对于zombie的object,虽然是free的,但是已经不能被分配了,对应的kmem_cache被销毁的了,所以cache会显示为
<destroyed>
实现
static int show_object(struct seq_file *seq, void *v) | |
{ | |
struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; | |
unsigned long flags; | |
raw_spin_lock_irqsave(&meta->lock, flags); | |
kfence_print_object(seq, meta); | |
raw_spin_unlock_irqrestore(&meta->lock, flags); | |
seq_puts(seq, "---------------------------------\n"); | |
return 0; | |
} |
- kfence_print_object
void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) | |
{ | |
const int size = abs(meta->size); | |
const unsigned long start = meta->addr; | |
const struct kmem_cache *const cache = meta->cache; | |
lockdep_assert_held(&meta->lock); | |
if (meta->state == KFENCE_OBJECT_UNUSED) { // 尚未使用的meta | |
seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata); | |
return; | |
} | |
seq_con_printf(seq, | |
"kfence-#%td [0x%p-0x%p" | |
", size=%d, cache=%s] allocated by task %d:\n", | |
meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, | |
(cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid); | |
kfence_print_stack(seq, meta, true); // 输出meta对应的object被分配出去时的调用栈 | |
if (meta->state == KFENCE_OBJECT_FREED) { // 如果meta对应的object被释放了 | |
seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid); | |
kfence_print_stack(seq, meta, false); // 输出meta对应的object被释放时的调用栈 | |
} | |
} |
测试框架
kfence提供了测试用例,在mm\kfence\kfence_test.c中。
static int __init kfence_test_init(void) | |
{ | |
/* 遍历内核中的tracepoint,在名为"console"的tracepoint上挂载一个hook函数 */ | |
for_each_kernel_tracepoint(register_tracepoints, NULL); | |
/* 执行测试用例 */ | |
return __kunit_test_suites_init(kfence_test_suites); | |
} |
- register_tracepoints
static void register_tracepoints(struct tracepoint *tp, void *ignore) | |
{ | |
check_trace_callback_type_console(probe_console); | |
if (!strcmp(tp->name, "console")) | |
WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); | |
} |
当kfence_report_error输出错误log时,"console"这个tracepoint会触发,然后会回调到probe_console,在probe_console中会过滤kfence_report_error中输出的错误log,并记录到observed,用于跟期望的错误类型比对,比对通过表示测试成功。
- probe_console
过滤kfence_report_error中输出的错误log,并记录到observed,用于跟期望的错误类型比对,比对通过表示测试成功。
/* Probe for console output: obtains observed lines of interest. */ | |
static void probe_console(void *ignore, const char *buf, size_t len) | |
{ | |
unsigned long flags; | |
int nlines; | |
spin_lock_irqsave(&observed.lock, flags); | |
nlines = observed.nlines; | |
if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) { | |
/* | |
* KFENCE report and related to the test. | |
* | |
* The provided @buf is not NUL-terminated; copy no more than | |
* @len bytes and let strscpy() add the missing NUL-terminator. | |
*/ | |
strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); | |
nlines = 1; | |
} else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) { | |
strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); | |
} | |
WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ | |
spin_unlock_irqrestore(&observed.lock, flags); | |
} |
- kfence_test_suites
记录了测试case的具体内容:
#define KFENCE_KUNIT_CASE(test_name) \ | |
{ .run_case = test_name, .name = #test_name }, \ | |
{ .run_case = test_name, .name = #test_name "-memcache" } | |
static struct kunit_case kfence_test_cases[] = { | |
KFENCE_KUNIT_CASE(test_out_of_bounds_read), | |
KFENCE_KUNIT_CASE(test_out_of_bounds_write), | |
KFENCE_KUNIT_CASE(test_use_after_free_read), | |
KFENCE_KUNIT_CASE(test_double_free), | |
KFENCE_KUNIT_CASE(test_invalid_addr_free), | |
KFENCE_KUNIT_CASE(test_corruption), | |
KFENCE_KUNIT_CASE(test_free_bulk), | |
KFENCE_KUNIT_CASE(test_init_on_free), | |
KUNIT_CASE(test_kmalloc_aligned_oob_read), | |
KUNIT_CASE(test_kmalloc_aligned_oob_write), | |
KUNIT_CASE(test_shrink_memcache), | |
KUNIT_CASE(test_memcache_ctor), | |
KUNIT_CASE(test_invalid_access), | |
KUNIT_CASE(test_gfpzero), | |
KUNIT_CASE(test_memcache_typesafe_by_rcu), | |
KUNIT_CASE(test_krealloc), | |
KUNIT_CASE(test_memcache_alloc_bulk), | |
{}, | |
}; | |
static struct kunit_suite kfence_test_suite = { | |
.name = "kfence", | |
.test_cases = kfence_test_cases, | |
.init = test_init, | |
.exit = test_exit, | |
}; | |
static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL }; |
以test_out_of_bounds_read为例:
static void test_out_of_bounds_read(struct kunit *test) | |
{ | |
size_t size = 32; | |
struct expect_report expect = { // 期望发生的结果 | |
.type = KFENCE_ERROR_OOB, // 期望发生的错误类型 | |
.fn = test_out_of_bounds_read, // 期望导致错误发生的函数 | |
.is_write = false, // 期望的读写方向,这里是读 | |
}; | |
char *buf; | |
setup_test_cache(test, size, 0, NULL); | |
/* | |
* If we don't have our own cache, adjust based on alignment, so that we | |
* actually access guard pages on either side. | |
*/ | |
if (!test_cache) | |
size = kmalloc_cache_alignment(size); | |
/* Test both sides. */ | |
// 从kfence中分配内存,构造访问左边保护页的OOB,返回的是object所在页的首地址 | |
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); | |
expect.addr = buf - 1; // 期望在哪个地址上发生OOB,地址减1就是左边保护页的结尾地址 | |
READ_ONCE(*expect.addr); // 触发OOB异常 | |
KUNIT_EXPECT_TRUE(test, report_matches(&expect)); // 调用report_matche比对实际发生的错误跟期望发生的错误是否一致 | |
test_free(buf); | |
// 从kfence中分配内存,构造访问右边保护页的OOB,返回的是object所在页的首地址 | |
buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); | |
expect.addr = buf + size; // 期望发生缺页的地址,地址加上size就是右边保护页的首地址 | |
READ_ONCE(*expect.addr); // 触发OOB异常 | |
KUNIT_EXPECT_TRUE(test, report_matches(&expect)); // 核对结果 | |
test_free(buf); | |
} |
- report_matches
static bool report_matches(const struct expect_report *r) | |
{ | |
bool ret = false; | |
unsigned long flags; | |
typeof(observed.lines) expect; | |
const char *end; | |
char *cur; | |
/* Doubled-checked locking. */ | |
if (!report_available()) | |
return false; | |
/* Generate expected report contents. */ | |
/* Title */ | |
cur = expect[0]; | |
end = &expect[0][sizeof(expect[0]) - 1]; | |
switch (r->type) { | |
case KFENCE_ERROR_OOB: | |
cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s", | |
get_access_type(r)); | |
break; | |
case KFENCE_ERROR_UAF: | |
cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s", | |
get_access_type(r)); | |
break; | |
case KFENCE_ERROR_CORRUPTION: | |
cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption"); | |
break; | |
case KFENCE_ERROR_INVALID: | |
cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s", | |
get_access_type(r)); | |
break; | |
case KFENCE_ERROR_INVALID_FREE: | |
cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free"); | |
break; | |
} | |
scnprintf(cur, end - cur, " in %pS", r->fn); | |
/* The exact offset won't match, remove it; also strip module name. */ | |
cur = strchr(expect[0], '+'); | |
if (cur) | |
*cur = '\0'; | |
/* Access information */ | |
cur = expect[1]; | |
end = &expect[1][sizeof(expect[1]) - 1]; | |
switch (r->type) { | |
case KFENCE_ERROR_OOB: | |
cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); | |
break; | |
case KFENCE_ERROR_UAF: | |
cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); | |
break; | |
case KFENCE_ERROR_CORRUPTION: | |
cur += scnprintf(cur, end - cur, "Corrupted memory at"); | |
break; | |
case KFENCE_ERROR_INVALID: | |
cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); | |
break; | |
case KFENCE_ERROR_INVALID_FREE: | |
cur += scnprintf(cur, end - cur, "Invalid free of"); | |
break; | |
} | |
cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr); | |
spin_lock_irqsave(&observed.lock, flags); | |
if (!report_available()) | |
goto out; /* A new report is being captured. */ | |
/* Finally match expected output to what we actually observed. */ | |
ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]); | |
out: | |
spin_unlock_irqrestore(&observed.lock, flags); | |
return ret; | |
} |
完。