bpftrace使用案例学习
参考手册
小技巧
- 读取内核全局变量的值(参考)
# bpftrace -qe 'BEGIN {printf("banner: %s\ntotalram: 0x%lx\n", str(kaddr("linux_banner")), *kaddr("_totalram_pages")); exit();}'
banner: Linux version 5.19.0+ (pengdl@ubuntu) (gcc (Ubuntu 9.4.0-1ubunt
totalram: 0xd3178
- 抓取地址被写时的调用栈 (参考)
当排查内核中某个变量被谁在哪个位置修改时,可以使用这个方法。变量的地址可以根据/proc/kallsyms或者crash工具或者bpftrace或者内核日志来获得
- 在某个内核模块的某个函数偏移处挂钩
bpftrace -e 'k:sched_walt:walt_cfs_replace_next_task_fair+0x10 {print(ustack);}'
上面的命令在内核模块sched_walt的函数walt_cfs_replace_next_task_fair的偏移量为0x10的位置加了一个kprobe探针,
从kprobe_events可以看到整个探针:
xxx:/sys/kernel/tracing # cat dynamic_events
p:kprobes/p_walt_cfs_replace_next_task_fair_10_1_bcc_14593 sched_walt:walt_cfs_replace_next_task_fair+16
- 自定义结构体
对于无法直接访问的内核结构体,可以用下面的方法自定义一个数据结构来获取自己关心的字段的值。
#!/usr/bin/env bpftrace
/*
struct worker_pool {
[0x0] spinlock_t lock;
[0x4] int cpu;
[0x8] int node;
[0xc] int id;
[0x10] unsigned int flags;
[0x18] unsigned long watchdog_ts;
[0x20] struct list_head worklist;
[0x30] int nr_workers;
[0x34] int nr_idle;
[0x38] struct list_head idle_list;
[0x48] struct timer_list idle_timer;
[0x90] struct timer_list mayday_timer;
[0xd8] struct hlist_head busy_hash[64];
[0x2d8] struct worker *manager;
[0x2e0] struct list_head workers;
[0x2f0] struct completion *detach_completion;
[0x2f8] struct ida worker_ida;
[0x310] struct workqueue_attrs *attrs;
[0x318] struct hlist_node hash_node;
[0x328] int refcnt;
[0x340] atomic_t nr_running;
[0x348] struct callback_head rcu;
}
*/
union worker_pool
{
struct {
char rsv0[0xc];
int id;
};
struct {
char rsv1[0x30];
int nr_workers;
int nr_idle;
};
struct {
char rsv2[0x340];
int nr_running;
};
};
struct worker {
char rsv0[0x40];
union worker_pool *pool;
};
kprobe:create_worker
{
$wp = (union worker_pool *)arg0;
if ($wp->id == 4) {
printf("%s: nr_worker: %d, nr_idle: %d, nr_running: %d, %s\n",
func, $wp->nr_workers, $wp->nr_idle, $wp->nr_running, kstack);
}
}
kprobe:destroy_worker
{
$wp = ((struct worker *)arg0)->pool;
if ($wp->id == 4) {
time();
printf(" %s: nr_worker: %d, nr_idle: %d, nr_running: %d, %s\n",
func, $wp->nr_workers, $wp->nr_idle, $wp->nr_running, kstack);
}
}
内存泄漏
kmemleak.bt源码
#!/usr/bin/env bpftrace
t:kmem:kmalloc,
t:kmem:kmalloc_node,
t:kmem:kmem_cache_alloc,
t:kmem:kmem_cache_alloc_node
/comm != "bpftrace" /
{
$slots = 4;
if ($1 != 0) {
$slots = $1;
}
$i = 0;
while($i < $slots) {
if (args->bytes_alloc == (uint64)128 && @kmem_addr[$i] == (uint64 *)0) {
@alloc_stack[$i] = kstack;
@kmem_addr[$i] = args->ptr;
break;
}
$i++;
}
}
t:kmem:kfree,
t:kmem:kmem_cache_free
{
$slots = 4;
if ($1 != 0) {
$slots = $1;
}
$i = 0;
while($i < $slots) {
if (@kmem_addr[$i] != (uint64 *)0 && @kmem_addr[$i] == args->ptr) {
delete(@alloc_stack[$i]);
delete(@kmem_addr[$i]);
break;
}
$i++;
}
}
interval:s:5
{
time();
$slots = 4;
if ($1 != 0) {
$slots = $1;
}
$i = 0;
while($i < $slots) {
if (@kmem_addr[$i] != 0) {
printf("kmem_addr%d: 0x%lx", $i, @kmem_addr[$i]);
printf("%s\n", @alloc_stack[$i]);
}
$i++;
}
}
END {
clear(@alloc_stack);
clear(@kmem_addr);
$slots = 4;
if ($1 != 0) {
$slots = $1;
}
$i = 0;
while($i < $slots) {
delete(@alloc_stack[$i]);
delete(@kmem_addr[$i]);
$i++;
}
}
用法:./kmemleak.bt [槽位数]
监控软中断处理延迟
softirq_net_latency.bt
#!/usr/bin/bpftrace
/* SPDX-License-Identifier: GPL-2.0+
*
* Catching high network IRQ-to-softirq latency by measuing time
* between softirq "raise" until softirq function is called. Limited
* to network via softirq vector NET_RX_SOFTIRQ (3).
*
* This script was used for detecting latency issues described in blog:
* https://github.blog/2019-11-21-debugging-network-stalls-on-kubernetes/
*
* Output examples from this script can be seen in:
* https://bugzilla.redhat.com/show_bug.cgi?id=1795049#c8
*
* 26-Jan-2020 Jesper Dangaard Brouer Created this
* 27-Feb-2024 Jesper Dangaard Brouer Updated based on production usage
*/
BEGIN
{
/* Cmdline arg#1: latency threshold input in usec */
@threshold_usecs = $1 ? $1: 2000;
/* Cmdline arg#2: enable measuring runtime of softirq func */
@measure_runtime = $2;
printf("Tracing softirq wait-time latency ... Hit Ctrl-C to end.\n");
printf(" - Will report on latency above %d usecs (= %d ms)\n",
@threshold_usecs, @threshold_usecs / 1000);
if (@measure_runtime) {
printf(" - Also record runtime of softirq func call\n");
}
@threshold_ns = @threshold_usecs * 1000;
}
tracepoint:irq:softirq_raise
/args->vec == 3/ /* NET_RX_SOFTIRQ = 3 */
{
if (!@start[cpu]) {
/* Only catch first softirq_raise, to measure time
* until softirq_entry happens. Other raise events
* could be triggered by other NICs.
*/
@start[cpu] = nsecs;
}
}
tracepoint:irq:softirq_entry
/args->vec == 3/
{
if (@start[cpu] > 0) {
$lat = nsecs - @start[cpu];
@softirq_wait_nanosec = hist($lat);
/* Report on "not-able-to-run" events over threshold */
if ($lat >= @threshold_ns) {
// @stack[cpu, $lat] = kstack; //no useful stack
time("%H:%M:%S ");
printf("High IRQ-to-softirq latency: %d usec (%d ms) on CPU:%d comm:%s\n",
$lat / 1000, $lat / 1000000, cpu, comm);
}
}
delete(@start[cpu]);
if (@measure_runtime) {
@runtime[cpu] = nsecs;
}
}
tracepoint:irq:softirq_exit
/args->vec == 3/
{
if (@measure_runtime && @runtime[cpu] > 0) {
$lat = nsecs - @runtime[cpu];
@runtime_nanosec = hist($lat);
/* Report on "runtime" events over threshold */
if ($lat >= @threshold_ns) {
time("%H:%M:%S ");
printf("Long softirq runtime: %d usec (%d ms) on CPU:%d comm:%s\n",
$lat / 1000, $lat / 1000000, cpu, comm);
}
}
delete(@runtime[cpu]);
}
END
{ /* Default bpftrace will print all remaining maps at END */
clear(@measure_runtime);
clear(@threshold_usecs);
clear(@threshold_ns);
}
语法
本文来自博客园,作者:dolinux,未经同意,禁止转载