bpftrace使用案例学习

参考手册

小技巧

  • 读取内核全局变量的值(参考
# bpftrace -qe 'BEGIN {printf("banner: %s\ntotalram: 0x%lx\n", str(kaddr("linux_banner")), *kaddr("_totalram_pages")); exit();}'
banner: Linux version 5.19.0+ (pengdl@ubuntu) (gcc (Ubuntu 9.4.0-1ubunt
totalram: 0xd3178
  • 抓取地址被写时的调用栈 (参考

当排查内核中某个变量被谁在哪个位置修改时,可以使用这个方法。变量的地址可以根据/proc/kallsyms或者crash工具或者bpftrace或者内核日志来获得

  • 在某个内核模块的某个函数偏移处挂钩
bpftrace -e 'k:sched_walt:walt_cfs_replace_next_task_fair+0x10 {print(ustack);}'

上面的命令在内核模块sched_walt的函数walt_cfs_replace_next_task_fair的偏移量为0x10的位置加了一个kprobe探针,
从kprobe_events可以看到整个探针:

xxx:/sys/kernel/tracing # cat dynamic_events
p:kprobes/p_walt_cfs_replace_next_task_fair_10_1_bcc_14593 sched_walt:walt_cfs_replace_next_task_fair+16
  • 自定义结构体

对于无法直接访问的内核结构体,可以用下面的方法自定义一个数据结构来获取自己关心的字段的值。

#!/usr/bin/env bpftrace
/*
struct worker_pool {
    [0x0] spinlock_t lock;
    [0x4] int cpu;
    [0x8] int node;
    [0xc] int id;
   [0x10] unsigned int flags;
   [0x18] unsigned long watchdog_ts;
   [0x20] struct list_head worklist;
   [0x30] int nr_workers;
   [0x34] int nr_idle;
   [0x38] struct list_head idle_list;
   [0x48] struct timer_list idle_timer;
   [0x90] struct timer_list mayday_timer;
   [0xd8] struct hlist_head busy_hash[64];
  [0x2d8] struct worker *manager;
  [0x2e0] struct list_head workers;
  [0x2f0] struct completion *detach_completion;
  [0x2f8] struct ida worker_ida;
  [0x310] struct workqueue_attrs *attrs;
  [0x318] struct hlist_node hash_node;
  [0x328] int refcnt;
  [0x340] atomic_t nr_running;
  [0x348] struct callback_head rcu;
}
*/
union worker_pool
{
    struct {
        char rsv0[0xc];
        int id;
    };
    struct {
        char rsv1[0x30];
        int nr_workers;
        int nr_idle;
    };
    struct {
        char rsv2[0x340];
        int nr_running;
    };
};
struct worker {
    char rsv0[0x40];
    union worker_pool *pool;
};
kprobe:create_worker
{
    $wp = (union worker_pool *)arg0;
    if ($wp->id == 4) {
        printf("%s: nr_worker: %d, nr_idle: %d, nr_running: %d, %s\n",
            func, $wp->nr_workers, $wp->nr_idle, $wp->nr_running, kstack);
    }
}

kprobe:destroy_worker
{
    $wp = ((struct worker *)arg0)->pool;
    if ($wp->id == 4) {
        time();
        printf(" %s: nr_worker: %d, nr_idle: %d, nr_running: %d, %s\n",
            func, $wp->nr_workers, $wp->nr_idle, $wp->nr_running, kstack);
    }
}

内存泄漏

kmemleak.bt源码
#!/usr/bin/env bpftrace

t:kmem:kmalloc,
t:kmem:kmalloc_node,
t:kmem:kmem_cache_alloc,
t:kmem:kmem_cache_alloc_node
/comm != "bpftrace" /
{
    $slots = 4;
    if ($1 != 0) {
        $slots = $1;
    }

    $i = 0;
    while($i < $slots) {
                if (args->bytes_alloc == (uint64)128 && @kmem_addr[$i] == (uint64 *)0) {
                        @alloc_stack[$i] = kstack;
                        @kmem_addr[$i] = args->ptr;
                        break;
                }
                $i++;
        }
}

t:kmem:kfree,
t:kmem:kmem_cache_free
{
    $slots = 4;
    if ($1 != 0) {
        $slots = $1;
    }
    $i = 0;
    while($i < $slots) {
                if (@kmem_addr[$i] != (uint64 *)0 && @kmem_addr[$i] == args->ptr) {
                        delete(@alloc_stack[$i]);
                        delete(@kmem_addr[$i]);
                        break;
                }
                $i++;
        }
}

interval:s:5
{
    time();

    $slots = 4;
    if ($1 != 0) {
        $slots = $1;
    }
    $i = 0;
    while($i < $slots) {
                if (@kmem_addr[$i] != 0) {
                        printf("kmem_addr%d: 0x%lx", $i, @kmem_addr[$i]);
                        printf("%s\n", @alloc_stack[$i]);
                }
                $i++;
        }
}

END {
    clear(@alloc_stack);
    clear(@kmem_addr);

    $slots = 4;
    if ($1 != 0) {
        $slots = $1;
    }
    $i = 0;
    while($i < $slots) {
                delete(@alloc_stack[$i]);
                delete(@kmem_addr[$i]);
                $i++;
        }
}

用法:./kmemleak.bt [槽位数]

监控软中断处理延迟

来源:ksoftirqd延迟排查说明

softirq_net_latency.bt
#!/usr/bin/bpftrace
/* SPDX-License-Identifier: GPL-2.0+
 *
 * Catching high network IRQ-to-softirq latency by measuing time
 * between softirq "raise" until softirq function is called.  Limited
 * to network via softirq vector NET_RX_SOFTIRQ (3).
 *
 * This script was used for detecting latency issues described in blog:
 *  https://github.blog/2019-11-21-debugging-network-stalls-on-kubernetes/
 *
 * Output examples from this script can be seen in:
 *  https://bugzilla.redhat.com/show_bug.cgi?id=1795049#c8
 *
 * 26-Jan-2020	Jesper Dangaard Brouer	Created this
 * 27-Feb-2024	Jesper Dangaard Brouer	Updated based on production usage
 */
BEGIN
{
	/* Cmdline arg#1: latency threshold input in usec */
	@threshold_usecs = $1 ? $1: 2000;
	/* Cmdline arg#2: enable measuring runtime of softirq func */
	@measure_runtime = $2;
	printf("Tracing softirq wait-time latency ... Hit Ctrl-C to end.\n");
	printf(" - Will report on latency above %d usecs (= %d ms)\n",
	       @threshold_usecs, @threshold_usecs / 1000);
	if (@measure_runtime) {
		printf(" - Also record runtime of softirq func call\n");
	}
	@threshold_ns = @threshold_usecs * 1000;
}

tracepoint:irq:softirq_raise
/args->vec == 3/   /* NET_RX_SOFTIRQ = 3 */
{
	if (!@start[cpu]) {
		/* Only catch first softirq_raise, to measure time
		 * until softirq_entry happens.  Other raise events
		 * could be triggered by other NICs.
		 */
		@start[cpu] = nsecs;
	}
}

tracepoint:irq:softirq_entry
/args->vec == 3/
{
	if (@start[cpu] > 0) {
		$lat = nsecs - @start[cpu];
		@softirq_wait_nanosec = hist($lat);

		/* Report on "not-able-to-run" events over threshold */
		if ($lat >= @threshold_ns) {
			// @stack[cpu, $lat] = kstack; //no useful stack
			time("%H:%M:%S ");
			printf("High IRQ-to-softirq latency: %d usec (%d ms) on CPU:%d comm:%s\n",
			       $lat / 1000, $lat / 1000000, cpu, comm);
		}
	}
	delete(@start[cpu]);
	if (@measure_runtime) {
		@runtime[cpu] = nsecs;
	}
}

tracepoint:irq:softirq_exit
/args->vec == 3/
{
	if (@measure_runtime && @runtime[cpu] > 0) {
		$lat = nsecs - @runtime[cpu];
		@runtime_nanosec = hist($lat);

		/* Report on "runtime" events over threshold */
		if ($lat >= @threshold_ns) {
			time("%H:%M:%S ");
			printf("Long softirq runtime: %d usec (%d ms) on CPU:%d comm:%s\n",
			       $lat / 1000, $lat / 1000000, cpu, comm);
		}

	}
	delete(@runtime[cpu]);
}

END
{	/* Default bpftrace will print all remaining maps at END */
	clear(@measure_runtime);
	clear(@threshold_usecs);
	clear(@threshold_ns);
}

语法

posted @ 2022-09-01 21:29  dolinux  阅读(631)  评论(0编辑  收藏  举报