linux bpf 学习记录
-
eBPF 介绍
BPF(Berkeley Packet Filter)使普通用户拥有了让啮合执行用户代码并共享数据的能力。用户可以讲 eBPF 指令直接码传输给内核,然后通过 socket 写时间来触发内核执行代码。并且在用户空间和内核空间共享一个 map 内存,用户空间和内核空间都对其拥有读写能力。
-
eBPF 虚拟指令系统
eBPF 指令属于 RISC,有 10 个虚拟寄存器,对应硬件 CPU 10 个物理寄存器。
R0 – rax R1 - rdi R2 - rsi R3 - rdx R4 - rcx R5 - r8 R6 - rbx R7 - r13 R8 - r14 R9 - r15 R10 – rbp(帧指针,frame pointer)
指令格式:
struct bpf_insn { __u8 code; /* opcode */ __u8 dst_reg:4; /* dest register */ __u8 src_reg:4; /* source register */ __s16 off; /* signed offset */ __s32 imm; /* signed immediate constant */ };
例如 x86 指令:mov edi 0xffffffff 的实现:
#define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM })
编写的话就是:BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF),对应字节码为:\xb4\x09\x00\x00\xff\xff\xff\xff。
-
eBPF 加载过程
- 用户调用 syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)) 申请创建一个 map,在 attr 结构体中执行 map 的类型、大小、最大容量等属性。然后调用 sys_bpf 进而执行系统调用 syscall(__NR_bpf, BPF_MAP_CREATE, attr, size); 创建一个 map 数据结构,最终返回 map 的文件描述符。从而得到一个内核态和用户态的共享内存。
- 用户调用 syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)) 将用户编写的 bpf 代码加载进内核,attr 结构体中包含了指令数量、指令首地址、日志级别等属性,在加载前会利用虚拟执行的方式来做安全性检验(包括对指定语法的检查、指令数量的检查、指令中的指针和立即数的范围及读写权限的检查),检查通过后程序被成功加载至内核。
- 用户通过调用 setsocopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) 将我们写的 BPF 程序绑定到指定的 socket 上,Progfd 为上一步骤的返回值。
- 用户程序通过操作上一步骤的 socket 来触发 BPF 真正执行。
-
eBPF 代码执行过程
对 eBPF 指令的解释执行,最后都会进入 __bpf_prog_run 函数,这个函数实际上是自己用栈模拟了一个 eBPF 程序的栈和寄存器,所以 eBPF 程序的指令可以直接控制内核栈数据,这也为后续漏洞利用提供了方便。
-
eBPF 中的函数
-
BPF_MAP_CREATE
该函数用于创建一个新的 map 内存,返回一个新的文件描述符,并指向该内存。
int bpf_create_map(enum bpf_map_type map_type, unsigned int key_size, unsigned int value_size, unsigned int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));c }
-
BPF_MAP_LOOKUP_ELEM
BPF_MAP_LOOKUP_ELEM 函数根据传入的 key 执行寻找其对应的元素。
int bpf_lookup_elem(int fd, const void *key, void *value) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), }; return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); }
-
BPF_MAP_UPDATE_ELEM
BPF_MAP_UPDATE_ELEM 函数使用传入的 key 或 value 创建或者更新一个 map 中的元素
int bpf_update_elem(int fd, const void *key, const void *value, uint64_t flags) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), .flags = flags, }; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); }
-
BPF_MAP_DELETE_ELEM
BPF_MAP_DELETE_ELEM 函数用于根据传入的 key 或 value 来删除一个元素:
int bpf_delete_elem(int fd, const void *key) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), }; return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); }
-
BPF_MAP_GET_NEXT_KEY
该函数根据传入的 key 值寻找对应的元素,然后返回其下一个元素:
int bpf_get_next_key(int fd, const void *key, void *next_key) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .next_key = ptr_to_u64(next_key), }; return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); }
-
BPF_PROG_LOAD
该函数用于加载一个 eBPF 程序到内核,返回一个新的指向 eBPF 程序的文件指针。
char bpf_log_buf[LOG_BUF_SIZE]; int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), .log_buf = ptr_to_u64(bpf_log_buf), .log_size = LOG_BUF_SIZE, .log_level = 1, }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); }
-
-
eBPF 安全校验
在 BPF_PROG_LOAD 之后需要对 eBPF 进行安全检验,通过后才能继续执行,主要检测函数为 bpf_check。
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; // 指令条数判断 if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ // 分配 verifier_env 空间 env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output * and supplied buffer to store the verification trace */ log_level = attr->log_level; log_ubuf = (char __user *) (unsigned long) attr->log_buf; log_size = attr->log_size; log_len = 0; ret = -EINVAL; /* log_* values have to be sane */ if (log_size < 128 || log_size > UINT_MAX >> 8 || log_level == 0 || log_ubuf == NULL) goto free_env; ret = -ENOMEM; log_buf = vmalloc(log_size); if (!log_buf) goto free_env; } else { log_level = 0; } /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ // 将伪指令中操作 map_fd 的部分替换成 map 地址,注意这个地址是8字节的,因此在实现中用本指令的 imm 和下一条指令的 2 个 4 字节中存储了这个地址 /* store map pointer inside BPF_LD_IMM64 instruction insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; */ ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; // 控制流图检查死循环和不可能到达的跳转 ret = check_cfg(env); if (ret < 0) goto skip_full_check; env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); // 核心检查函数 ret = do_check(env); skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ ret = -ENOSPC; /* fall through to return what was recorded */ } /* copy verifier log back to user space including trailing zero */ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { ret = -EFAULT; goto free_log_buf; } if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), GFP_KERNEL); if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } free_log_buf: if (log_level) vfree(log_buf); free_env: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; }
其中主要通过 do_check 来根据不同的指令类型来做具体的合法性判断,使用的核心数据结构是:reg_state, bpf_reg_type 枚举变量用来表示寄存器的类型,初始化为 NOT_INIT。
struct reg_state { enum bpf_reg_type type; union { /* valid when type == CONST_IMM | PTR_TO_STACK */ int imm; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct bpf_map *map_ptr; }; }; static void init_reg_state(struct reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; regs[i].map_ptr = NULL; } /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; } /* types of values stored in eBPF registers */ enum bpf_reg_type { NOT_INIT = 0, /* nothing was written into register */ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ FRAME_PTR, /* reg == frame_pointer */ PTR_TO_STACK, /* reg == frame_pointer + imm */ CONST_IMM, /* constant integer value */ };
do_check:
static int do_check(struct verifier_env *env) { struct verifier_state *state = &env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct reg_state *regs = state->regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; init_reg_state(regs); insn_idx = 0; for (;;) { struct bpf_insn *insn; u8 class; int err; //指令条数检查 if (insn_idx >= insn_cnt) { verbose("invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } insn = &insns[insn_idx]; class = BPF_CLASS(insn->code); //运行过的次数上限检查 if (++insn_processed > 32768) { verbose("BPF program is too large. Proccessed %d insn\n", insn_processed); return -E2BIG; } //检测该指令有无visit,主要通过env->explored_states的状态数组保存访问过的指令的状态 err = is_state_visited(env, insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ if (log_level) { if (do_print_state) verbose("\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else verbose("%d: safe\n", insn_idx); } goto process_bpf_exit; } if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); print_verifier_state(env); do_print_state = false; } if (log_level) { verbose("%d: ", insn_idx); print_bpf_insn(env, insn); } //计算指令ALU if (class == BPF_ALU || class == BPF_ALU64) { //检查具体指令的合法性,比如是否使用了保留的field,使用的寄存器编号是否超过了模拟寄存器的最大编号,寄存器是否可读/写,寄存器值是否是指针等,该函数后面详细解释 err = check_alu_op(env, insn); if (err) return err; //BPF_LDX指令 } else if (class == BPF_LDX) { enum bpf_reg_type src_reg_type; /* check for reserved fields is already done */ /* check src operand */ //检测源寄存器的编号是否超过最大编号,如果为操作数其是否初始化,是否是指针 err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; //检查目的寄存器 err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; // src_reg_type = regs[insn->src_reg].type; /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ //检查源寄存器+off所指的地址是可读的 err = check_mem_access(env, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) return err; if (BPF_SIZE(insn->code) != BPF_W) { insn_idx++; continue; } if (insn->imm == 0) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) * use reserved 'imm' field to mark this insn */ insn->imm = src_reg_type;//判断出了一种指令类型,即地址取值指令 } //源类型非立即数 else if (src_reg_type != insn->imm && (src_reg_type == PTR_TO_CTX || insn->imm == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: * src_reg == ctx in one branch and * src_reg == stack|map in some other branch. * Reject it. */ verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } //BPF_STX指令 } else if (class == BPF_STX) { enum bpf_reg_type dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); if (err) return err; insn_idx++; continue; } /* check src1 operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; /* check src2 operand */ err = check_reg_arg(regs, insn->dst_reg, SRC_OP); if (err) return err; dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) return err; if (insn->imm == 0) { insn->imm = dst_reg_type; } else if (dst_reg_type != insn->imm && (dst_reg_type == PTR_TO_CTX || insn->imm == PTR_TO_CTX)) { verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } //BPF_ST指令 } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { verbose("BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ err = check_reg_arg(regs, insn->dst_reg, SRC_OP); if (err) return err; /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) return err; //BPF_JMP指令 } else if (class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); //直接跳转CALL if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { verbose("BPF_CALL uses reserved fields\n"); return -EINVAL; } //在这个函数中会检查跳转的地址有无超过范围,函数的五个参数的参数类型(是否是key/value/map地址/stack_size等),更新返回值寄存器,更新reg_state等。 err = check_call(env, insn->imm); if (err) return err; } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { verbose("BPF_JA uses reserved fields\n"); return -EINVAL; } insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { verbose("BPF_EXIT uses reserved fields\n"); return -EINVAL; } //r0保存返回值,bpf_exit为指令集合结束标志,在此之前检查有无写入值 /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time * of bpf_exit, which means that program wrote * something into it earlier */ err = check_reg_arg(regs, BPF_REG_0, SRC_OP); if (err) return err; if (is_pointer_value(env, BPF_REG_0)) { verbose("R0 leaks addr as return value\n"); return -EACCES; } //遇到一个exit就结束一个分支,回退到分叉处执行另一个branch,类似于走迷宫遍历路径 process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { break; } else { do_print_state = true; continue; } } else { err = check_cond_jmp_op(env, insn, &insn_idx); if (err) return err; } } else if (class == BPF_LD) { u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { err = check_ld_abs(env, insn); if (err) return err; } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) return err; insn_idx++; } else { verbose("invalid BPF_LD mode\n"); return -EINVAL; } } else { verbose("unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } return 0; }
-
eBPF 架构
-
Hello World
基于 libbpf-bootstrap 的结构实现 BPF:
相关依赖和初始化:
git clone https://github.com/libbpf/libbpf-bootstrap.git git submodule update --init --recursive
在 libbpf-bootstrap/examples/ 目录下写 helloworld.bpf.c(运行在内核态的 BPF 源码)和 helloworld.c(加载 BPF 到内核的用户态程序)
helloworld.bpf.c
#include <linux/bpf.h> #include <bpf/bpf_helper.h> SEC("tracepoint/syscalls/sys_enter_execve") int bpf_prog(void *ctx) { char msg[] = "Hello, World"; bpf_printk("invoke bpf_prog: %s\n", msg); return 0; } char LICENSE[] SEC("license") = "Dual BSD/GPL";
再系统调用 execve 的埋点处注入 bpf_prog,使得每次调用 execve 的时候,都会回调 bpf_prog。
hellocworld.c
#include <stdio.h> #include <unistd.h> #include <sys/resource.h> #include <bpf/libbpf.h> #include "helloworld.skel.h" static int libbpf_print_fn(enum libbpf_print_level, const char *format, va_list args) { return vfprintf(stderr, format, args); } int main(int argc, char **argv) { struct helloworld_bpf *skel; int err; libbpf_set_strict_mode(LIBBPF_STRICT_ALL); // Set uo libbpf errors and debug info callback libbpf_set_print(libbpf_print_fn); // Open BPF application skel = helloworld_bpf__open(); if (!skel) { fprintf(stderr, "Failed to open BPF skeleton\n"); return 1; } // Load & verify BPF programs err = helloworld_bpf__load(skel); if (err) { fprintf(stderr, "Failed to load and verify BPF skeleton\n"); goto cleanup; } // Attach tracepoint handler err = helloworld_bpf__attach(skel); if (err) { fprintf(stderr, "Failed to attach BPF skeleton\n"); goto cleanup; } printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` " "to see output of the BPF programs.\n"); for (;;) { /* trigger our BPF program */ fprintf(stderr, "."); sleep(1); } cleanup: helloworld_bpf__destroy(skel); return -err; }
bpf 字节码被封装到 helloworld.skel.h 中,那么就是 open -> load -> attach -> destroy 依次执行即可。
编译运行:
再 Makefile 的 APPS 后加上 helloworld,cmake . 后直接 make helloworld就行。
-
参考文献