linux bpf 学习记录

  • eBPF 介绍

BPF(Berkeley Packet Filter)使普通用户拥有了让啮合执行用户代码并共享数据的能力。用户可以讲 eBPF 指令直接码传输给内核,然后通过 socket 写时间来触发内核执行代码。并且在用户空间和内核空间共享一个 map 内存,用户空间和内核空间都对其拥有读写能力。

  • eBPF 虚拟指令系统

    eBPF 指令属于 RISC,有 10 个虚拟寄存器,对应硬件 CPU 10 个物理寄存器。

    R0 – rax
    R1 - rdi
    R2 - rsi
    R3 - rdx
    R4 - rcx
    R5 - r8
    R6 - rbx
    R7 - r13
    R8 - r14
    R9 - r15
    R10 – rbp(帧指针,frame pointer)
    

    指令格式:

    struct bpf_insn {
        __u8    code;       /* opcode */
        __u8    dst_reg:4;  /* dest register */
        __u8    src_reg:4;  /* source register */
        __s16   off;        /* signed offset */
        __s32   imm;        /* signed immediate constant */
    };
    

    例如 x86 指令:mov edi 0xffffffff 的实现:

    #define BPF_MOV32_IMM(DST, IMM)                 \
        ((struct bpf_insn) {                    \
            .code  = BPF_ALU | BPF_MOV | BPF_K,     \
            .dst_reg = DST,                 \
            .src_reg = 0,                   \
            .off   = 0,                 \
            .imm   = IMM })
    

    编写的话就是:BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF),对应字节码为:\xb4\x09\x00\x00\xff\xff\xff\xff。

  • eBPF 加载过程

    1. 用户调用 syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)) 申请创建一个 map,在 attr 结构体中执行 map 的类型、大小、最大容量等属性。然后调用 sys_bpf 进而执行系统调用 syscall(__NR_bpf, BPF_MAP_CREATE, attr, size); 创建一个 map 数据结构,最终返回 map 的文件描述符。从而得到一个内核态和用户态的共享内存。
    2. 用户调用 syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)) 将用户编写的 bpf 代码加载进内核,attr 结构体中包含了指令数量、指令首地址、日志级别等属性,在加载前会利用虚拟执行的方式来做安全性检验(包括对指定语法的检查、指令数量的检查、指令中的指针和立即数的范围及读写权限的检查),检查通过后程序被成功加载至内核。
    3. 用户通过调用 setsocopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) 将我们写的 BPF 程序绑定到指定的 socket 上,Progfd 为上一步骤的返回值。
    4. 用户程序通过操作上一步骤的 socket 来触发 BPF 真正执行。
  • eBPF 代码执行过程

    对 eBPF 指令的解释执行,最后都会进入 __bpf_prog_run 函数,这个函数实际上是自己用栈模拟了一个 eBPF 程序的栈和寄存器,所以 eBPF 程序的指令可以直接控制内核栈数据,这也为后续漏洞利用提供了方便。

  • eBPF 中的函数

    • BPF_MAP_CREATE

      该函数用于创建一个新的 map 内存,返回一个新的文件描述符,并指向该内存。

      int bpf_create_map(enum bpf_map_type map_type,
      unsigned int key_size,
      unsigned int value_size,
      unsigned int max_entries)
      {
          union bpf_attr attr = {
              .map_type    = map_type,
              .key_size    = key_size,
              .value_size  = value_size,
              .max_entries = max_entries
          };
      
          return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));c
      }
      
    • BPF_MAP_LOOKUP_ELEM

      BPF_MAP_LOOKUP_ELEM 函数根据传入的 key 执行寻找其对应的元素。

      int bpf_lookup_elem(int fd, const void *key, void *value)
      {
          union bpf_attr attr = {
              .map_fd = fd,
              .key    = ptr_to_u64(key),
              .value  = ptr_to_u64(value),
          };
      
          return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
      }
      
    • BPF_MAP_UPDATE_ELEM

      BPF_MAP_UPDATE_ELEM 函数使用传入的 key 或 value 创建或者更新一个 map 中的元素

      int bpf_update_elem(int fd, const void *key, const void *value,
                          uint64_t flags)
      {
          union bpf_attr attr = {
              .map_fd = fd,
              .key    = ptr_to_u64(key),
              .value  = ptr_to_u64(value),
              .flags  = flags,
          };
      
          return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
      }
      
    • BPF_MAP_DELETE_ELEM

      BPF_MAP_DELETE_ELEM 函数用于根据传入的 key 或 value 来删除一个元素:

      int bpf_delete_elem(int fd, const void *key)
      {
          union bpf_attr attr = {
              .map_fd = fd,
              .key    = ptr_to_u64(key),
          };
      
          return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
      }
      
    • BPF_MAP_GET_NEXT_KEY

      该函数根据传入的 key 值寻找对应的元素,然后返回其下一个元素:

      int bpf_get_next_key(int fd, const void *key, void *next_key)
      {
          union bpf_attr attr = {
              .map_fd   = fd,
              .key      = ptr_to_u64(key),
              .next_key = ptr_to_u64(next_key),
          };
      
          return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
      }
      
    • BPF_PROG_LOAD

      该函数用于加载一个 eBPF 程序到内核,返回一个新的指向 eBPF 程序的文件指针。

      char bpf_log_buf[LOG_BUF_SIZE];
      
      int bpf_prog_load(enum bpf_prog_type type,
      const struct bpf_insn *insns, int insn_cnt,
      const char *license)
      {
          union bpf_attr attr = {
              .prog_type = type,
              .insns     = ptr_to_u64(insns),
              .insn_cnt  = insn_cnt,
              .license   = ptr_to_u64(license),
              .log_buf   = ptr_to_u64(bpf_log_buf),
              .log_size  = LOG_BUF_SIZE,
              .log_level = 1,
          };
      
          return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
      }
      
  • eBPF 安全校验

    在 BPF_PROG_LOAD 之后需要对 eBPF 进行安全检验,通过后才能继续执行,主要检测函数为 bpf_check。

    int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
    {
        char __user *log_ubuf = NULL;
        struct verifier_env *env;
        int ret = -EINVAL;
        // 指令条数判断
        if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
            return -E2BIG;
    
        /* 'struct verifier_env' can be global, but since it's not small,
         * allocate/free it every time bpf_check() is called
         */
        // 分配 verifier_env 空间
        env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
        if (!env)
            return -ENOMEM;
    
        env->prog = *prog;
        
        /* grab the mutex to protect few globals used by verifier */
        mutex_lock(&bpf_verifier_lock);
        
        if (attr->log_level || attr->log_buf || attr->log_size) {
            /* user requested verbose verifier output
             * and supplied buffer to store the verification trace
             */
            log_level = attr->log_level;
            log_ubuf = (char __user *) (unsigned long) attr->log_buf;
            log_size = attr->log_size;
            log_len = 0;
    
            ret = -EINVAL;
            /* log_* values have to be sane */
            if (log_size < 128 || log_size > UINT_MAX >> 8 ||
                log_level == 0 || log_ubuf == NULL)
                goto free_env;
    
            ret = -ENOMEM;
            log_buf = vmalloc(log_size);
            if (!log_buf)
                goto free_env;
        } else {
            log_level = 0;
        }
        /* look for pseudo eBPF instructions that access map FDs and
     	*  replace them with actual map pointers
     	*/
        // 将伪指令中操作 map_fd 的部分替换成 map 地址,注意这个地址是8字节的,因此在实现中用本指令的 imm 和下一条指令的 2 个 4 字节中存储了这个地址
        /* store map pointer inside BPF_LD_IMM64 instruction 
                insn[0].imm = (u32) (unsigned long) map;
                insn[1].imm = ((u64) (unsigned long) map) >> 32;
        */
        ret = replace_map_fd_with_map_ptr(env);
        if (ret < 0)
            goto skip_full_check;
    
        env->explored_states = kcalloc(env->prog->len,
                           sizeof(struct verifier_state_list *),
                           GFP_USER);
        ret = -ENOMEM;
        if (!env->explored_states)
            goto skip_full_check;
        // 控制流图检查死循环和不可能到达的跳转
        ret = check_cfg(env);
        if (ret < 0)
            goto skip_full_check;
        env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
        // 核心检查函数
        ret = do_check(env);
        
    skip_full_check:
        while (pop_stack(env, NULL) >= 0);
        free_states(env);
    
        if (ret == 0)
            /* program is valid, convert *(u32*)(ctx + off) accesses */
            ret = convert_ctx_accesses(env);
    
        if (log_level && log_len >= log_size - 1) {
            BUG_ON(log_len >= log_size);
            /* verifier log exceeded user supplied buffer */
            ret = -ENOSPC;
            /* fall through to return what was recorded */
        }
    
        /* copy verifier log back to user space including trailing zero */
        if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
            ret = -EFAULT;
            goto free_log_buf;
        }
    
        if (ret == 0 && env->used_map_cnt) {
            /* if program passed verifier, update used_maps in bpf_prog_info */
            env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
                                  sizeof(env->used_maps[0]),
                                  GFP_KERNEL);
    
            if (!env->prog->aux->used_maps) {
                ret = -ENOMEM;
                goto free_log_buf;
            }
    
            memcpy(env->prog->aux->used_maps, env->used_maps,
                   sizeof(env->used_maps[0]) * env->used_map_cnt);
            env->prog->aux->used_map_cnt = env->used_map_cnt;
    
            /* program is valid. Convert pseudo bpf_ld_imm64 into generic
             * bpf_ld_imm64 instructions
             */
            convert_pseudo_ld_imm64(env);
        }
    
    free_log_buf:
        if (log_level)
            vfree(log_buf);
    free_env:
        if (!env->prog->aux->used_maps)
            /* if we didn't copy map pointers into bpf_prog_info, release
             * them now. Otherwise free_bpf_prog_info() will release them.
             */
            release_maps(env);
        *prog = env->prog;
        kfree(env);
        mutex_unlock(&bpf_verifier_lock);
        return ret;
    }
    

    其中主要通过 do_check 来根据不同的指令类型来做具体的合法性判断,使用的核心数据结构是:reg_state, bpf_reg_type 枚举变量用来表示寄存器的类型,初始化为 NOT_INIT。

    struct reg_state {
        enum bpf_reg_type type;
        union {
            /* valid when type == CONST_IMM | PTR_TO_STACK */
            int imm;
    
            /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
             *   PTR_TO_MAP_VALUE_OR_NULL
             */
            struct bpf_map *map_ptr;
        };
    };
    static void init_reg_state(struct reg_state *regs)
    {
        int i;
    
        for (i = 0; i < MAX_BPF_REG; i++) {
            regs[i].type = NOT_INIT;
            regs[i].imm = 0;
            regs[i].map_ptr = NULL;
        }
    
        /* frame pointer */
        regs[BPF_REG_FP].type = FRAME_PTR;
    
        /* 1st arg to a function */
        regs[BPF_REG_1].type = PTR_TO_CTX;
    }
    /* types of values stored in eBPF registers */
    enum bpf_reg_type {
        NOT_INIT = 0,        /* nothing was written into register */
        UNKNOWN_VALUE,       /* reg doesn't contain a valid pointer */
        PTR_TO_CTX,      /* reg points to bpf_context */
        CONST_PTR_TO_MAP,    /* reg points to struct bpf_map */
        PTR_TO_MAP_VALUE,    /* reg points to map element value */
        PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
        FRAME_PTR,       /* reg == frame_pointer */
        PTR_TO_STACK,        /* reg == frame_pointer + imm */
        CONST_IMM,       /* constant integer value */
    };
    

    do_check:

    static int do_check(struct verifier_env *env)
    {
        struct verifier_state *state = &env->cur_state;
        struct bpf_insn *insns = env->prog->insnsi;
        struct reg_state *regs = state->regs;
        int insn_cnt = env->prog->len;
        int insn_idx, prev_insn_idx = 0;
        int insn_processed = 0;
        bool do_print_state = false;
    
        init_reg_state(regs);
        insn_idx = 0;
        for (;;) {
            struct bpf_insn *insn;
            u8 class;
            int err;
            //指令条数检查
            if (insn_idx >= insn_cnt) {
                verbose("invalid insn idx %d insn_cnt %d\n",
                    insn_idx, insn_cnt);
                return -EFAULT;
            }
    
            insn = &insns[insn_idx];
            class = BPF_CLASS(insn->code);
            //运行过的次数上限检查
            if (++insn_processed > 32768) {
                verbose("BPF program is too large. Proccessed %d insn\n",
                    insn_processed);
                return -E2BIG;
            }
            //检测该指令有无visit,主要通过env->explored_states的状态数组保存访问过的指令的状态
            err = is_state_visited(env, insn_idx);
            if (err < 0)
                return err;
            if (err == 1) {
                /* found equivalent state, can prune the search */
                if (log_level) {
                    if (do_print_state)
                        verbose("\nfrom %d to %d: safe\n",
                            prev_insn_idx, insn_idx);
                    else
                        verbose("%d: safe\n", insn_idx);
                }
                goto process_bpf_exit;
            }
    
            if (log_level && do_print_state) {
                verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
                print_verifier_state(env);
                do_print_state = false;
            }
    
            if (log_level) {
                verbose("%d: ", insn_idx);
                print_bpf_insn(env, insn);
            }
            //计算指令ALU
            if (class == BPF_ALU || class == BPF_ALU64) {
                //检查具体指令的合法性,比如是否使用了保留的field,使用的寄存器编号是否超过了模拟寄存器的最大编号,寄存器是否可读/写,寄存器值是否是指针等,该函数后面详细解释
                err = check_alu_op(env, insn);
                if (err)
                    return err;
            //BPF_LDX指令
            } else if (class == BPF_LDX) {
                enum bpf_reg_type src_reg_type;
    
                /* check for reserved fields is already done */
    
                /* check src operand */
                //检测源寄存器的编号是否超过最大编号,如果为操作数其是否初始化,是否是指针
                err = check_reg_arg(regs, insn->src_reg, SRC_OP);
                if (err)
                    return err;
                //检查目的寄存器
                err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
                if (err)
                    return err;
                //
                src_reg_type = regs[insn->src_reg].type;
    
                /* check that memory (src_reg + off) is readable,
                 * the state of dst_reg will be updated by this func
                 */
                //检查源寄存器+off所指的地址是可读的
                err = check_mem_access(env, insn->src_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_READ,
                               insn->dst_reg);
                if (err)
                    return err;
    
                if (BPF_SIZE(insn->code) != BPF_W) {
                    insn_idx++;
                    continue;
                }
    
                if (insn->imm == 0) {
                    /* saw a valid insn
                     * dst_reg = *(u32 *)(src_reg + off)
                     * use reserved 'imm' field to mark this insn
                     */
                    insn->imm = src_reg_type;//判断出了一种指令类型,即地址取值指令
    
                }
                //源类型非立即数
                else if (src_reg_type != insn->imm &&
                       (src_reg_type == PTR_TO_CTX ||
                        insn->imm == PTR_TO_CTX)) {
                    /* ABuser program is trying to use the same insn
                     * dst_reg = *(u32*) (src_reg + off)
                     * with different pointer types:
                     * src_reg == ctx in one branch and
                     * src_reg == stack|map in some other branch.
                     * Reject it.
                     */
                    verbose("same insn cannot be used with different pointers\n");
                    return -EINVAL;
                }
            //BPF_STX指令
            } else if (class == BPF_STX) {
                enum bpf_reg_type dst_reg_type;
    
                if (BPF_MODE(insn->code) == BPF_XADD) {
                    err = check_xadd(env, insn);
                    if (err)
                        return err;
                    insn_idx++;
                    continue;
                }
    
                /* check src1 operand */
                err = check_reg_arg(regs, insn->src_reg, SRC_OP);
                if (err)
                    return err;
                /* check src2 operand */
                err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
                if (err)
                    return err;
    
                dst_reg_type = regs[insn->dst_reg].type;
    
                /* check that memory (dst_reg + off) is writeable */
                err = check_mem_access(env, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_WRITE,
                               insn->src_reg);
                if (err)
                    return err;
    
                if (insn->imm == 0) {
                    insn->imm = dst_reg_type;
                } else if (dst_reg_type != insn->imm &&
                       (dst_reg_type == PTR_TO_CTX ||
                        insn->imm == PTR_TO_CTX)) {
                    verbose("same insn cannot be used with different pointers\n");
                    return -EINVAL;
                }
            //BPF_ST指令
            } else if (class == BPF_ST) {
                if (BPF_MODE(insn->code) != BPF_MEM ||
                    insn->src_reg != BPF_REG_0) {
                    verbose("BPF_ST uses reserved fields\n");
                    return -EINVAL;
                }
                /* check src operand */
                err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
                if (err)
                    return err;
    
                /* check that memory (dst_reg + off) is writeable */
                err = check_mem_access(env, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_WRITE,
                               -1);
                if (err)
                    return err;
            //BPF_JMP指令
            } else if (class == BPF_JMP) {
                u8 opcode = BPF_OP(insn->code);
                //直接跳转CALL
                if (opcode == BPF_CALL) {
                    if (BPF_SRC(insn->code) != BPF_K ||
                        insn->off != 0 ||
                        insn->src_reg != BPF_REG_0 ||
                        insn->dst_reg != BPF_REG_0) {
                        verbose("BPF_CALL uses reserved fields\n");
                        return -EINVAL;
                    }
                    //在这个函数中会检查跳转的地址有无超过范围,函数的五个参数的参数类型(是否是key/value/map地址/stack_size等),更新返回值寄存器,更新reg_state等。
                    err = check_call(env, insn->imm);
                    if (err)
                        return err;
    
                } else if (opcode == BPF_JA) {
                    if (BPF_SRC(insn->code) != BPF_K ||
                        insn->imm != 0 ||
                        insn->src_reg != BPF_REG_0 ||
                        insn->dst_reg != BPF_REG_0) {
                        verbose("BPF_JA uses reserved fields\n");
                        return -EINVAL;
                    }
    
                    insn_idx += insn->off + 1;
                    continue;
    
                } else if (opcode == BPF_EXIT) {
                    if (BPF_SRC(insn->code) != BPF_K ||
                        insn->imm != 0 ||
                        insn->src_reg != BPF_REG_0 ||
                        insn->dst_reg != BPF_REG_0) {
                        verbose("BPF_EXIT uses reserved fields\n");
                        return -EINVAL;
                    }
                    //r0保存返回值,bpf_exit为指令集合结束标志,在此之前检查有无写入值
                    /* eBPF calling convetion is such that R0 is used
                     * to return the value from eBPF program.
                     * Make sure that it's readable at this time
                     * of bpf_exit, which means that program wrote
                     * something into it earlier
                     */
                    err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
                    if (err)
                        return err;
    
                    if (is_pointer_value(env, BPF_REG_0)) {
                        verbose("R0 leaks addr as return value\n");
                        return -EACCES;
                    }
                    //遇到一个exit就结束一个分支,回退到分叉处执行另一个branch,类似于走迷宫遍历路径
    process_bpf_exit:
                    insn_idx = pop_stack(env, &prev_insn_idx);
                    if (insn_idx < 0) {
                        break;
                    } else {
                        do_print_state = true;
                        continue;
                    }
                } else {
                    err = check_cond_jmp_op(env, insn, &insn_idx);
                    if (err)
                        return err;
                }
            } else if (class == BPF_LD) {
                u8 mode = BPF_MODE(insn->code);
    
                if (mode == BPF_ABS || mode == BPF_IND) {
                    err = check_ld_abs(env, insn);
                    if (err)
                        return err;
    
                } else if (mode == BPF_IMM) {
                    err = check_ld_imm(env, insn);
                    if (err)
                        return err;
    
                    insn_idx++;
                } else {
                    verbose("invalid BPF_LD mode\n");
                    return -EINVAL;
                }
            } else {
                verbose("unknown insn class %d\n", class);
                return -EINVAL;
            }
    
            insn_idx++;
        }
    
        return 0;
    }
    
  • eBPF 架构

    image-20220811165927132

  • Hello World

    基于 libbpf-bootstrap 的结构实现 BPF:

    image-20220812095706517

    相关依赖和初始化:

    git clone https://github.com/libbpf/libbpf-bootstrap.git
    git submodule update --init --recursive
    

    在 libbpf-bootstrap/examples/ 目录下写 helloworld.bpf.c(运行在内核态的 BPF 源码)和 helloworld.c(加载 BPF 到内核的用户态程序)

    helloworld.bpf.c

    #include <linux/bpf.h>
    #include <bpf/bpf_helper.h>
    
    SEC("tracepoint/syscalls/sys_enter_execve")
    
    int bpf_prog(void *ctx) {
        char msg[] = "Hello, World";
        bpf_printk("invoke bpf_prog: %s\n", msg);
        return 0;
    }
    
    char LICENSE[] SEC("license") = "Dual BSD/GPL";
    

    再系统调用 execve 的埋点处注入 bpf_prog,使得每次调用 execve 的时候,都会回调 bpf_prog。

    hellocworld.c

    #include <stdio.h>
    #include <unistd.h>
    #include <sys/resource.h>
    #include <bpf/libbpf.h>
    #include "helloworld.skel.h"
    
    static int libbpf_print_fn(enum libbpf_print_level, const char *format, va_list args)
    {
        return vfprintf(stderr, format, args);
    }
    
    int main(int argc, char **argv)
    {
        struct helloworld_bpf *skel;
        int err;
    
        libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
        // Set uo libbpf errors and debug info callback
        libbpf_set_print(libbpf_print_fn);
    
        // Open BPF application
        skel = helloworld_bpf__open();
        if (!skel) {
            fprintf(stderr, "Failed to open BPF skeleton\n");
            return 1;
        }
    
        // Load & verify BPF programs
        err = helloworld_bpf__load(skel);
        if (err) {
            fprintf(stderr, "Failed to load and verify BPF skeleton\n");
            goto cleanup;
        }
    
        // Attach tracepoint handler
        err = helloworld_bpf__attach(skel);
        if (err) {
            fprintf(stderr, "Failed to attach BPF skeleton\n");
            goto cleanup;
        }
    
        printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` "
               "to see output of the BPF programs.\n");
    
        for (;;) {
            /* trigger our BPF program */
            fprintf(stderr, ".");
            sleep(1);
        }
    
    cleanup:
        helloworld_bpf__destroy(skel);
        return -err;
    }
    

    bpf 字节码被封装到 helloworld.skel.h 中,那么就是 open -> load -> attach -> destroy 依次执行即可。

    编译运行:

    再 Makefile 的 APPS 后加上 helloworld,cmake . 后直接 make helloworld就行。

    image-20220812133335480

  • 参考文献

posted @ 2022-12-29 12:34  moon_flower  阅读(394)  评论(0编辑  收藏  举报