sockmap/eBPF

  This is how to use SOCKMAP: SOCKMAP or specifically "BPF_MAP_TYPE_SOCKMAP", is a type of an eBPF map. This map is an "array" - indices are integers. All this is pretty standard. The magic is in the map values - they must be TCP socket descriptors.

 copy from:https://blog.cloudflare.com/sockmap-tcp-splicing-of-the-future/

也就是eBPF程序必须attach一个map,不是attach一个socket。so how to use SOCKMAP ?

sock_map = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int), sizeof(int), 2, 0)

prog_parser = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...)
prog_verdict = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...)
bpf_prog_attach(prog_parser, sock_map, BPF_SK_SKB_STREAM_PARSER)
bpf_prog_attach(prog_verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT)
  • 先看看 bpf_create_map的作用: 创建一个map内存块 
  • BPF map的应用场景有几种:

    • BPF程序和用户态态的交互:BPF程序运行完,得到的结果存储到map中,供用户态访问;
    • BPF程序内部交互:如果BPF程序内部需要用全局变量来交互,但是由于安全原因BPF程序不允许访问全局变量,可以使用map来充当全局变量;
    • BPF Tail call:Tail call是一个BPF程序跳转到另一BPF程序,BPF程序首先通过BPF_MAP_TYPE_PROG_ARRAY类型的map来知道另一个BPF程序的指针,然后调用tail_call()的helper function来执行Tail call。
    • BPF程序和内核态的交互:和BPF程序以外的内核程序交互,也可以使用map作为中介;
    • Map 类型(map_type),就是上文提到的各种 Map 类型
    • Map 的键大小(key_size),以字节为单位
    • Map 的值大小(value_size),以字节为单位
    • Map 的元素最大容量(max_entries),个数为单位
复制代码
{
    struct { /* anonymous struct used by BPF_MAP_CREATE command */
        __u32    map_type;    /* one of enum bpf_map_type */
        __u32    key_size;    /* size of key in bytes */
        __u32    value_size;    /* size of value in bytes */
        __u32    max_entries;    /* max number of entries in a map */
        __u32    map_flags;    /* BPF_MAP_CREATE related
                     * flags defined above.
                     */
        __u32    inner_map_fd;    /* fd pointing to the inner map */
        __u32    numa_node;    /* numa node (effective only if
                     * BPF_F_NUMA_NODE is set).
                     */
        char    map_name[BPF_OBJ_NAME_LEN];
        __u32    map_ifindex;    /* ifindex of netdev to create on */
        __u32    btf_fd;        /* fd pointing to a BTF type data */
        __u32    btf_key_type_id;    /* BTF type_id of the key */
        __u32    btf_value_type_id;    /* BTF type_id of the value */
        __u32    btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
                           * struct stored as the
                           * map value
                           */
    };
    ---------------------------
}
复制代码

 

复制代码
int bpf_create_map(enum bpf_map_type map_type, int key_size,
           int value_size, int max_entries, __u32 map_flags)
{
    struct bpf_create_map_attr map_attr = {};

    map_attr.map_type = map_type;//BPF_MAP_TYPE_SOCKMAP  BPF_MAP_TYPE_HASH BPF_MAP_TYPE_ARRAY and so on
    map_attr.map_flags = map_flags;//map的标志位
    map_attr.key_size = key_size; //键值 中键的大小
    map_attr.value_size = value_size;// 键值中值的大小
    map_attr.max_entries = max_entries;//map键值对 最大数目

    return bpf_create_map_xattr(&map_attr);
}
复制代码
复制代码
int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
{
    union bpf_attr attr;

    memset(&attr, '\0', sizeof(attr));
    // 完成 bpf_attr的赋值初始化
    attr.map_type = create_attr->map_type;
    attr.key_size = create_attr->key_size;
    attr.value_size = create_attr->value_size;
    attr.max_entries = create_attr->max_entries;
    attr.map_flags = create_attr->map_flags;
    if (create_attr->name)
        memcpy(attr.map_name, create_attr->name,
               min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
    attr.numa_node = create_attr->numa_node;
    attr.btf_fd = create_attr->btf_fd;
    attr.btf_key_type_id = create_attr->btf_key_type_id;
    attr.btf_value_type_id = create_attr->btf_value_type_id;
    attr.map_ifindex = create_attr->map_ifindex;
    if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS)
        attr.btf_vmlinux_value_type_id =
            create_attr->btf_vmlinux_value_type_id;
    else
        attr.inner_map_fd = create_attr->inner_map_fd;
    //调用bpf 系统调用 创建 一个map bpf 第一个参数为命令参数,比如: BPF_MAP_CREATE BPF_MAP_UPDATE_ELEM BPF_MAP_DELETE_ELEM
    return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
View Code
复制代码

可以看到 实际上 会调用一个map_create 函数 分配内存 并初始化一个map

复制代码
static int map_create(union bpf_attr *attr)
{
    int numa_node = bpf_map_attr_numa_node(attr);
    struct bpf_map_memory mem;
    struct bpf_map *map;
    int f_flags;
    int err;

    err = CHECK_ATTR(BPF_MAP_CREATE);
    if (err)
        return -EINVAL;

    if (attr->btf_vmlinux_value_type_id) {
        if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
            attr->btf_key_type_id || attr->btf_value_type_id)
            return -EINVAL;
    } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
        return -EINVAL;
    }

    f_flags = bpf_get_file_flag(attr->map_flags);
    if (f_flags < 0)
        return f_flags;

    if (numa_node != NUMA_NO_NODE &&
        ((unsigned int)numa_node >= nr_node_ids ||
         !node_online(numa_node)))
        return -EINVAL;

    /* find map type and init map: hashtable vs rbtree vs bloom vs ...
分配内存使用    */
    map = find_and_alloc_map(attr);
    if (IS_ERR(map))
        return PTR_ERR(map);

    err = bpf_obj_name_cpy(map->name, attr->map_name,
                   sizeof(attr->map_name));
    if (err < 0)
        goto free_map;

    atomic64_set(&map->refcnt, 1);
    atomic64_set(&map->usercnt, 1);
    mutex_init(&map->freeze_mutex);

    map->spin_lock_off = -EINVAL;
    ----------------------------------------------

    err = bpf_map_alloc_id(map); // 将map 和 idx-id 相关联索引
    if (err)
        goto free_map_sec;

    err = bpf_map_new_fd(map, f_flags);// 将map 和fd 关联  一切皆文件
    if (err < 0) {
        /* failed to allocate fd.
         * bpf_map_put_with_uref() is needed because the above
         * bpf_map_alloc_id() has published the map
         * to the userspace and the userspace may
         * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
         */
        bpf_map_put_with_uref(map);
        return err;
    }

    return err;
}
复制代码

map_create 会调用:对应map_type的ops去分配内存等

以map_array为例:

复制代码
static const struct bpf_map_ops array_ops = {
    .map_alloc = array_map_alloc,
    .map_free = array_map_free,
    .map_get_next_key = array_map_get_next_key,
    .map_lookup_elem = array_map_lookup_elem,
    .map_update_elem = array_map_update_elem,
    .map_delete_elem = array_map_delete_elem,
};

static struct bpf_map_type_list array_type __read_mostly = {
    .ops = &array_ops,
    .type = BPF_MAP_TYPE_ARRAY,
};


static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
    bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
    u32 elem_size, index_mask, max_entries;
    bool unpriv = !capable(CAP_SYS_ADMIN);
    struct bpf_array *array;
    u64 array_size, mask64;

    /* check sanity of attributes */
    if (attr->max_entries == 0 || attr->key_size != 4 ||
        attr->value_size == 0 || attr->map_flags)
        return ERR_PTR(-EINVAL);

    if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
        /* if value_size is bigger, the user space won't be able to
         * access the elements.
         */
        return ERR_PTR(-E2BIG);

    /* (1.1.1) 计算value的size,key的size不用计算也不用存储,因为这里的key直接就是index */
    elem_size = round_up(attr->value_size, 8);

    max_entries = attr->max_entries;

    /* On 32 bit archs roundup_pow_of_two() with max_entries that has
     * upper most bit set in u32 space is undefined behavior due to
     * resulting 1U << 32, so do it manually here in u64 space.
     */
    mask64 = fls_long(max_entries - 1);
    mask64 = 1ULL << mask64;
    mask64 -= 1;

    index_mask = mask64;
    if (unpriv) {
        /* round up array size to nearest power of 2,
         * since cpu will speculate within index_mask limits
         */
        max_entries = index_mask + 1;
        /* Check for overflows. */
        if (max_entries < attr->max_entries)
            return ERR_PTR(-E2BIG);
    }

    /* (1.1.2) 计算bpf_array + value数组的总大小,bpf_array包含了map的通用结构bpf_map */
    array_size = sizeof(*array);
    if (percpu)
        array_size += (u64) max_entries * sizeof(void *);
    else
        array_size += (u64) max_entries * elem_size;

    /* make sure there is no u32 overflow later in round_up() */
    if (array_size >= U32_MAX - PAGE_SIZE)
        return ERR_PTR(-ENOMEM);

    /* allocate all map elements and zero-initialize them */
    /* (1.1.3) 根据总大小,分配bpf_array空间 */
    array = bpf_map_area_alloc(array_size);
    if (!array)
        return ERR_PTR(-ENOMEM);
    array->index_mask = index_mask;
    array->map.unpriv_array = unpriv;

    /* copy mandatory map attributes */
    /* (1.1.4) 拷贝attr到array->map中 */
    array->map.map_type = attr->map_type;
    array->map.key_size = attr->key_size;
    array->map.value_size = attr->value_size;
    array->map.max_entries = attr->max_entries;
    array->elem_size = elem_size;

    if (!percpu)
        goto out;

    array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();

    if (array_size >= U32_MAX - PAGE_SIZE ||
        elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
        bpf_map_area_free(array);
        return ERR_PTR(-ENOMEM);
    }
out:
    array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;

    return &array->map;
}
View Code
复制代码

 

bpf_load_program:用BPF_PROG_LOAD命令进行bpf系统调用加载 BPF 程序到内核中

  • 拷贝程序到内核;
  • 校验它的安全性;
  • 如果可能对它进行JIT编译;
  • 然后分配一个文件句柄fd给它

完成这一切后,后续再把这段BPF程序挂载到需要运行的钩子上面。

复制代码
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
    enum bpf_prog_type type = attr->prog_type;
    struct bpf_prog *prog;
    int err;
    char license[128];
    bool is_gpl;

    if (CHECK_ATTR(BPF_PROG_LOAD))
        return -EINVAL;

    if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
                 BPF_F_ANY_ALIGNMENT |
                 BPF_F_TEST_STATE_FREQ |
                 BPF_F_TEST_RND_HI32))
        return -EINVAL;

    if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
        (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
        !bpf_capable())
        return -EPERM;

    /* copy eBPF program license from user space 
    根据attr->license地址,从用户空间拷贝license字符串到内核 */
    if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
                  sizeof(license) - 1) < 0)
        return -EFAULT;
    license[sizeof(license) - 1] = 0;

    /* eBPF programs must be GPL compatible to use GPL-ed functions
    判断license是否符合GPL协议*/
    is_gpl = license_is_gpl_compatible(license);
        //判断BPF的总指令数是否超过BPF_MAXINSNS(4k)
    if (attr->insn_cnt == 0 ||
        attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
        return -E2BIG;
    //对BPF_PROG_TYPE_SOCKET_FILTER和BPF_PROG_TYPE_CGROUP_SKB以外的BPF程序加载,需要管理员权限
    if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
        type != BPF_PROG_TYPE_CGROUP_SKB &&
        !bpf_capable())
        return -EPERM;
//对 CGROUP  SOCK等需要admin 权限 或者 对应net 空间的权限
    if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
        return -EPERM;
    if (is_perfmon_prog_type(type) && !perfmon_capable())
        return -EPERM;

    bpf_prog_load_fixup_attach_type(attr);
    if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
                       attr->attach_btf_id,
                       attr->attach_prog_fd))
        return -EINVAL;

    /* plain bpf_prog allocation 根据BPF指令数分配bpf_prog空间,和bpf_prog->aux空间*/
    prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
    if (!prog)
        return -ENOMEM;

    prog->expected_attach_type = attr->expected_attach_type;
    prog->aux->attach_btf_id = attr->attach_btf_id;
    if (attr->attach_prog_fd) {
        struct bpf_prog *tgt_prog;

        tgt_prog = bpf_prog_get(attr->attach_prog_fd);
        if (IS_ERR(tgt_prog)) {
            err = PTR_ERR(tgt_prog);
            goto free_prog_nouncharge;
        }
        prog->aux->linked_prog = tgt_prog;
    }

    prog->aux->offload_requested = !!attr->prog_ifindex;

    err = security_bpf_prog_alloc(prog->aux);
    if (err)
        goto free_prog_nouncharge;

    err = bpf_prog_charge_memlock(prog);
    if (err)
        goto free_prog_sec;

    prog->len = attr->insn_cnt;

    err = -EFAULT;//把BPF代码从用户空间地址attr->insns,拷贝到内核空间地址prog->insns
    if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
               bpf_prog_insn_size(prog)) != 0)
        goto free_prog;

    prog->orig_prog = NULL;
    prog->jited = 0;

    atomic64_set(&prog->aux->refcnt, 1);
    prog->gpl_compatible = is_gpl ? 1 : 0;

    if (bpf_prog_is_dev_bound(prog->aux)) {
        err = bpf_prog_offload_init(prog, attr);
        if (err)
            goto free_prog;
    }

    /* find program type: socket_filter vs tracing_filter 
    根据attr->prog_type指定的type值,找到对应的bpf_prog_types,
        给bpf_prog->aux->ops赋值,这个ops是一个函数操作集*/
    err = find_prog_type(type, prog);
    if (err < 0)
        goto free_prog;

    prog->aux->load_time = ktime_get_boottime_ns();
    err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
                   sizeof(attr->prog_name));
    if (err < 0)
        goto free_prog;

    /* run eBPF verifier 使用verifer对BPF程序进行合法性扫描 */
    err = bpf_check(&prog, attr, uattr);
    if (err < 0)
        goto free_used_maps;
    /*尝试对BPF程序进行JIT转换*/
    prog = bpf_prog_select_runtime(prog, &err);
    if (err < 0)
        goto free_used_maps;
//给BPF程序分配关联一个idx id索引
    err = bpf_prog_alloc_id(prog);
    if (err)
        goto free_used_maps;

    /* Upon success of bpf_prog_alloc_id(), the BPF prog is
     * effectively publicly exposed. However, retrieving via
     * bpf_prog_get_fd_by_id() will take another reference,
     * therefore it cannot be gone underneath us.
     *
     * Only for the time /after/ successful bpf_prog_new_fd()
     * and before returning to userspace, we might just hold
     * one reference and any parallel close on that fd could
     * rip everything out. Hence, below notifications must
     * happen before bpf_prog_new_fd().
     *
     * Also, any failure handling from this point onwards must
     * be using bpf_prog_put() given the program is exposed.
     */
    bpf_prog_kallsyms_add(prog);
    perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
    bpf_audit_prog(prog, BPF_AUDIT_LOAD);
//给BPF程序分配一个文件句柄fd 
    err = bpf_prog_new_fd(prog);
    if (err < 0)
        bpf_prog_put(prog);
    return err;
--------------------------------
}
复制代码

 

bpf_prog_attach:如何把我的bpf程序,attach到这些类型上:
重定向程序作为BPF_SK_SKB_STREAM_VERDICT附加到sockmap; 它应返回bpf_sk_redirect_map()的结果。
一个strparser程序通过BPF_SK_SKB_STREAM_PARSER附加,并且应返回已解析数据的长度。

能够获取什么样的context?

指向包含包元数据/数据的结构__sk_buff的指针。 但是,sk_skb程序类型可以访问更多字段。 可用的额外字段集记录在include / linux / bpf.h中,如下所示:

什么时候会运行?
可以通过把BPF_SK_SKB_STREAM_PARSER 附加到sockmap上来把一个stream parser附加到一个socket上,然后,当socket通过、bpf/sockmap.c中的smap_parse_func_strparser() 接受的时候,就会执行。BPF_SK_SKB_STREAM_VERDICT也会附加到sockmap上,它通过smap_verdict_func()来执行。
复制代码
/*  bpf_load_program

bpf_prog_attach(verdict_prog, map_fd, BPF_SMAP_STREAM_VERDICT, 0);

int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
            unsigned int flags)
{
    DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts,
        .flags = flags,
    );

    return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts);
}

int bpf_prog_attach_xattr(int prog_fd, int target_fd,
              enum bpf_attach_type type,
              const struct bpf_prog_attach_opts *opts)
{
    union bpf_attr attr;

    if (!OPTS_VALID(opts, bpf_prog_attach_opts))
        return -EINVAL;

    memset(&attr, 0, sizeof(attr));
    attr.target_fd       = target_fd;
    attr.attach_bpf_fd = prog_fd;
    attr.attach_type   = type;
    attr.attach_flags  = OPTS_GET(opts, flags, 0);
    attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0);

    return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
}
复制代码
int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
             struct bpf_prog *old, u32 which)
{
    struct sk_psock_progs *progs = sock_map_progs(map);
    struct bpf_prog **pprog;

    switch (which) {
------------------------------------------
    case BPF_SK_SKB_STREAM_PARSER:
        pprog = &progs->skb_parser;
        break;
    case BPF_SK_SKB_STREAM_VERDICT:
        pprog = &progs->skb_verdict;
        break;

    }
    psock_set_prog(pprog, prog);
    return 0;
}
复制代码

 


int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
{
    u32 ufd = attr->target_fd;
    struct bpf_map *map;
    struct fd f;
    int ret;

    if (attr->attach_flags || attr->replace_bpf_fd)
        return -EINVAL;

    f = fdget(ufd);
    map = __bpf_map_get(f);
    if (IS_ERR(map))
        return PTR_ERR(map);
    ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);---//找到对应的sk_psock_progs  并更新
    fdput(f);
    return ret;
}

*/
static int bpf_prog_attach(const union bpf_attr *attr)
{
    enum bpf_prog_type ptype;BPF_SOCK_STREAM_VERDICT
    struct bpf_prog *prog = NULL;
    int ret;

    if (CHECK_ATTR(BPF_PROG_ATTACH))
        return -EINVAL;

    if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
        return -EINVAL;
    //------BPF_SK_SKB_STREAM_VERDICT-------> transmit -----BPF_PROG_TYPE_SK_SKB  也就是attach type 转换为 prog-type
    ptype = attach_type_to_prog_type(attr->attach_type);
    if (ptype == BPF_PROG_TYPE_UNSPEC)
        return -EINVAL;

    prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
    if (IS_ERR(prog))
        return PTR_ERR(prog);

    if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
        bpf_prog_put(prog);
        return -EINVAL;
    }
/*
    const struct bpf_verifier_ops sk_skb_verifier_ops = {
        .get_func_proto     = sk_skb_func_proto,--------------bpf_sk_redirect_map_proto----------bpf_msg_redirect_map
        .is_valid_access    = sk_skb_is_valid_access,
        .convert_ctx_access = sk_skb_convert_ctx_access,
        .gen_prologue       = sk_skb_prologue,
    };

    */
    switch (ptype) {
    case BPF_PROG_TYPE_SK_SKB:
    case BPF_PROG_TYPE_SK_MSG:
        ret = sock_map_get_from_fd(attr, prog);// 根据target_fd 找到 map  并关联对应map
        break;
    case BPF_PROG_TYPE_LIRC_MODE2:
        ret = lirc_prog_attach(attr, prog);
        break;
    case BPF_PROG_TYPE_FLOW_DISSECTOR:
        ret = netns_bpf_prog_attach(attr, prog);
        break;
    case BPF_PROG_TYPE_CGROUP_DEVICE:
    case BPF_PROG_TYPE_CGROUP_SKB:
    case BPF_PROG_TYPE_CGROUP_SOCK:
    case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
    case BPF_PROG_TYPE_CGROUP_SOCKOPT:
    case BPF_PROG_TYPE_CGROUP_SYSCTL:
    case BPF_PROG_TYPE_SOCK_OPS:
        ret = cgroup_bpf_prog_attach(attr, ptype, prog);
        break;
    default:
        ret = -EINVAL;
    }

    if (ret)
        bpf_prog_put(prog);
    return ret;
}
复制代码

established sock_map eBPF map, with two eBPF programs attached: parser and verdict. 
The next step is to add a TCP socket descriptor to this map

int val = fd;
bpf_map_update_elem(sock_map, &idx, &val, BPF_ANY);
bpf_map_update_elem: 将fd socket 和map相关联

会执行系统调用 bpf(BPF_MAP_UPDATE_ELEM,-----) 最后调用map_update_elem 函数处理

复制代码
static int map_update_elem(union bpf_attr *attr)
{
    void __user *ukey = u64_to_user_ptr(attr->key);// 对应idx 索引
    void __user *uvalue = u64_to_user_ptr(attr->value);//对应 键值 value 比如 需要执行动作的socket--fd
    int ufd = attr->map_fd;
-----------------------
    f = fdget(ufd);// map_fd--->file--->对应的map 内存
    map = __bpf_map_get(f);// map_fd--->file--->对应的map 内存 f.file->private_data;
   ------------------------------
  ----------------------------------// 将 key value 更新到map 中
    err = bpf_map_update_value(map, f, key, value, attr->flags);

}
复制代码
复制代码
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
                void *value, __u64 flags)
{
    int err;

    /* Need to create a kthread, thus must support schedule */
    if (bpf_map_is_dev_bound(map)) {
        return bpf_map_offload_update_elem(map, key, value, flags);
    } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
           map->map_type == BPF_MAP_TYPE_SOCKHASH ||
           map->map_type == BPF_MAP_TYPE_SOCKMAP ||//sock_map_update_elem
           map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
        return map->ops->map_update_elem(map, key, value, flags);//
    }
//------------------

    return err;
复制代码

以sock_map_update_elem 为例查看

复制代码
static int sock_map_update_elem(struct bpf_map *map, void *key,
                void *value, u64 flags)
{
    u32 idx = *(u32 *)key;
    struct socket *sock;
    struct sock *sk;
    int ret;
    u64 ufd;

    if (map->value_size == sizeof(u64))
        ufd = *(u64 *)value;
    else
        ufd = *(u32 *)value;
---------------------------
    sock = sockfd_lookup(ufd, &ret);// 根据value:sockt-fd 找到对应的struct socket
    ----------
    sk = sock->sk;//sock---对应的net sk 结构体
    -----------
    ret = sock_map_update_common(map, idx, sk, flags);

}
复制代码
复制代码
static int sock_map_update_common(struct bpf_map *map, u32 idx,
                  struct sock *sk, u64 flags)
{
    struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
    struct sk_psock_link *link;
    struct sk_psock *psock;
    struct sock *osk;
    int ret;

    
    link = sk_psock_init_link();//分配内存
    
    /* Only sockets we can redirect into/from in BPF need to hold
     * refs to parser/verdict progs and have their sk_data_ready
     * and sk_write_space callbacks overridden.
     */

        ret = sock_map_link(map, &stab->progs, sk);
    

    psock = sk_psock(sk);
    WARN_ON_ONCE(!psock);

    raw_spin_lock_bh(&stab->lock);
    osk = stab->sks[idx];

    sock_map_add_link(psock, link, map, &stab->sks[idx]);
    stab->sks[idx] = sk;
    sock_map_unref(osk, &stab->sks[idx]);
    return 0;
}

static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
             struct sock *sk)
{
    struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
    struct sk_psock *psock;
    bool skb_progs;
    int ret;

    skb_verdict = READ_ONCE(progs->skb_verdict);-------赋值见-sock_map_prog_update
    skb_parser = READ_ONCE(progs->skb_parser);
    skb_progs = skb_parser && skb_verdict;
    ---------------------

    msg_parser = READ_ONCE(progs->msg_parser);
    ------------------

    psock = sock_map_psock_get_checked(sk);
    if (IS_ERR(psock)) {
        ret = PTR_ERR(psock);
        goto out_progs;
    }
-------------------
    psock = sk_psock_init(sk, map->numa_node);
    将sk 和psock 相关联:创建psock ;psock->sk = sk;
---------------------
//主要是sk->sk_prot=ops  替换sk 的ops 函数;替换为bpf_ops
    ret = sock_map_init_proto(sk, psock);
    if (ret < 0)
        goto out_drop;


    if (skb_progs && !psock->parser.enabled) {
        ret = sk_psock_init_strp(sk, psock);//设置strparser cb 回调函数
        if (ret) {
            write_unlock_bh(&sk->sk_callback_lock);
            goto out_drop;
        }
        psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
        psock_set_prog(&psock->progs.skb_parser, skb_parser);
        //设置 sk 的data_ready 数据到达唤醒函数
        sk_psock_start_strp(sk, psock);
    }

    return 0;
}

void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
    struct sk_psock_parser *parser = &psock->parser;

    if (parser->enabled)
        return;

    parser->saved_data_ready = sk->sk_data_ready;
    sk->sk_data_ready = sk_psock_strp_data_ready;
    sk->sk_write_space = sk_psock_write_space;
    parser->enabled = true;
}

int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
{
    static const struct strp_callbacks cb = {
        .rcv_msg    = sk_psock_strp_read,
        .read_sock_done    = sk_psock_strp_read_done,
        .parse_msg    = sk_psock_strp_parse,
    };

    psock->parser.enabled = false;
    return strp_init(&psock->parser.strp, sk, &cb);
}

设置strparser cb 回调函数
int strp_init(struct strparser *strp, struct sock *sk,
          const struct strp_callbacks *cb)
{
--------------------
    /* The sk (sock) arg determines the mode of the stream parser.
     *
     * If the sock is set then the strparser is in receive callback mode.
     * The upper layer calls strp_data_ready to kick receive processing
     * and strparser calls the read_sock function on the socket to
     * get packets.
     *
     * If the sock is not set then the strparser is in general mode.
     * The upper layer calls strp_process for each skb to be parsed.
     */
---------------
    memset(strp, 0, sizeof(*strp));

    strp->sk = sk;

    strp->cb.lock = cb->lock ? : strp_sock_lock;
    strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
    strp->cb.rcv_msg = cb->rcv_msg;
    strp->cb.parse_msg = cb->parse_msg;
    strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
    strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;

    INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout);
    INIT_WORK(&strp->work, strp_work);

    return 0;
}

static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
                   struct proto *base)
{
    prot[TCP_BPF_BASE]            = *base;
    prot[TCP_BPF_BASE].unhash        = sock_map_unhash;
    prot[TCP_BPF_BASE].close        = sock_map_close;
    prot[TCP_BPF_BASE].recvmsg        = tcp_bpf_recvmsg;
    prot[TCP_BPF_BASE].stream_memory_read    = tcp_bpf_stream_read;

    prot[TCP_BPF_TX]            = prot[TCP_BPF_BASE];
    prot[TCP_BPF_TX].sendmsg        = tcp_bpf_sendmsg;
    prot[TCP_BPF_TX].sendpage        = tcp_bpf_sendpage;
}


struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
{
    int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
    int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;

    if (!psock->sk_proto) {
        struct proto *ops = READ_ONCE(sk->sk_prot);

        if (tcp_bpf_assert_proto_ops(ops))
            return ERR_PTR(-EINVAL);

        tcp_bpf_check_v6_needs_rebuild(sk, ops);
    }

    return &tcp_bpf_prots[family][config];
}

static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
{
    struct proto *prot;

    switch (sk->sk_type) {
    case SOCK_STREAM:
        prot = tcp_bpf_get_proto(sk, psock);
        break;

    case SOCK_DGRAM:
        prot = udp_bpf_get_proto(sk, psock);
        break;


    sk_psock_update_proto(sk, psock, prot);
    return 0;
}
复制代码
From now on, each time our socket sd receives a packet,
prog_parser and prog_verdict are called

复制代码
SEC("prog_parser")
int _prog_parser(struct __sk_buff *skb)
{
    return skb->len;
}

SEC("prog_verdict")
int _prog_verdict(struct __sk_buff *skb)
{
    uint32_t idx = 0;
    return bpf_sk_redirect_map(skb, &sock_map, idx, 0);
}
复制代码

bpf_sk_redirect_map   tells the kernel: for the received packet, please oh please redirect it from a receive queue of some socket,to a transmit queue of the socket living in sock_map under index 0. In our case, these are the same sockets!Here we achieved exactly what the echo server is supposed to do, but purely in eBPF.

复制代码
const struct bpf_func_proto bpf_sk_redirect_map_proto = {
    .func           = bpf_sk_redirect_map,
    .gpl_only       = false,
    .ret_type       = RET_INTEGER,
    .arg1_type    = ARG_PTR_TO_CTX,
    .arg2_type      = ARG_CONST_MAP_PTR,
    .arg3_type      = ARG_ANYTHING,
    .arg4_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
       struct bpf_map *, map, u32, key, u64, flags)
{
    struct sock *sk;

    if (unlikely(flags & ~(BPF_F_INGRESS)))
        return SK_DROP;

    sk = __sock_map_lookup_elem(map, key);
    if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
        return SK_DROP;

    msg->flags = flags;
    msg->sk_redir = sk;
    return SK_PASS;
}
复制代码


参考学习:
eBPF学习用例:
Linux 内核观测技术 BPF书籍
https://davidlovezoe.club/wordpress/archives/862
http://arthurchiao.art/blog/cilium-life-of-a-packet-pod-to-service-zh/
https://switch-router.gitee.io/blog/strparser/

https://davidlovezoe.club/wordpress/archives/963

https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/

https://jishuin.proginn.com/p/763bfbd2bc4e

https://blog.csdn.net/pwl999/article/details/82884882

https://github.com/zoidbergwill/awesome-ebpf

https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/

https://switch-router.gitee.io/blog/strparser/

https://blogs.oracle.com/linux/notes-on-bpf-1

总结:

 

  • eBPF程序处理截获报文的例子:psock,psock 使用 strpaser,将数据包的控制权转移到 eBPF 处理程序,用户可以在 eBPF 程序里完成网络报文的重定向;sockmap 建立在 psock 之上,而 psock 的底层则是 strparser

strparser 的工作原理

核心数据结构:struct strparser 是 strparser 框架的核心数据结构,它绑定(attach)一个 TCP sock 结构 sk 和一组回调函数 cb
struct strparser {
    struct sock *sk;
    // code omitted ....
    struct strp_callbacks cb;
};

回调函数一共有以下六个:

struct strp_callbacks {
    int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
    void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); 
    int (*read_sock_done)(struct strparser *strp, int err)
    void (*abort_parser)(struct strparser *strp, int err);
    void (*lock)(struct strparser *strp);
    void (*unlock)(struct strparser *strp);
};

parse_msg() 在 strpaser 收到报文时被框架调用。它用于从报文中提取下一个应用层消息(message)的长度。一个 TCP 报文里可能不止一个应用层消息,而 parse_msg() 就是提供给使用者去识别各个消息的手段

strpaser 截获报文

正常情况下,内核 TCP 层处理报文后,会调用 sock->sk_data_ready(sk) , 它的默认动作是 wake up 一个用户态进程.

void tcp_data_ready(struct sock *sk)
{
    const struct tcp_sock *tp = tcp_sk(sk);
    // code omitted
    sk->sk_data_ready(sk);
}
我们期望报文能进入 strpaser ,但报文显然不会平白无故地地进入 strpaser ,因此,我们需要在报文的上送路径上动一些手脚:替换掉 sk->sk_data_ready 函数
复制代码
static int tls_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len){
    // code omitted
    tsk->saved_sk_data_ready = tsk->socket->sk->sk_data_ready;
    tsk->saved_sk_write_space = tsk->socket->sk->sk_write_space;sk_write_space
    tsk->socket->sk->sk_data_ready = tls_data_ready; 
    tsk->socket->sk->sk_write_space = tls_write_space;
    tsk->socket->sk->sk_user_data = tsk;     
    // code omitted
}
复制代码

在 psock 的例子中, sk_psock_strp_data_ready() 被赋值到 sk->sk_data_ready

复制代码
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
    struct sk_psock_parser *parser = &psock->parser;
    // code omitted
    parser->saved_data_ready = sk->sk_data_ready;
    sk->sk_data_ready = sk_psock_strp_data_ready;
    sk->sk_write_space = sk_psock_write_space;
    parser->enabled = true;
}
复制代码

替换之后,当有 TCP 报文准备上送时,用户定义的 sk->sk_data_ready 函数就会被调用,在该函数中,KTLS/psock 需要调用框架函数strp_data_ready() 将报文转交给 strpaser 框架。

对 KTLS

复制代码
static void tls_data_ready(struct sock *sk)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);

    strp_data_ready(&ctx->strp);
}
View Code
复制代码

对 psock

复制代码
static void sk_psock_strp_data_ready(struct sock *sk)
{
    struct sk_psock *psock;

    rcu_read_lock();
    psock = sk_psock(sk);
    if (likely(psock)) {
        write_lock_bh(&sk->sk_callback_lock);
        strp_data_ready(&psock->parser.strp);
        write_unlock_bh(&sk->sk_callback_lock);
    }
    rcu_read_unlock();
}
复制代码
strpaser 处理报文

strpaser 框架拿到报文之后,通常会依次调用用户设置的 parse_msg 和 rcv_msg 回调函数,用户在回调函数里用来决定报文应该何去何从

strp_data_ready
  |- strp_read_sock
    |- tcp_read_sock
       |- strp_recv
         |- __strp_recv
           |- strp->cb.parse_msg(strp, head)
           ...
           |- strp->cb.rcv_msg(strp, head);
复制代码
比如对 KTLS, 就是将报文上送给应用层(AF_KTLS socket)

static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
    struct tls_sock *tsk;
    
    // code omitted 
    tsk = strp->sk->sk_user_data;
    // code omitted 
    
    ret = sock_queue_rcv_skb((struct sock *)tsk, skb);
    // code omitted 
}
复制代码

而对于 psock, 则是运行 eBPF 程序,得到动作(verdict)。

复制代码
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
    struct sk_psock *psock = sk_psock_from_strp(strp);
    struct bpf_prog *prog;
    int ret = __SK_DROP;

    rcu_read_lock();
    prog = READ_ONCE(psock->progs.skb_verdict);
    if (likely(prog)) {
        skb_orphan(skb);
        tcp_skb_bpf_redirect_clear(skb);
        ret = sk_psock_bpf_run(psock, prog, skb); // if we rdir , return SK_PASS
        ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
    }
    rcu_read_unlock();
    sk_psock_verdict_apply(psock, skb, ret);
复制代码

strpaser 是这个框架只是限定如何处理报文,而只是在内核层面提供给了用户一个提前处理 TCP 报文的时机和一组回调函数,用户通过不同的回调函数可以实现不同的逻辑。

https://switch-router.gitee.io/blog/strparser/-----------------------------------------*************************------------------------------------------------------





 

posted @   codestacklinuxer  阅读(1011)  评论(0编辑  收藏  举报
编辑推荐:
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
点击右上角即可分享
微信分享提示