sockmap/eBPF
This is how to use SOCKMAP: SOCKMAP or specifically "BPF_MAP_TYPE_SOCKMAP", is a type of an eBPF map. This map is an "array" - indices are integers. All this is pretty standard. The magic is in the map values - they must be TCP socket descriptors.
copy from:https://blog.cloudflare.com/sockmap-tcp-splicing-of-the-future/
也就是eBPF程序必须attach一个map,不是attach一个socket。so how to use SOCKMAP ?
sock_map = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int), sizeof(int), 2, 0) prog_parser = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...) prog_verdict = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...) bpf_prog_attach(prog_parser, sock_map, BPF_SK_SKB_STREAM_PARSER) bpf_prog_attach(prog_verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT)
-
先看看 bpf_create_map的作用: 创建一个map内存块
-
BPF map的应用场景有几种:
- BPF程序和用户态态的交互:BPF程序运行完,得到的结果存储到map中,供用户态访问;
- BPF程序内部交互:如果BPF程序内部需要用全局变量来交互,但是由于安全原因BPF程序不允许访问全局变量,可以使用map来充当全局变量;
- BPF Tail call:Tail call是一个BPF程序跳转到另一BPF程序,BPF程序首先通过BPF_MAP_TYPE_PROG_ARRAY类型的map来知道另一个BPF程序的指针,然后调用tail_call()的helper function来执行Tail call。
- BPF程序和内核态的交互:和BPF程序以外的内核程序交互,也可以使用map作为中介;
-
- Map 类型(
map_type
),就是上文提到的各种 Map 类型 - Map 的键大小(
key_size
),以字节为单位 - Map 的值大小(
value_size
),以字节为单位 - Map 的元素最大容量(
max_entries
),个数为单位
- Map 类型(
{ struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* BPF_MAP_CREATE related * flags defined above. */ __u32 inner_map_fd; /* fd pointing to the inner map */ __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ __u32 btf_key_type_id; /* BTF type_id of the key */ __u32 btf_value_type_id; /* BTF type_id of the value */ __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- * struct stored as the * map value */ }; --------------------------- }
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags) { struct bpf_create_map_attr map_attr = {}; map_attr.map_type = map_type;//BPF_MAP_TYPE_SOCKMAP BPF_MAP_TYPE_HASH BPF_MAP_TYPE_ARRAY and so on map_attr.map_flags = map_flags;//map的标志位 map_attr.key_size = key_size; //键值 中键的大小 map_attr.value_size = value_size;// 键值中值的大小 map_attr.max_entries = max_entries;//map键值对 最大数目 return bpf_create_map_xattr(&map_attr); }

int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) { union bpf_attr attr; memset(&attr, '\0', sizeof(attr)); // 完成 bpf_attr的赋值初始化 attr.map_type = create_attr->map_type; attr.key_size = create_attr->key_size; attr.value_size = create_attr->value_size; attr.max_entries = create_attr->max_entries; attr.map_flags = create_attr->map_flags; if (create_attr->name) memcpy(attr.map_name, create_attr->name, min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1)); attr.numa_node = create_attr->numa_node; attr.btf_fd = create_attr->btf_fd; attr.btf_key_type_id = create_attr->btf_key_type_id; attr.btf_value_type_id = create_attr->btf_value_type_id; attr.map_ifindex = create_attr->map_ifindex; if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS) attr.btf_vmlinux_value_type_id = create_attr->btf_vmlinux_value_type_id; else attr.inner_map_fd = create_attr->inner_map_fd; //调用bpf 系统调用 创建 一个map bpf 第一个参数为命令参数,比如: BPF_MAP_CREATE BPF_MAP_UPDATE_ELEM BPF_MAP_DELETE_ELEM return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); }
可以看到 实际上 会调用一个map_create 函数 分配内存 并初始化一个map
static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; if (attr->btf_vmlinux_value_type_id) { if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || attr->btf_key_type_id || attr->btf_value_type_id) return -EINVAL; } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { return -EINVAL; } f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... 分配内存使用 */ map = find_and_alloc_map(attr); if (IS_ERR(map)) return PTR_ERR(map); err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); map->spin_lock_off = -EINVAL; ---------------------------------------------- err = bpf_map_alloc_id(map); // 将map 和 idx-id 相关联索引 if (err) goto free_map_sec; err = bpf_map_new_fd(map, f_flags);// 将map 和fd 关联 一切皆文件 if (err < 0) { /* failed to allocate fd. * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ bpf_map_put_with_uref(map); return err; } return err; }
map_create 会调用:对应map_type的ops去分配内存等
以map_array为例:

static const struct bpf_map_ops array_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, }; static struct bpf_map_type_list array_type __read_mostly = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) /* if value_size is bigger, the user space won't be able to * access the elements. */ return ERR_PTR(-E2BIG); /* (1.1.1) 计算value的size,key的size不用计算也不用存储,因为这里的key直接就是index */ elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; /* On 32 bit archs roundup_pow_of_two() with max_entries that has * upper most bit set in u32 space is undefined behavior due to * resulting 1U << 32, so do it manually here in u64 space. */ mask64 = fls_long(max_entries - 1); mask64 = 1ULL << mask64; mask64 -= 1; index_mask = mask64; if (unpriv) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; /* Check for overflows. */ if (max_entries < attr->max_entries) return ERR_PTR(-E2BIG); } /* (1.1.2) 计算bpf_array + value数组的总大小,bpf_array包含了map的通用结构bpf_map */ array_size = sizeof(*array); if (percpu) array_size += (u64) max_entries * sizeof(void *); else array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); /* allocate all map elements and zero-initialize them */ /* (1.1.3) 根据总大小,分配bpf_array空间 */ array = bpf_map_area_alloc(array_size); if (!array) return ERR_PTR(-ENOMEM); array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ /* (1.1.4) 拷贝attr到array->map中 */ array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->elem_size = elem_size; if (!percpu) goto out; array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); if (array_size >= U32_MAX - PAGE_SIZE || elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } out: array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; return &array->map; }
bpf_load_program:用BPF_PROG_LOAD
命令进行bpf系统调用加载 BPF 程序到内核中
- 拷贝程序到内核;
- 校验它的安全性;
- 如果可能对它进行JIT编译;
- 然后分配一个文件句柄fd给它
完成这一切后,后续再把这段BPF程序挂载到需要运行的钩子上面。
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; int err; char license[128]; bool is_gpl; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && !bpf_capable()) return -EPERM; /* copy eBPF program license from user space 根据attr->license地址,从用户空间拷贝license字符串到内核 */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions 判断license是否符合GPL协议*/ is_gpl = license_is_gpl_compatible(license); //判断BPF的总指令数是否超过BPF_MAXINSNS(4k) if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; //对BPF_PROG_TYPE_SOCKET_FILTER和BPF_PROG_TYPE_CGROUP_SKB以外的BPF程序加载,需要管理员权限 if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !bpf_capable()) return -EPERM; //对 CGROUP SOCK等需要admin 权限 或者 对应net 空间的权限 if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attr->attach_btf_id, attr->attach_prog_fd)) return -EINVAL; /* plain bpf_prog allocation 根据BPF指令数分配bpf_prog空间,和bpf_prog->aux空间*/ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; if (attr->attach_prog_fd) { struct bpf_prog *tgt_prog; tgt_prog = bpf_prog_get(attr->attach_prog_fd); if (IS_ERR(tgt_prog)) { err = PTR_ERR(tgt_prog); goto free_prog_nouncharge; } prog->aux->linked_prog = tgt_prog; } prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; err = bpf_prog_charge_memlock(prog); if (err) goto free_prog_sec; prog->len = attr->insn_cnt; err = -EFAULT;//把BPF代码从用户空间地址attr->insns,拷贝到内核空间地址prog->insns if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), bpf_prog_insn_size(prog)) != 0) goto free_prog; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; } /* find program type: socket_filter vs tracing_filter 根据attr->prog_type指定的type值,找到对应的bpf_prog_types, 给bpf_prog->aux->ops赋值,这个ops是一个函数操作集*/ err = find_prog_type(type, prog); if (err < 0) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) goto free_prog; /* run eBPF verifier 使用verifer对BPF程序进行合法性扫描 */ err = bpf_check(&prog, attr, uattr); if (err < 0) goto free_used_maps; /*尝试对BPF程序进行JIT转换*/ prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; //给BPF程序分配关联一个idx id索引 err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; /* Upon success of bpf_prog_alloc_id(), the BPF prog is * effectively publicly exposed. However, retrieving via * bpf_prog_get_fd_by_id() will take another reference, * therefore it cannot be gone underneath us. * * Only for the time /after/ successful bpf_prog_new_fd() * and before returning to userspace, we might just hold * one reference and any parallel close on that fd could * rip everything out. Hence, below notifications must * happen before bpf_prog_new_fd(). * * Also, any failure handling from this point onwards must * be using bpf_prog_put() given the program is exposed. */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_LOAD); //给BPF程序分配一个文件句柄fd err = bpf_prog_new_fd(prog); if (err < 0) bpf_prog_put(prog); return err; -------------------------------- }
bpf_prog_attach:如何把我的bpf程序,attach到这些类型上:
重定向程序作为BPF_SK_SKB_STREAM_VERDICT附加到sockmap; 它应返回bpf_sk_redirect_map()的结果。
一个strparser程序通过BPF_SK_SKB_STREAM_PARSER附加,并且应返回已解析数据的长度。
能够获取什么样的context?
指向包含包元数据/数据的结构__sk_buff的指针。 但是,sk_skb程序类型可以访问更多字段。 可用的额外字段集记录在include / linux / bpf.h中,如下所示:
什么时候会运行?
可以通过把BPF_SK_SKB_STREAM_PARSER 附加到sockmap上来把一个stream
parser附加到一个socket上,然后,当socket通过、bpf/sockmap.c中的smap_parse_func_strparser()
接受的时候,就会执行。BPF_SK_SKB_STREAM_VERDICT也会附加到sockmap上,它通过smap_verdict_func()来执行。
/* bpf_load_program bpf_prog_attach(verdict_prog, map_fd, BPF_SMAP_STREAM_VERDICT, 0); int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, unsigned int flags) { DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts, .flags = flags, ); return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts); } int bpf_prog_attach_xattr(int prog_fd, int target_fd, enum bpf_attach_type type, const struct bpf_prog_attach_opts *opts) { union bpf_attr attr; if (!OPTS_VALID(opts, bpf_prog_attach_opts)) return -EINVAL; memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; attr.attach_flags = OPTS_GET(opts, flags, 0); attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); }
int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; switch (which) { ------------------------------------------ case BPF_SK_SKB_STREAM_PARSER: pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->skb_verdict; break; } psock_set_prog(pprog, prog); return 0; }
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);---//找到对应的sk_psock_progs 并更新 fdput(f); return ret; } */ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype;BPF_SOCK_STREAM_VERDICT struct bpf_prog *prog = NULL; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; //------BPF_SK_SKB_STREAM_VERDICT-------> transmit -----BPF_PROG_TYPE_SK_SKB 也就是attach type 转换为 prog-type ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { bpf_prog_put(prog); return -EINVAL; } /* const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto,--------------bpf_sk_redirect_map_proto----------bpf_msg_redirect_map .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; */ switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: ret = sock_map_get_from_fd(attr, prog);// 根据target_fd 找到 map 并关联对应map break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: ret = -EINVAL; } if (ret) bpf_prog_put(prog); return ret; }
established sock_map
eBPF map, with two eBPF programs attached: parser and verdict.
The next step is to add a TCP socket descriptor to this map
int val = fd; bpf_map_update_elem(sock_map, &idx, &val, BPF_ANY);
bpf_map_update_elem: 将fd socket 和map相关联
会执行系统调用 bpf(BPF_MAP_UPDATE_ELEM,-----) 最后调用map_update_elem 函数处理
static int map_update_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key);// 对应idx 索引 void __user *uvalue = u64_to_user_ptr(attr->value);//对应 键值 value 比如 需要执行动作的socket--fd int ufd = attr->map_fd; ----------------------- f = fdget(ufd);// map_fd--->file--->对应的map 内存 map = __bpf_map_get(f);// map_fd--->file--->对应的map 内存 f.file->private_data; ------------------------------ ----------------------------------// 将 key value 更新到map 中 err = bpf_map_update_value(map, f, key, value, attr->flags); }
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, void *value, __u64 flags) { int err; /* Need to create a kthread, thus must support schedule */ if (bpf_map_is_dev_bound(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || map->map_type == BPF_MAP_TYPE_SOCKMAP ||//sock_map_update_elem map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags);// } //------------------ return err;
以sock_map_update_elem 为例查看
static int sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { u32 idx = *(u32 *)key; struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; --------------------------- sock = sockfd_lookup(ufd, &ret);// 根据value:sockt-fd 找到对应的struct socket ---------- sk = sock->sk;//sock---对应的net sk 结构体 ----------- ret = sock_map_update_common(map, idx, sk, flags); }
static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; link = sk_psock_init_link();//分配内存 /* Only sockets we can redirect into/from in BPF need to hold * refs to parser/verdict progs and have their sk_data_ready * and sk_write_space callbacks overridden. */ ret = sock_map_link(map, &stab->progs, sk); psock = sk_psock(sk); WARN_ON_ONCE(!psock); raw_spin_lock_bh(&stab->lock); osk = stab->sks[idx]; sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; sock_map_unref(osk, &stab->sks[idx]); return 0; } static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; struct sk_psock *psock; bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict);-------赋值见-sock_map_prog_update skb_parser = READ_ONCE(progs->skb_parser); skb_progs = skb_parser && skb_verdict; --------------------- msg_parser = READ_ONCE(progs->msg_parser); ------------------ psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } ------------------- psock = sk_psock_init(sk, map->numa_node); 将sk 和psock 相关联:创建psock ;psock->sk = sk; --------------------- //主要是sk->sk_prot=ops 替换sk 的ops 函数;替换为bpf_ops ret = sock_map_init_proto(sk, psock); if (ret < 0) goto out_drop; if (skb_progs && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock);//设置strparser cb 回调函数 if (ret) { write_unlock_bh(&sk->sk_callback_lock); goto out_drop; } psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); //设置 sk 的data_ready 数据到达唤醒函数 sk_psock_start_strp(sk, psock); } return 0; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; if (parser->enabled) return; parser->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; psock->parser.enabled = false; return strp_init(&psock->parser.strp, sk, &cb); } 设置strparser cb 回调函数 int strp_init(struct strparser *strp, struct sock *sk, const struct strp_callbacks *cb) { -------------------- /* The sk (sock) arg determines the mode of the stream parser. * * If the sock is set then the strparser is in receive callback mode. * The upper layer calls strp_data_ready to kick receive processing * and strparser calls the read_sock function on the socket to * get packets. * * If the sock is not set then the strparser is in general mode. * The upper layer calls strp_process for each skb to be parsed. */ --------------- memset(strp, 0, sizeof(*strp)); strp->sk = sk; strp->cb.lock = cb->lock ? : strp_sock_lock; strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout); INIT_WORK(&strp->work, strp_work); return 0; } static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; } struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; if (!psock->sk_proto) { struct proto *ops = READ_ONCE(sk->sk_prot); if (tcp_bpf_assert_proto_ops(ops)) return ERR_PTR(-EINVAL); tcp_bpf_check_v6_needs_rebuild(sk, ops); } return &tcp_bpf_prots[family][config]; } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { struct proto *prot; switch (sk->sk_type) { case SOCK_STREAM: prot = tcp_bpf_get_proto(sk, psock); break; case SOCK_DGRAM: prot = udp_bpf_get_proto(sk, psock); break; sk_psock_update_proto(sk, psock, prot); return 0; }
From now on, each time our socket sd
receives a packet,
prog_parser and prog_verdict are called
SEC("prog_parser") int _prog_parser(struct __sk_buff *skb) { return skb->len; } SEC("prog_verdict") int _prog_verdict(struct __sk_buff *skb) { uint32_t idx = 0; return bpf_sk_redirect_map(skb, &sock_map, idx, 0); }
bpf_sk_redirect_map
tells the kernel: for the received packet, please oh please redirect it from a receive queue of some socket,to a transmit queue of the socket living in sock_map under index 0. In our case, these are the same sockets!Here we achieved exactly what the echo server is supposed to do, but purely in eBPF.
const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; }
参考学习:
eBPF学习用例:
Linux 内核观测技术 BPF书籍
https://davidlovezoe.club/wordpress/archives/862
http://arthurchiao.art/blog/cilium-life-of-a-packet-pod-to-service-zh/
https://switch-router.gitee.io/blog/strparser/
https://davidlovezoe.club/wordpress/archives/963
https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/
https://jishuin.proginn.com/p/763bfbd2bc4e
https://blog.csdn.net/pwl999/article/details/82884882
https://github.com/zoidbergwill/awesome-ebpf
https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/
https://switch-router.gitee.io/blog/strparser/
https://blogs.oracle.com/linux/notes-on-bpf-1
总结:
- eBPF程序处理截获报文的例子:psock,psock 使用 strpaser,将数据包的控制权转移到 eBPF 处理程序,用户可以在 eBPF 程序里完成网络报文的重定向;sockmap 建立在 psock 之上,而 psock 的底层则是 strparser
strparser 的工作原理
核心数据结构:struct strparser 是 strparser 框架的核心数据结构,它绑定(attach)一个 TCP sock 结构 sk 和一组回调函数 cb
struct strparser { struct sock *sk; // code omitted .... struct strp_callbacks cb; };
回调函数一共有以下六个:
struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock_done)(struct strparser *strp, int err) void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); void (*unlock)(struct strparser *strp); };
parse_msg() 在 strpaser 收到报文时被框架调用。它用于从报文中提取下一个应用层消息(message)的长度。一个 TCP 报文里可能不止一个应用层消息,而 parse_msg() 就是提供给使用者去识别各个消息的手段
strpaser 截获报文
正常情况下,内核 TCP 层处理报文后,会调用 sock->sk_data_ready(sk) , 它的默认动作是 wake up 一个用户态进程.
void tcp_data_ready(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); // code omitted sk->sk_data_ready(sk); }
我们期望报文能进入 strpaser ,但报文显然不会平白无故地地进入 strpaser ,因此,我们需要在报文的上送路径上动一些手脚:替换掉 sk->sk_data_ready 函数
static int tls_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len){ // code omitted tsk->saved_sk_data_ready = tsk->socket->sk->sk_data_ready; tsk->saved_sk_write_space = tsk->socket->sk->sk_write_space;sk_write_space tsk->socket->sk->sk_data_ready = tls_data_ready; tsk->socket->sk->sk_write_space = tls_write_space; tsk->socket->sk->sk_user_data = tsk; // code omitted }
在 psock 的例子中, sk_psock_strp_data_ready() 被赋值到 sk->sk_data_ready
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; // code omitted parser->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; }
替换之后,当有 TCP 报文准备上送时,用户定义的 sk->sk_data_ready 函数就会被调用,在该函数中,KTLS/psock 需要调用框架函数strp_data_ready() 将报文转交给 strpaser 框架。
对 KTLS

static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); strp_data_ready(&ctx->strp); }
对 psock
static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { write_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->parser.strp); write_unlock_bh(&sk->sk_callback_lock); } rcu_read_unlock(); }
strpaser 处理报文
strpaser 框架拿到报文之后,通常会依次调用用户设置的 parse_msg 和 rcv_msg 回调函数,用户在回调函数里用来决定报文应该何去何从
strp_data_ready |- strp_read_sock |- tcp_read_sock |- strp_recv |- __strp_recv |- strp->cb.parse_msg(strp, head) ... |- strp->cb.rcv_msg(strp, head);
比如对 KTLS, 就是将报文上送给应用层(AF_KTLS socket) static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_sock *tsk; // code omitted tsk = strp->sk->sk_user_data; // code omitted ret = sock_queue_rcv_skb((struct sock *)tsk, skb); // code omitted }
而对于 psock, 则是运行 eBPF 程序,得到动作(verdict)。
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); struct bpf_prog *prog; int ret = __SK_DROP; rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_orphan(skb); tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); // if we rdir , return SK_PASS ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } rcu_read_unlock(); sk_psock_verdict_apply(psock, skb, ret);
strpaser 是这个框架只是限定如何处理报文,而只是在内核层面提供给了用户一个提前处理 TCP 报文的时机和一组回调函数,用户通过不同的回调函数可以实现不同的逻辑。
https://switch-router.gitee.io/blog/strparser/-----------------------------------------*************************------------------------------------------------------
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南