scheduler bpf
2021年,Roman Gushchin发出了一组patch,提出scheduler bpf的框架,使得用户可以控制调度器的行为。链接:https://lwn.net/Articles/869433/
这组patch的重点在于模仿BPF LSM通过插入hook点,改变原有kernel执行逻辑。
BPF LSM是bpf的一种程序类型,它的一个特点是可以改变被probe的函数的返回值,也就是将注入代码的返回值替换原来被probe函数的返回值,再加上一些返回值判断代码就可以改变内核函数原有的执行逻辑。
scheduler bpf采用了相同的技术。LSM为安全模块服务,scheduler bpf为调度相关的逻辑服务。
使用方法:
首先要将这组patch apply到kernel中。因为这组patch只是一个框架,并没有实际的hook点应用到内核,因此你可以自己添加一些hook点,打开CONFIG_DEBUG_INFO_BTF,编译安装kernel,重启生效。剩下的工作需要在用户态完成。作者给了一个使用方法,链接为https://github.com/rgushchin/atc。按照说明,修改Makefile中的“TREE"变量为本地kernel源码路径,然后编译。期间可能会遇到不兼容的问题,我们可以修改带有bpf后缀的源码,尽量简化直至编译成功。依赖项包括一个vmlinux.h文件和有本地kernel源码编译得到的bpftool工具。
bpftool可以在kernel源码下tools/bpf/bpftool直接make生成。
vmlinux.h的生成方法:
bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
这依赖于本地kernel开启btf config。
有关bpf sched hook的添加可以参考欧拉kernel的5.10分支。我将一个示例patch贴在下面。
diff --git a/include/linux/sched.h b/include/linux/sched.h index 0981c127f261..7ef4efd8cddb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2406,5 +2406,15 @@ struct sched_affine_ctx { KABI_RESERVE(3) KABI_RESERVE(4) }; + +struct sched_migrate_node { + int src_cpu; + int dst_cpu; + + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) + KABI_RESERVE(4) +}; #endif #endif diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h index 818b1244a018..e2519a00aa6b 100644 --- a/include/linux/sched_hook_defs.h +++ b/include/linux/sched_hook_defs.h @@ -10,3 +10,5 @@ BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_stru BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx) BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx) +BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p, + struct sched_migrate_node *migrate_node) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55b607564bd9..073c0cf35d3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9486,9 +9486,23 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; +#ifdef CONFIG_BPF_SCHED + struct sched_migrate_node migrate_node; + int ret; +#endif lockdep_assert_rq_held(env->src_rq); +#ifdef CONFIG_BPF_SCHED + if (bpf_sched_enabled()) { + migrate_node.src_cpu = env->src_cpu; + migrate_node.dst_cpu = env->dst_cpu; + ret = bpf_sched_cfs_can_migrate_task(p, &migrate_node); + if (ret > 0) + return ret - 1; + } +#endif + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or
使用成功的一个bpf代码示例。
atc.bpf.c
#include "vmlinux.h" #include <linux/types.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char LICENSE[] SEC("license") = "Dual BSD/GPL"; unsigned long tgidpid = 0; unsigned long cgid = 0; unsigned long allret = 0; unsigned long max_exec_slice = 0; #define INVALID_RET ((unsigned long) -1L) //#define debug(args...) bpf_printk(args) #define debug(args...) SEC("sched/cfs_can_migrate_task") int BPF_PROG(my_can_migrate_task, struct task_struct *p, struct sched_migrate_node *migrate_node) { return -1; }
atc.c
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) #include <stdio.h> #include <unistd.h> #include <signal.h> #include <stdlib.h> #include <dirent.h> #include <ctype.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/resource.h> #include <bpf/libbpf.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include "atc.skel.h" static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { return vfprintf(stderr, format, args); } static void bump_memlock_rlimit(void) { struct rlimit rlim_new = { .rlim_cur = RLIM_INFINITY, .rlim_max = RLIM_INFINITY, }; if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); exit(1); } } int main(int argc, char **argv) { struct atc_bpf *skel; char msg[128] = {0}; int err, i; libbpf_set_print(libbpf_print_fn); bump_memlock_rlimit(); skel = atc_bpf__open(); if (!skel) { fprintf(stderr, "Failed to open BPF skeleton\n"); return 1; } err = atc_bpf__load(skel); if (err) { fprintf(stderr, "Failed to load and verify BPF skeleton\n"); goto cleanup; } err = atc_bpf__attach(skel); if (err) { fprintf(stderr, "Failed to attach BPF skeleton\n"); goto cleanup; } printf("%s\n", msg); for (;;) sleep(1); cleanup: atc_bpf__destroy(skel); if (child) wait(NULL); return -err; return 1; }