scheduler bpf

2021年,Roman Gushchin发出了一组patch,提出scheduler bpf的框架,使得用户可以控制调度器的行为。链接:https://lwn.net/Articles/869433/

这组patch的重点在于模仿BPF LSM通过插入hook点,改变原有kernel执行逻辑。

BPF LSM是bpf的一种程序类型,它的一个特点是可以改变被probe的函数的返回值,也就是将注入代码的返回值替换原来被probe函数的返回值,再加上一些返回值判断代码就可以改变内核函数原有的执行逻辑。

scheduler bpf采用了相同的技术。LSM为安全模块服务,scheduler bpf为调度相关的逻辑服务。

使用方法:

首先要将这组patch apply到kernel中。因为这组patch只是一个框架,并没有实际的hook点应用到内核,因此你可以自己添加一些hook点,打开CONFIG_DEBUG_INFO_BTF,编译安装kernel,重启生效。剩下的工作需要在用户态完成。作者给了一个使用方法,链接为https://github.com/rgushchin/atc。按照说明,修改Makefile中的“TREE"变量为本地kernel源码路径,然后编译。期间可能会遇到不兼容的问题,我们可以修改带有bpf后缀的源码,尽量简化直至编译成功。依赖项包括一个vmlinux.h文件和有本地kernel源码编译得到的bpftool工具。

bpftool可以在kernel源码下tools/bpf/bpftool直接make生成。

vmlinux.h的生成方法:

bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h

这依赖于本地kernel开启btf config。

有关bpf sched hook的添加可以参考欧拉kernel的5.10分支。我将一个示例patch贴在下面。

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0981c127f261..7ef4efd8cddb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2406,5 +2406,15 @@ struct sched_affine_ctx {
        KABI_RESERVE(3)
        KABI_RESERVE(4)
 };
+
+struct sched_migrate_node {
+       int src_cpu;
+       int dst_cpu;
+
+       KABI_RESERVE(1)
+       KABI_RESERVE(2)
+       KABI_RESERVE(3)
+       KABI_RESERVE(4)
+};
 #endif
 #endif
diff --git a/include/linux/sched_hook_defs.h b/include/linux/sched_hook_defs.h
index 818b1244a018..e2519a00aa6b 100644
--- a/include/linux/sched_hook_defs.h
+++ b/include/linux/sched_hook_defs.h
@@ -10,3 +10,5 @@ BPF_SCHED_HOOK(void, (void) 0, cfs_dequeue_task, struct rq *rq, struct task_stru
 BPF_SCHED_HOOK(int, -1, cfs_select_rq, struct sched_migrate_ctx *ctx)
 BPF_SCHED_HOOK(int, -1, cfs_wake_affine, struct sched_affine_ctx *ctx)
 BPF_SCHED_HOOK(int, -1, cfs_select_rq_exit, struct sched_migrate_ctx *ctx)
+BPF_SCHED_HOOK(int, -1, cfs_can_migrate_task, struct task_struct *p,
+       struct sched_migrate_node *migrate_node)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 55b607564bd9..073c0cf35d3a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9486,9 +9486,23 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
        int tsk_cache_hot;
+#ifdef CONFIG_BPF_SCHED
+       struct sched_migrate_node migrate_node;
+       int ret;
+#endif

        lockdep_assert_rq_held(env->src_rq);

+#ifdef CONFIG_BPF_SCHED
+       if (bpf_sched_enabled()) {
+               migrate_node.src_cpu = env->src_cpu;
+               migrate_node.dst_cpu = env->dst_cpu;
+               ret = bpf_sched_cfs_can_migrate_task(p, &migrate_node);
+               if (ret > 0)
+                       return ret - 1;
+       }
+#endif
+
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or

使用成功的一个bpf代码示例。

atc.bpf.c

#include "vmlinux.h"
#include <linux/types.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

char LICENSE[] SEC("license") = "Dual BSD/GPL";

unsigned long tgidpid = 0;
unsigned long cgid = 0;
unsigned long allret = 0;
unsigned long max_exec_slice = 0;

#define INVALID_RET ((unsigned long) -1L)

//#define debug(args...) bpf_printk(args)
#define debug(args...)

SEC("sched/cfs_can_migrate_task")
int BPF_PROG(my_can_migrate_task, struct task_struct *p, struct sched_migrate_node *migrate_node)
{
        return -1;
}

atc.c

// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)

#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <dirent.h>
#include <ctype.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "atc.skel.h"

static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
        return vfprintf(stderr, format, args);
}

static void bump_memlock_rlimit(void)
{
        struct rlimit rlim_new = {
                .rlim_cur       = RLIM_INFINITY,
                .rlim_max       = RLIM_INFINITY,
        };

        if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
                fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
                exit(1);
        }
}

int main(int argc, char **argv)
{
        struct atc_bpf *skel;
        char msg[128] = {0};
        int err, i;

        libbpf_set_print(libbpf_print_fn);
        bump_memlock_rlimit();

        skel = atc_bpf__open();
        if (!skel) {
                fprintf(stderr, "Failed to open BPF skeleton\n");
                return 1;
        }

        err = atc_bpf__load(skel);
        if (err) {
                fprintf(stderr, "Failed to load and verify BPF skeleton\n");
                goto cleanup;
        }

        err = atc_bpf__attach(skel);
        if (err) {
                fprintf(stderr, "Failed to attach BPF skeleton\n");
                goto cleanup;
        }

        printf("%s\n", msg);

        for (;;)
                        sleep(1);

cleanup:
        atc_bpf__destroy(skel);
        if (child)
                wait(NULL);
        return -err;

        return 1;
}

 

posted on 2024-12-06 11:12  半山随笔  阅读(10)  评论(0编辑  收藏  举报

导航