sched-ext is a new scheduling class introduced in the Linux kernel that provides a mechanism to implement scheduling policies as BPF (Berkeley Packet Filter) programs [1]. Such programs can also be connected to user-space counterparts to defer scheduling decisions to regular user-space processes.
State of the art
The idea of "pluggable" schedulers is not new, it was initially proposed in 2004 [2], but at that time it was strongly rejected, to prioritize the creation of a single generic scheduler (one to rule them all), that ended up being the “completely fair scheduler” (CFS).
However, with BPF and the sched-ext scheduling class, we now have the possibility to easily and quickly implement and test scheduling policies, making the “pluggable” approach an effective tool for easy experimentation.
What is the main benefit of sched-ext?
The ability to implement custom scheduling policies via BPF greatly lowers the difficulty of testing new scheduling ideas (much easier than changing CFS or replacing it with a different scheduler). With this feature researchers or developers can test their own scheduler in a safe way, without even needing to reboot the system.
How to use sched-ext in Ubuntu?
Unfortunately sched-ext is not yet available in the upstream Linux kernel, at the moment it is only available as a patch set in the Linux kernel mailing list [3] (and it is unlikely to be applied upstream in the near future, because there are still some concerns and potential issues that need to be addressed).
However, it is possible to use an experimental version of the Ubuntu linux-unstable kernel [4][5] that includes the sched-ext patch set (keep in mind that this kernel is very experimental, do not use it in production!).
How to implement a custom scheduler?
The following example implements a “toy” CPU scheduler that passes all the scheduling “enqueue” events to a user-space task that processes them in a FIFO way and sends the “dispatch” events back to the kernel.
First of all let’s implement the BPF program:
/* SPDX-License-Identifier: GPL-2.0 *//*
* Copyright 2023 Canonical Ltd.
*/#include"scx_common.bpf.h"#include"scx_toy.h"char _license[] SEC("license") = "GPL";
/*
* This contains the PID of the scheduler task itself (initialized in
* scx_toy.c).
*/constvolatile s32 usersched_pid;
/* Set when the user-space scheduler needs to run */staticbool usersched_needed;
/* Notify the user-space counterpart when the BPF program exits */structuser_exit_info uei;
/* Enqueues statistics */
u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
/*
* BPF map to store enqueue events.
*
* The producer of this map is this BPF program, the consumer is the user-space
* scheduler task.
*/struct {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(max_entries, MAX_TASKS);
__type(value, struct scx_toy_enqueued_task);
} enqueued SEC(".maps");
/*
* BPF map to store dispatch events.
*
* The producer of this map is the user-space scheduler task, the consumer is
* this BPF program.
*/struct {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(max_entries, MAX_TASKS);
__type(value, s32);
} dispatched SEC(".maps");
/* Return true if the target task "p" is a kernel thread */staticinlineboolis_kthread(conststruct task_struct *p){
return !!(p->flags & PF_KTHREAD);
}
/* Return true if the target task "p" is the user-space scheduler task */staticboolis_usersched_task(conststruct task_struct *p){
return p->pid == usersched_pid;
}
/*
* Dispatch user-space scheduler directly.
*/staticvoiddispatch_user_scheduler(void){
structtask_struct *p;
if (!usersched_needed)
return;
p = bpf_task_from_pid(usersched_pid);
if (!p)
return;
usersched_needed = false;
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
bpf_task_release(p);
}
voidBPF_STRUCT_OPS(toy_enqueue, struct task_struct *p, u64 enq_flags){
structscx_toy_enqueued_task task = {
.pid = p->pid,
};
/*
* User-space scheduler will be dispatched only when needed from
* toy_dispatch(), so we can skip it here.
*/if (is_usersched_task(p))
return;
if (is_kthread(p)) {
/*
* We want to dispatch kernel threads and the scheduler task
* directly here for efficiency reasons, rather than passing
* the events to the user-space scheduler counterpart.
*/
__sync_fetch_and_add(&nr_kernel_enqueues, 1);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
return;
}
if (bpf_map_push_elem(&enqueued, &task, 0)) {
/*
* We couldn't push the task to the "enqueued" map, dispatch
* the event here and register the failure in the failure
* counter.
*/
__sync_fetch_and_add(&nr_failed_enqueues, 1);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
} else {
/*
* Enqueue event will be processed and task will be dispatched
* in user-space by the scheduler task.
*/
__sync_fetch_and_add(&nr_user_enqueues, 1);
}
}
voidBPF_STRUCT_OPS(toy_dispatch, s32 cpu, struct task_struct *prev){
structtask_struct *p;
s32 pid;
dispatch_user_scheduler();
/*
* Get a dispatch event from user-space and dispatch the corresponding
* task.
*/if (bpf_map_pop_elem(&dispatched, &pid))
return;
p = bpf_task_from_pid(pid);
if (!p)
return;
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
bpf_task_release(p);
}
s32 BPF_STRUCT_OPS(toy_init){
/* Apply the "toy" scheduling class for all the tasks in the system */scx_bpf_switch_all();
return0;
}
voidBPF_STRUCT_OPS(toy_exit, struct scx_exit_info *ei){
/* Notify user-space counterpart that the BPF program terminated */uei_record(&uei, ei);
}
SEC(".struct_ops.link")
structsched_ext_ops toy_ops = {
.enqueue = (void *)toy_enqueue,
.dispatch = (void *)toy_dispatch,
.init = (void *)toy_init,
.exit = (void *)toy_exit,
.name = "toy",
};
Then we can implement the user-space counterpart, that will intercept the “enqueue” events and will dispatch them back to the kernel:
/* SPDX-License-Identifier: GPL-2.0 *//*
* Copyright 2023 Canonical Ltd.
*/#define _GNU_SOURCE#include<stdio.h>#include<unistd.h>#include<sched.h>#include<signal.h>#include<assert.h>#include<libgen.h>#include<pthread.h>#include<bpf/bpf.h>#include<sys/mman.h>#include<sys/queue.h>#include<sys/syscall.h>#include"user_exit_info.h"#include"scx_toy.skel.h"#include"scx_toy.h"constchar help_fmt[] =
"A toy sched_ext scheduler.\n""\n""See the top-level comment in .bpf.c for more details.\n""\n""Usage: %s\n""\n"" -h Display this help and exit\n";
staticvolatileint exit_req;
/*
* Descriptors used to communicate enqueue and dispatch event with the BPF
* program.
*/staticint enqueued_fd, dispatched_fd;
staticstructscx_toy *skel;
staticvoidsigint_handler(int dummy){
exit_req = 1;
}
/* Thread that periodically prints enqueue statistics */staticvoid *run_stats_printer(void *arg){
while (!exit_req) {
__u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
nr_failed_enqueues = skel->bss->nr_failed_enqueues;
nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
nr_user_enqueues = skel->bss->nr_user_enqueues;
total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
printf("\e[1;1H\e[2J");
printf("o-----------------------o\n");
printf("| BPF SCHED ENQUEUES |\n");
printf("|-----------------------|\n");
printf("| kern: %10llu |\n", nr_kernel_enqueues);
printf("| user: %10llu |\n", nr_user_enqueues);
printf("| failed: %10llu |\n", nr_failed_enqueues);
printf("| -------------------- |\n");
printf("| total: %10llu |\n", total);
printf("o-----------------------o\n\n");
sleep(1);
}
returnNULL;
}
staticintspawn_stats_thread(void){
pthread_t stats_printer;
returnpthread_create(&stats_printer, NULL, run_stats_printer, NULL);
}
/* Send a dispatch event to the BPF program */staticintdispatch_task(s32 pid){
int err;
err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
if (err) {
fprintf(stderr, "Failed to dispatch task %d\n", pid);
exit_req = 1;
}
return err;
}
/* Receive all the enqueue events from the BPF program */staticvoiddrain_enqueued_map(void){
structscx_toy_enqueued_task task;
while (!bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task))
dispatch_task(task.pid);
}
/*
* Scheduler main loop: get enqueue events from the BPF program, process them
* (no-op) and send dispatch events to the BPF program.
*/staticvoidsched_main_loop(void){
while (!exit_req && !uei_exited(&skel->bss->uei)) {
drain_enqueued_map();
sched_yield();
}
}
intmain(int argc, char **argv){
structbpf_link *link;
u32 opt;
int err;
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
skel = scx_toy__open();
assert(skel);
skel->rodata->usersched_pid = getpid();
assert(skel->rodata->usersched_pid > 0);
while ((opt = getopt(argc, argv, "h")) != -1) {
switch (opt) {
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
/*
* It's not always safe to allocate in a user space scheduler, as an
* enqueued task could hold a lock that we require in order to be able
* to allocate.
*/
err = mlockall(MCL_CURRENT | MCL_FUTURE);
if (err) {
fprintf(stderr, "Failed to prefault and lock address space: %s\n",
strerror(err));
return err;
}
assert(!scx_toy__load(skel));
/* Initialize file descriptors to communicate with the BPF program */
enqueued_fd = bpf_map__fd(skel->maps.enqueued);
dispatched_fd = bpf_map__fd(skel->maps.dispatched);
assert(enqueued_fd > 0);
assert(dispatched_fd > 0);
/* Start the thread to periodically print enqueue statistics */
err = spawn_stats_thread();
if (err) {
fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err));
goto destroy_skel;
}
/* Register BPF program */
link = bpf_map__attach_struct_ops(skel->maps.toy_ops);
assert(link);
/* Call the scheduler main loop */sched_main_loop();
/* Unregister the BPF program and exit */bpf_link__destroy(link);
uei_print(&skel->bss->uei);
scx_toy__destroy(skel);
return0;
destroy_skel:
scx_toy__destroy(skel);
exit_req = 1;
return err;
}
To test the “toy” scheduler install the latest kernel from `ppa:arighi/sched-ext` with all the required build dependencies, following the steps documented at [5]).
Then you can load this “toy” scheduling class and replace the default CPU scheduler in Linux simply by running the following command:
$ sudo ./scx_toy
All the events that are affecting kernel threads or the scheduler task itself will be processed in kernel-space (for efficiency reasons), while the rest of the user-space tasks will be processed by the scheduler task (`scx_toy`).
The program will output some statistics of the “enqueue” and “dispatch” events done in user-space and kernel-space.
To unregister the “toy” scheduler and restore the default CFS scheduler we can simply press CTRL+c, that will also stop the scheduler task:
The sched-ext patch set has been written by Tejun Heo, David Vernet, Josh Don and Barret Rhoden (with multiple contributions from the kernel community).