Docker Namespace CGROUP

Namespace

Linux 初始化 init 进程(进程号为 1)时会为每个 namespaces 类型创建一个实例。后面其它进程可以创建新的 namespaces 或加入已有的 namespaces。namespace 存在父子嵌套关系。

每个进程都有 /proc/[pid]/ns/ 目录，查看 1 号进程的 ns：sudo ls -l /proc/1/ns，查看当前进程 ns：ls -l /proc/$$/ns，readlink /proc/$$/ns/uts。

/proc/sys/user 目录下的文件记录了各 namespace 的相关限制。通常 namespace 的生命周期与最后一个进程的终止有关，但也有特殊情况(例如 namespace fd 被占用着)。

Namespaces in operation：https://lwn.net/Articles/531114

clone() 创建一个新进程，可通过特定入参达到隔离
unshare() 使某进程脱离某个 namespace
setns() 把某进程加入到某个 namespace

#define _GNU_SOURCE

#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>

/* 定义一个给 clone 用的栈，大小 1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char *const container_args[] = {"/bin/bash", NULL};

int container_main(void *arg) {
  printf("Container - PID [%5d]\n", getppid());
  printf("Container - ID  [%5d]\n", getpid());
  /* 执行一个 shell，以便观察这个进程空间里的资源是否被隔离 */
  execv(container_args[0], container_args);
  printf("Container - Something's wrong!\n");
  return 1;
}

int main() {
  printf("Parent - PID [%5d]\n", getppid()); // 父进程 ID
  printf("Parent - ID  [%5d]\n", gettid()); // 进程 ID
  printf("Parent - start a container!\n");
  int flags = SIGCHLD;
  /* clone 调用，栈空间传尾指针是因为栈是反着的 */
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0); /* 等待子进程结束 */
  printf("Parent - container stopped!\n");
  return 0;
}

View Code

uts_namespaces

#define _GNU_SOURCE

#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

int container_main(void *arg) {
  sethostname("container", 10); /* 设置 hostname，不会干扰父进程 */
  char hostname[10] = {0};
  gethostname(hostname, sizeof(hostname));
  printf("Container - hostname: %s\n", hostname);
  return 0;
}

int main() {
  int flags = CLONE_NEWUTS | SIGCHLD; /* UTS Namespace - CLONE_NEWUTS */
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0);
  char hostname[100] = {0};
  gethostname(hostname, sizeof(hostname));
  printf("Parent - hostname: %s\n", hostname);
  return 0;
}

View Code

运行需 root 权限(除 user namespace 外，创建其它类型 namespace 都要 CAP_SYS_ADMIN 的 capability)，子进程 hostname 变成了 container。

ipc_namespaces

IPC 是 Unix/Linux 下进程间通信的一种方式，只有在同一个 Namespace 下的进程才能相互通信。IPC 需要有一个全局的 ID，Namespace 需要对这个 ID 隔离，不能让别的 Namespace 的进程看到。

创建：ipcmk -Q，查看：ipcs

#define _GNU_SOURCE

#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char *const container_args[] = {"/bin/bash", NULL};

int container_main(void *arg) {
  execv(container_args[0], container_args);
  printf("Container - Something's wrong!\n");
  return 1;
}

int main() {
  int flags = CLONE_NEWIPC | SIGCHLD; /* IPC Namespace - CLONE_NEWIPC */
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0);
  return 0;
}

View Code

在子进程 shell 中 ipcs 无法查看到全局的 IPC Queue。

pid_namespaces

#define _GNU_SOURCE

#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char *const container_args[] = {"/bin/bash", NULL};

int container_main(void *arg) {
  printf("Container [%5d] - inside the container!\n", getpid());
  execv(container_args[0], container_args);
  printf("Container - Something's wrong!\n");
  return 1;
}

int main() {
  int flags = CLONE_NEWPID | SIGCHLD; /* PID Namespace - CLONE_NEWPID */
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0);
  return 0;
}

View Code

可以看到子进程的 PID 为 1。

传统 UNIX 系统中 PID 为 1 的进程是 init，是所有进程的父进程，有很多特权(屏蔽信号等)，另外，其还会检查所有进程的状态，若某个子进程脱离了父进程(父进程没有 wait 它)，那么 init 就会负责回收资源并结束这个子进程。所以，要做到进程空间的隔离，首先要创建出 PID 为 1 的进程。

但是，在子进程 shell 输入 ps、top 等命令，还是可以看得到所有进程。说明没有完全隔离。这是因为 ps、top 这些命令会读 /proc 文件系统。而文件系统并没有隔离。

mount_namespaces

private mount：https://www.man7.org/linux/man-pages/man2/mount.2.html

#define _GNU_SOURCE

#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>
#include <stdlib.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char *const container_args[] = {"/bin/bash", NULL};

int container_main(void *arg) {
  system("mount --make-rprivate /"); // 因为 shared subtree 机制，这里要递归修改整个 mount 树的 propagate type 为 private
  system("mount -t proc proc /proc"); // 重新 mount proc 文件系统到 /proc 下
  printf("Container [%5d] - inside the container!\n", getpid());
  execv(container_args[0], container_args);
  printf("Container - Something's wrong!\n");
  return 1;
}

int main() {
  int flags = CLONE_NEWPID | CLONE_NEWNS | SIGCHLD; /* Mount Namespace - CLONE_NEWNS */
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0);
  return 0;
}

View Code

子进程 shell 输入 ps、top 等命令看不到所有进程了，在通过 CLONE_NEWNS 创建 mount namespace 后，父进程会把自己的文件结构复制给子进程。

子进程在 namespace 隔离下的所有 mount 操作都只影响自身的文件系统，不对外界产生任何影响。还有别的一些文件系统也需要这样。

模仿 Docker 的 Mount Namespace

需要一个 rootfs，要添加其他命令可使用 ldd 命令查看命令相关的 so 文件。

mkdir rootfs conf mnt
cp /etc/hosts conf
cp /etc/hostname conf
cp /etc/resolv.conf conf
cd rootfs
mkdir -p tmp proc sys dev bin sbin usr/bin usr/sbin run etc mnt
cp /etc/hosts etc
cp /etc/hostname etc
cp /etc/resolv.conf etc
wget https://busybox.net/downloads/binaries/1.35.0-x86_64-linux-musl/busybox -O bin/busybox

View Code

启动后执行 /bin/busybox --install -s 安装 busybox。

#define _GNU_SOURCE

#include <sys/wait.h>
#include <sys/mount.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];
char *const container_args[] = {"/bin/busybox", "sh", NULL};

int container_main(void *arg) {
  printf("Container [%5d] - inside the container!\n", getpid());
  sethostname("container", 10);
  if (mount("proc", "rootfs/proc", "proc", 0, NULL) != 0) perror("proc");
  if (mount("sysfs", "rootfs/sys", "sysfs", 0, NULL) != 0) perror("sys");
  if (mount("none", "rootfs/tmp", "tmpfs", 0, NULL) != 0) perror("tmp");
  if (mount("udev", "rootfs/dev", "devtmpfs", 0, NULL) != 0) perror("dev");
  if (mount("devpts", "rootfs/dev/pts", "devpts", 0, NULL) != 0) perror("dev/pts");
  if (mount("shm", "rootfs/dev/shm", "tmpfs", 0, NULL) != 0) perror("dev/shm");
  if (mount("tmpfs", "rootfs/run", "tmpfs", 0, NULL) != 0) perror("run");

  if (mount("conf/hosts", "rootfs/etc/hosts", "none", MS_BIND, NULL) != 0 ||
      mount("conf/hostname", "rootfs/etc/hostname", "none", MS_BIND, NULL) != 0 ||
      mount("conf/resolv.conf", "rootfs/etc/resolv.conf", "none", MS_BIND, NULL) != 0) {
    perror("conf");
  } // 模仿 docker run 命令中的 -v。可以查看 /var/lib/docker/containers/<container_id>/ 目录
  if (mount("mnt", "rootfs/mnt", "none", MS_BIND, NULL) != 0) perror("mnt");
  if (chdir("./rootfs") != 0 || chroot("./") != 0) perror("chdir/chroot"); // chroot 隔离目录

  execv(container_args[0], container_args);
  perror("exec");
  printf("Container - Something's wrong!\n");
  return 1;
}

int main() {
  printf("Parent [%5d] - start a container!\n", getpid());
  int flags = CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD;
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0);
  printf("Parent - container stopped!\n");
  return 0;
}

View Code

user_namespaces

#define _GNU_SOURCE

#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>
#include <pwd.h>
#include <stdlib.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

int container_main(void *arg) {
  struct passwd *pwd = getpwuid(getuid());
  printf("eUID=%u, eGID=%u, UID=%u, GID=%u, name=%s", geteuid(), getegid(), getuid(), getgid(), pwd->pw_name);
  fflush(stdout);
  return EXIT_SUCCESS;
}

int main() {
  int flags = CLONE_NEWUSER | SIGCHLD; /* User Namespace - CLONE_NEWUSER */
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);
  waitpid(container_pid, NULL, 0);
  return EXIT_SUCCESS;
}

View Code

可以看到输出都是 65534，name 是 nobody。这是因为没有映射父 user namespace 的 user ID 和 group ID 到子 user namespace，这是必须的，这样系统才能控制一个 user namespace 里的用户在其它 user namespace 中的权限。

要把容器中的 uid 和真实系统的 uid 映射在一起，需要修改 /proc/<pid>/uid_map 和 /proc/<pid>/gid_map 这两个文件(这里 PID 是新 user namespace 中的进程 ID，开始时这两个文件都是空的)。这两个文件的格式为：ID-inside-ns ID-outside-ns length。

ID-inside-ns 表示在容器里显示的 UID 或 GID。
ID-outside-ns 表示容器外映射的真实 UID 或 GID。
length 表示映射范围，一般填 1，表示一一对应。

例如：0 1000 1 表示把真实的 uid=1000 映射成容器内的 uid=0，0 0 4294967295 表示把 namespace 内部从 0 开始的 uid 映射到外部从 0 开始的 uid，其最大范围是无符号 32 位整形。

写这两个文件的进程需要这个 namespace 中的 CAP_SETUID 和 CAP_SETGID 权限，参看 Capabilities、setcap。

#define _GNU_SOURCE

#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>

#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];
char *const container_args[] = {"/bin/bash", NULL};
int pipefd[2];

void set_map(char *file, int inside_id, unsigned int outside_id, int len) {
  FILE *mapfd = fopen(file, "w");
  if (NULL == mapfd) {
    perror("open file error");
    return;
  }
  fprintf(mapfd, "%d %d %d", inside_id, outside_id, len);
  fclose(mapfd);
}

void set_uid_map(pid_t pid, int inside_id, unsigned int outside_id, int len) {
  char file[256];
  sprintf(file, "/proc/%d/uid_map", pid);
  set_map(file, inside_id, outside_id, len);
}

void set_gid_map(pid_t pid, int inside_id, unsigned int outside_id, int len) {
  char file[256];
  sprintf(file, "/proc/%d/gid_map", pid);
  set_map(file, inside_id, outside_id, len);
}

int container_main(void *arg) {
  printf("Container[%5d] - eUID=%u, eGID=%u, UID=%u, GID=%u\n", getpid(), geteuid(), getegid(), getuid(), getgid());
  char ch;
  close(pipefd[1]);
  read(pipefd[0], &ch, 1); // 等待父进程通知后再往下执行
  printf("Container[%5d] - eUID=%u, eGID=%u, UID=%u, GID=%u\n", getpid(), geteuid(), getegid(), getuid(), getgid());
  sethostname("container", 10);
  // remount "/proc" to make sure the "top" and "ps" show container's information
  mount("proc", "/proc", "proc", 0, NULL);
  execv(container_args[0], container_args);
  printf("Container[%5d] - Something's wrong!\n", getpid());
  return 1;
}

int main() {
  pipe(pipefd); // 创建管道
  printf("Parent[%5d] - eUID=%u, eGID=%u, UID=%u, GID=%u\n", getpid(), geteuid(), getegid(), getuid(), getgid());
  int flags = CLONE_NEWUTS | CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUSER | SIGCHLD;
  int container_pid = clone(container_main, container_stack + STACK_SIZE, flags, NULL);

  // To map the uid/gid,
  //   we need edit the /proc/PID/uid_map (or /proc/PID/gid_map) in parent
  // The file format is
  //   ID-inside-ns   ID-outside-ns   length
  // if no mapping,
  //   the uid will be taken from /proc/sys/kernel/overflowuid
  //   the gid will be taken from /proc/sys/kernel/overflowgid
  set_uid_map(container_pid, 0, getuid(), 1);
  set_gid_map(container_pid, 0, getgid(), 1);
  printf("Parent[%5d] - user/group mapping container[%5d] done!\n", getpid(), container_pid);

  close(pipefd[1]); // 通知子进程
  waitpid(container_pid, NULL, 0);
  printf("Parent[%5d] - container[%5d] stopped!\n", getpid(), container_pid);
  return 0;
}

View Code

内核会保证 CLONE_NEWUSER 先被执行，然后执行剩下的 CLONE_NEW，这样就可以不使用 root 权限。

每个 namespace 都有一个 user namespace 与之关联，这个 user namespace 就是创建相应 namespace 时进程所属的 user namespace，保证对任何 namespace 的操作都受 user namespace 权限的控制。

例如 uts_namespace 结构体中的 struct user_namespace *user_ns。

命令方式也可以测试：

unshare -U
echo $$
exec bash

cat /proc/$$/status | egrep 'Cap(Inh|Prm|Eff)'
sudo setcap cap_setgid,cap_setuid+ep /bin/bash
echo "0 $(id -u) 1" > /proc/xxxx/uid_map
echo "0 $(id -g) 1" > /proc/xxxx/gid_map
sudo setcap cap_setgid,cap_setuid-ep /bin/bash

View Code

network_namespaces

这个图基本上就是 Docker 在宿主机上的网络示意图，其中的物理网卡并不准确，Docker 可能会运行在 VM 中，所以，这里所谓的物理网卡其实也就是一个有可以路由的 IP 的网卡。

可以使用一组命令做成图中的样子：

# 增加 lxcbr0 网桥，模仿 docker0
sudo ip link add name lxcbr0 type bridge
# brd + 表示使用默认广播地址，这样操作系统会根据给定的 IP 和子网掩码计算出对应的广播地址
sudo ip addr add 192.168.10.1/24 brd + dev lxcbr0
sudo ip link set lxcbr0 up
# 增加一个 Network Namespace，名字为 ns1。用 ip netns show 查看
sudo ip netns add ns1
# 激活 namespace 中的 loopback，即 127.0.0.1(使用 ip netns exec ns1 来操作 ns1 中的命令)
sudo ip netns exec ns1 ip link set dev lo up

# 增加一个 pair(veth) 虚拟网卡，其中一个网卡要设置到容器中
sudo ip link add veth-ns1 type veth peer name lxcbr0.1
# 把 veth-ns1 设置到 namespace ns1 中，这样容器中就会有一个新的网卡了。用 sudo ip netns exec ns1 ip a 查看
sudo ip link set veth-ns1 netns ns1
# 把容器里的 veth-ns1 改名为 eth0(容器外会冲突，容器内不会)
sudo ip netns exec ns1 ip link set dev veth-ns1 name eth0
# 为容器中的网卡分配一个 IP 地址，并激活
sudo ip netns exec ns1 ip addr add 192.168.10.11/24 dev eth0
sudo ip netns exec ns1 ip link set dev eth0 up

# 上面把 veth-ns1 设置到了容器中，这里把 lxcbr0.1 添加到网桥上
sudo ip link set lxcbr0.1 master lxcbr0
# 为容器中网卡增加一个默认路由规则，让容器可以访问外面的网络
sudo ip netns exec ns1 ip route add default via 192.168.10.1
# 在 /etc/netns 下创建 network namespce 名称为 ns1 的目录
sudo mkdir -p /etc/netns/ns1
# 然后为这个 namespace 设置 resolv.conf，这样，容器内就可以访问域名了
sudo sh -c 'echo "nameserver 8.8.8.8" > /etc/netns/ns1/resolv.conf'
# Docker 的 resolv.conf 没有用这种方式，用了 Mount Namesapce 方式
# Docker 用进程的 PID 来做 Network Namespace 的名称

View Code

甚至可以为正在运行的 docker 容器增加一个新的网卡，例如增加一个 eth1 的网卡，并给一个静态的可被外部访问到的 IP 地址：

sudo ip link add peerA type veth peer name peerB
sudo ip link set peerA master docker0
sudo ip link set peerA up
sudo ip link set peerB netns ${container-pid}
sudo ip netns exec ${container-pid} ip link set dev peerB name eth1
sudo ip netns exec ${container-pid} ip link set eth1 up
sudo ip netns exec ${container-pid} ip addr add ${ROUTEABLE_IP} dev eth1

View Code

要被外部访问需要把外部的“物理网卡”配置成混杂模式，这样 eth1 网卡就会向外通过 ARP 协议发送自己的 Mac 地址，然后外部的交换机就会把这个 IP 地址的包转到“物理网卡”上，因为是混杂模式，所以 eth1 就能收到相关的数据。这样，Docker 容器的网络就和外部通了。

当然，无论是 Docker 的 NAT 方式，还是混杂模式都会有性能上的问题。NAT 存在转发的开销，混杂模式下物理网卡收到的负载都会完全交给所有的虚拟网卡，于是就算一个网卡上没有数据，也会被其它网卡上的数据所影响。

这两种方式都不够完美，真正解决这种网络问题需要使用 VLAN，Google 为 Linux 内核实现了一个 IPVLAN 的驱动，这基本上是为 Docker 量身定制的。

time_namespaces

cgroup_namespaces

使用 cgroup namespace 需要内核开启 CONFIG_CGROUPS 选项。

CGROUP

Linux Control Group 是 Linux 内核的一个功能，用来限制、控制、分离一个进程组群的资源：CPU、内存、磁盘输入输出等，主要提供了如下功能：

Resource limitation：限制资源使用，比如内存使用上限以及文件系统的缓存限制
Prioritization：优先级控制，比如：CPU利用和磁盘IO吞吐
Accounting：一些审计或一些统计，主要目的是为了计费
Control：挂起进程，恢复执行进程

在实践中，系统管理员一般会利用 CGroup 做下面这些事(有点像为某个虚拟机分配资源似的)：

隔离一个进程集合(比如：nginx 的所有进程)，并限制他们所消费的资源，比如绑定 CPU 的核
为这组进程分配其足够使用的内存
为这组进程分配相应的网络带宽和磁盘存储限制
限制访问某些设备(通过设置设备的白名单)

Linux 把 CGroup 这个事实现成了一个 file system

mount -t cgroup
sudo apt install -y cgroup-tools
lssubsys -m

cpuset /sys/fs/cgroup/cpuset
cpu /sys/fs/cgroup/cpu
cpuacct /sys/fs/cgroup/cpuacct
blkio /sys/fs/cgroup/blkio
memory /sys/fs/cgroup/memory
devices /sys/fs/cgroup/devices
freezer /sys/fs/cgroup/freezer
net_cls /sys/fs/cgroup/net_cls
perf_event /sys/fs/cgroup/perf_event
net_prio /sys/fs/cgroup/net_prio
hugetlb /sys/fs/cgroup/hugetlb
pids /sys/fs/cgroup/pids
rdma /sys/fs/cgroup/rdma
misc /sys/fs/cgroup/misc

# 如果没有上述目录，可以手动 mount，示例：
mkdir cgroup
mount -t tmpfs cgroup_root ./cgroup
mkdir cgroup/cpuset
mount -t cgroup -ocpuset cpuset ./cgroup/cpuset/
mkdir cgroup/cpu
mount -t cgroup -ocpu cpu ./cgroup/cpu/
mkdir cgroup/memory
mount -t cgroup -omemory memory ./cgroup/memory/

# 到 /sys/fs/cgroup 的各个子目录下去 mkdir，会发现一旦创建目录，这个目录里会有很多文件
ls /sys/fs/cgroup/cpu /sys/fs/cgroup/cpuset
# sudo mount -o remount,rw /sys/fs/cgroup
cd /sys/fs/cgroup/cpu && sudo mkdir test
# sudo mount -o remount,ro /sys/fs/cgroup

View Code

cgroup.procs 和 tasks

cgroup.procs 包含的是进程 ID，tasks 里面包含的是线程 ID。进程或线程结束后 ID 会被自动移除

ps -efL | grep xxx
# root 370 367 370 99 6 12:00 pts/2 00:02:31 [xxx] <defunct>
# root 370 367 374 99 6 12:00 pts/2 00:00:30 xxx
# root 370 367 375 99 6 12:00 pts/2 00:00:30 xxx
# root 370 367 376 99 6 12:00 pts/2 00:00:30 xxx
# root 370 367 377 99 6 12:00 pts/2 00:00:30 xxx
# root 370 367 378 99 6 12:00 pts/2 00:00:30 xxx

# 线程
sudo sh -c 'echo 374 >> /sys/fs/cgroup/cpu/test/tasks'
cat /sys/fs/cgroup/cpu/test/tasks
# 374
cat /sys/fs/cgroup/cpu/test/cgroup.procs
# 370

# 线程
sudo sh -c 'echo 375 >> /sys/fs/cgroup/cpu/test/tasks'
cat /sys/fs/cgroup/cpu/test/tasks
# 374
# 375
cat /sys/fs/cgroup/cpu/test/cgroup.procs
# 370

# 进程
sudo sh -c 'echo 370 >> /sys/fs/cgroup/cpu/test/cgroup.procs'
cat /sys/fs/cgroup/cpu/test/cgroup.procs
# 370
cat /sys/fs/cgroup/cpu/test/tasks
# 374
# 375
# 376
# 377
# 378

View Code

CPU 限制

deadloop，运行后 top 查看 cpu 占用

int main(void) {
  int i = 0;
  for (;;) i++;
  return 0;
}

View Code

限制 deadloop 的 cpu 占用，设置后再 top 查看 cpu 占用

# 创建一个 test 的 group
sudo mkdir /sys/fs/cgroup/cpu/test
# 设置 test group 的 cpu 利用的限制
cat /sys/fs/cgroup/cpu/test/cpu.cfs_quota_us
sudo sh -c 'echo 20000 > /sys/fs/cgroup/cpu/test/cpu.cfs_quota_us'
# 查找 deadloop 进程 ID(这里为 987)，加到 cgroup 中
sudo sh -c 'echo 987 >> /sys/fs/cgroup/cpu/test/tasks'

View Code

线程示例，root 权限运行

// #define _GNU_SOURCE // See feature_test_macros(7)
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/syscall.h>

const int NUM_THREADS = 5;

void *thread_main(void *threadId) {
  char cmd[128];
  // 把自己加入 cgroup，syscall(SYS_gettid) 得到线程的系统 tid
  sprintf(cmd, "echo %ld >> /sys/fs/cgroup/cpu/test/tasks", syscall(SYS_gettid));
  system(cmd);
  sprintf(cmd, "echo %ld >> /sys/fs/cgroup/cpuset/test/tasks", syscall(SYS_gettid));
  system(cmd);
  printf("Thread: thread %ld, pid %ld\n", (long) threadId, syscall(SYS_gettid));

  unsigned long long i = 0;
  while (1) i++;
  pthread_exit(NULL);
}

int main(int argc, char *argv[]) {
  long num_threads = NUM_THREADS;
  if (argc > 1) num_threads = atol(argv[1]);
  if (num_threads <= 0 || num_threads >= 100) num_threads = NUM_THREADS;

  mkdir("/sys/fs/cgroup/cpu/test", 755);
  system("echo 50000 > /sys/fs/cgroup/cpu/test/cpu.cfs_quota_us"); // 设置 CPU 利用率为 50%
  mkdir("/sys/fs/cgroup/cpuset/test", 755);
  system("echo 2,3 > /sys/fs/cgroup/cpuset/test/cpuset.cpus"); // 限制 CPU 只能使用 #2 核和 #3 核

  pthread_t threads[num_threads];
  for (long t = 0; t < num_threads; t++) {
    printf("Main: creating thread %ld\n", t);
    int rc = pthread_create(&threads[t], NULL, thread_main, (void *) t);
    if (rc) {
      printf("Main: error, return code from pthread_create() is %d\n", rc);
      exit(-1);
    }
  }
  pthread_exit(NULL); // Last thing that main() should do
}

View Code

内存限制

下面的代码是个死循环，不断分配内存，每次 512个字节，每次等待一秒

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

int main(void) {
  int size = 0, chunk_size = 512;
  void *p = NULL;
  while (1) {
    if ((p = realloc(p, size += chunk_size)) == NULL) {
      printf("out of memory!!\n");
      break;
    }
    memset(p, 1, size);

    // if ((p = malloc(chunk_size)) == NULL) {
    //   printf("out of memory!!\n");
    //   break;
    // }
    // memset(p, 1, chunk_size);
    // size += chunk_size;

    printf("[%d] - memory is allocated [%8d] bytes \n", getpid(), size);
    sleep(1);
  }
  return 0;
}

View Code

然后在 Shell 中执行下面命令，会看到一会进程就会因为内存问题被 kill

# 创建 cgroup
sudo mkdir /sys/fs/cgroup/memory/test
# 设置限额
sudo sh -c 'echo 4k > /sys/fs/cgroup/memory/test/memory.limit_in_bytes'
cat /sys/fs/cgroup/memory/test/memory.limit_in_bytes
# 100 表示优先使用 swap，0 表示优先使用内存
sudo sh -c 'echo 0 > /sys/fs/cgroup/memory/test/memory.swappiness'
# oom_kill_disable 为 0 表示内存超过限制就 kill，为 1 表示继续等待，当有内存释放时，继续申请内存。不会 kill
# sudo sh -c 'echo 0 > /sys/fs/cgroup/memory/test/memory.oom_control'
# 把进程 pid 加入 cgroup
sudo sh -c 'echo pid > /sys/fs/cgroup/memory/test/cgroup.procs'

View Code

磁盘 I/O 限制

sudo apt install -y iotop
sudo dd if=/dev/sda of=/dev/null
# 查看 IO 速度
sudo iotop
# 创建 cgroup
sudo mkdir /sys/fs/cgroup/blkio/test
# 读 IO 限制到 1MB(1048576bytes)/s，8:0 是设备号，通过 ls -l /dev/sd* 查看
sudo sh -c "echo '8:0 1048576' > /sys/fs/cgroup/blkio/test/blkio.throttle.read_bps_device"
# dd 命令进程的 pid
sudo sh -c 'echo 8128 > /sys/fs/cgroup/blkio/test/tasks'