This paper mainly records some practical processes, mainly analyses how to calculate CPU usage under CFS, referring to the code of system d-cgtop.
1 test case
[root@ecs]# mount -t cgroup cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd) cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpuacct,cpu) cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_prio,net_cls) cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) [root@ecs cpu]# lssubsys -m cpuset /sys/fs/cgroup/cpuset cpu,cpuacct /sys/fs/cgroup/cpu,cpuacct memory /sys/fs/cgroup/memory devices /sys/fs/cgroup/devices freezer /sys/fs/cgroup/freezer net_cls,net_prio /sys/fs/cgroup/net_cls,net_prio blkio /sys/fs/cgroup/blkio perf_event /sys/fs/cgroup/perf_event hugetlb /sys/fs/cgroup/hugetlb pids /sys/fs/cgroup/pids
1.1 cpu restriction (cpu.cfs_quota_us)
Create group: rule test
[root@ecs ~]# rmdir /sys/fs/cgroup/cpu/ruletest [root@ecs ~]# mkdir /sys/fs/cgroup/cpu/ruletest [root@ecs ~]# cd /sys/fs/cgroup/cpu/ruletest [root@ecs /sys/fs/cgroup/cpu/ruletest]# ll total 0 -rw-r--r-- 1 root root 0 Jun 14 14:50 cgroup.clone_children --w--w--w- 1 root root 0 Jun 14 14:50 cgroup.event_control -rw-r--r-- 1 root root 0 Jun 14 14:50 cgroup.procs -rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.cfs_period_us -rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.cfs_quota_us -rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.rt_period_us -rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.rt_runtime_us -rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.shares -r--r--r-- 1 root root 0 Jun 14 14:50 cpu.stat -r--r--r-- 1 root root 0 Jun 14 14:50 cpuacct.stat -rw-r--r-- 1 root root 0 Jun 14 14:50 cpuacct.usage -r--r--r-- 1 root root 0 Jun 14 14:50 cpuacct.usage_percpu -rw-r--r-- 1 root root 0 Jun 14 14:50 notify_on_release -rw-r--r-- 1 root root 0 Jun 14 14:50 tasks
The cpu limitation of this cgoup
[root@ecs /sys/fs/cgroup/cpu/ruletest]# cat /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us -1 [root@ecs /sys/fs/cgroup/cpu/ruletest]# echo 20000 > /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us [root@ecs /sys/fs/cgroup/cpu/ruletest]# cat /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us 20000
Running a program that runs full of CPU and observing the state
int main(void) { int i = 0; for(;;) i++; return 0; }
One nucleus is full
top - 15:00:14 up 19 days, 1:26, 3 users, load average: 0.44, 0.13, 0.20 Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie %Cpu0 : 0.3 us, 0.0 sy, 0.0 ni, 99.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu1 :100.0 us, 0.0 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu2 : 5.0 us, 2.3 sy, 0.0 ni, 92.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu3 : 6.0 us, 1.7 sy, 0.0 ni, 92.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu4 : 3.7 us, 1.0 sy, 0.0 ni, 95.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu5 : 0.3 us, 0.3 sy, 0.0 ni, 99.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu6 : 2.3 us, 1.3 sy, 0.0 ni, 96.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu7 : 2.0 us, 1.7 sy, 0.0 ni, 96.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem : 32779804 total, 3207136 free, 2965740 used, 26606928 buff/cache KiB Swap: 1048572 total, 1048572 free, 0 used. 28961712 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 910 root 20 0 4208 356 280 R 99.7 0.0 0:30.71 ./deadloop
Add process 910 to this cgroup:
[root@ecs ~/tmp]# echo 910 > /sys/fs/cgroup/cpu/ruletest/tasks [root@ecs ~/tmp]# cat /sys/fs/cgroup/cpu/ruletest/tasks 910
Then, you can see in top that CPU utilization drops to 20% immediately. (The 20,000 we set up earlier means 20%)
[root@ecs ~/tmp]# top -p 910 top - 15:06:43 up 19 days, 1:33, 3 users, load average: 0.01, 0.16, 0.22 Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie %Cpu0 : 4.3 us, 1.7 sy, 0.0 ni, 94.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu1 : 3.3 us, 2.0 sy, 0.0 ni, 94.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu2 : 2.0 us, 0.3 sy, 0.0 ni, 97.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu3 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu4 : 20.7 us, 0.3 sy, 0.0 ni, 79.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu5 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu6 : 6.4 us, 2.0 sy, 0.0 ni, 91.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st %Cpu7 : 2.7 us, 1.7 sy, 0.0 ni, 95.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem : 32779804 total, 3206684 free, 2966044 used, 26607076 buff/cache KiB Swap: 1048572 total, 1048572 free, 0 used. 28961312 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 910 root 20 0 4208 356 280 R 20.0 0.0 3:09.83 ./deadloop
memory.limit_in_bytes
Create programs that constantly eat memory
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <unistd.h> int main(void) { int size = 0; int chunk_size = 512; void *p = NULL; while(1) { if ((p = malloc(chunk_size)) == NULL) { printf("out of memory!!\n"); break; } memset(p, 1, chunk_size); size += chunk_size; printf("[%d] - memory is allocated [%8d] bytes \n", getpid(), size); sleep(1); } return 0; } [root@ecs ~/tmp]# ./a.out [1236] - memory is allocated [ 512] bytes [1236] - memory is allocated [ 1024] bytes [1236] - memory is allocated [ 1536] bytes [1236] - memory is allocated [ 2048] bytes [1236] - memory is allocated [ 2560] bytes [1236] - memory is allocated [ 3072] bytes ...
View memory usage
[root@ecs ~/tmp]# ps aux | grep a.out root 1236 0.0 0.0 4476 616 pts/3 S+ 15:11 0:00 ./a.out [root@ecs ~]# cat /proc/1236/status | grep RSS VmRSS: 616 kB top - 15:17:30 up 19 days, 1:44, 4 users, load average: 0.00, 0.02, 0.11 Tasks: 1 total, 0 running, 1 sleeping, 0 stopped, 0 zombie %Cpu(s): 2.3 us, 1.0 sy, 0.0 ni, 96.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem : 32779804 total, 3203592 free, 2968776 used, 26607436 buff/cache KiB Swap: 1048572 total, 1048572 free, 0 used. 28958612 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 1236 root 20 0 4476 616 424 S 0.0 0.0 0:00.01 ./a.out
Increase the memory limit of rule test group by 64KB
[root@ecs ~/tmp]# mkdir /sys/fs/cgroup/memory/ruletest [root@ecs ~/tmp]# cat /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes 9223372036854771712 [root@ecs ~/tmp]# echo 64k > /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes [root@ecs ~/tmp]# cat /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes 65536
Add processes to the ruletest group
echo 1236 > /sys/fs/cgroup/memory/ruletest/tasks
No limit! Memory usage has exceeded 64K before it is added to the group!
[root@ecs ~/tmp]# ps aux | grep a.out root 1236 0.0 0.0 4608 616 pts/3 S+ 15:11 0:00 ./a.out
Restart the program and add it to the group before no more than 64K
[1544] - memory is allocated [ 61440] bytes [1544] - memory is allocated [ 61952] bytes [1544] - memory is allocated [ 62464] bytes [1544] - memory is allocated [ 62976] bytes [1544] - memory is allocated [ 63488] bytes [1544] - memory is allocated [ 64000] bytes [1544] - memory is allocated [ 64512] bytes [1544] - memory is allocated [ 65024] bytes [1544] - memory is allocated [ 65536] bytes [1544] - memory is allocated [ 66048] bytes [1544] - memory is allocated [ 66560] bytes [1544] - memory is allocated [ 67072] bytes [1544] - memory is allocated [ 67584] bytes [1544] - memory is allocated [ 68096] bytes [1544] - memory is allocated [ 68608] bytes [1544] - memory is allocated [ 69120] bytes [1544] - memory is allocated [ 69632] bytes [1544] - memory is allocated [ 70144] bytes [1544] - memory is allocated [ 70656] bytes [1544] - memory is allocated [ 71168] bytes Killed
Look at the results of dmesg
[root@ecs ~/tmp]# cat /var/log/messages ... ... Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: a.out invoked oom-killer: gfp_mask=0xd0, order=0, oom_score_adj=0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: a.out cpuset=/ mems_allowed=0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: CPU: 7 PID: 1544 Comm: a.out Tainted: G ------------ T 3.10.0-957.5.1.el7.x86_64 #1 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 99a222b 04/01/2014 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Call Trace: Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b61e41>] dump_stack+0x19/0x1b Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b5c86a>] dump_header+0x90/0x229 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25ba076>] ? find_lock_task_mm+0x56/0xc0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25ba524>] oom_kill_process+0x254/0x3d0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2635346>] mem_cgroup_oom_synchronize+0x546/0x570 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb26347c0>] ? mem_cgroup_charge_common+0xc0/0xc0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25badb4>] pagefault_out_of_memory+0x14/0x90 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b5ad72>] mm_fault_error+0x6a/0x157 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6f7a8>] __do_page_fault+0x3c8/0x500 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6f9c6>] trace_do_page_fault+0x56/0x150 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6ef42>] do_async_page_fault+0x22/0xf0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6b788>] async_page_fault+0x28/0x30 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Task in /ruletest killed as a result of limit of /ruletest Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: memory: usage 64kB, limit 64kB, failcnt 219 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: memory+swap: usage 64kB, limit 9007199254740988kB, failcnt 0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: kmem: usage 0kB, limit 9007199254740988kB, failcnt 0 Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Memory cgroup stats for /ruletest: cache:0KB rss:64KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:64KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [ 1544] 0 1544 1086 89 7 0 0 a.out Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Memory cgroup out of memory: Kill process 1544 (a.out) score 5875 or sacrifice child Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Killed process 1544 (a.out) total-vm:4344kB, anon-rss:76kB, file-rss:280kB, shmem-rss:0kB
1.3 IO limitation (blkio.throttle.read_bps_device)
Simulate a large number of disk IO s
[root@ecs ~/tmp]# dd if=/dev/mapper/vgdata-lvdata of=/dev/null
Observation of iostat and iotop showed that IO speed was 60 MB.+
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util vdb 0.00 0.00 144.00 0.00 65536.00 0.00 910.22 0.87 7.19 7.19 0.00 1.16 16.70 dm-0 0.00 0.00 144.00 0.00 65536.00 0.00 910.22 1.03 7.19 7.19 0.00 1.82 26.20 Total DISK READ : 61.11 M/s | Total DISK WRITE : 0.00 B/s Actual DISK READ: 61.11 M/s | Actual DISK WRITE: 0.00 B/s TID PRIO USER DISK READ DISK WRITE SWAPIN IO> COMMAND 1714 be/4 root 61.11 M/s 0.00 B/s 0.00 % 0.98 % dd if=/dev/mapper/vgdata-lvdata of=/dev/null
Using ls-l/dev/mapper/vgdata-lvdata to get block device number
[root@ecs ~]# ll /dev/mapper/vgdata-lvdata lrwxrwxrwx 1 root root 7 May 29 19:59 /dev/mapper/vgdata-lvdata -> ../dm-0 [root@ecs ~]# ll /dev/dm-0 brw-rw---- 1 root disk 252, 0 May 29 19:59 /dev/dm-0
Create IO's cgroup and add processes to the group
[root@ecs ~]# echo '252:0 1048576' > /sys/fs/cgroup/blkio/ruletest/blkio.throttle.read_bps_device [root@ecs ~]# cat /sys/fs/cgroup/blkio/ruletest/blkio.throttle.read_bps_device 252:0 1048576 [root@ecs ~]# echo 1714 > /sys/fs/cgroup/blkio/ruletest/tasks [root@ecs ~]# cat /sys/fs/cgroup/blkio/ruletest/tasks 1714
Observation of iostat-x 1 revealed that IO was limited to about 1MB/s.
Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util vdb 0.00 0.00 2.00 0.00 1008.00 0.00 1008.00 0.00 1.00 1.00 0.00 1.00 0.20 dm-0 0.00 0.00 2.00 0.00 1008.00 0.00 1008.00 0.00 1.00 1.00 0.00 1.00 0.20
2 CGROUP Internal
2.1 subsystem
Well, with the above perceptual knowledge, let's see what subsystems the control group has:
- blkio - This subsystem sets input/output limits for block devices, such as physical devices (disks, solid-state drives, USB, etc.).
- CPU - This subsystem uses a scheduler to provide access to the CPU's cgroup tasks.
- cpuacct - This subsystem automatically generates CPU reports for tasks in cgroup.
- cpuset - This subsystem allocates independent CPU s (in multi-core systems) and memory nodes for tasks in cgroup.
- Devices - This subsystem allows or denies access to devices for tasks in cgroup.
- freezer - This subsystem suspends or restores tasks in cgroup.
- Memor - This subsystem sets memory limits for tasks in cgroup and automatically generates memory resource usage reports.
- net_cls - This subsystem uses a classid to mark network packets, allowing Linux traffic control programs (tc) to identify packets generated from specific cgroup s.
- net_prio - This subsystem is used to design the priority of network traffic
- HugeTLB - This subsystem is designed to restrict the HugeTLB system, which is a large-page file system.
2.2 Terminology
- Tasks: A process of the system.
- Control Group: A group of processes that are divided according to certain criteria, such as Professor and Student in official documents, or WWW and System, which represent a process group. Resource control in Cgroups is implemented in control groups. A process can join a control group. The resource constraints are defined on this group, just like the haoel I used in the example above. Simply put, the presentation of cgroup is a directory with a series of configurable files.
Hierarchy: A control group can be organized into a hierarchical form, which is a tree (directory structure) of the control group. The child nodes in the control group tree inherit the attributes of the parent node. In short, hierarchy is the cgroups directory tree on one or more subsystems. - Subsystem: A subsystem is a resource controller. For example, a CPU subsystem is a controller that controls CPU time allocation. A subsystem must be attached to a hierarchy in order to function. After a subsystem is attached to a certain hierarchy, all control groups at this hierarchy are controlled by this subsystem. Cgroup's subsystems can be numerous and growing.
3 Best Practices
Organize and supplement
4 CFS
4.1 cpu usage formula (incorrect)
cat cpuacct.usage;cat cpu.stat| grep nr_periods;sleep 60;cat cpuacct.usage;cat cpu.stat| grep nr_periods;cat cpu.cfs_period_us 36161906134505 nr_periods 1201535 36162489738476 nr_periods 1201613 50000
$(36162489738476 - 36161906134505) / (1201613 - 1201535) / 50000 / 1000 * 100$
4.2 cpu usage formula (summarized in system d-cgtop code)
cpuacct.usage difference / real time difference (nanoseconds)
Related code Cgtop.c
} else { if (!streq(controller, "cpuacct")) return 0; r = cg_get_path(controller, path, "cpuacct.usage", &p); if (r < 0) return r; r = read_one_line_file(p, &v); if (r == -ENOENT) return 0; if (r < 0) return r; r = safe_atou64(v, &new_usage); if (r < 0) return r; } timestamp = now_nsec(CLOCK_MONOTONIC); if (g->cpu_iteration == iteration - 1 && (nsec_t) new_usage > g->cpu_usage) { nsec_t x, y; x = timestamp - g->cpu_timestamp; if (x < 1) x = 1; y = (nsec_t) new_usage - g->cpu_usage; // y = cpuacct.usage difference // x = now_nsec's true time difference g->cpu_fraction = (double) y / (double) x; g->cpu_valid = true; } g->cpu_usage = (nsec_t) new_usage; g->cpu_timestamp = timestamp; g->cpu_iteration = iteration; }
4.2 Distribution of Core Number in CFS
cpu.cfs_period_us: Defined period length
cpu.cfs_quota_us: Allocate more time based on defined cycles
for example
cpu.cfs_period_us = 50000
cpu.cfs_quota_us = 200000
Represents a cycle of 50 ms, using the upper limit of a cycle you can use 200 ms, that is, you can use four core resources.