Cgroup Practice-Computing Method of CPU Utilization Rate under CFS

Keywords: Linux network

This paper mainly records some practical processes, mainly analyses how to calculate CPU usage under CFS, referring to the code of system d-cgtop.

1 test case

[root@ecs]# mount -t cgroup
cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpuacct,cpu)
cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb)
cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_prio,net_cls)
cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset)
cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids)
cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event)

[root@ecs cpu]# lssubsys -m
cpuset /sys/fs/cgroup/cpuset
cpu,cpuacct /sys/fs/cgroup/cpu,cpuacct
memory /sys/fs/cgroup/memory
devices /sys/fs/cgroup/devices
freezer /sys/fs/cgroup/freezer
net_cls,net_prio /sys/fs/cgroup/net_cls,net_prio
blkio /sys/fs/cgroup/blkio
perf_event /sys/fs/cgroup/perf_event
hugetlb /sys/fs/cgroup/hugetlb
pids /sys/fs/cgroup/pids

1.1 cpu restriction (cpu.cfs_quota_us)

Create group: rule test

[root@ecs ~]# rmdir /sys/fs/cgroup/cpu/ruletest
[root@ecs ~]# mkdir /sys/fs/cgroup/cpu/ruletest
[root@ecs ~]# cd /sys/fs/cgroup/cpu/ruletest
[root@ecs /sys/fs/cgroup/cpu/ruletest]# ll
total 0
-rw-r--r-- 1 root root 0 Jun 14 14:50 cgroup.clone_children
--w--w--w- 1 root root 0 Jun 14 14:50 cgroup.event_control
-rw-r--r-- 1 root root 0 Jun 14 14:50 cgroup.procs
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.cfs_period_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.cfs_quota_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.rt_period_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.rt_runtime_us
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpu.shares
-r--r--r-- 1 root root 0 Jun 14 14:50 cpu.stat
-r--r--r-- 1 root root 0 Jun 14 14:50 cpuacct.stat
-rw-r--r-- 1 root root 0 Jun 14 14:50 cpuacct.usage
-r--r--r-- 1 root root 0 Jun 14 14:50 cpuacct.usage_percpu
-rw-r--r-- 1 root root 0 Jun 14 14:50 notify_on_release
-rw-r--r-- 1 root root 0 Jun 14 14:50 tasks

The cpu limitation of this cgoup

[root@ecs /sys/fs/cgroup/cpu/ruletest]# cat /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us
-1
[root@ecs /sys/fs/cgroup/cpu/ruletest]# echo 20000 > /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us

[root@ecs /sys/fs/cgroup/cpu/ruletest]# cat /sys/fs/cgroup/cpu/ruletest/cpu.cfs_quota_us
20000

Running a program that runs full of CPU and observing the state

int main(void)
{
    int i = 0;
    for(;;) i++;
    return 0;
}

One nucleus is full

top - 15:00:14 up 19 days,  1:26,  3 users,  load average: 0.44, 0.13, 0.20
Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu0  :  0.3 us,  0.0 sy,  0.0 ni, 99.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu1  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu2  :  5.0 us,  2.3 sy,  0.0 ni, 92.6 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu3  :  6.0 us,  1.7 sy,  0.0 ni, 92.3 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu4  :  3.7 us,  1.0 sy,  0.0 ni, 95.3 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu5  :  0.3 us,  0.3 sy,  0.0 ni, 99.3 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu6  :  2.3 us,  1.3 sy,  0.0 ni, 96.3 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu7  :  2.0 us,  1.7 sy,  0.0 ni, 96.3 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem : 32779804 total,  3207136 free,  2965740 used, 26606928 buff/cache
KiB Swap:  1048572 total,  1048572 free,        0 used. 28961712 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
  910 root      20   0    4208    356    280 R  99.7  0.0   0:30.71 ./deadloop

Add process 910 to this cgroup:

[root@ecs ~/tmp]# echo 910 > /sys/fs/cgroup/cpu/ruletest/tasks
[root@ecs ~/tmp]# cat /sys/fs/cgroup/cpu/ruletest/tasks
910

Then, you can see in top that CPU utilization drops to 20% immediately. (The 20,000 we set up earlier means 20%)

[root@ecs ~/tmp]# top -p 910
top - 15:06:43 up 19 days,  1:33,  3 users,  load average: 0.01, 0.16, 0.22
Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu0  :  4.3 us,  1.7 sy,  0.0 ni, 94.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu1  :  3.3 us,  2.0 sy,  0.0 ni, 94.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu2  :  2.0 us,  0.3 sy,  0.0 ni, 97.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu3  :  0.0 us,  0.0 sy,  0.0 ni,100.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu4  : 20.7 us,  0.3 sy,  0.0 ni, 79.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu5  :  0.0 us,  0.0 sy,  0.0 ni,100.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu6  :  6.4 us,  2.0 sy,  0.0 ni, 91.6 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
%Cpu7  :  2.7 us,  1.7 sy,  0.0 ni, 95.7 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem : 32779804 total,  3206684 free,  2966044 used, 26607076 buff/cache
KiB Swap:  1048572 total,  1048572 free,        0 used. 28961312 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
  910 root      20   0    4208    356    280 R  20.0  0.0   3:09.83 ./deadloop

memory.limit_in_bytes

Create programs that constantly eat memory

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>

int main(void)
{
    int size = 0;
    int chunk_size = 512;
    void *p = NULL;

    while(1) {

        if ((p = malloc(chunk_size)) == NULL) {
            printf("out of memory!!\n");
            break;
        }
        memset(p, 1, chunk_size);
        size += chunk_size;
        printf("[%d] - memory is allocated [%8d] bytes \n", getpid(), size);
        sleep(1);
    }
    return 0;
}

[root@ecs ~/tmp]# ./a.out
[1236] - memory is allocated [     512] bytes
[1236] - memory is allocated [    1024] bytes
[1236] - memory is allocated [    1536] bytes
[1236] - memory is allocated [    2048] bytes
[1236] - memory is allocated [    2560] bytes
[1236] - memory is allocated [    3072] bytes
...

View memory usage

[root@ecs ~/tmp]# ps aux | grep a.out
root      1236  0.0  0.0   4476   616 pts/3    S+   15:11   0:00 ./a.out

[root@ecs ~]# cat /proc/1236/status | grep RSS
VmRSS:         616 kB


top - 15:17:30 up 19 days,  1:44,  4 users,  load average: 0.00, 0.02, 0.11
Tasks:   1 total,   0 running,   1 sleeping,   0 stopped,   0 zombie
%Cpu(s):  2.3 us,  1.0 sy,  0.0 ni, 96.6 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem : 32779804 total,  3203592 free,  2968776 used, 26607436 buff/cache
KiB Swap:  1048572 total,  1048572 free,        0 used. 28958612 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
 1236 root      20   0    4476    616    424 S   0.0  0.0   0:00.01 ./a.out

Increase the memory limit of rule test group by 64KB

[root@ecs ~/tmp]# mkdir /sys/fs/cgroup/memory/ruletest

[root@ecs ~/tmp]# cat /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes
9223372036854771712

[root@ecs ~/tmp]# echo 64k > /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes

[root@ecs ~/tmp]# cat /sys/fs/cgroup/memory/ruletest/memory.limit_in_bytes
65536

Add processes to the ruletest group

echo 1236 > /sys/fs/cgroup/memory/ruletest/tasks

No limit! Memory usage has exceeded 64K before it is added to the group!

[root@ecs ~/tmp]# ps aux | grep a.out
root      1236  0.0  0.0   4608   616 pts/3    S+   15:11   0:00 ./a.out

Restart the program and add it to the group before no more than 64K

[1544] - memory is allocated [   61440] bytes
[1544] - memory is allocated [   61952] bytes
[1544] - memory is allocated [   62464] bytes
[1544] - memory is allocated [   62976] bytes
[1544] - memory is allocated [   63488] bytes
[1544] - memory is allocated [   64000] bytes
[1544] - memory is allocated [   64512] bytes
[1544] - memory is allocated [   65024] bytes
[1544] - memory is allocated [   65536] bytes
[1544] - memory is allocated [   66048] bytes
[1544] - memory is allocated [   66560] bytes
[1544] - memory is allocated [   67072] bytes
[1544] - memory is allocated [   67584] bytes
[1544] - memory is allocated [   68096] bytes
[1544] - memory is allocated [   68608] bytes
[1544] - memory is allocated [   69120] bytes
[1544] - memory is allocated [   69632] bytes
[1544] - memory is allocated [   70144] bytes
[1544] - memory is allocated [   70656] bytes
[1544] - memory is allocated [   71168] bytes
Killed

Look at the results of dmesg

[root@ecs ~/tmp]# cat /var/log/messages
...
...
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: a.out invoked oom-killer: gfp_mask=0xd0, order=0, oom_score_adj=0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: a.out cpuset=/ mems_allowed=0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: CPU: 7 PID: 1544 Comm: a.out Tainted: G               ------------ T 3.10.0-957.5.1.el7.x86_64 #1
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 99a222b 04/01/2014
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Call Trace:
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b61e41>] dump_stack+0x19/0x1b
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b5c86a>] dump_header+0x90/0x229
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25ba076>] ? find_lock_task_mm+0x56/0xc0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25ba524>] oom_kill_process+0x254/0x3d0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2635346>] mem_cgroup_oom_synchronize+0x546/0x570
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb26347c0>] ? mem_cgroup_charge_common+0xc0/0xc0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb25badb4>] pagefault_out_of_memory+0x14/0x90
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b5ad72>] mm_fault_error+0x6a/0x157
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6f7a8>] __do_page_fault+0x3c8/0x500
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6f9c6>] trace_do_page_fault+0x56/0x150
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6ef42>] do_async_page_fault+0x22/0xf0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [<ffffffffb2b6b788>] async_page_fault+0x28/0x30
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Task in /ruletest killed as a result of limit of /ruletest
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: memory: usage 64kB, limit 64kB, failcnt 219
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: memory+swap: usage 64kB, limit 9007199254740988kB, failcnt 0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: kmem: usage 0kB, limit 9007199254740988kB, failcnt 0
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Memory cgroup stats for /ruletest: cache:0KB rss:64KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:64KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: [ 1544]     0  1544     1086       89       7        0             0 a.out
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Memory cgroup out of memory: Kill process 1544 (a.out) score 5875 or sacrifice child
Jun 14 15:25:26 iZbp1d4tisi44j6vxze02fZ kernel: Killed process 1544 (a.out) total-vm:4344kB, anon-rss:76kB, file-rss:280kB, shmem-rss:0kB

1.3 IO limitation (blkio.throttle.read_bps_device)

Simulate a large number of disk IO s

[root@ecs ~/tmp]# dd if=/dev/mapper/vgdata-lvdata of=/dev/null

Observation of iostat and iotop showed that IO speed was 60 MB.+

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
vdb               0.00     0.00  144.00    0.00 65536.00     0.00   910.22     0.87    7.19    7.19    0.00   1.16  16.70
dm-0              0.00     0.00  144.00    0.00 65536.00     0.00   910.22     1.03    7.19    7.19    0.00   1.82  26.20


Total DISK READ :      61.11 M/s | Total DISK WRITE :       0.00 B/s
Actual DISK READ:      61.11 M/s | Actual DISK WRITE:       0.00 B/s
  TID  PRIO  USER     DISK READ  DISK WRITE  SWAPIN     IO>    COMMAND
 1714 be/4 root       61.11 M/s    0.00 B/s  0.00 %  0.98 % dd if=/dev/mapper/vgdata-lvdata of=/dev/null

Using ls-l/dev/mapper/vgdata-lvdata to get block device number

[root@ecs ~]# ll /dev/mapper/vgdata-lvdata
lrwxrwxrwx 1 root root 7 May 29 19:59 /dev/mapper/vgdata-lvdata -> ../dm-0
[root@ecs ~]# ll /dev/dm-0
brw-rw---- 1 root disk 252, 0 May 29 19:59 /dev/dm-0

Create IO's cgroup and add processes to the group

[root@ecs ~]# echo '252:0 1048576'  > /sys/fs/cgroup/blkio/ruletest/blkio.throttle.read_bps_device

[root@ecs ~]# cat /sys/fs/cgroup/blkio/ruletest/blkio.throttle.read_bps_device
252:0 1048576

[root@ecs ~]# echo 1714 > /sys/fs/cgroup/blkio/ruletest/tasks
[root@ecs ~]# cat /sys/fs/cgroup/blkio/ruletest/tasks
1714

Observation of iostat-x 1 revealed that IO was limited to about 1MB/s.

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
vdb               0.00     0.00    2.00    0.00  1008.00     0.00  1008.00     0.00    1.00    1.00    0.00   1.00   0.20
dm-0              0.00     0.00    2.00    0.00  1008.00     0.00  1008.00     0.00    1.00    1.00    0.00   1.00   0.20

2 CGROUP Internal

2.1 subsystem

Well, with the above perceptual knowledge, let's see what subsystems the control group has:

  • blkio - This subsystem sets input/output limits for block devices, such as physical devices (disks, solid-state drives, USB, etc.).
  • CPU - This subsystem uses a scheduler to provide access to the CPU's cgroup tasks.
  • cpuacct - This subsystem automatically generates CPU reports for tasks in cgroup.
  • cpuset - This subsystem allocates independent CPU s (in multi-core systems) and memory nodes for tasks in cgroup.
  • Devices - This subsystem allows or denies access to devices for tasks in cgroup.
  • freezer - This subsystem suspends or restores tasks in cgroup.
  • Memor - This subsystem sets memory limits for tasks in cgroup and automatically generates memory resource usage reports.
  • net_cls - This subsystem uses a classid to mark network packets, allowing Linux traffic control programs (tc) to identify packets generated from specific cgroup s.
  • net_prio - This subsystem is used to design the priority of network traffic
  • HugeTLB - This subsystem is designed to restrict the HugeTLB system, which is a large-page file system.

2.2 Terminology

  • Tasks: A process of the system.
  • Control Group: A group of processes that are divided according to certain criteria, such as Professor and Student in official documents, or WWW and System, which represent a process group. Resource control in Cgroups is implemented in control groups. A process can join a control group. The resource constraints are defined on this group, just like the haoel I used in the example above. Simply put, the presentation of cgroup is a directory with a series of configurable files.
    Hierarchy: A control group can be organized into a hierarchical form, which is a tree (directory structure) of the control group. The child nodes in the control group tree inherit the attributes of the parent node. In short, hierarchy is the cgroups directory tree on one or more subsystems.
  • Subsystem: A subsystem is a resource controller. For example, a CPU subsystem is a controller that controls CPU time allocation. A subsystem must be attached to a hierarchy in order to function. After a subsystem is attached to a certain hierarchy, all control groups at this hierarchy are controlled by this subsystem. Cgroup's subsystems can be numerous and growing.

3 Best Practices

Organize and supplement

4 CFS

4.1 cpu usage formula (incorrect)

cat cpuacct.usage;cat cpu.stat| grep nr_periods;sleep 60;cat cpuacct.usage;cat cpu.stat| grep nr_periods;cat cpu.cfs_period_us

36161906134505
nr_periods 1201535
36162489738476
nr_periods 1201613
50000

$(36162489738476 - 36161906134505) / (1201613 - 1201535) / 50000 / 1000 * 100$

4.2 cpu usage formula (summarized in system d-cgtop code)

cpuacct.usage difference / real time difference (nanoseconds)

Related code Cgtop.c

                } else {
                        if (!streq(controller, "cpuacct"))
                                return 0;

                        r = cg_get_path(controller, path, "cpuacct.usage", &p);
                        if (r < 0)
                                return r;

                        r = read_one_line_file(p, &v);
                        if (r == -ENOENT)
                                return 0;
                        if (r < 0)
                                return r;

                        r = safe_atou64(v, &new_usage);
                        if (r < 0)
                                return r;
                }

                timestamp = now_nsec(CLOCK_MONOTONIC);

                if (g->cpu_iteration == iteration - 1 &&
                    (nsec_t) new_usage > g->cpu_usage) {

                        nsec_t x, y;

                        x = timestamp - g->cpu_timestamp;
                        if (x < 1)
                                x = 1;

                        y = (nsec_t) new_usage - g->cpu_usage;
                        // y = cpuacct.usage difference
                        // x = now_nsec's true time difference
                        g->cpu_fraction = (double) y / (double) x;
                        g->cpu_valid = true;
                }

                g->cpu_usage = (nsec_t) new_usage;
                g->cpu_timestamp = timestamp;
                g->cpu_iteration = iteration;

        }

4.2 Distribution of Core Number in CFS

cpu.cfs_period_us: Defined period length

cpu.cfs_quota_us: Allocate more time based on defined cycles

for example

cpu.cfs_period_us = 50000
cpu.cfs_quota_us = 200000
Represents a cycle of 50 ms, using the upper limit of a cycle you can use 200 ms, that is, you can use four core resources.

Posted by JD-AM on Mon, 22 Jul 2019 08:23:25 -0700