From 2cff8f3d379bafeb8e632f422008f4cb781b3068 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 2 Jan 2024 01:56:14 +0000 Subject: [PATCH 01/17] alinux: psi: Support PSI under cgroup v1 maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW Reference: https://github.com/alibaba/cloud-kernel/commit/1f49a73850325bb93601cc2240236114a0a7ca9a ---------------------------------------------- Export "cpu|io|memory.pressure" under cgroup v1 "cpuacct" subsystem. Reviewed-by: Joseph Qi Signed-off-by: Xunlei Pang Conflict: kernel/cgroup/cgroup.c kernel/sched/sched.h kernel/sched/psi.c Merge OLK-5.10 fix patch 614135168b0475d54567b3a5d27afe315b09d0a8 Merge OLK-5.10 fix patch 834a1594da34ba6e7b21fdba1b2564240ce4cd05 Signed-off-by: Lu Jialin --- init/Kconfig | 10 ++++++++++ kernel/cgroup/cgroup.c | 27 +++++++++++++++++++++++++++ kernel/sched/cpuacct.c | 12 ++++++++++++ kernel/sched/psi.c | 17 ++++++++++++++++- 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index c8909ca8bb48..78b13b1ee210 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -667,6 +667,16 @@ config PSI_DEFAULT_DISABLED Say N if unsure. +config PSI_CGROUP_V1 + bool "Support PSI under cgroup v1" + default Y + depends on PSI + help + If set, pressure stall information tracking will be used + for cgroup v1 other than v2. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cfae217e6e7f..7e4511bfbb5e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3919,6 +3919,33 @@ bool cgroup_psi_enabled(void) return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } +struct cftype cgroup_v1_psi_files[] = { + { + .name = "io.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { } /* terminate */ +}; #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..91039b9260a4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -361,3 +361,15 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .legacy_cftypes = files, .early_init = true, }; + +#ifdef CONFIG_PSI +extern struct cftype cgroup_v1_psi_files[]; + +static int __init cgroup_v1_psi_init(void) +{ + cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files); + return 0; +} + +late_initcall_sync(cgroup_v1_psi_init); +#endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6..61a12286030a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -881,11 +881,26 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } +#ifdef CONFIG_PSI_CGROUP_V1 +static bool task_is_in_psi_v1(void) +{ + return !cgroup_subsys_on_dfl(cpuacct_cgrp_subsys); +} +#else +static bool task_is_in_psi_v1(void) +{ + return false; +} +#endif + static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) + if (static_branch_likely(&psi_cgroups_enabled)) { + if (task_is_in_psi_v1()) + return cgroup_psi(task_cgroup(task, cpuacct_cgrp_id)); return cgroup_psi(task_dfl_cgroup(task)); + } #endif return &psi_system; } -- Gitee From 116a499ff1b97f45655ce077de37d52d7d44b696 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 2 Jan 2024 01:56:15 +0000 Subject: [PATCH 02/17] alinux: introduce psi_v1 boot parameter maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW Reference: https://github.com/alibaba/cloud-kernel/commit/dc159a61c043489e5527aa0c52e7b35929a30f0d ---------------------------------------------- Instead using static kconfig CONFIG_PSI_CGROUP_V1, we introduce a boot parameter psi_v1 to enable psi cgroup v1 support. Default it is disabled, which means when passing psi=1 boot parameter, we only support cgroup v2. This is to keep consistent with other cgroup v1 features such as cgroup writeback v1 (cgwb_v1). Signed-off-by: Joseph Qi Acked-by: Xunlei Pang Conflicts: kernel/sched/cpuacct.c kernel/sched/psi.c include/linux/psi.h Merge OLK-5.10 fix patch d767c63390e0c1ecf45f1a06d55eb23f613225ab Signed-off-by: Lu Jialin --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 1 + init/Kconfig | 10 ---------- kernel/sched/cpuacct.c | 16 ++++++++++++++++ kernel/sched/psi.c | 11 ++++------- 5 files changed, 25 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fe2dc03538fe..dafef771b07a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4654,6 +4654,10 @@ tracking. Format: + psi_v1= [KNL] Enable or disable pressure stall information + tracking on cgroup v1. + Format: + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index e0745873e3f2..44cb1358737b 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -15,6 +15,7 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; +extern struct static_key_false psi_v1_disabled; extern struct psi_group psi_system; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index 78b13b1ee210..c8909ca8bb48 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -667,16 +667,6 @@ config PSI_DEFAULT_DISABLED Say N if unsure. -config PSI_CGROUP_V1 - bool "Support PSI under cgroup v1" - default Y - depends on PSI - help - If set, pressure stall information tracking will be used - for cgroup v1 other than v2. - - Say N if unsure. - endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 91039b9260a4..4e71caf232d3 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -363,10 +363,26 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { }; #ifdef CONFIG_PSI +static bool psi_v1_enable; extern struct cftype cgroup_v1_psi_files[]; +static int __init setup_psi_v1(char *str) +{ + int ret; + + ret = kstrtobool(str, &psi_v1_enable); + if (!psi_v1_enable) + static_branch_enable(&psi_v1_disabled); + + return ret == 0; +} +__setup("psi_v1=", setup_psi_v1); + static int __init cgroup_v1_psi_init(void) { + if (!psi_v1_enable) + return 0; + cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files); return 0; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 61a12286030a..00a0242eea50 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,6 +140,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_FALSE(psi_v1_disabled); static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED @@ -881,17 +882,13 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -#ifdef CONFIG_PSI_CGROUP_V1 static bool task_is_in_psi_v1(void) { + if (static_branch_likely(&psi_v1_disabled)) + return false; + return !cgroup_subsys_on_dfl(cpuacct_cgrp_subsys); } -#else -static bool task_is_in_psi_v1(void) -{ - return false; -} -#endif static inline struct psi_group *task_psi_group(struct task_struct *task) { -- Gitee From ee5f49d93a4cca71e8ca4453724259a820edd606 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 2 Jan 2024 01:56:16 +0000 Subject: [PATCH 03/17] alinux: psi: using cpuacct_cgrp_id under CONFIG_CGROUP_CPUACCT maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW Reference: https://github.com/alibaba/cloud-kernel/commit/1bd8a72b11818535b18984227ec1d707e6605e71 ---------------------------------------------- This is to fix the build error if CONFIG_CGROUP_CPUACCT is not enabled. kernel/sched/psi.c: In function 'iterate_groups': kernel/sched/psi.c:732:31: error: 'cpuacct_cgrp_id' undeclared (first use in this function); did you mean 'cpuacct_charge'? Reported-by: kbuild test robot Fixes: 1f49a7385032 ("alinux: psi: Support PSI under cgroup v1") Signed-off-by: Joseph Qi Reviewed-by: Xunlei Pang Conflict: kernel/sched/psi.c Signed-off-by: Lu Jialin --- kernel/sched/psi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 00a0242eea50..328ae3bc82bd 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -882,6 +882,7 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } +#ifdef CONFIG_CGROUP_CPUACCT static bool task_is_in_psi_v1(void) { if (static_branch_likely(&psi_v1_disabled)) @@ -889,6 +890,12 @@ static bool task_is_in_psi_v1(void) return !cgroup_subsys_on_dfl(cpuacct_cgrp_subsys); } +#else +static bool task_is_in_psi_v1(void) +{ + return false; +} +#endif static inline struct psi_group *task_psi_group(struct task_struct *task) { -- Gitee From 0ce7ca0095cd8f8496666bc166cc4237929db7e7 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 2 Jan 2024 01:56:17 +0000 Subject: [PATCH 04/17] alinux: cgroup: Fix task_css_check rcu warnings maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW Reference: https://github.com/alibaba/cloud-kernel/commit/798cfa768c743e03aac3336a43bd233d2ec824ce ---------------------------------------------- to #26424323 task_css() should be protected by rcu, fix several callers. Fixes: 1f49a7385032 ("alinux: psi: Support PSI under cgroup v1") Acked-by: Michael Wang Signed-off-by: Xunlei Pang Signed-off-by: Yihao Wu Acked-by: Yang Shi Conflicts: kernel/sched/psi.c mm/oom_kill.c Signed-off-by: Lu Jialin --- kernel/sched/psi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 328ae3bc82bd..583347b67ce8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -901,8 +901,14 @@ static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS if (static_branch_likely(&psi_cgroups_enabled)) { - if (task_is_in_psi_v1()) - return cgroup_psi(task_cgroup(task, cpuacct_cgrp_id)); + if (task_is_in_psi_v1()) { + struct cgroup *cgroup; + + rcu_read_lock(); + cgroup = task_cgroup(task, cpuacct_cgrp_id); + rcu_read_unlock(); + return cgroup_psi(cgroup); + } return cgroup_psi(task_dfl_cgroup(task)); } #endif -- Gitee From 27a86faa0abb47f556e24e133f41e13dca927c5b Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Tue, 2 Jan 2024 01:56:18 +0000 Subject: [PATCH 05/17] sched/psi: Bail out early from irq time accounting mainline inclusion from mainline-v6.7-rc1 commit 0c2924079f5a83ed715630680e338b3685a0bf7d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c2924079f5a83ed715630680e338b3685a0bf7d -------------------------------- We could bail out early when psi was disabled. Signed-off-by: Haifeng Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Link: https://lore.kernel.org/r/20230926115722.467833-1-haifeng.xu@shopee.com Signed-off-by: Lu Jialin --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 583347b67ce8..b1aa1b719f1d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1034,6 +1034,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; -- Gitee From c80c50f7e33a336751d3424faec113f279f91909 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:19 +0000 Subject: [PATCH 06/17] psi: update psi irqtime when the irq delta is nozero hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- If update psi irqtime whether the irq delta is zero or not, the performance will be degradation when update_rq_clock_task works frequently. Therefore, just update psi irqtime whether the irq delta is nozero. performace test of times: 1) without psi_account_irqtime in update_rq_clock_task [root@arm64_perf bin]# ./times -E -C 200 -L -S -W -N "times" -I 200 Running: times# ./../bin-arm64/times -E -C 200 -L -S -W -N times -I 200 prc thr usecs/call samples errors cnt/samp times 1 1 0.45210 188 0 500 2) psi_account_irqtime in update_rq_clock_task [root@arm64_perf bin]# ./times -E -C 200 -L -S -W -N "times" -I 200 Running: times# ./../bin-arm64/times -E -C 200 -L -S -W -N times -I 200 prc thr usecs/call samples errors cnt/samp times 1 1 0.49408 196 0 500 3) psi_account_irqtime in update_rq_clock_task when irq delta is nozero [root@arm64_perf bin]# ./times -E -C 200 -L -S -W -N "times" -I 200 Running: times# ./../bin-arm64/times -E -C 200 -L -S -W -N times -I 200 prc thr usecs/call samples errors cnt/samp times 1 1 0.45158 195 0 500 Signed-off-by: Lu Jialin --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58c274b655ab..16858d1b98f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,7 +722,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); + if (irq_delta) + psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -- Gitee From 4d3528678df30c9f9dc2deba3b8ed1f9517ae77f Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:20 +0000 Subject: [PATCH 07/17] psi: support irq.pressure under cgroup v1 hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Export "irq.pressure" to cgroup v1 "cpuacct" subsystem. Signed-off-by: Lu Jialin --- kernel/cgroup/cgroup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7e4511bfbb5e..bb6b9b8a57a6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3944,6 +3944,16 @@ struct cftype cgroup_v1_psi_files[] = { .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif { } /* terminate */ }; #else /* CONFIG_PSI */ -- Gitee From 4f637168ddd68048a869be2dfcd4cdefbc91c690 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Tue, 2 Jan 2024 01:56:21 +0000 Subject: [PATCH 08/17] psi, tracepoint: introduce tracepoints for psi_memstall_{enter, leave} hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW -------------------------------- Two tracepoints are added we can easily use other tools such as ebpf, ftrace, perf to monitor the memstall data and do some analysis. The output of these tracepoints is, kcompactd0-58 [001] .... 902.709565: psi_memstall_enter: kcompactd kswapd0-132 [003] .... 902.709569: psi_memstall_leave: balance_pgdat kcompactd0-58 [001] .... 902.775230: psi_memstall_leave: kcompactd kswapd0-132 [003] .... 1337.754598: psi_memstall_enter: balance_pgdat kswapd0-132 [003] .... 1337.756076: psi_memstall_leave: balance_pgdat kcompactd0-58 [003] .... 1337.756213: psi_memstall_enter: kcompactd kcompactd0-58 [003] .... 1337.893188: psi_memstall_leave: kcompactd Signed-off-by: Chen Wandun Conflict: kernel/sched/psi.c Signed-off-by: Lu Jialin --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/psi.c | 6 ++++++ 2 files changed, 33 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714..0e8e7bd5cb9f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -735,6 +735,33 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); +DECLARE_EVENT_CLASS(psi_memstall_template, + + TP_PROTO(unsigned long function), + + TP_ARGS(function), + + TP_STRUCT__entry( + __field(unsigned long, function) + ), + + TP_fast_assign( + __entry->function = function; + ), + + TP_printk("%ps", (void *)__entry->function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_enter, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_leave, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index b1aa1b719f1d..b81293175b72 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -137,6 +137,8 @@ * sampling of the aggregate task states would be. */ +#include + static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -1080,6 +1082,8 @@ void psi_memstall_enter(unsigned long *flags) *flags = current->in_memstall; if (*flags) return; + + trace_psi_memstall_enter(_RET_IP_); /* * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can @@ -1110,6 +1114,8 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; + + trace_psi_memstall_leave(_RET_IP_); /* * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could -- Gitee From f5332caa921f9124060384c4cd6ffa07c7b8da74 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Tue, 2 Jan 2024 01:56:22 +0000 Subject: [PATCH 09/17] mm: disable psi cgroup v1 by default hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Feature of psi cgroup v1 should only enable when open CONFIG_PSI_CGROUP_V1. Signed-off-by: Chen Wandun Conflict: include/linux/psi.h kernel/sched/psi.c Signed-off-by: Lu Jialin --- include/linux/psi.h | 2 +- kernel/sched/cpuacct.c | 4 ++-- kernel/sched/psi.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index 44cb1358737b..49afe8d1cde6 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -15,7 +15,7 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; -extern struct static_key_false psi_v1_disabled; +extern struct static_key_true psi_v1_disabled; extern struct psi_group psi_system; void psi_init(void); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4e71caf232d3..0fd1b207f133 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -371,8 +371,8 @@ static int __init setup_psi_v1(char *str) int ret; ret = kstrtobool(str, &psi_v1_enable); - if (!psi_v1_enable) - static_branch_enable(&psi_v1_disabled); + if (psi_v1_enable) + static_branch_disable(&psi_v1_disabled); return ret == 0; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index b81293175b72..30535bca3a16 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -142,7 +142,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); -DEFINE_STATIC_KEY_FALSE(psi_v1_disabled); +DEFINE_STATIC_KEY_TRUE(psi_v1_disabled); static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -- Gitee From 2533d57b5fc13b270f382b1610bfe76f32f06c4d Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Tue, 2 Jan 2024 01:56:23 +0000 Subject: [PATCH 10/17] mm: add config isolation for psi under cgroup v1 hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Add CONFIG_PSI_CGROUP_V1 to separate feature of psi under cgroup v1 from baseline. Signed-off-by: Chen Wandun Conflicts: kernel/cgroup/cgroup.c kernel/sched/cpuacct.c kernel/sched/psi.c include/linux/psi.h Signed-off-by: Lu Jialin --- include/linux/psi.h | 2 ++ init/Kconfig | 10 ++++++++++ kernel/cgroup/cgroup.c | 2 ++ kernel/sched/cpuacct.c | 2 +- kernel/sched/psi.c | 2 +- 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index 49afe8d1cde6..9e5d49cf62d5 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -15,7 +15,9 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; +#ifdef CONFIG_PSI_CGROUP_V1 extern struct static_key_true psi_v1_disabled; +#endif extern struct psi_group psi_system; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index c8909ca8bb48..7592b0fad547 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -667,6 +667,16 @@ config PSI_DEFAULT_DISABLED Say N if unsure. +config PSI_CGROUP_V1 + bool "Support PSI under cgroup v1" + default n + depends on PSI + help + If set, pressure stall information tracking will be used + for cgroup v1 other than v2. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bb6b9b8a57a6..24a657d130c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3919,6 +3919,7 @@ bool cgroup_psi_enabled(void) return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } +#ifdef CONFIG_PSI_CGROUP_V1 struct cftype cgroup_v1_psi_files[] = { { .name = "io.pressure", @@ -3956,6 +3957,7 @@ struct cftype cgroup_v1_psi_files[] = { #endif { } /* terminate */ }; +#endif #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0fd1b207f133..758afa949ea0 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -362,7 +362,7 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .early_init = true, }; -#ifdef CONFIG_PSI +#ifdef CONFIG_PSI_CGROUP_V1 static bool psi_v1_enable; extern struct cftype cgroup_v1_psi_files[]; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 30535bca3a16..005813dbf45d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -884,7 +884,7 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -#ifdef CONFIG_CGROUP_CPUACCT +#if defined(CONFIG_CGROUP_CPUACCT) && defined(CONFIG_PSI_CGROUP_V1) static bool task_is_in_psi_v1(void) { if (static_branch_likely(&psi_v1_disabled)) -- Gitee From 79475df95eaacadc41a3f63bb32b9d38a1859e28 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:24 +0000 Subject: [PATCH 11/17] psi: add struct psi_group_ext hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Change struct psi_group directly will causes the kabi broken. Therefore, add a new struct psi_group_ext for new variables, which will be added in the next patch of pressure.stat. Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 8 ++++++++ init/Kconfig | 10 ++++++++++ kernel/sched/psi.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index f1fd3a8044e0..e8058b9ae609 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -207,6 +207,14 @@ struct psi_group { u64 rtpoll_until; }; +#ifdef CONFIG_PSI_FINE_GRAINED +struct psi_group_ext { + struct psi_group psi; +}; +#else +struct psi_group_ext {}; +#endif /* CONFIG_PSI_FINE_GRAINED */ + #else /* CONFIG_PSI */ #define NR_PSI_RESOURCES 0 diff --git a/init/Kconfig b/init/Kconfig index 7592b0fad547..5da1fddb96d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -677,6 +677,16 @@ config PSI_CGROUP_V1 Say N if unsure. +config PSI_FINE_GRAINED + bool "Support fine grained psi under cgroup v1 and system" + default n + depends on PSI + help + If set, fine grained pressure stall information tracking will + be used for cgroup v1 and system, such as memory reclaim, + memory compact and so on. + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 005813dbf45d..39c31c82175e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -175,6 +175,24 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +#ifdef CONFIG_PSI_FINE_GRAINED +/* System-level fine grained pressure and stall tracking */ +struct psi_group_ext psi_stat_system = { }; + +struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) +{ + if (psi == &psi_system) + return &psi_stat_system; + else + return container_of(psi, struct psi_group_ext, psi); +} +#else +static inline struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) +{ + return NULL; +} +#endif + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -1133,16 +1151,30 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { +#ifdef CONFIG_PSI_FINE_GRAINED + struct psi_group_ext *psi_ext; +#endif + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; +#ifdef CONFIG_PSI_FINE_GRAINED + psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL); + if (!psi_ext) + return -ENOMEM; + cgroup->psi = &psi_ext->psi; +#else cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); if (!cgroup->psi) return -ENOMEM; - +#endif cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); if (!cgroup->psi->pcpu) { +#ifdef CONFIG_PSI_FINE_GRAINED + kfree(psi_ext); +#else kfree(cgroup->psi); +#endif return -ENOMEM; } group_init(cgroup->psi); @@ -1159,7 +1191,11 @@ void psi_cgroup_free(struct cgroup *cgroup) free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); +#ifdef CONFIG_PSI_FINE_GRAINED + kfree(to_psi_group_ext(cgroup->psi)); +#else kfree(cgroup->psi); +#endif } /** -- Gitee From 5220ff3a1cfe9d53aafafd052a58f304fc6182c6 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:25 +0000 Subject: [PATCH 12/17] PSI: Introduce fine grained stall time collect for cgroup reclaim hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- PSI will tracking pressure stall for memory, cpu, io and irq. But, there are differrnt pressure types which will cause memory pressure, memory.pressure could not show the type of pressure effectively. The same situation for cpu.pressure. Introduce pressure.stat in psi, which will monitor specific reasons for the memory.pressure and cpu.pressure, such as global/cgroup memory reclaim, memory compact, cpu cfs bandwidth and so on. Therefore, userland could make the right solution to reduce the pressure depends on the specific pressure reasons. This patch will introduce memory fine grained stall time collect for cgroup reclaim. Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 34 +++++++++ include/linux/sched.h | 4 + kernel/sched/psi.c | 150 +++++++++++++++++++++++++++++++++++++- mm/memcontrol.c | 9 +++ 4 files changed, 194 insertions(+), 3 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index e8058b9ae609..5994c545d250 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -208,8 +208,29 @@ struct psi_group { }; #ifdef CONFIG_PSI_FINE_GRAINED + +enum psi_stat_states { + PSI_MEMCG_RECLAIM_SOME, + PSI_MEMCG_RECLAIM_FULL, + NR_PSI_STAT_STATES, +}; + +enum psi_stat_task_count { + NR_MEMCG_RECLAIM, + NR_MEMCG_RECLAIM_RUNNING, + NR_PSI_STAT_TASK_COUNTS, +}; + +struct psi_group_stat_cpu { + u32 state_mask; + u32 times[NR_PSI_STAT_STATES]; + u32 psi_delta; + unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; +}; + struct psi_group_ext { struct psi_group psi; + struct psi_group_stat_cpu __percpu *pcpu; }; #else struct psi_group_ext {}; @@ -223,4 +244,17 @@ struct psi_group { }; #endif /* CONFIG_PSI */ +/* + * one type should have two task stats: regular running and memstall + * threads. The reason is the same as NR_MEMSTALL_RUNNING. + * Because of the psi_memstall_type is start with 1, the correspondence + * between psi_memstall_type and psi_stat_task_count should be as below: + * + * memstall : psi_memstall_type * 2 - 2; + * running : psi_memstall_type * 2 - 1; + */ +enum psi_memstall_type { + PSI_MEMCG_RECLAIM = 1, +}; + #endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index fe8556ff7fb3..557bfd4c6eac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1547,6 +1547,10 @@ struct task_struct { const cpumask_t *select_cpus; #endif +#ifdef CONFIG_PSI_FINE_GRAINED + int memstall_type; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 39c31c82175e..80783ac9850c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -177,7 +177,10 @@ struct psi_group psi_system = { #ifdef CONFIG_PSI_FINE_GRAINED /* System-level fine grained pressure and stall tracking */ -struct psi_group_ext psi_stat_system = { }; +static DEFINE_PER_CPU(struct psi_group_stat_cpu, system_stat_group_pcpu); +struct psi_group_ext psi_stat_system = { + .pcpu = &system_stat_group_pcpu, +}; struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) { @@ -354,6 +357,109 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } +#ifdef CONFIG_PSI_FINE_GRAINED + +static void record_stat_times(struct psi_group_ext *psi_ext, int cpu) +{ + struct psi_group_stat_cpu *ext_grpc = per_cpu_ptr(psi_ext->pcpu, cpu); + + u32 delta = ext_grpc->psi_delta; + + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta; + } +} + +static bool test_fine_grained_stat(unsigned int *stat_tasks, + unsigned int nr_running, + enum psi_stat_states state) +{ + switch (state) { + case PSI_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM]); + case PSI_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + default: + return false; + } +} + +static void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) +{ + int t; + u32 state_mask = 0; + enum psi_stat_states s; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + for (t = 0; clear; clear &= ~(1 << t), t++) + if (clear & (1 << t)) + ext_groupc->tasks[t]--; + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + ext_groupc->tasks[t]++; + for (s = 0; s < NR_PSI_STAT_STATES; s++) + if (test_fine_grained_stat(ext_groupc->tasks, + groupc->tasks[NR_RUNNING], s)) + state_mask |= (1 << s); + if (unlikely(groupc->state_mask & PSI_ONCPU) && + cpu_curr(cpu)->memstall_type) + state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1)); + + record_stat_times(psi_ext, cpu); + ext_groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); +} + +static void update_psi_stat_delta(struct psi_group *group, int cpu, u64 now) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + ext_groupc->psi_delta = now - groupc->state_start; +} + +static void psi_stat_flags_change(struct task_struct *task, int *stat_set, + int *stat_clear, int set, int clear) +{ + if (!task->memstall_type) + return; + + if (clear) { + if (clear & TSK_MEMSTALL) + *stat_clear |= 1 << (2 * task->memstall_type - 2); + if (clear & TSK_MEMSTALL_RUNNING) + *stat_clear |= 1 << (2 * task->memstall_type - 1); + } + if (set) { + if (set & TSK_MEMSTALL) + *stat_set |= 1 << (2 * task->memstall_type - 2); + if (set & TSK_MEMSTALL_RUNNING) + *stat_set |= 1 << (2 * task->memstall_type - 1); + } + if (!task->in_memstall) + task->memstall_type = 0; +} + +#else +static inline void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) {} +static inline void update_psi_stat_delta(struct psi_group *group, int cpu, + u64 now) {} +static inline void psi_stat_flags_change(struct task_struct *task, + int *stat_set, int *stat_clear, + int set, int clear) {} +static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) @@ -955,17 +1061,22 @@ void psi_task_change(struct task_struct *task, int clear, int set) int cpu = task_cpu(task); struct psi_group *group; u64 now; + int stat_set = 0; + int stat_clear = 0; if (!task->pid) return; psi_flags_change(task, clear, set); + psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear); now = cpu_clock(cpu); group = task_psi_group(task); do { + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, true); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); } @@ -991,13 +1102,18 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_stat_change(group, cpu, 0, 0); } while ((group = group->parent)); } if (prev->pid) { int clear = TSK_ONCPU, set = 0; bool wake_clock = true; + int stat_set = 0; + int stat_clear = 0; + bool memstall_type_change = false; /* * When we're going to sleep, psi_dequeue() lets us @@ -1024,24 +1140,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_flags_change(prev, clear, set); + psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear); group = task_psi_group(prev); do { if (group == common) break; + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); +#ifdef CONFIG_PSI_FINE_GRAINED + if (next->memstall_type != prev->memstall_type) + memstall_type_change = true; +#endif + /* * TSK_ONCPU is handled up to the common ancestor. If there are * any other differences between the two tasks (e.g. prev goes * to sleep, or only one task is memstall), finish propagating * those differences all the way up to the root. */ - if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU || + memstall_type_change) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) + for (; group; group = group->parent) { + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); + } } } } @@ -1071,6 +1199,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) write_seqcount_begin(&groupc->seq); + update_psi_stat_delta(group, cpu, now); + record_stat_times(to_psi_group_ext(group), cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1093,6 +1223,9 @@ void psi_memstall_enter(unsigned long *flags) { struct rq_flags rf; struct rq *rq; +#ifdef CONFIG_PSI_FINE_GRAINED + unsigned long stat_flags = *flags; +#endif if (static_branch_likely(&psi_disabled)) return; @@ -1110,6 +1243,10 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; +#ifdef CONFIG_PSI_FINE_GRAINED + if (stat_flags) + current->memstall_type = stat_flags; +#endif psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); @@ -1162,6 +1299,11 @@ int psi_cgroup_alloc(struct cgroup *cgroup) psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL); if (!psi_ext) return -ENOMEM; + psi_ext->pcpu = alloc_percpu(struct psi_group_stat_cpu); + if (!psi_ext->pcpu) { + kfree(psi_ext); + return -ENOMEM; + } cgroup->psi = &psi_ext->psi; #else cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -1171,6 +1313,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); if (!cgroup->psi->pcpu) { #ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(psi_ext->pcpu); kfree(psi_ext); #else kfree(cgroup->psi); @@ -1192,6 +1335,7 @@ void psi_cgroup_free(struct cgroup *cgroup) /* All triggers must be removed by now */ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); #ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(to_psi_group_ext(cgroup->psi)->pcpu); kfree(to_psi_group_ext(cgroup->psi)); #else kfree(cgroup->psi); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e80504a49c0..5983f461f91d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2421,6 +2421,9 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, memcg_memory_event(memcg, MEMCG_HIGH); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, @@ -2692,6 +2695,9 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. */ +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); schedule_timeout_killable(penalty_jiffies); psi_memstall_leave(&pflags); @@ -2753,6 +2759,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, memcg_memory_event(mem_over_limit, MEMCG_MAX); raised_max_event = true; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, reclaim_options); -- Gitee From bc642e71d147d14a799d60eee9499c8690a1e20b Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:26 +0000 Subject: [PATCH 13/17] PSI: Introduce avgs and total calculation for cgroup reclaim hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introduce avgs and total calculation depend on the fine grained time collect in psi_avgs_works() for cgroup_reclaim. The results will be shown in pressure.stat, which will be done in the next patch. Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 7 +++++ kernel/sched/psi.c | 62 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 5994c545d250..984aabee2c35 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -226,11 +226,18 @@ struct psi_group_stat_cpu { u32 times[NR_PSI_STAT_STATES]; u32 psi_delta; unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; + u32 times_delta; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; }; struct psi_group_ext { struct psi_group psi; struct psi_group_stat_cpu __percpu *pcpu; + /* Running fine grained pressure averages */ + u64 avg_total[NR_PSI_STAT_STATES]; + /* Total fine grained stall times and sampled pressure averages */ + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + unsigned long avg[NR_PSI_STAT_STATES][3]; }; #else struct psi_group_ext {}; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 80783ac9850c..43b0331663b2 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -270,6 +270,10 @@ static void get_recent_times(struct psi_group *group, int cpu, enum psi_aggregators aggregator, u32 *times, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); +#endif struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); int current_cpu = raw_smp_processor_id(); unsigned int tasks[NR_PSI_TASK_COUNTS]; @@ -314,6 +318,10 @@ static void get_recent_times(struct psi_group *group, int cpu, *pchanged_states |= (1 << s); } +#ifdef CONFIG_PSI_FINE_GRAINED + ext_groupc->times_delta = now - state_start; +#endif + /* * When collect_percpu_times() from the avgs_work, we don't want to * re-arm avgs_work when all CPUs are IDLE. But the current CPU running @@ -449,6 +457,39 @@ static void psi_stat_flags_change(struct task_struct *task, int *stat_set, task->memstall_type = 0; } +static void get_recent_stat_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + enum psi_stat_states s; + u32 delta; + + memcpy(times, ext_groupc->times, sizeof(ext_groupc->times)); + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + if (ext_groupc->state_mask & (1 << s)) + times[s] += ext_groupc->times_delta; + delta = times[s] - ext_groupc->times_prev[aggregator][s]; + ext_groupc->times_prev[aggregator][s] = times[s]; + times[s] = delta; + } +} + +static void update_stat_averages(struct psi_group_ext *psi_ext, + unsigned long missed_periods, u64 period) +{ + int s; + + for (s = 0; s < NR_PSI_STAT_STATES; s++) { + u32 sample; + + sample = psi_ext->total[PSI_AVGS][s] - psi_ext->avg_total[s]; + if (sample > period) + sample = period; + psi_ext->avg_total[s] += sample; + calc_avgs(psi_ext->avg[s], missed_periods, sample, period); + } +} #else static inline void psi_group_stat_change(struct psi_group *group, int cpu, int clear, int set) {} @@ -458,12 +499,20 @@ static inline void psi_stat_flags_change(struct task_struct *task, int *stat_set, int *stat_clear, int set, int clear) {} static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {} +static inline void update_stat_averages(struct psi_group_ext *psi_ext, + unsigned long missed_periods, + u64 period) {} #endif static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) { +#ifdef CONFIG_PSI_FINE_GRAINED + u64 stat_delta[NR_PSI_STAT_STATES] = { 0 }; + u32 stat_times[NR_PSI_STAT_STATES] = { 0 }; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); +#endif u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; u32 changed_states = 0; @@ -492,6 +541,11 @@ static void collect_percpu_times(struct psi_group *group, for (s = 0; s < PSI_NONIDLE; s++) deltas[s] += (u64)times[s] * nonidle; +#ifdef CONFIG_PSI_FINE_GRAINED + get_recent_stat_times(group, cpu, aggregator, stat_times); + for (s = 0; s < NR_PSI_STAT_STATES; s++) + stat_delta[s] += (u64)stat_times[s] * nonidle; +#endif } /* @@ -511,6 +565,12 @@ static void collect_percpu_times(struct psi_group *group, group->total[aggregator][s] += div_u64(deltas[s], max(nonidle_total, 1UL)); +#ifdef CONFIG_PSI_FINE_GRAINED + for (s = 0; s < NR_PSI_STAT_STATES; s++) + psi_ext->total[aggregator][s] += + div_u64(stat_delta[s], max(nonidle_total, 1UL)); +#endif + if (pchanged_states) *pchanged_states = changed_states; } @@ -636,6 +696,7 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, static u64 update_averages(struct psi_group *group, u64 now) { + struct psi_group_ext *psi_ext = to_psi_group_ext(group); unsigned long missed_periods = 0; u64 expires, period; u64 avg_next_update; @@ -684,6 +745,7 @@ static u64 update_averages(struct psi_group *group, u64 now) calc_avgs(group->avg[s], missed_periods, sample, period); } + update_stat_averages(psi_ext, missed_periods, period); return avg_next_update; } -- Gitee From 14badc7db9b2af2867dd9cc5f4a23be540ef9b0c Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:27 +0000 Subject: [PATCH 14/17] PSI: Introduce pressure.stat in psi hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introduce pressure.stat in psi for cgroupv1 and system, which will show the fine grained time tracking for cgroup memory reclaim. for example: /test # cat /tmp/cpuacct/test/pressure.stat cgroup_memory_reclaim some avg10=45.78 avg60=10.40 avg300=2.26 total=13491160 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 Signed-off-by: Lu Jialin --- include/linux/psi.h | 4 +++ kernel/cgroup/cgroup.c | 12 +++++++++ kernel/sched/psi.c | 60 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/include/linux/psi.h b/include/linux/psi.h index 9e5d49cf62d5..a01e5b857ba5 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -34,6 +34,10 @@ void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); +#ifdef CONFIG_PSI_FINE_GRAINED + int psi_stat_show(struct seq_file *s, struct psi_group *group); +#endif + #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 24a657d130c1..7ee2260c1b9c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3920,6 +3920,13 @@ bool cgroup_psi_enabled(void) } #ifdef CONFIG_PSI_CGROUP_V1 +#ifdef CONFIG_PSI_FINE_GRAINED +static int cgroup_psi_stat_show(struct seq_file *seq, void *v) +{ + return psi_stat_show(seq, cgroup_psi(seq_css(seq)->cgroup)); +} +#endif + struct cftype cgroup_v1_psi_files[] = { { .name = "io.pressure", @@ -3955,6 +3962,11 @@ struct cftype cgroup_v1_psi_files[] = { .release = cgroup_pressure_release, }, #endif + { + .name = "pressure.stat", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_psi_stat_show, + }, { } /* terminate */ }; #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 43b0331663b2..40d7e2a11999 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1896,6 +1896,63 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_PSI_FINE_GRAINED +static const char *const psi_stat_names[] = { + "cgroup_memory_reclaim", +}; + +int psi_stat_show(struct seq_file *m, struct psi_group *group) +{ + struct psi_group_ext *psi_ext; + unsigned long avg[3] = {0, }; + int i, w; + bool is_full; + u64 now, total; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + psi_ext = to_psi_group_ext(group); + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); + for (i = 0; i < NR_PSI_STAT_STATES; i++) { + is_full = i % 2; + for (w = 0; w < 3; w++) + avg[w] = psi_ext->avg[i][w]; + total = div_u64(psi_ext->total[PSI_AVGS][i], NSEC_PER_USEC); + if (!is_full) + seq_printf(m, "%s\n", psi_stat_names[i / 2]); + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + is_full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + return 0; +} +static int system_psi_stat_show(struct seq_file *m, void *v) +{ + return psi_stat_show(m, &psi_system); +} + +static int psi_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, system_psi_stat_show, NULL); +} + +static const struct proc_ops psi_stat_proc_ops = { + .proc_open = psi_stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = psi_fop_release, +}; +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int psi_irq_show(struct seq_file *m, void *v) { @@ -1932,6 +1989,9 @@ static int __init psi_proc_init(void) proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); #ifdef CONFIG_IRQ_TIME_ACCOUNTING proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif +#ifdef CONFIG_PSI_FINE_GRAINED + proc_create("pressure/stat", 0666, NULL, &psi_stat_proc_ops); #endif } return 0; -- Gitee From 7fcc6850f768cf4f98087de2506b9f568e9bb9da Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:28 +0000 Subject: [PATCH 15/17] PSI: add more memory fine grained stall tracking in pressure.stat hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introcude more memory fine grianed stall tracking in pressure.stat, such as global memory relcaim, memory compact, memory async cgroup reclaim and swap. Signed-off-by: Lu Jialin --- block/blk-cgroup.c | 2 +- fs/btrfs/compression.c | 2 +- fs/erofs/zdata.c | 2 +- include/linux/psi_types.h | 20 ++++++++++++++++++ kernel/sched/psi.c | 44 +++++++++++++++++++++++++++++++++++++++ mm/compaction.c | 2 +- mm/filemap.c | 6 +++--- mm/memcontrol.c | 3 +++ mm/page_alloc.c | 6 ++++++ mm/page_io.c | 3 +++ mm/readahead.c | 12 ++++++++++- mm/vmscan.c | 5 ++++- 12 files changed, 98 insertions(+), 9 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a1460948f663..a1f2f316e88d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1834,7 +1834,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { - unsigned long pflags; + unsigned long pflags = 0; bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8818ed5c390f..9b5cea00238c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -475,7 +475,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) u64 em_len; u64 em_start; struct extent_map *em; - unsigned long pflags; + unsigned long pflags = 0; int memstall = 0; blk_status_t ret; int ret2; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a7e6847f6f8f..7b0cbe37e462 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1636,7 +1636,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; - unsigned long pflags; + unsigned long pflags = 0; int memstall = 0; /* diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 984aabee2c35..d20a83184fd0 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -212,12 +212,28 @@ struct psi_group { enum psi_stat_states { PSI_MEMCG_RECLAIM_SOME, PSI_MEMCG_RECLAIM_FULL, + PSI_GLOBAL_RECLAIM_SOME, + PSI_GLOBAL_RECLAIM_FULL, + PSI_COMPACT_SOME, + PSI_COMPACT_FULL, + PSI_ASYNC_MEMCG_RECLAIM_SOME, + PSI_ASYNC_MEMCG_RECLAIM_FULL, + PSI_SWAP_SOME, + PSI_SWAP_FULL, NR_PSI_STAT_STATES, }; enum psi_stat_task_count { NR_MEMCG_RECLAIM, NR_MEMCG_RECLAIM_RUNNING, + NR_GLOBAL_RECLAIM, + NR_GLOBAL_RECLAIM_RUNNING, + NR_COMPACT, + NR_COMPACT_RUNNING, + NR_ASYNC_MEMCG_RECLAIM, + NR_ASYNC_MEMCG_RECLAIM_RUNNING, + NR_SWAP, + NR_SWAP_RUNNING, NR_PSI_STAT_TASK_COUNTS, }; @@ -262,6 +278,10 @@ struct psi_group { }; */ enum psi_memstall_type { PSI_MEMCG_RECLAIM = 1, + PSI_GLOBAL_RECLAIM, + PSI_COMPACT, + PSI_ASYNC_MEMCG_RECLAIM, + PSI_SWAP, }; #endif /* _LINUX_PSI_TYPES_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 40d7e2a11999..fd96b882604e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -378,6 +378,26 @@ static void record_stat_times(struct psi_group_ext *psi_ext, int cpu) if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta; } + if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_SOME)) { + ext_grpc->times[PSI_GLOBAL_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_GLOBAL_RECLAIM_FULL)) + ext_grpc->times[PSI_GLOBAL_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_COMPACT_SOME)) { + ext_grpc->times[PSI_COMPACT_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_COMPACT_FULL)) + ext_grpc->times[PSI_COMPACT_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_ASYNC_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_ASYNC_MEMCG_RECLAIM_FULL] += delta; + } + if (ext_grpc->state_mask & (1 << PSI_SWAP_SOME)) { + ext_grpc->times[PSI_SWAP_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_SWAP_FULL)) + ext_grpc->times[PSI_SWAP_FULL] += delta; + } } static bool test_fine_grained_stat(unsigned int *stat_tasks, @@ -390,6 +410,26 @@ static bool test_fine_grained_stat(unsigned int *stat_tasks, case PSI_MEMCG_RECLAIM_FULL: return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + case PSI_GLOBAL_RECLAIM_SOME: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM]); + case PSI_GLOBAL_RECLAIM_FULL: + return unlikely(stat_tasks[NR_GLOBAL_RECLAIM] && + nr_running == stat_tasks[NR_GLOBAL_RECLAIM_RUNNING]); + case PSI_COMPACT_SOME: + return unlikely(stat_tasks[NR_COMPACT]); + case PSI_COMPACT_FULL: + return unlikely(stat_tasks[NR_COMPACT] && + nr_running == stat_tasks[NR_COMPACT_RUNNING]); + case PSI_ASYNC_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM]); + case PSI_ASYNC_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_ASYNC_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_ASYNC_MEMCG_RECLAIM_RUNNING]); + case PSI_SWAP_SOME: + return unlikely(stat_tasks[NR_SWAP]); + case PSI_SWAP_FULL: + return unlikely(stat_tasks[NR_SWAP] && + nr_running == stat_tasks[NR_SWAP_RUNNING]); default: return false; } @@ -1899,6 +1939,10 @@ static const struct proc_ops psi_cpu_proc_ops = { #ifdef CONFIG_PSI_FINE_GRAINED static const char *const psi_stat_names[] = { "cgroup_memory_reclaim", + "global_memory_reclaim", + "compact", + "cgroup_async_memory_reclaim", + "swap", }; int psi_stat_show(struct seq_file *m, struct psi_group *group) diff --git a/mm/compaction.c b/mm/compaction.c index 38c8d216c6a3..771e9629b95c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3061,7 +3061,7 @@ static int kcompactd(void *p) pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { - unsigned long pflags; + unsigned long pflags = 0; /* * Avoid the unnecessary wakeup for proactive compaction diff --git a/mm/filemap.c b/mm/filemap.c index 1c398edcfcaf..d0a2beabc68a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1227,7 +1227,7 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - unsigned long pflags; + unsigned long pflags = 0; bool in_thrashing; if (bit_nr == PG_locked && @@ -1378,7 +1378,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - unsigned long pflags; + unsigned long pflags = 0; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); @@ -2366,7 +2366,7 @@ static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); - unsigned long pflags; + unsigned long pflags = 0; int error; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5983f461f91d..744c7726a544 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2461,6 +2461,9 @@ static void async_reclaim_high(struct mem_cgroup *memcg) return; } +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_ASYNC_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_pages = memcg_usage > safe_pages ? memcg_usage - safe_pages : MEMCG_CHARGE_BATCH; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5b61c1060d1..798a9ec645c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3518,6 +3518,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_COMPACT; +#endif psi_memstall_enter(&pflags); delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); @@ -3787,6 +3790,9 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned long pflags; bool drained = false; +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_GLOBAL_RECLAIM; +#endif psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) diff --git a/mm/page_io.c b/mm/page_io.c index fe4c21af23f2..95c3616b5db3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -509,6 +509,9 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) */ if (workingset) { delayacct_thrashing_start(&in_thrashing); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_SWAP; +#endif psi_memstall_enter(&pflags); } delayacct_swapin_start(); diff --git a/mm/readahead.c b/mm/readahead.c index 6925e6959fd3..e09919547c3b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -152,8 +152,12 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; - if (unlikely(rac->_workingset)) + if (unlikely(rac->_workingset)) { +#ifdef CONFIG_PSI_FINE_GRAINED + rac->_pflags = 0; +#endif psi_memstall_enter(&rac->_pflags); + } blk_start_plug(&plug); if (aops->readahead) { @@ -803,6 +807,9 @@ void readahead_expand(struct readahead_control *ractl, if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; +#ifdef CONFIG_PSI_FINE_GRAINED + ractl->_pflags = 0; +#endif psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; @@ -830,6 +837,9 @@ void readahead_expand(struct readahead_control *ractl, if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; +#ifdef CONFIG_PSI_FINE_GRAINED + ractl->_pflags = 0; +#endif psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7a676296af30..e98f3a25db40 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7393,7 +7393,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long pflags; + unsigned long pflags = 0; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; @@ -8064,6 +8064,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in sc.gfp_mask); cond_resched(); +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_GLOBAL_RECLAIM; +#endif psi_memstall_enter(&pflags); fs_reclaim_acquire(sc.gfp_mask); /* -- Gitee From b4ca7a1ce3a8178669fa4b619e18e7bb6a63cd6e Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:29 +0000 Subject: [PATCH 16/17] add cpu fine grained stall tracking in pressure.stat hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- Introduce cpu fine grained stall tracking(cpu cfs bandwidth or cpu qos) in pressure.stat. For cpu fine grained stall tracking, only "full" information in pressure.stat. for example: /test # cat /tmp/cpuacct/test/pressure.stat cgroup_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 global_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 compact some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 cgroup_async_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 swap some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 cpu_cfs_bandwidth full avg10=21.76 avg60=4.58 avg300=0.98 total=3893827 cpu_qos full avg10=0.00 avg60=0.00 avg300=0.00 total=0 Signed-off-by: Lu Jialin --- include/linux/psi_types.h | 8 +++++ kernel/sched/fair.c | 6 ---- kernel/sched/psi.c | 75 ++++++++++++++++++++++++++++++++++++--- kernel/sched/stats.h | 8 +++++ 4 files changed, 86 insertions(+), 11 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index d20a83184fd0..bd2a28224910 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -220,6 +220,10 @@ enum psi_stat_states { PSI_ASYNC_MEMCG_RECLAIM_FULL, PSI_SWAP_SOME, PSI_SWAP_FULL, + PSI_CPU_CFS_BANDWIDTH_FULL, +#ifdef CONFIG_QOS_SCHED + PSI_CPU_QOS_FULL, +#endif NR_PSI_STAT_STATES, }; @@ -237,6 +241,8 @@ enum psi_stat_task_count { NR_PSI_STAT_TASK_COUNTS, }; +#define CPU_CFS_BANDWIDTH 1 + struct psi_group_stat_cpu { u32 state_mask; u32 times[NR_PSI_STAT_STATES]; @@ -244,6 +250,8 @@ struct psi_group_stat_cpu { unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; u32 times_delta; u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + int prev_throttle; + int cur_throttle; }; struct psi_group_ext { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 640c0a73e73a..d9d0ad0d8405 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -131,12 +131,6 @@ int __weak arch_asym_cpu_priority(int cpu) #ifdef CONFIG_QOS_SCHED -/* - * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled - * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). - */ -#define QOS_THROTTLED 2 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index fd96b882604e..246b6182a9da 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -453,7 +453,7 @@ static void psi_group_stat_change(struct psi_group *group, int cpu, for (t = 0; set; set &= ~(1 << t), t++) if (set & (1 << t)) ext_groupc->tasks[t]++; - for (s = 0; s < NR_PSI_STAT_STATES; s++) + for (s = 0; s < PSI_CPU_CFS_BANDWIDTH_FULL; s++) if (test_fine_grained_stat(ext_groupc->tasks, groupc->tasks[NR_RUNNING], s)) state_mask |= (1 << s); @@ -544,6 +544,52 @@ static inline void update_stat_averages(struct psi_group_ext *psi_ext, u64 period) {} #endif +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_CGROUP_CPUACCT) && \ + defined(CONFIG_PSI_FINE_GRAINED) +static void record_cpu_stat_times(struct psi_group *group, int cpu) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + u32 delta = ext_groupc->psi_delta; + + if (groupc->state_mask & (1 << PSI_CPU_FULL)) { + if (ext_groupc->prev_throttle == CPU_CFS_BANDWIDTH) + ext_groupc->times[PSI_CPU_CFS_BANDWIDTH_FULL] += delta; +#ifdef CONFIG_QOS_SCHED + else if (ext_groupc->prev_throttle == QOS_THROTTLED) + ext_groupc->times[PSI_CPU_QOS_FULL] += delta; +#endif + } +} + +static void update_throttle_type(struct task_struct *task, int cpu, bool next) +{ + struct cgroup *cpuacct_cgrp; + struct psi_group_ext *psi_ext; + struct psi_group_stat_cpu *groupc; + struct task_group *tsk_grp; + + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + rcu_read_lock(); + cpuacct_cgrp = task_cgroup(task, cpuacct_cgrp_id); + if (cgroup_parent(cpuacct_cgrp)) { + psi_ext = to_psi_group_ext(cgroup_psi(cpuacct_cgrp)); + groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + tsk_grp = task_group(task); + if (next) + groupc->prev_throttle = groupc->cur_throttle; + groupc->cur_throttle = tsk_grp->cfs_rq[cpu]->throttled; + } + rcu_read_unlock(); + } +} +#else +static inline void record_cpu_stat_times(struct psi_group *group, int cpu) {} +static inline void update_throttle_type(struct task_struct *task, int cpu, + bool next) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) @@ -1072,8 +1118,10 @@ static void psi_group_change(struct psi_group *group, int cpu, * may have already incorporated the live state into times_prev; * avoid a delta sample underflow when PSI is later re-enabled. */ - if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) { record_times(groupc, now); + record_cpu_stat_times(group, cpu); + } groupc->state_mask = state_mask; @@ -1098,6 +1146,7 @@ static void psi_group_change(struct psi_group *group, int cpu, state_mask |= (1 << PSI_MEM_FULL); record_times(groupc, now); + record_cpu_stat_times(group, cpu); groupc->state_mask = state_mask; @@ -1190,6 +1239,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu); if (next->pid) { + update_throttle_type(next, cpu, true); psi_flags_change(next, 0, TSK_ONCPU); /* * Set TSK_ONCPU on @next's cgroups. If @next shares any @@ -1217,6 +1267,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int stat_clear = 0; bool memstall_type_change = false; + update_throttle_type(prev, cpu, false); /* * When we're going to sleep, psi_dequeue() lets us * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and @@ -1304,6 +1355,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) update_psi_stat_delta(group, cpu, now); record_stat_times(to_psi_group_ext(group), cpu); record_times(groupc, now); + record_cpu_stat_times(group, cpu); groupc->times[PSI_IRQ_FULL] += delta; write_seqcount_end(&groupc->seq); @@ -1943,8 +1995,22 @@ static const char *const psi_stat_names[] = { "compact", "cgroup_async_memory_reclaim", "swap", + "cpu_cfs_bandwidth", + "cpu_qos", }; +static void get_stat_names(struct seq_file *m, int i, bool is_full) +{ + if (i <= PSI_SWAP_FULL && !is_full) + return seq_printf(m, "%s\n", psi_stat_names[i / 2]); + else if (i == PSI_CPU_CFS_BANDWIDTH_FULL) + return seq_printf(m, "%s\n", "cpu_cfs_bandwidth"); +#ifdef CONFIG_QOS_SCHED + else if (i == PSI_CPU_QOS_FULL) + return seq_printf(m, "%s\n", "cpu_qos"); +#endif +} + int psi_stat_show(struct seq_file *m, struct psi_group *group) { struct psi_group_ext *psi_ext; @@ -1964,12 +2030,11 @@ int psi_stat_show(struct seq_file *m, struct psi_group *group) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); for (i = 0; i < NR_PSI_STAT_STATES; i++) { - is_full = i % 2; + is_full = i % 2 || i > PSI_SWAP_FULL; for (w = 0; w < 3; w++) avg[w] = psi_ext->avg[i][w]; total = div_u64(psi_ext->total[PSI_AVGS][i], NSEC_PER_USEC); - if (!is_full) - seq_printf(m, "%s\n", psi_stat_names[i / 2]); + get_stat_names(m, i, is_full); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", is_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..9546cbf02d55 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -106,6 +106,14 @@ __schedstats_from_se(struct sched_entity *se) return &task_of(se)->stats; } +#ifdef CONFIG_QOS_SCHED +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 +#endif + #ifdef CONFIG_PSI void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, -- Gitee From 8d58fd1c3362dc4d0e573225c51c03774a5e6a1a Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Tue, 2 Jan 2024 01:56:30 +0000 Subject: [PATCH 17/17] psi: enable CONFIG_PSI_CGROUP_V1 in openeuler_defconfig hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW ------------------------------- enable CONFIG_PSI_CGROUP_V1 and CONFIG_PSI_FINE_GRAINED in openeuler_defconfig Signed-off-by: Lu Jialin --- arch/arm64/configs/openeuler_defconfig | 2 ++ arch/x86/configs/openeuler_defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index c5bb420feb86..36508b8dcefb 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -112,6 +112,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 15479398d9a8..42fafd3bc8be 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -132,6 +132,8 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PSI=y CONFIG_PSI_DEFAULT_DISABLED=y +CONFIG_PSI_CGROUP_V1=y +CONFIG_PSI_FINE_GRAINED=y # end of CPU/Task time and stats accounting CONFIG_CPU_ISOLATION=y -- Gitee