Skip to content

Commit 5c4b240

Browse files
author
CKI KWF Bot
committed
Merge: Sched: Topology: Speed improvements for online/offline of CPUs.
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/7515 JIRA: https://issues.redhat.com/browse/RHEL-118964 Tested: HPE will do other QA to test on large cpu count systems and perf QE will do performance regression testing, mostly x86 and powerpc. This is the rest of the commits needed for speeding up offline and online of CPUs, especially with very high CPU counts. The first part of this is in RHEL-1130303. There are also a handful of mostly architecture-specific commits need to make this work. Signed-off-by: Phil Auld <pauld@redhat.com> Approved-by: Steve Best <sbest@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: CKI GitLab Kmaint Pipeline Bot <26919896-cki-kmaint-pipeline-bot@users.noreply.gitlab.com>
2 parents b853040 + 5fa1290 commit 5c4b240

File tree

13 files changed

+224
-216
lines changed

13 files changed

+224
-216
lines changed

arch/Kconfig

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,44 @@ config HOTPLUG_SMT
3333
config SMT_NUM_THREADS_DYNAMIC
3434
bool
3535

36+
config ARCH_SUPPORTS_SCHED_SMT
37+
bool
38+
39+
config ARCH_SUPPORTS_SCHED_CLUSTER
40+
bool
41+
42+
config ARCH_SUPPORTS_SCHED_MC
43+
bool
44+
45+
config SCHED_SMT
46+
bool "SMT (Hyperthreading) scheduler support"
47+
depends on ARCH_SUPPORTS_SCHED_SMT
48+
default y
49+
help
50+
Improves the CPU scheduler's decision making when dealing with
51+
MultiThreading at a cost of slightly increased overhead in some
52+
places. If unsure say N here.
53+
54+
config SCHED_CLUSTER
55+
bool "Cluster scheduler support"
56+
depends on ARCH_SUPPORTS_SCHED_CLUSTER
57+
default y
58+
help
59+
Cluster scheduler support improves the CPU scheduler's decision
60+
making when dealing with machines that have clusters of CPUs.
61+
Cluster usually means a couple of CPUs which are placed closely
62+
by sharing mid-level caches, last-level cache tags or internal
63+
busses.
64+
65+
config SCHED_MC
66+
bool "Multi-Core Cache (MC) scheduler support"
67+
depends on ARCH_SUPPORTS_SCHED_MC
68+
default y
69+
help
70+
Multi-core scheduler support improves the CPU scheduler's decision
71+
making when dealing with multi-core CPU chips at a cost of slightly
72+
increased overhead in some places. If unsure say N here.
73+
3674
# Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL
3775
config HOTPLUG_CORE_SYNC
3876
bool

arch/arm64/Kconfig

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ config ARM64
9999
select ARCH_SUPPORTS_PER_VMA_LOCK
100100
select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
101101
select ARCH_SUPPORTS_RT
102+
select ARCH_SUPPORTS_SCHED_SMT
103+
select ARCH_SUPPORTS_SCHED_CLUSTER
104+
select ARCH_SUPPORTS_SCHED_MC
102105
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
103106
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
104107
select ARCH_WANT_DEFAULT_BPF_JIT
@@ -1448,29 +1451,6 @@ config CPU_LITTLE_ENDIAN
14481451

14491452
endchoice
14501453

1451-
config SCHED_MC
1452-
bool "Multi-core scheduler support"
1453-
help
1454-
Multi-core scheduler support improves the CPU scheduler's decision
1455-
making when dealing with multi-core CPU chips at a cost of slightly
1456-
increased overhead in some places. If unsure say N here.
1457-
1458-
config SCHED_CLUSTER
1459-
bool "Cluster scheduler support"
1460-
help
1461-
Cluster scheduler support improves the CPU scheduler's decision
1462-
making when dealing with machines that have clusters of CPUs.
1463-
Cluster usually means a couple of CPUs which are placed closely
1464-
by sharing mid-level caches, last-level cache tags or internal
1465-
busses.
1466-
1467-
config SCHED_SMT
1468-
bool "SMT scheduler support"
1469-
help
1470-
Improves the CPU scheduler's decision making when dealing with
1471-
MultiThreading at a cost of slightly increased overhead in some
1472-
places. If unsure say N here.
1473-
14741454
config NR_CPUS
14751455
int "Maximum number of CPUs (2-4096)"
14761456
range 2 4096

arch/powerpc/Kconfig

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ config PPC
146146
select ARCH_STACKWALK
147147
select ARCH_SUPPORTS_ATOMIC_RMW
148148
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x
149+
select ARCH_SUPPORTS_SCHED_MC if SMP
150+
select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP
151+
select SCHED_MC if ARCH_SUPPORTS_SCHED_MC
149152
select ARCH_USE_BUILTIN_BSWAP
150153
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
151154
select ARCH_USE_MEMTEST
@@ -853,14 +856,6 @@ config PPC_PROT_SAO_LPAR
853856
config PPC_COPRO_BASE
854857
bool
855858

856-
config SCHED_SMT
857-
bool "SMT (Hyperthreading) scheduler support"
858-
depends on PPC64 && SMP
859-
help
860-
SMT scheduler support improves the CPU scheduler's decision making
861-
when dealing with POWER5 cpus at a cost of slightly increased
862-
overhead in some places. If unsure say N here.
863-
864859
config PPC_DENORMALISATION
865860
bool "PowerPC denormalisation exception handling"
866861
depends on PPC_BOOK3S_64

arch/powerpc/include/asm/topology.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu)
131131
#ifdef CONFIG_SMP
132132
#include <asm/cputable.h>
133133

134+
struct cpumask *cpu_coregroup_mask(int cpu);
135+
134136
#ifdef CONFIG_PPC64
135137
#include <asm/smp.h>
136138

arch/powerpc/kernel/smp.c

Lines changed: 69 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
7979
#endif
8080

8181
struct task_struct *secondary_current;
82-
bool has_big_cores;
83-
bool coregroup_enabled;
84-
bool thread_group_shares_l2;
85-
bool thread_group_shares_l3;
82+
bool has_big_cores __ro_after_init;
83+
bool coregroup_enabled __ro_after_init;
84+
bool thread_group_shares_l2 __ro_after_init;
85+
bool thread_group_shares_l3 __ro_after_init;
8686

8787
DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
8888
DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
@@ -95,15 +95,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
9595
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
9696
EXPORT_SYMBOL_GPL(has_big_cores);
9797

98-
enum {
99-
#ifdef CONFIG_SCHED_SMT
100-
smt_idx,
101-
#endif
102-
cache_idx,
103-
mc_idx,
104-
die_idx,
105-
};
106-
10798
#define MAX_THREAD_LIST_SIZE 8
10899
#define THREAD_GROUP_SHARE_L1 1
109100
#define THREAD_GROUP_SHARE_L2_L3 2
@@ -1000,7 +991,7 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property)
1000991
return 0;
1001992
}
1002993

1003-
static bool shared_caches;
994+
static bool shared_caches __ro_after_init;
1004995

1005996
#ifdef CONFIG_SCHED_SMT
1006997
/* cpumask of CPUs with asymmetric SMT dependency */
@@ -1016,6 +1007,13 @@ static int powerpc_smt_flags(void)
10161007
}
10171008
#endif
10181009

1010+
/*
1011+
* On shared processor LPARs scheduled on a big core (which has two or more
1012+
* independent thread groups per core), prefer lower numbered CPUs, so
1013+
* that workload consolidates to lesser number of cores.
1014+
*/
1015+
static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack);
1016+
10191017
/*
10201018
* P9 has a slightly odd architecture where pairs of cores share an L2 cache.
10211019
* This topology makes it *much* cheaper to migrate tasks between adjacent cores
@@ -1024,50 +1022,50 @@ static int powerpc_smt_flags(void)
10241022
*/
10251023
static int powerpc_shared_cache_flags(void)
10261024
{
1025+
if (static_branch_unlikely(&splpar_asym_pack))
1026+
return SD_SHARE_LLC | SD_ASYM_PACKING;
1027+
10271028
return SD_SHARE_LLC;
10281029
}
10291030

1031+
static int powerpc_shared_proc_flags(void)
1032+
{
1033+
if (static_branch_unlikely(&splpar_asym_pack))
1034+
return SD_ASYM_PACKING;
1035+
1036+
return 0;
1037+
}
1038+
10301039
/*
10311040
* We can't just pass cpu_l2_cache_mask() directly because
10321041
* returns a non-const pointer and the compiler barfs on that.
10331042
*/
1034-
static const struct cpumask *shared_cache_mask(int cpu)
1043+
static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu)
10351044
{
10361045
return per_cpu(cpu_l2_cache_map, cpu);
10371046
}
10381047

10391048
#ifdef CONFIG_SCHED_SMT
1040-
static const struct cpumask *smallcore_smt_mask(int cpu)
1049+
static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu)
10411050
{
10421051
return cpu_smallcore_mask(cpu);
10431052
}
10441053
#endif
10451054

1046-
static struct cpumask *cpu_coregroup_mask(int cpu)
1055+
struct cpumask *cpu_coregroup_mask(int cpu)
10471056
{
10481057
return per_cpu(cpu_coregroup_map, cpu);
10491058
}
10501059

10511060
static bool has_coregroup_support(void)
10521061
{
1053-
return coregroup_enabled;
1054-
}
1062+
/* Coregroup identification not available on shared systems */
1063+
if (is_shared_processor())
1064+
return 0;
10551065

1056-
static const struct cpumask *cpu_mc_mask(int cpu)
1057-
{
1058-
return cpu_coregroup_mask(cpu);
1066+
return coregroup_enabled;
10591067
}
10601068

1061-
static struct sched_domain_topology_level powerpc_topology[] = {
1062-
#ifdef CONFIG_SCHED_SMT
1063-
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
1064-
#endif
1065-
{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
1066-
{ cpu_mc_mask, SD_INIT_NAME(MC) },
1067-
{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
1068-
{ NULL, },
1069-
};
1070-
10711069
static int __init init_big_cores(void)
10721070
{
10731071
int cpu;
@@ -1456,7 +1454,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
14561454
return false;
14571455
}
14581456

1459-
cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
1457+
cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
14601458

14611459
/* Update l2-cache mask with all the CPUs that are part of submask */
14621460
or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
@@ -1546,7 +1544,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
15461544
return;
15471545
}
15481546

1549-
cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
1547+
cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
15501548

15511549
/* Update coregroup mask with all the CPUs that are part of submask */
15521550
or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
@@ -1609,7 +1607,7 @@ static void add_cpu_to_masks(int cpu)
16091607

16101608
/* If chip_id is -1; limit the cpu_core_mask to within PKG */
16111609
if (chip_id == -1)
1612-
cpumask_and(mask, mask, cpu_cpu_mask(cpu));
1610+
cpumask_and(mask, mask, cpu_node_mask(cpu));
16131611

16141612
for_each_cpu(i, mask) {
16151613
if (chip_id == cpu_to_chip_id(i)) {
@@ -1698,43 +1696,40 @@ int setup_profiling_timer(unsigned int multiplier)
16981696
return 0;
16991697
}
17001698

1701-
static void __init fixup_topology(void)
1699+
static struct sched_domain_topology_level powerpc_topology[6];
1700+
1701+
static void __init build_sched_topology(void)
17021702
{
1703-
int i;
1703+
int i = 0;
1704+
1705+
if (is_shared_processor() && has_big_cores)
1706+
static_branch_enable(&splpar_asym_pack);
17041707

17051708
#ifdef CONFIG_SCHED_SMT
17061709
if (has_big_cores) {
17071710
pr_info("Big cores detected but using small core scheduling\n");
1708-
powerpc_topology[smt_idx].mask = smallcore_smt_mask;
1711+
powerpc_topology[i++] =
1712+
SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT);
1713+
} else {
1714+
powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT);
17091715
}
17101716
#endif
1717+
if (shared_caches) {
1718+
powerpc_topology[i++] =
1719+
SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE);
1720+
}
17111721

1712-
if (!has_coregroup_support())
1713-
powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask;
1714-
1715-
/*
1716-
* Try to consolidate topology levels here instead of
1717-
* allowing scheduler to degenerate.
1718-
* - Dont consolidate if masks are different.
1719-
* - Dont consolidate if sd_flags exists and are different.
1720-
*/
1721-
for (i = 1; i <= die_idx; i++) {
1722-
if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask)
1723-
continue;
1722+
if (has_coregroup_support()) {
1723+
powerpc_topology[i++] =
1724+
SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC);
1725+
}
17241726

1725-
if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags &&
1726-
powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags)
1727-
continue;
1727+
powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG);
17281728

1729-
if (!powerpc_topology[i - 1].sd_flags)
1730-
powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags;
1729+
/* There must be one trailing NULL entry left. */
1730+
BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
17311731

1732-
powerpc_topology[i].mask = powerpc_topology[i + 1].mask;
1733-
powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags;
1734-
#ifdef CONFIG_SCHED_DEBUG
1735-
powerpc_topology[i].name = powerpc_topology[i + 1].name;
1736-
#endif
1737-
}
1732+
set_sched_topology(powerpc_topology);
17381733
}
17391734

17401735
void __init smp_cpus_done(unsigned int max_cpus)
@@ -1749,9 +1744,20 @@ void __init smp_cpus_done(unsigned int max_cpus)
17491744
smp_ops->bringup_done();
17501745

17511746
dump_numa_cpu_topology();
1747+
build_sched_topology();
1748+
}
17521749

1753-
fixup_topology();
1754-
set_sched_topology(powerpc_topology);
1750+
/*
1751+
* For asym packing, by default lower numbered CPU has higher priority.
1752+
* On shared processors, pack to lower numbered core. However avoid moving
1753+
* between thread_groups within the same core.
1754+
*/
1755+
int arch_asym_cpu_priority(int cpu)
1756+
{
1757+
if (static_branch_unlikely(&splpar_asym_pack))
1758+
return -cpu / threads_per_core;
1759+
1760+
return -cpu;
17551761
}
17561762

17571763
#ifdef CONFIG_HOTPLUG_CPU

arch/s390/Kconfig

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -501,15 +501,11 @@ config NODES_SHIFT
501501
depends on NUMA
502502
default "1"
503503

504-
config SCHED_SMT
505-
def_bool n
506-
507-
config SCHED_MC
508-
def_bool n
509-
510504
config SCHED_TOPOLOGY
511505
def_bool y
512506
prompt "Topology scheduler support"
507+
select ARCH_SUPPORTS_SCHED_SMT
508+
select ARCH_SUPPORTS_SCHED_MC
513509
select SCHED_SMT
514510
select SCHED_MC
515511
help

0 commit comments

Comments
 (0)