Skip to content

Commit 2902893

Browse files
committed
Add support for ARM hybrid CPUs.
Specifically: - Add support for detecting ARM hybrid CPUs, via a heuristic on CPU "capacity". - Adjust ARM-specific event names as necessary, e.g. `armv8_pmuv3_0/instructions:u/` -> `instructions:u`. There is also some refactoring of the existing code for handling Intel hybrid architectures, e.g. merging `run_on_p_cores` into `performance_cores`, to avoid code duplication.
1 parent 1c128cc commit 2902893

File tree

5 files changed

+171
-87
lines changed

5 files changed

+171
-87
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

collector/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ serde = { workspace = true, features = ["derive"] }
1818
serde_json = { workspace = true }
1919
tokio = { workspace = true, features = ["rt", "process"] }
2020

21+
cfg-if = "1"
2122
thiserror = "2"
2223
tempfile = "3"
2324
libc = "0.2"

collector/src/compile/benchmark/target.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::{fmt, str::FromStr};
55
/// https://doc.rust-lang.org/nightly/rustc/platform-support.html
66
///
77
/// Presently we only support x86_64
8+
/// FIXME: we actually support Windows and aarch64, but that isn't captured here.
89
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, serde::Deserialize)]
910
pub enum Target {
1011
/// `x86_64-unknown-linux-gnu`

collector/src/compile/execute/mod.rs

Lines changed: 168 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -135,81 +135,136 @@ pub struct CargoProcess<'a> {
135135
pub workspace_package: Option<String>,
136136
}
137137

138-
/// Returns an optional list of P-cores, if the system has P-cores and E-cores.
139-
/// This list *should* be in a format suitable for the `taskset` command.
140-
#[cfg(target_os = "linux")]
141-
fn performance_cores() -> Option<&'static String> {
142-
use std::sync::LazyLock;
143-
static PERFORMANCE_CORES: LazyLock<Option<String>> = LazyLock::new(|| {
144-
if std::fs::exists("/sys/devices/cpu")
145-
.expect("Could not check if `/sys/devices/cpu` exists")
146-
{
147-
// If /sys/devices/cpu exists, then this is not a hybrid CPU.
148-
None
149-
} else if std::fs::exists("/sys/devices/cpu_core")
150-
.expect("Could not check if `/sys/devices/cpu_core` exists!")
151-
{
152-
// If /sys/devices/cpu_core exists, then this is a hybrid CPU.
153-
eprintln!("WARNING: hybrid Intel CPU detected.");
154-
eprintln!("WARNING: test suite will only use P-cores, not E-cores");
155-
Some(
156-
std::fs::read_to_string("/sys/devices/cpu_core/cpus")
157-
.unwrap()
158-
.trim()
159-
.to_string(),
160-
)
161-
} else {
162-
// If neither dir exists, then something is wrong, because `/sys/devices/cpu` has been
163-
// in Linux for over a decade.
164-
eprintln!("WARNING: neither `/sys/devices/cpu` nor `/sys/devices/cpu_core` present");
165-
eprintln!("WARNING: unable to determine if CPU has a hybrid architecture");
166-
None
167-
}
168-
});
169-
(*PERFORMANCE_CORES).as_ref()
138+
// Some CPUs have a hybrid architecture with a mixture of P-cores (power) and E-cores (efficiency).
139+
// When benchmarking we use `taskset` to restrict execution to P-cores. Why?
140+
// 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of events
141+
// is lost.
142+
// 2. The performance characteristics of E-cores are less reliable, so excluding them from the
143+
// benchmark makes things easier.
144+
// 3. An unpredictable mix of P-core and E-core execution can give inconsistent results.
145+
//
146+
// If a hybrid architecture is detected, this type is used to hold information about the P-cores.
147+
// The detection method used varies across platforms.
148+
#[derive(Debug)]
149+
struct PCores {
150+
/// The number of P-cores.
151+
len: usize,
152+
/// The list of P-cores, in a form suitable for passing to `taskset`.
153+
list: String,
170154
}
171155

172-
#[cfg(not(target_os = "linux"))]
173-
// Modify this stub if you want to add support for P-/E-cores on more OSs
174-
fn performance_cores() -> Option<&'static String> {
175-
None
176-
}
156+
static P_CORES: LazyLock<Option<PCores>> = LazyLock::new(p_cores);
157+
158+
cfg_if::cfg_if! {
159+
if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
160+
// On x86-64/Linux we look for the presence of `/sys/devices/cpu_core/` which indicates a
161+
// hybrid architecture.
162+
fn p_cores() -> Option<PCores> {
163+
if std::fs::exists("/sys/devices/cpu").unwrap() {
164+
// `/sys/devices/cpu` exists: this is not a hybrid CPU.
165+
None
166+
} else if std::fs::exists("/sys/devices/cpu_core").unwrap() {
167+
// `/sys/devices/cpu_core/` exists: this is a hybrid CPU, and the `cpus` file
168+
// within contains the list of P-cores. (`sys/devices/cpu_atom/cpus` contains
169+
// the list of E-cores).
170+
let list =
171+
std::fs::read_to_string("/sys/devices/cpu_core/cpus")
172+
.unwrap()
173+
.trim()
174+
.to_string();
175+
eprintln!(
176+
"WARNING: hybrid Intel CPU detected; test suite will only use P-cores: {list}"
177+
);
178+
// Parse CPU list to extract the number of P-cores. This assumes the P-core ids are
179+
// continuous, in format `m-n`.
180+
let (first, last) = list
181+
.split_once("-")
182+
.unwrap_or_else(|| panic!("unsupported P-core list format: {list:?}."));
183+
let first = first
184+
.parse::<usize>()
185+
.expect("expected a number at the start of the P-core list");
186+
let last = last
187+
.parse::<usize>()
188+
.expect("expected a number at the end of the P-core list");
189+
let len = last - first + 1; // e.g. "0-3" is four cores: [0, 1, 2, 3]
190+
Some(PCores { len, list })
191+
} else {
192+
// Neither dir exists: something is wrong, because `/sys/devices/cpu` has been
193+
// in Linux (on x86-64, at least) for over a decade.
194+
eprintln!(
195+
"WARNING: `/sys/devices/{{cpu,cpu_core}}` not found; \
196+
unable to determine if CPU has a hybrid architecture"
197+
);
198+
None
199+
}
200+
}
201+
} else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
202+
// On ARM64/Linux there is no definitive way to distinguish P-cores from E-cores, so we
203+
// must use a heuristic.
204+
//
205+
// Each core has a listed "capacity", a performance estimate relative to the most powerful
206+
// core in the system (scaled 0-1024). For example, an ASUS GX10 Ascent has a Cortex-X925
207+
// with 10 P-cores and a Cortex-A725 with 10 E-cores. The reported capacities are:
208+
// * Cores 0- 4: 718 (E-cores in cluster 1 with 8MiB L3 cache)
209+
// * Cores 5- 9: 997 (P-cores in cluster 1 with 8MiB L3 cache)
210+
// * Cores 10-14: 731 (E-cores in cluster 2 with 16MiB L3 cache)
211+
// * Cores 15-18: 1017 (P-cores in cluster 2 with 16MiB L3 cache)
212+
// * Core 19: 1024 (P-core in cluster 2 with 16MiB L3 cache))
213+
//
214+
// The heuristic is that any core with a capacity at least 90% of the maximum capacity is
215+
// considered a P-core, and any other core is considered an E-core. (The 718/731 and
216+
// 997/1017 differences are presumably due to the L3 cache size. The reason for the
217+
// 1017/1024 difference is unclear. Even though the P-cores are not all identical, they are
218+
// close enough for our purposes.)
219+
fn p_cores() -> Option<PCores> {
220+
let mut caps = vec![];
221+
for i in 0.. {
222+
let path = format!("/sys/devices/system/cpu/cpu{i}/cpu_capacity");
223+
if !std::fs::exists(&path).unwrap() {
224+
break;
225+
}
226+
let cap = std::fs::read_to_string(&path).unwrap().trim().parse::<usize>().unwrap();
227+
caps.push((i, cap));
228+
}
177229

178-
#[cfg(target_os = "linux")]
179-
/// Makes the benchmark run only on Performance cores.
180-
fn run_on_p_cores(path: &Path, cpu_list: &str) -> Command {
181-
// Parse CPU list to extract the number of P-cores!
182-
// This assumes the P-core id's are continuous, in format `first_id-last_id`
183-
let (core_start, core_end) = cpu_list
184-
.split_once("-")
185-
.unwrap_or_else(|| panic!("Unsupported P-core list format: {cpu_list:?}."));
186-
let core_start: u32 = core_start
187-
.parse()
188-
.expect("Expected a number when parsing the start of the P-core list!");
189-
let core_end: u32 = core_end
190-
.parse()
191-
.expect("Expected a number when parsing the end of the P-core list!");
192-
let core_count = core_end - core_start + 1; // e.g. "0-3" is four cores: [0, 1, 2, 3]
193-
194-
let mut cmd = Command::new("taskset");
195-
// Set job count to P-core count. This is done for 3 reasons:
196-
// 1. The instruction count info for E-cores is often incomplete, and a substantial chunk of
197-
// events is lost.
198-
// 2. The performance characteristics of E-cores are less reliable, so excluding them from the
199-
// benchmark makes things easier.
200-
// 3. An unpredictable mix of P-core and E-core execution will give inconsistent results.
201-
cmd.env("CARGO_BUILD_JOBS", format!("{core_count}"));
202-
// Pass the P-core list to taskset to pin task to the P-core.
203-
cmd.arg("--cpu-list");
204-
cmd.arg(cpu_list);
205-
cmd.arg(path);
206-
cmd
207-
}
230+
if let Some(max_cap) = caps.iter().map(|(_, cap)| cap).max() {
231+
// Filter out cores that fail the 90% capacity check.
232+
let cap_threshold = *max_cap as f64 * 0.9;
233+
let p_cores: Vec<_> = caps.iter().filter_map(|(i, cap)| {
234+
if *cap as f64 >= cap_threshold {
235+
Some(i.to_string())
236+
} else {
237+
None
238+
}
239+
}).collect();
208240

209-
#[cfg(not(target_os = "linux"))]
210-
// Modify this stub if you want to add support for P-cores/E-cores on more OSs.
211-
fn run_on_p_cores(_path: &Path, _cpu_list: &str) -> Command {
212-
todo!("Can't run commands on the P-cores on this platform");
241+
if p_cores.len() == caps.len() {
242+
// All cores have roughly the same capacity; this is not a hybrid CPU.
243+
None
244+
} else {
245+
let list = p_cores.join(",");
246+
eprintln!(
247+
"WARNING: hybrid ARM CPU detected; test suite will only use P-cores: {list}"
248+
);
249+
Some(PCores {
250+
len: p_cores.len(),
251+
list,
252+
})
253+
}
254+
} else {
255+
eprintln!(
256+
"WARNING: `/sys/devices/system/cpu/cpu*/cpu_capacity` not found; \
257+
unable to determine if CPU has a hybrid architecture"
258+
);
259+
None
260+
}
261+
}
262+
} else {
263+
// Modify this stub if you want to add support for hybrid architectures on more platforms.
264+
fn p_cores() -> Option<PCores> {
265+
None
266+
}
267+
}
213268
}
214269

215270
impl<'a> CargoProcess<'a> {
@@ -230,11 +285,17 @@ impl<'a> CargoProcess<'a> {
230285
}
231286

232287
fn base_command(&self, cwd: &Path, subcommand: &str) -> Command {
233-
// Processors with P-core and E-cores require special handling.
234-
let mut cmd = if let Some(p_cores) = performance_cores() {
235-
run_on_p_cores(Path::new(&self.toolchain.components.cargo), p_cores)
288+
let cargo_path = Path::new(&self.toolchain.components.cargo);
289+
let mut cmd = if let Some(p_cores) = (*P_CORES).as_ref() {
290+
// Processors with P-cores and E-cores require special handling.
291+
let mut cmd = Command::new("taskset");
292+
cmd.env("CARGO_BUILD_JOBS", p_cores.len.to_string());
293+
cmd.arg("--cpu-list");
294+
cmd.arg(&p_cores.list);
295+
cmd.arg(cargo_path);
296+
cmd
236297
} else {
237-
Command::new(Path::new(&self.toolchain.components.cargo))
298+
Command::new(cargo_path)
238299
};
239300
cmd
240301
// Not all cargo invocations (e.g. `cargo clean`) need all of these
@@ -620,6 +681,11 @@ fn process_stat_output(
620681
let stdout = String::from_utf8(output.stdout.clone()).expect("utf8 output");
621682
let mut stats = Stats::new();
622683

684+
// ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
685+
// `armv8_pmuv3_1/branche-misses/`.
686+
#[cfg(all(target_os = "linux", target_arch = "aarch64"))]
687+
let arm_p_core_events_re = regex::Regex::new(r"armv[0-9]_pmuv[0-9]_[0-9]/([^/]*)/").unwrap();
688+
623689
let mut self_profile_dir: Option<PathBuf> = None;
624690
let mut self_profile_crate: Option<String> = None;
625691
for line in stdout.lines() {
@@ -670,24 +736,43 @@ fn process_stat_output(
670736
}
671737
};
672738
}
739+
673740
let mut parts = line.split(';').map(|s| s.trim());
674741
let cnt = get!(parts.next());
742+
if cnt == "<not supported>" || cnt == "<not counted>" || cnt.is_empty() {
743+
continue;
744+
}
745+
675746
let _unit = get!(parts.next());
676-
let mut name = get!(parts.next());
677-
// Map P-core events to normal events
678-
if name == "cpu_core/instructions:u/" {
679-
name = "instructions:u";
747+
748+
#[allow(unused_mut)]
749+
let mut name = get!(parts.next()).to_string();
750+
// Map P-core event name to normal event names.
751+
cfg_if::cfg_if! {
752+
if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
753+
if name == "cpu_core/instructions:u/" {
754+
name = "instructions:u".to_string();
755+
}
756+
} else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
757+
// ARM P-core events have names like `armv8_pmuv3_0/instructions:u/` and
758+
// `armv8_pmuv3_1/branche-misses/`.
759+
if let Some(event) = arm_p_core_events_re.captures(&name) {
760+
name = event[1].to_string();
761+
}
762+
}
680763
}
764+
681765
let _time = get!(parts.next());
766+
682767
let pct = get!(parts.next());
683-
if cnt == "<not supported>" || cnt == "<not counted>" || cnt.is_empty() {
684-
continue;
685-
}
686768
if !pct.starts_with("100.") {
769+
// If this fails, it's probably because the CPU has a hybrid architecture and the
770+
// metric is split across P-cores and E-cores. See `PCores`.
687771
panic!("measurement of `{name}` only active for {pct}% of the time");
688772
}
773+
689774
stats.insert(
690-
name.to_owned(),
775+
name,
691776
cnt.parse()
692777
.map_err(|e| DeserializeStatError::ParseError(cnt.to_string(), e))?,
693778
);

site/frontend/package-lock.json

Lines changed: 0 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)