Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ cpuid_profile_config = { path = "crates/cpuid-profile-config" }
dladm = { path = "crates/dladm" }
nvpair = { path = "crates/nvpair" }
nvpair_sys = { path = "crates/nvpair/sys" }
pbind = { path = "crates/pbind" }
propolis-config-toml = { path = "crates/propolis-config-toml" }
propolis_api_types = { path = "crates/propolis-api-types" }
propolis-server-api = { path = "crates/propolis-server-api" }
Expand Down
1 change: 1 addition & 0 deletions bin/propolis-server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ oxide-tokio-rt.workspace = true
oximeter-instruments.workspace = true
oximeter-producer.workspace = true
oximeter.workspace = true
pbind.workspace = true
ron.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
Expand Down
27 changes: 26 additions & 1 deletion bin/propolis-server/src/lib/vcpu_tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ use thiserror::Error;
pub enum VcpuTaskError {
#[error("Failed to spawn a vCPU backing thread: {0}")]
BackingThreadSpawnFailed(std::io::Error),
#[error("CPU bindings did not match vCPUs: {bindings} bindings for {vcpus} vCPUs")]
CpuBindingMismatch { bindings: usize, vcpus: usize },
}

pub struct VcpuTasks {
Expand All @@ -41,11 +43,30 @@ impl VcpuTasks {
pub(crate) fn new(
machine: &propolis::Machine,
event_handler: Arc<dyn super::vm::guest_event::VcpuEventHandler>,
bind_cpus: Option<Vec<pbind::processorid_t>>,
log: slog::Logger,
) -> Result<Self, VcpuTaskError> {
let generation = Arc::new(AtomicUsize::new(0));

// We take in an `Option<Vec<..>>` but a `Vec<Option<..>>` is more
// convenient for spawning below, so we have to shuffle values a bit..
let mut bindings = vec![None; machine.vcpus.len()];
if let Some(bind_cpus) = bind_cpus {
if bind_cpus.len() != machine.vcpus.len() {
return Err(VcpuTaskError::CpuBindingMismatch {
bindings: bind_cpus.len(),
vcpus: machine.vcpus.len(),
});
}
for i in 0..machine.vcpus.len() {
bindings[i] = Some(bind_cpus[i]);
}
}

let mut tasks = Vec::new();
for vcpu in machine.vcpus.iter().map(Arc::clone) {
for (vcpu, bind_cpu) in
machine.vcpus.iter().map(Arc::clone).zip(bindings.into_iter())
{
let (task, ctrl) =
propolis::tasks::TaskHdl::new_held(Some(vcpu.barrier_fn()));
let task_log = log.new(slog::o!("vcpu" => vcpu.id));
Expand All @@ -54,6 +75,10 @@ impl VcpuTasks {
let thread = std::thread::Builder::new()
.name(format!("vcpu-{}", vcpu.id))
.spawn(move || {
if let Some(bind_cpu) = bind_cpu {
pbind::bind_lwp(bind_cpu)
.expect("can bind to specified CPU");
}
Self::vcpu_loop(
vcpu.as_ref(),
task,
Expand Down
39 changes: 39 additions & 0 deletions bin/propolis-server/src/lib/vm/ensure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,9 +581,48 @@ async fn initialize_vm_objects(

init.register_guest_hv_interface(guest_hv_lifecycle);
init.initialize_cpus().await?;

let total_cpus = pbind::online_cpus()?;
let vcpu_count: i32 = machine
.vcpus
.len()
.try_into()
.map_err(|_| anyhow::anyhow!("more than 2^31 vCPUs"))?;

// When a VM has I/O-heavy workloads across many cores, vCPUs can end up
// moving across the host and come with wasted time in juggling cyclics.
//
// Nexus can't yet determine CPU binding assignments in a central manner. In
// lieu of it, knowing that Nexus won't oversubscribe a host we can
// autonomously follow a CPU pinning strategy as: "if the VM is larger than
// half of the sled, there can only be one, so bind it to specific CPUs and
// rely on the OS to sort out the rest". This gets the largest VMs to fixed
// vCPU->CPU assignments, which also are most likely to benefit.
let cpu_threshold = total_cpus / 2;
let bind_cpus = if vcpu_count > cpu_threshold {
if vcpu_count > total_cpus {
anyhow::bail!("spec requested more CPUs than are online!");
}

// Bind to the upper range of CPUs, fairly arbitrary.
let first_bind_cpu = total_cpus - vcpu_count;
let bind_cpus = (first_bind_cpu..total_cpus).collect();

info!(log, "applying automatic vCPU->CPU binding";
"vcpu_count" => vcpu_count,
"total_cpus" => total_cpus,
"threshold" => cpu_threshold,
"vcpu_cpus" => #?bind_cpus);

Some(bind_cpus)
} else {
None
};

let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new(
&machine,
event_queue.clone() as Arc<dyn super::guest_event::VcpuEventHandler>,
bind_cpus,
log.new(slog::o!("component" => "vcpu_tasks")),
)?);

Expand Down
1 change: 1 addition & 0 deletions bin/propolis-standalone/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ slog-term.workspace = true
strum = { workspace = true, features = ["derive"] }
tar.workspace = true
uuid.workspace = true
pbind.workspace = true

[features]
default = []
Expand Down
20 changes: 20 additions & 0 deletions bin/propolis-standalone/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ pub struct Main {
pub memory: usize,
pub use_reservoir: Option<bool>,
pub cpuid_profile: Option<String>,
/// How vCPUs should be bound to physical processors, if at all. If not
/// provided, vCPUs are not bound (equivalent to setting `any`).
pub cpu_binding: Option<BindingStrategy>,
/// Process exitcode to emit if/when instance halts
///
/// Default: 0
Expand All @@ -69,6 +72,23 @@ pub struct Main {
pub boot_order: Option<Vec<String>>,
}

#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub enum BindingStrategy {
/// vCPUs are not bound to any particular physical processor.
Any,
/// vCPUs are bound to the highest-numbered processors in the system, with
/// the first vCPU bound to CPU `last - N_vCPU` and the last vCPU bound to
/// the last CPU.
///
/// An example given a system with 10 CPUs running a VM with 6 vCPUs:
/// ```text
/// host CPU number: | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
/// guest vCPU number: | | | | | 0 | 1 | 2 | 3 | 4 | 5 |
/// ```
UpperHalf,
}

/// A hard-coded device, either enabled by default or accessible locally
/// on a machine.
#[derive(Clone, Debug, Deserialize, Serialize)]
Expand Down
34 changes: 33 additions & 1 deletion bin/propolis-standalone/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,36 @@ impl Instance {
let state = &mut *state_guard;
let machine = state.machine.as_ref().unwrap();

for vcpu in machine.vcpus.iter().map(Arc::clone) {
let bind_cpus = match this.0.config.main.cpu_binding {
Some(config::BindingStrategy::UpperHalf) => {
let total_cpus =
pbind::online_cpus().expect("can get processor count");
let vcpu_count: i32 = machine
.vcpus
.len()
.try_into()
.expect("vCPU count <= MAXCPU < i32::MAX");

let first_bound_cpu = total_cpus - vcpu_count;
let bind_cpus =
(first_bound_cpu..total_cpus).map(Some).collect();
slog::info!(
&log,
"Explicit CPU binding requested";
"last_cpu" => total_cpus,
"first_cpu" => first_bound_cpu,
"vcpu_count" => vcpu_count,
);
bind_cpus
}
Some(config::BindingStrategy::Any) | None => {
vec![None; machine.vcpus.len()]
}
};

for (vcpu, bind_cpu) in
machine.vcpus.iter().map(Arc::clone).zip(bind_cpus.into_iter())
{
let (task, ctrl) =
propolis::tasks::TaskHdl::new_held(Some(vcpu.barrier_fn()));

Expand All @@ -295,6 +324,9 @@ impl Instance {
let _ = std::thread::Builder::new()
.name(format!("vcpu-{}", vcpu.id))
.spawn(move || {
if let Some(bind_cpu) = bind_cpu {
pbind::bind_lwp(bind_cpu).expect("can bind vcpu");
}
Instance::vcpu_loop(inner, vcpu.as_ref(), &task, task_log)
})
.unwrap();
Expand Down
8 changes: 8 additions & 0 deletions crates/pbind/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[package]
name = "pbind"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, since this is a low-level binding to the C headers, should this perhaps be pbind-sys?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was on the fence tbh, since this includes a bit more than just definitions from the headers :( idk

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, iunno.

version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[dependencies]
libc.workspace = true
102 changes: 102 additions & 0 deletions crates/pbind/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// C-style type names follow, opt out of warnings for using names from headers.
#![allow(non_camel_case_types)]

//! Utility functions for binding LWPs to specific CPUs.
//!
//! This is generally a very light wrapper for illumos' `sysconf(3c)` and
//! `processor_bind(2)`, plus a few constants out of related headers.

use std::io::Error;

// From `<sys/types.h>`
pub type id_t = i32;

// From `<sys/processor.h>`
pub type processorid_t = i32;

// From `<sys/procset.h>`
pub type idtype_t = i32;

/// The enum values `idtype_t` can be. This is separate to be more explicit that
/// idtype_t is the ABI type, but is `repr(i32)` to make casting to `idtype_t`
/// trivial.
#[allow(non_camel_case_types)]
#[repr(i32)]
pub enum IdType {
P_PID,
P_PPID,
P_PGID,
P_SID,
P_CID,
P_UID,
P_GID,
P_ALL,
P_LWPID,
P_TASKID,
P_PROJID,
P_POOLID,
P_ZONEID,
P_CTID,
P_CPUID,
P_PSETID,
}

/// Returns an `i32` to match `processorid_t`, so that `0..online_cpus()`
/// produces a range of processor IDs without additional translation needed.
///
/// This is really just a wrapper for `sysconf(_SC_NPROCESSORS_ONLN)`.
pub fn online_cpus() -> Result<i32, Error> {
let res = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };

if res == -1 {
return Err(Error::last_os_error());
}

res.try_into().map_err(|_| {
// sysconf() reports more than 2^31 processors?!
Error::other(format!("too many processors: {res}"))
})
}

#[cfg(target_os = "illumos")]
/// Bind the current LWP to the specified processor.
pub fn bind_lwp(bind_cpu: processorid_t) -> Result<(), Error> {
extern "C" {
fn processor_bind(
idtype: idtype_t,
id: id_t,
processorid: processorid_t,
obind: *mut processorid_t,
) -> i32;
}

// From `<sys/types.h>`.
const P_MYID: id_t = -1;

let res = unsafe {
processor_bind(
IdType::P_LWPID as i32,
P_MYID,
bind_cpu,
std::ptr::null_mut(),
)
};

if res != 0 {
return Err(Error::last_os_error());
}

Ok(())
}

#[cfg(not(target_os = "illumos"))]
/// On non-illumos targets, we're not actually running a VM. We do need the
/// crate to compile to be nicer for blanket `cargo test` invocations on other
/// platforms. So a no-op function will do.
pub fn bind_lwp(_bind_cpu: processorid_t) -> Result<(), Error> {
Ok(())
}
Loading