Skip to content

Commit fccb897

Browse files
committed
distinguish file backend thread names across backend instances
A `FileBackend` spawns some number of threads to actually handle I/Os once it's eventually attached to a device that supports block operations. Those names were just `worker N`, though, and typically all file backends were created with the same number of workers, so a running Propolis would have `num-file-backend` threads named `worker 0`, `worker 1`, etc. This made finding which threads you care about somewhat more complicated than it needs to be. Instead, include a block backend ID, and plumb that to file backends to use in worker thread names. Now, for example: ``` $ pstack 28651 | grep 'worker 0' ------- thread# 21 / lwp# 21 [file backend 0/worker 0] ------- ------- thread# 29 / lwp# 29 [file backend 1/worker 0] ------- ------- thread# 45 / lwp# 45 [file backend 2/worker 0] ------- ------- thread# 49 / lwp# 49 [file backend 3/worker 0] ------- ------- thread# 53 / lwp# 53 [file backend 4/worker 0] ------- ------- thread# 57 / lwp# 57 [file backend 5/worker 0] ------- ------- thread# 61 / lwp# 61 [file backend 6/worker 0] ------- ------- thread# 65 / lwp# 65 [file backend 7/worker 0] ------- ------- thread# 69 / lwp# 69 [file backend 8/worker 0] ------- ------- thread# 73 / lwp# 73 [file backend 9/worker 0] ------- ------ thread# 77 / lwp# 77 [file backend 10/worker 0] ------- ``` yippee! yay! Having `type DeviceId = u32; type BackendId = u32` made me pretty suspicious we (I) would accidentally pass one where I meant the other and produce confusing probe output, so this comes with a new `define_id` macro to newtype the ID and provide a bit of glue to make it useful. This brings `hw::nvme::DeviceId` along for the change and gives a hopefully-more-obvious home for what these IDs are, are not, and how they relate - in particular, that while a VM consisting only of N NVMe disks, today, will have NVMe controllers 0..N with block devices 0..N and block backends 0..N, where each `i` is related to "the same thing", this is not at all reliable. Today, one could flip an NVMe disk to virtio-block and change NVMe->block mappings, while tomrorow we might make devices concurrently and be totally nondeterministic!
1 parent dfb7f68 commit fccb897

File tree

10 files changed

+144
-28
lines changed

10 files changed

+144
-28
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ newtype_derive = "0.1.6"
138138
newtype-uuid = { version = "1.0.1", features = [ "v4" ] }
139139
owo-colors = "4"
140140
oxide-tokio-rt = "0.1.2"
141+
paste = "1.0.15"
141142
pin-project-lite = "0.2.13"
142143
proc-macro2 = "1.0"
143144
proc-macro-error = "1"

lib/propolis/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ propolis_types.workspace = true
2020
usdt = { workspace = true, features = ["asm"] }
2121
tokio = { workspace = true, features = ["full"] }
2222
futures.workspace = true
23+
paste.workspace = true
2324
pin-project-lite.workspace = true
2425
anyhow.workspace = true
2526
rgb_frame.workspace = true

lib/propolis/src/block/attachment.rs

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ use std::future::Future;
2020
use std::marker::PhantomPinned;
2121
use std::num::NonZeroUsize;
2222
use std::pin::Pin;
23-
use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
23+
use std::sync::atomic::{AtomicUsize, Ordering};
2424
use std::sync::{Arc, Condvar, Mutex, MutexGuard, Weak};
2525
use std::task::{Context, Poll};
2626

2727
use super::minder::{NoneInFlight, QueueMinder};
2828
use super::{
29-
devq_id, probes, DeviceId, DeviceInfo, DeviceQueue, DeviceRequest,
30-
MetricConsumer, QueueId, WorkerId,
29+
devq_id, probes, BackendId, DeviceId, DeviceInfo, DeviceQueue,
30+
DeviceRequest, MetricConsumer, QueueId, WorkerId,
3131
};
3232
use crate::accessors::MemAccessor;
33+
use crate::block;
3334

3435
use futures::stream::FuturesUnordered;
3536
use futures::Stream;
@@ -39,12 +40,6 @@ use thiserror::Error;
3940
use tokio::sync::futures::Notified;
4041
use tokio::sync::Notify;
4142

42-
/// Static for generating unique block [DeviceId]s within a process
43-
///
44-
/// Numbering across block devices means that a block `DeviceId` and the queue
45-
/// ID in a block attachment are unique across a VM.
46-
static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0);
47-
4843
pub const MAX_WORKERS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
4944

5045
pub type ReqCountHint = Option<NonZeroUsize>;
@@ -381,6 +376,7 @@ impl AttachPair {
381376
return Err(AttachError::BackendAttached);
382377
}
383378

379+
probes::block_attach!(|| (dev.device_id().0, be.backend_id().0));
384380
// TODO: name the accessor child?
385381
let be_acc_mem = dev.0.acc_mem.child(None);
386382
be.0.workers.attach(&be_acc_mem, &dev.0.queues);
@@ -431,6 +427,7 @@ impl AttachPair {
431427
return;
432428
}
433429
}
430+
probes::block_detach!(|| (dev.device_id.0, be.backend_id.0));
434431
*dev_state = None;
435432
*be_state = None;
436433

@@ -458,6 +455,7 @@ struct DeviceAttachInner {
458455
dev_state: Mutex<DeviceState>,
459456
queues: Arc<QueueCollection>,
460457
acc_mem: MemAccessor,
458+
device_id: block::DeviceId,
461459
}
462460

463461
/// Main "attachment point" for a block device.
@@ -467,13 +465,14 @@ impl DeviceAttachment {
467465
/// queues which the device will ever expose is set via `max_queues`. DMA
468466
/// done by attached backend workers will be through the provided `acc_mem`.
469467
pub fn new(max_queues: NonZeroUsize, acc_mem: MemAccessor) -> Self {
470-
let devid = NEXT_DEVICE_ID.fetch_add(1, Ordering::Relaxed);
471-
let queues = QueueCollection::new(max_queues, devid);
468+
let device_id = DeviceId::new();
469+
let queues = QueueCollection::new(max_queues, device_id);
472470
Self(Arc::new(DeviceAttachInner {
473471
att_state: Mutex::new(None),
474472
dev_state: Mutex::new(DeviceState::default()),
475473
queues,
476474
acc_mem,
475+
device_id,
477476
}))
478477
}
479478

@@ -560,8 +559,9 @@ impl DeviceAttachment {
560559
}
561560

562561
pub fn device_id(&self) -> DeviceId {
563-
self.0.queues.devid
562+
self.0.device_id
564563
}
564+
565565
/// Get the maximum queues configured for this device.
566566
pub fn max_queues(&self) -> NonZeroUsize {
567567
NonZeroUsize::new(self.0.queues.queues.len())
@@ -678,9 +678,9 @@ impl WorkerSlot {
678678
};
679679

680680
state.sleeping_on = Some(devid);
681-
probes::block_sleep!(|| { (devid, self.id as u64) });
681+
probes::block_sleep!(|| { (devid.0, self.id as u64) });
682682
state = self.cv.wait(state).unwrap();
683-
probes::block_wake!(|| { (devid, self.id as u64) });
683+
probes::block_wake!(|| { (devid.0, self.id as u64) });
684684
state.sleeping_on = None;
685685
}
686686
}
@@ -720,13 +720,13 @@ impl WorkerSlot {
720720
devid: DeviceId,
721721
) {
722722
state.sleeping_on = Some(devid);
723-
probes::block_sleep!(|| { (devid, self.id as u64) });
723+
probes::block_sleep!(|| { (devid.0, self.id as u64) });
724724
}
725725

726726
fn async_stop_sleep(&self) {
727727
let mut state = self.state.lock().unwrap();
728728
if let Some(devid) = state.sleeping_on.take() {
729-
probes::block_wake!(|| { (devid, self.id as u64) });
729+
probes::block_wake!(|| { (devid.0, self.id as u64) });
730730
}
731731
}
732732

@@ -967,13 +967,13 @@ impl WorkerCollection {
967967
}
968968
fn assignments_refresh(&self, mut state: MutexGuard<WorkerColState>) {
969969
let assign = state.generate_assignments();
970-
let devid = state.device_id.unwrap_or(u32::MAX);
970+
let devid = state.device_id.unwrap_or(block::DeviceId::INVALID);
971971
drop(state);
972972

973973
super::probes::block_strategy!(|| {
974974
let assign_name: &'static str = assign.strategy.get().into();
975975
let generation = assign.strategy.generation() as u64;
976-
(devid, assign_name, generation)
976+
(devid.0, assign_name, generation)
977977
});
978978
for slot in self.workers.iter() {
979979
slot.update_assignment(&assign);
@@ -1155,6 +1155,7 @@ struct BackendAttachInner {
11551155
att_state: Mutex<Option<(AttachPair, MemAccessor)>>,
11561156
workers: Arc<WorkerCollection>,
11571157
info: DeviceInfo,
1158+
backend_id: BackendId,
11581159
}
11591160

11601161
/// Main "attachment point" for a block backend.
@@ -1165,6 +1166,7 @@ impl BackendAttachment {
11651166
att_state: Mutex::new(None),
11661167
workers: WorkerCollection::new(max_workers),
11671168
info,
1169+
backend_id: BackendId::new(),
11681170
}))
11691171
}
11701172
/// Get an (inactive) [context](InactiveWorkerCtx) for a given [WorkerId].
@@ -1182,6 +1184,10 @@ impl BackendAttachment {
11821184
self.0.info
11831185
}
11841186

1187+
pub fn backend_id(&self) -> BackendId {
1188+
self.0.backend_id
1189+
}
1190+
11851191
/// Permit workers to pull requests from the attached device (if any) for
11861192
/// processing.
11871193
pub fn start(&self) {

lib/propolis/src/block/file.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,13 +209,14 @@ impl FileBackend {
209209
}))
210210
}
211211
fn spawn_workers(&self) -> std::io::Result<()> {
212+
let backend_id = self.block_attach.backend_id().0;
212213
let spawn_results = (0..self.worker_count.get())
213214
.map(|n| {
214215
let shared_state = self.state.clone();
215216
let wctx = self.block_attach.worker(n as WorkerId);
216217

217218
std::thread::Builder::new()
218-
.name(format!("file worker {n}"))
219+
.name(format!("file backend {backend_id}/worker {n}"))
219220
.spawn(move || {
220221
let wctx = wctx
221222
.activate_sync()

lib/propolis/src/block/id.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Runtime identifiers for block devices and their backends.
6+
//!
7+
//! Devices that support block backends and the block backends themselves are
8+
//! independent identifiers, and only become related when an item of each type
9+
//! is connected via [`block::attach`].
10+
//!
11+
//! Devices in particular may have multiple identifiers, some from this module
12+
//! and some from others. As one example, [`propolis::hw::nvme::NvmeCtrl`] has a
13+
//! `device_id` distinguishing *instances of the NVMe controller* across a VM,
14+
//! while the `PciNvme` which has an NVMe controller also has `block_attach`
15+
//! with a `device_id` distinguishing *instances of block devices* across a VM.
16+
//!
17+
//! ## Limitations
18+
//!
19+
//! A consumer of `propolis` is free to construct devices supporting block
20+
//! backends in any order, and may happen to construct block backends in any
21+
//! different arbitrary order. Attaching the two kinds of item together is also
22+
//! up to the consumer of `propolis`, and there is no requirement that a
23+
//! particular block backend must be connected to a particular device.
24+
//!
25+
//! Consequently, these identifiers are not stable for use in migration of a VM,
26+
//! and must not be used in a way visible to a VM. They are unsuitable for
27+
//! emulated device serial numbers, model numbers, etc. The destination
28+
//! `propolis` may construct the same set of devices in a different order,
29+
//! resulting in different run-time identifiers for a device at the same
30+
//! location.
31+
32+
use crate::util::id::define_id;
33+
34+
define_id! {
35+
/// Numbering across block devices means that a block `DeviceId` and the
36+
/// queue ID in a block attachment are unique across a VM.
37+
#[derive(Copy, Clone)]
38+
pub struct DeviceId(pub(crate) u32);
39+
}
40+
41+
define_id! {
42+
/// Block backends are numbered distinctly across a VM, but may not
43+
/// be created in the same order as devices. The `block_attach` probe fires
44+
/// when a `DeviceId` and `BackendId` become associated.
45+
#[derive(Copy, Clone)]
46+
pub struct BackendId(pub(crate) u32);
47+
}

lib/propolis/src/block/mod.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ use std::time::Duration;
99
use crate::common::*;
1010
use crate::vmm::{MemCtx, SubMapping};
1111

12+
mod id;
13+
pub use id::{BackendId, DeviceId};
14+
1215
mod file;
1316
pub use file::FileBackend;
1417

@@ -41,6 +44,9 @@ pub const DEFAULT_BLOCK_SIZE: u32 = 512;
4144

4245
#[usdt::provider(provider = "propolis")]
4346
mod probes {
47+
fn block_attach(dev_id: u32, backend_id: u32) {}
48+
fn block_detach(dev_id: u32, backend_id: u32) {}
49+
4450
fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
4551
fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
4652
fn block_begin_flush(devq_id: u64, req_id: u64) {}
@@ -181,12 +187,11 @@ impl From<QueueId> for u16 {
181187
}
182188
}
183189

184-
pub type DeviceId = u32;
185190
pub type WorkerId = usize;
186191

187192
/// Combine device and queue IDs into single u64 for probes
188193
pub(crate) fn devq_id(dev: DeviceId, queue: QueueId) -> u64 {
189-
((dev as u64) << 8) | (queue.0 as u64)
194+
((dev.0 as u64) << 8) | (queue.0 as u64)
190195
}
191196

192197
/// Block device operation request

lib/propolis/src/hw/nvme/mod.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
use std::convert::TryInto;
66
use std::mem::size_of;
77
use std::num::NonZeroUsize;
8-
use std::sync::atomic::AtomicU32;
98
use std::sync::atomic::{AtomicBool, Ordering};
109
use std::sync::{Arc, Mutex, MutexGuard, Weak};
1110

@@ -16,6 +15,7 @@ use crate::hw::ids::pci::{PROPOLIS_NVME_DEV_ID, VENDOR_OXIDE};
1615
use crate::hw::ids::OXIDE_OUI;
1716
use crate::hw::pci;
1817
use crate::migrate::*;
18+
use crate::util::id::define_id;
1919
use crate::util::regmap::RegMap;
2020
use crate::vmm::MemAccessed;
2121

@@ -32,10 +32,16 @@ mod requests;
3232
use bits::*;
3333
use queue::{CompQueue, QueueId, SubQueue};
3434

35-
/// Static for generating unique NVMe device identifiers across a VM.
36-
static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0);
37-
38-
type DeviceId = u32;
35+
define_id! {
36+
/// Identifier for which NVMe controller in the VM an operation is happening
37+
/// on.
38+
///
39+
/// This is mostly useful for NVMe-related DTrace probes, where otherwise a
40+
/// queue number or command ID may be ambiguous across distinct NVMe
41+
/// controllers in a VM.
42+
#[derive(Copy, Clone)]
43+
pub struct DeviceId(u32);
44+
}
3945

4046
#[usdt::provider(provider = "propolis")]
4147
mod probes {
@@ -58,7 +64,7 @@ pub(crate) fn devq_id(dev: DeviceId, queue: QueueId) -> u64 {
5864
static_assertions::const_assert!(QueueId::MAX <= u16::MAX);
5965
}
6066

61-
((dev as u64) << 16) | (queue as u64)
67+
((dev.0 as u64) << 16) | (queue as u64)
6268
}
6369

6470
/// The max number of MSI-X interrupts we support
@@ -885,7 +891,7 @@ impl PciNvme {
885891
let csts = Status(0);
886892

887893
let state = NvmeCtrl {
888-
device_id: NEXT_DEVICE_ID.fetch_add(1, Ordering::Relaxed),
894+
device_id: DeviceId::new(),
889895
ctrl: CtrlState { cap, cc, csts, ..Default::default() },
890896
doorbell_buf: None,
891897
msix_hdl: None,

lib/propolis/src/util/id.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
// Define a newtype for a kind of run-time identifier. Exact semantics for a
6+
// given identifier depend on what uses it, but they are expected to be used
7+
// only in places internal to a VM's instantiation. DTrace probes for VM
8+
// operations, device state that is expected to be lost across migration, that
9+
// kind of thing.
10+
//
11+
// As an example of what not to do, these identifiers must not be used as field
12+
// on Oximeter metrics: because the identifier for some item in the VM may
13+
// change across instantiations, the field may become meaningless across
14+
// stop/start or migration. In such cases a more persistent identifier (PCI BDF,
15+
// NIC MAC, vCPU ACPI ID, etc) must be used instead.
16+
//
17+
// If an item needs stable identifiers, this is not the tool to use.
18+
//
19+
// This macro takes syntax matching the newtype definition primarily so that
20+
// grepping for the newtype like `struct DeviceId` finds corresponding macro
21+
// invocations.
22+
macro_rules! define_id {
23+
{
24+
$(#[$meta_items:meta])*
25+
pub struct $id_name:ident($visibility:vis u32);
26+
} => {
27+
::paste::paste! {
28+
$(#[$meta_items])*
29+
pub struct $id_name($visibility u32);
30+
31+
impl $id_name {
32+
pub const INVALID: $id_name = $id_name(u32::MAX);
33+
pub fn new() -> Self {
34+
static [<_NEXT_ $id_name:upper>]: ::std::sync::atomic::AtomicU32 =
35+
::std::sync::atomic::AtomicU32::new(0);
36+
37+
let id = [<_NEXT_ $id_name:upper>].fetch_add(
38+
1,
39+
::std::sync::atomic::Ordering::Relaxed
40+
);
41+
$id_name(id)
42+
}
43+
}
44+
}
45+
}
46+
}
47+
pub(crate) use define_id;

lib/propolis/src/util/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

55
pub mod aspace;
6+
pub mod id;
67
pub mod regmap;
78

89
mod ioctl {

0 commit comments

Comments
 (0)