From f39a608b2e0eb74321ecad3c5da05c3a9a36ff7d Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 19 Dec 2025 22:22:58 +0000 Subject: [PATCH 1/4] distinguish probes across NVMe devices NVMe commands are unique in a device across queue ID (`qid` in various parts of Propolis) and command ID (`cid` in various parts of Propolis), but are ambiguous when looking at an entire VM: two different NVMe devices will have a "queue 1", and their distinct "queue 1" may also have "command 1234" submitted at the same time. Pick a device ID for NVMe controllers, and plumb that through to distinguish queues similar to how we distinguish queues for block devices: a `devq_id` construction that makes a device and queue unique across the VM. In the NVMe case, a `devq_id` is not uniquely identifying across the life of a VM, as the queues can be destroyed, recreated, the controller reset, etc. But absent such an administrative operation, this helps untangle what's happening across a VM with many disks. Note that block attachment IDs and NVMe IDs are similar in name, and probably similar in practice, but are tracked distinctly. This is similar to the difference between block qeuue IDs and NVMe queue IDs: the architcture is similar, but the mapping between them is more arbitrary and may change as an implementation detail of Propolis. --- Cargo.lock | 1 + lib/propolis/Cargo.toml | 1 + lib/propolis/src/block/attachment.rs | 5 +- lib/propolis/src/hw/nvme/admin.rs | 8 +- lib/propolis/src/hw/nvme/mod.rs | 69 ++++++++++++++-- lib/propolis/src/hw/nvme/queue.rs | 116 +++++++++++++++++---------- lib/propolis/src/hw/nvme/requests.rs | 45 +++++++---- 7 files changed, 177 insertions(+), 68 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4cc5c378c..303213fe4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5350,6 +5350,7 @@ dependencies = [ "slog-async", "slog-term", "softnpu", + "static_assertions", "strum 0.26.3", "tempfile", "thiserror 1.0.64", diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index 5ac6f0e96..cf863ca8c 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -45,6 +45,7 @@ ispf = { workspace = true, optional = true } rand = { workspace = true, optional = true } softnpu = { workspace = true, optional = true } dlpi = { workspace = true, optional = true } +static_assertions = "1.1.0" [dev-dependencies] crossbeam-channel.workspace = true diff --git a/lib/propolis/src/block/attachment.rs b/lib/propolis/src/block/attachment.rs index 6bb00fa06..35d6daeee 100644 --- a/lib/propolis/src/block/attachment.rs +++ b/lib/propolis/src/block/attachment.rs @@ -39,7 +39,10 @@ use thiserror::Error; use tokio::sync::futures::Notified; use tokio::sync::Notify; -/// Static for generating unique block [DeviceId]s with a process +/// Static for generating unique block [DeviceId]s within a process +/// +/// Numbering across block devices means that a block `DeviceId` and the queue +/// ID in a block attachment are unique across a VM. static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0); pub const MAX_WORKERS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); diff --git a/lib/propolis/src/hw/nvme/admin.rs b/lib/propolis/src/hw/nvme/admin.rs index 63607707b..df8746a62 100644 --- a/lib/propolis/src/hw/nvme/admin.rs +++ b/lib/propolis/src/hw/nvme/admin.rs @@ -5,6 +5,7 @@ use std::mem::size_of; use crate::common::{GuestAddr, GuestRegion, PAGE_SIZE}; +use crate::hw::nvme; use crate::vmm::MemCtx; use super::bits::*; @@ -17,7 +18,7 @@ use super::{ #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_abort(cid: u16, sqid: u16) {} + fn nvme_abort(cid: u16, devsq_id: u64) {} } impl NvmeCtrl { @@ -25,7 +26,8 @@ impl NvmeCtrl { /// /// See NVMe 1.0e Section 5.1 Abort command pub(super) fn acmd_abort(&self, cmd: &cmds::AbortCmd) -> cmds::Completion { - probes::nvme_abort!(|| (cmd.cid, cmd.sqid)); + let devsq_id = nvme::devq_id(self.device_id, cmd.sqid); + probes::nvme_abort!(|| (cmd.cid, devsq_id)); // Verify the SQ in question currently exists let sqid = cmd.sqid as usize; @@ -74,6 +76,7 @@ impl NvmeCtrl { match self.create_cq( super::queue::CreateParams { id: cmd.qid, + device_id: self.device_id, base: GuestAddr(cmd.prp), size: cmd.qsize, }, @@ -119,6 +122,7 @@ impl NvmeCtrl { match self.create_sq( super::queue::CreateParams { id: cmd.qid, + device_id: self.device_id, base: GuestAddr(cmd.prp), size: cmd.qsize, }, diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index d7e61b923..37456fa00 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -5,6 +5,7 @@ use std::convert::TryInto; use std::mem::size_of; use std::num::NonZeroUsize; +use std::sync::atomic::AtomicU32; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex, MutexGuard, Weak}; @@ -31,13 +32,40 @@ mod requests; use bits::*; use queue::{CompQueue, QueueId, SubQueue}; +/// Static for generating unique NVMe device identifiers within a process. +/// +/// Numbering across NVMe devices means that the combination of NVMe +/// `device_id` and queue ID identify either an NVMe submission or completion +/// queue across a VM. +/// +/// Guests typically will configure submission queue N to complete into +/// completion queue N, but this is not a mandated device configuration. +static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0); + +type DeviceId = u32; + #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_doorbell(off: u64, qid: u16, is_cq: u8, val: u16) {} + fn nvme_doorbell(off: u64, devq_id: u64, is_cq: u8, val: u16) {} fn nvme_doorbell_admin_cq(val: u16) {} fn nvme_doorbell_admin_sq(val: u16) {} fn nvme_admin_cmd(opcode: u8, prp1: u64, prp2: u64) {} - fn nvme_block_notify(sqid: u16, block_qid: u16, occupied_hint: u16) {} + fn nvme_block_notify(devsq_id: u64, block_devqid: u64, occupied_hint: u16) { + } +} + +/// Combine an NVMe device and queue ID into a single u64 for probes +pub(crate) fn devq_id(dev: DeviceId, queue: QueueId) -> u64 { + // We'll use the low 16 bits for the queue ID. Assert at compile time that + // queue IDs cannot go above that. Clippy helpfully asserts this is an + // absurd comparison, so silence that. If you see a rustc error here you + // must have changed the type of QueueId such this is no longer absurd! + #[allow(clippy::absurd_extreme_comparisons)] + { + static_assertions::const_assert!(QueueId::MAX <= u16::MAX); + } + + ((dev as u64) << 16) | (queue as u64) } /// The max number of MSI-X interrupts we support @@ -154,6 +182,10 @@ const MAX_NUM_IO_QUEUES: usize = MAX_NUM_QUEUES - 1; /// NVMe Controller struct NvmeCtrl { + /// A distinguishing identifier for this NVMe controller across the VM. + /// Useful mostly to distinguish queues and commands as seen in probes. + device_id: DeviceId, + /// Internal NVMe Controller state ctrl: CtrlState, @@ -185,6 +217,7 @@ impl NvmeCtrl { self.create_cq( queue::CreateParams { id: queue::ADMIN_QUEUE_ID, + device_id: self.device_id, base: GuestAddr(self.ctrl.admin_cq_base), // Convert from 0's based size: u32::from(self.ctrl.aqa.acqs()) + 1, @@ -196,6 +229,7 @@ impl NvmeCtrl { self.create_sq( queue::CreateParams { id: queue::ADMIN_QUEUE_ID, + device_id: self.device_id, base: GuestAddr(self.ctrl.admin_sq_base), // Convert from 0's based size: u32::from(self.ctrl.aqa.asqs()) + 1, @@ -589,6 +623,7 @@ impl NvmeCtrl { self.create_cq( queue::CreateParams { id: cqi.id, + device_id: self.device_id, base: GuestAddr(cqi.base), size: cqi.size, }, @@ -611,6 +646,7 @@ impl NvmeCtrl { .create_sq( queue::CreateParams { id: sqi.id, + device_id: self.device_id, base: GuestAddr(sqi.base), size: sqi.size, }, @@ -742,6 +778,11 @@ pub struct PciNvme { /// accesses without stacking them up behind one central lock. is_enabled: AtomicBool, + /// Duplicate of the controller NVMe device ID, but not requiring locking + /// [NvmeCtrl] to read. This is used to provide additional context in + /// NVMe-related probes. + device_id: DeviceId, + /// Access to NVMe Submission and Completion queues. /// /// These are protected with per-slot (queue ID) locks, so actions taken on @@ -849,6 +890,7 @@ impl PciNvme { let csts = Status(0); let state = NvmeCtrl { + device_id: NEXT_DEVICE_ID.fetch_add(1, Ordering::Relaxed), ctrl: CtrlState { cap, cc, csts, ..Default::default() }, doorbell_buf: None, msix_hdl: None, @@ -879,9 +921,13 @@ impl PciNvme { } })); + // Peel out device ID before we move it into the Mutex below. + let device_id = state.device_id; + PciNvme { state: Mutex::new(state), is_enabled: AtomicBool::new(false), + device_id, pci_state, queues: NvmeQueues::default(), block_attach, @@ -1110,9 +1156,12 @@ impl PciNvme { // 32-bit register but ignore reserved top 16-bits let val = wo.read_u32() as u16; + // Mix in the device ID for probe purposes + let devq_id = devq_id(self.device_id, qid); + probes::nvme_doorbell!(|| ( off as u64, - qid, + devq_id, u8::from(is_cq), val )); @@ -1188,9 +1237,12 @@ impl PciNvme { }) { let block_qid = queue::sqid_to_block_qid(sqid); + let devsq_id = devq_id(self.device_id, sqid); + let block_devqid = + block::devq_id(self.block_attach.device_id(), block_qid); probes::nvme_block_notify!(|| ( - sqid, - u16::from(block_qid), + devsq_id, + block_devqid, num_occupied )); self.block_attach.notify( @@ -1207,14 +1259,17 @@ impl PciNvme { let sq = guard.as_ref().unwrap(); let num_occupied = sq.notify_tail(val)?; + let devsq_id = sq.devq_id(); // Notification to block layer cannot be issued with SQ lock held drop(guard); assert_ne!(qid, queue::ADMIN_QUEUE_ID); let block_qid = queue::sqid_to_block_qid(qid); + let block_devqid = + block::devq_id(self.block_attach.device_id(), block_qid); probes::nvme_block_notify!(|| ( - qid, - u16::from(block_qid), + devsq_id, + block_devqid, num_occupied )); self.block_attach diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index 064950bf6..7aa4e97ab 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -12,6 +12,7 @@ use super::cmds::Completion; use crate::accessors::MemAccessor; use crate::block; use crate::common::*; +use crate::hw::nvme::DeviceId; use crate::hw::pci; use crate::migrate::MigrateStateError; use crate::vmm::MemCtx; @@ -20,13 +21,13 @@ use thiserror::Error; #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_cqe(qid: u16, idx: u16, phase: u8) {} - fn nvme_sq_dbbuf_read(qid: u16, val: u32, tail: u16) {} - fn nvme_sq_dbbuf_write(qid: u16, head: u16) {} - fn nvme_sq_dbbuf_write_shadow(qid: u16, head: u16) {} - fn nvme_cq_dbbuf_read(qid: u16, val: u32, tail: u16) {} - fn nvme_cq_dbbuf_write(qid: u16, head: u16) {} - fn nvme_cq_dbbuf_write_shadow(qid: u16, head: u16) {} + fn nvme_cqe(devcq_id: u64, idx: u16, phase: u8) {} + fn nvme_sq_dbbuf_read(devsq_id: u64, val: u32, tail: u16) {} + fn nvme_sq_dbbuf_write(devsq_id: u64, head: u16) {} + fn nvme_sq_dbbuf_write_shadow(devsq_id: u64, head: u16) {} + fn nvme_cq_dbbuf_read(devcq_id: u64, val: u32, tail: u16) {} + fn nvme_cq_dbbuf_write(devcq_id: u64, head: u16) {} + fn nvme_cq_dbbuf_write_shadow(devcq_id: u64, head: u16) {} } /// Each queue is identified by a 16-bit ID. @@ -330,9 +331,9 @@ impl QueueGuard<'_, CompQueueState> { } /// Write update to the EventIdx in Doorbell Buffer page, if possible - fn db_buf_write(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_cq_dbbuf_write!(|| (qid, self.state.tail)); + probes::nvme_cq_dbbuf_write!(|| (devq_id, self.state.tail)); // Keep EventIdx populated with the position of the CQ tail. We are // not especially concerned with receiving timely (doorbell) updates // from the guest about where the head pointer sits. We keep our @@ -352,21 +353,21 @@ impl QueueGuard<'_, CompQueueState> { /// We would expect the guest driver to keep this value in a valid state per /// the specification, but qemu notes that certain consumers fail to do so /// on the admin queue. We follow their lead to avoid issues. - fn db_buf_write_shadow(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write_shadow(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_cq_dbbuf_write_shadow!(|| (qid, self.state.head)); + probes::nvme_cq_dbbuf_write_shadow!(|| (devq_id, self.state.head)); fence(Ordering::Release); mem.write(db_buf.shadow, &self.state.head); } } /// Read update from the Shadow in Doorbell Buffer page, if possible - fn db_buf_read(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_read(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { if let Some(new_head) = mem.read::(db_buf.shadow) { let new_head = *new_head; probes::nvme_cq_dbbuf_read!(|| ( - qid, + devq_id, new_head, self.state.head )); @@ -450,9 +451,9 @@ impl QueueGuard<'_, SubQueueState> { } /// Write update to the EventIdx in Doorbell Buffer page, if possible - fn db_buf_write(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_sq_dbbuf_write!(|| (qid, self.state.head)); + probes::nvme_sq_dbbuf_write!(|| (devq_id, self.state.head)); // Keep EventIdx populated with the position of the SQ head. As // long as there are entries available between the head and tail, we // do not want the guest taking exits for ultimately redundant @@ -470,21 +471,21 @@ impl QueueGuard<'_, SubQueueState> { /// /// See [QueueGuard::db_buf_write_shadow()] for why we would /// write to a "guest-owned" page. - fn db_buf_write_shadow(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_write_shadow(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { - probes::nvme_sq_dbbuf_write_shadow!(|| (qid, self.state.tail)); + probes::nvme_sq_dbbuf_write_shadow!(|| (devq_id, self.state.tail)); fence(Ordering::Release); mem.write(db_buf.shadow, &self.state.tail); } } /// Read update from the Shadow in Doorbell Buffer page, if possible - fn db_buf_read(&mut self, qid: QueueId, mem: &MemCtx) { + fn db_buf_read(&mut self, devq_id: u64, mem: &MemCtx) { if let Some(db_buf) = self.state.db_buf { if let Some(new_tail) = mem.read::(db_buf.shadow) { let new_tail = *new_tail; probes::nvme_sq_dbbuf_read!(|| ( - qid, + devq_id, new_tail, self.state.head )); @@ -527,6 +528,9 @@ pub enum QueueUpdateError { #[derive(Copy, Clone)] pub struct CreateParams { pub id: QueueId, + // Not strictly necessary for submission or completion queues, but helpful + // to disambiguate the queue in probes. + pub device_id: DeviceId, pub base: GuestAddr, pub size: u32, } @@ -536,6 +540,10 @@ pub struct SubQueue { /// The ID of this Submission Queue. id: QueueId, + /// The ID of the device this is a submission queue for. Not semantically + /// interesting for submission queues, but useful context in probes. + device_id: DeviceId, + /// The corresponding Completion Queue. cq: Arc, @@ -566,10 +574,11 @@ impl SubQueue { cq: Arc, acc_mem: MemAccessor, ) -> Result, QueueCreateErr> { - let CreateParams { id, base, size } = params; + let CreateParams { id, device_id, base, size } = params; validate(id == ADMIN_QUEUE_ID, base, size)?; let sq = Arc::new(Self { id, + device_id, cq, state: SubQueueState::new(size, acc_mem), cur_head: AtomicU16::new(0), @@ -597,7 +606,7 @@ impl SubQueue { state.push_tail_to(idx)?; if self.id == ADMIN_QUEUE_ID { if let Some(mem) = state.acc_mem.access() { - state.db_buf_write_shadow(self.id, &mem); + state.db_buf_write_shadow(self.devq_id(), &mem); } } @@ -620,14 +629,15 @@ impl SubQueue { // Check for last-minute updates to the tail via any configured doorbell // page, prior to attempting the pop itself. - state.db_buf_read(self.id, &mem); + state.db_buf_read(self.devq_id(), &mem); if let Some(idx) = state.pop_head(&self.cur_head) { let addr = self.base.offset::(idx as usize); if let Some(ent) = mem.read::(addr) { - state.db_buf_write(self.id, &mem); - state.db_buf_read(self.id, &mem); + let devq_id = self.devq_id(); + state.db_buf_write(devq_id, &mem); + state.db_buf_read(devq_id, &mem); return Some((ent, permit.promote(ent.cid()), idx)); } // TODO: set error state on queue/ctrl if we cannot read entry @@ -678,6 +688,11 @@ impl SubQueue { cqe.sqhd = self.cur_head.load(Ordering::Acquire); } + /// Return a VM-unique identifier for this submission queue + pub(crate) fn devq_id(&self) -> u64 { + super::devq_id(self.device_id, self.id) + } + pub(super) fn export(&self) -> migrate::NvmeSubQueueV1 { let inner = self.state.inner.lock().unwrap(); migrate::NvmeSubQueueV1 { @@ -713,6 +728,10 @@ pub struct CompQueue { /// The ID of this Completion Queue. id: QueueId, + /// The ID of the device this is a completion queue for. Not semantically + /// interesting for completion queues, but useful context in probes. + device_id: DeviceId, + /// The Interrupt Vector used to signal to the host (VM) upon pushing /// entries onto the Completion Queue. iv: u16, @@ -739,10 +758,11 @@ impl CompQueue { hdl: pci::MsixHdl, acc_mem: MemAccessor, ) -> Result { - let CreateParams { id, base, size } = params; + let CreateParams { id, device_id, base, size } = params; validate(id == ADMIN_QUEUE_ID, base, size)?; Ok(Self { id, + device_id, iv, state: CompQueueState::new(size, acc_mem), base, @@ -757,7 +777,7 @@ impl CompQueue { state.pop_head_to(idx)?; if self.id == ADMIN_QUEUE_ID { if let Some(mem) = state.acc_mem.access() { - state.db_buf_write_shadow(self.id, &mem) + state.db_buf_write_shadow(self.devq_id(), &mem) } } Ok(()) @@ -804,7 +824,7 @@ impl CompQueue { // If the CQ appears full, but the db_buf shadow is configured, do a // last-minute check to see if entries have been consumed/freed // without a doorbell call. - state.db_buf_read(self.id, mem); + state.db_buf_read(self.devq_id(), mem); } if state.take_avail(sq) { Some(ProtoPermit::new(self, sq)) @@ -824,7 +844,7 @@ impl CompQueue { .push_tail() .expect("CQ should have available space for assigned permit"); - probes::nvme_cqe!(|| (self.id, idx, u8::from(phase))); + probes::nvme_cqe!(|| (self.devq_id(), idx, u8::from(phase))); // The only definite indicator that a CQE has become valid is the phase // bit being toggled. Since the interface for writing to guest memory @@ -844,8 +864,9 @@ impl CompQueue { cqe.set_phase(phase); mem.write(addr, &cqe); - state.db_buf_read(self.id, &mem); - state.db_buf_write(self.id, &mem); + let devq_id = self.devq_id(); + state.db_buf_read(devq_id, &mem); + state.db_buf_write(devq_id, &mem); } else { // TODO: mark the queue/controller in error state? } @@ -870,6 +891,11 @@ impl CompQueue { } } + /// Return a VM-unique identifier for this completion queue + pub(crate) fn devq_id(&self) -> u64 { + super::devq_id(self.device_id, self.id) + } + pub(super) fn export(&self) -> migrate::NvmeCompQueueV1 { let guard = self.state.lock(); migrate::NvmeCompQueueV1 { @@ -918,13 +944,18 @@ pub struct ProtoPermit { /// The Submission Queue for which this entry is reserved. sq: Weak, - /// The Submission Queue's ID (stored separately to avoid going through - /// the Weak ref). - sqid: u16, + /// The ID for the device and Submission Queue the command associated with + /// this permit was submtited from. Stored separately to avoid going through + /// the Weak ref. + devsq_id: u64, } impl ProtoPermit { fn new(cq: &Arc, sq: &Arc) -> Self { - Self { cq: Arc::downgrade(cq), sq: Arc::downgrade(sq), sqid: sq.id } + Self { + cq: Arc::downgrade(cq), + sq: Arc::downgrade(sq), + devsq_id: sq.devq_id(), + } } /// Promote a "proto" permit to a [Permit]. @@ -936,7 +967,7 @@ impl ProtoPermit { Permit { cq: self.cq, sq: self.sq, - sqid: self.sqid, + devsq_id: self.devsq_id, cid, _nodrop: NoDropPermit, } @@ -963,8 +994,9 @@ pub struct Permit { /// The Submission Queue for which this entry is reserved. sq: Weak, - /// The Submission Queue ID the request came in on. - sqid: u16, + /// The Submission Queue and device ID the request came in on. Retained as a + /// consistent source identifier for probes. + devsq_id: u64, /// ID of command holding this permit. Used to populate `cid` field in /// Completion Queue Entry. @@ -1009,10 +1041,10 @@ impl Permit { self.cid } - /// Get the ID of the Submission Queue the command associated with this - /// permit was submitted on. - pub fn sqid(&self) -> u16 { - self.sqid + /// Get the ID of the device and Submission Queue the command associated + /// with this permit was submitted on. + pub fn devsq_id(&self) -> u64 { + self.devsq_id } /// A device reset may cause us to abandon some in-flight I/O, dropping the @@ -1046,7 +1078,7 @@ impl Debug for Permit { f.debug_struct("Permit") .field("sq", &self.sq) .field("cq", &self.cq) - .field("sqid", &self.sqid) + .field("devsq_id", &self.devsq_id) .field("cid", &self.cid) .finish() } diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index f755a3549..48a5aaffa 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -12,17 +12,18 @@ use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_read_enqueue(qid: u16, idx: u16, cid: u16, off: u64, sz: u64) {} - fn nvme_read_complete(qid: u16, cid: u16, res: u8) {} + fn nvme_read_enqueue(devq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) {} + fn nvme_read_complete(devq_id: u64, cid: u16, res: u8) {} - fn nvme_write_enqueue(qid: u16, idx: u16, cid: u16, off: u64, sz: u64) {} - fn nvme_write_complete(qid: u16, cid: u16, res: u8) {} + fn nvme_write_enqueue(devq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) { + } + fn nvme_write_complete(devq_id: u64, cid: u16, res: u8) {} - fn nvme_flush_enqueue(qid: u16, idx: u16, cid: u16) {} - fn nvme_flush_complete(qid: u16, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devq_id: u64, idx: u16, cid: u16) {} + fn nvme_flush_complete(devq_id: u64, cid: u16, res: u8) {} fn nvme_raw_cmd( - qid: u16, + devq_id: u64, cdw0nsid: u64, prp1: u64, prp2: u64, @@ -57,10 +58,10 @@ impl block::DeviceQueue for NvmeBlockQueue { let params = self.sq.params(); while let Some((sub, permit, idx)) = sq.pop() { - let qid = sq.id(); + let devq_id = sq.devq_id(); probes::nvme_raw_cmd!(|| { ( - qid, + devq_id, u64::from(sub.cdw0) | (u64::from(sub.nsid) << 32), sub.prp1, sub.prp2, @@ -83,7 +84,13 @@ impl block::DeviceQueue for NvmeBlockQueue { continue; } - probes::nvme_write_enqueue!(|| (qid, idx, cid, off, size)); + probes::nvme_write_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + off, + size + )); let bufs = cmd.data(size, &mem).collect(); let req = @@ -102,7 +109,13 @@ impl block::DeviceQueue for NvmeBlockQueue { continue; } - probes::nvme_read_enqueue!(|| (qid, idx, cid, off, size)); + probes::nvme_read_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + off, + size + )); let bufs = cmd.data(size, &mem).collect(); let req = @@ -110,7 +123,7 @@ impl block::DeviceQueue for NvmeBlockQueue { return Some((req, permit, None)); } Ok(NvmCmd::Flush) => { - probes::nvme_flush_enqueue!(|| (qid, idx, cid)); + probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid)); let req = Request::new_flush(); return Some((req, permit, None)); } @@ -133,18 +146,18 @@ impl block::DeviceQueue for NvmeBlockQueue { result: block::Result, permit: Self::Token, ) { - let qid = permit.sqid(); + let devsq_id = permit.devsq_id(); let cid = permit.cid(); let resnum = result as u8; match op { Operation::Read(..) => { - probes::nvme_read_complete!(|| (qid, cid, resnum)); + probes::nvme_read_complete!(|| (devsq_id, cid, resnum)); } Operation::Write(..) => { - probes::nvme_write_complete!(|| (qid, cid, resnum)); + probes::nvme_write_complete!(|| (devsq_id, cid, resnum)); } Operation::Flush => { - probes::nvme_flush_complete!(|| (qid, cid, resnum)); + probes::nvme_flush_complete!(|| (devsq_id, cid, resnum)); } Operation::Discard(..) => { unreachable!("discard not supported in NVMe for now"); From 243fffa4cd00835a5cfd5ab4656f88be6e29ca8e Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 19 Dec 2025 23:41:27 +0000 Subject: [PATCH 2/4] tests should work --- lib/propolis/src/hw/nvme/queue.rs | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index 7aa4e97ab..07332b539 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -1172,20 +1172,20 @@ mod test { let hdl = pci::MsixHdl::new_test(); let write_base = GuestAddr(1024 * 1024); let tmpl = - CreateParams { id: ADMIN_QUEUE_ID, base: write_base, size: 0 }; + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, base: write_base, size: 0 }; let acc_mem = || machine.acc_mem.child(None); // Admin queues must be less than 4K let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, size: 1024, ..tmpl }, + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, size: 1024, ..tmpl }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Ok(_))); let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, size: 5 * 1024, ..tmpl }, + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, size: 5 * 1024, ..tmpl }, 0, hdl.clone(), acc_mem(), @@ -1194,14 +1194,14 @@ mod test { // I/O queues must be less than 64K let cq = CompQueue::new( - CreateParams { id: 1, size: 1024, ..tmpl }, + CreateParams { id: 1, device_id: 0, size: 1024, ..tmpl }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Ok(_))); let cq = CompQueue::new( - CreateParams { id: 1, size: 65 * 1024, ..tmpl }, + CreateParams { id: 1, device_id: 0, size: 65 * 1024, ..tmpl }, 0, hdl.clone(), acc_mem(), @@ -1210,14 +1210,14 @@ mod test { // Neither must be less than 2 let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, size: 1, ..tmpl }, + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, size: 1, ..tmpl }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Err(QueueCreateErr::InvalidSize))); let cq = CompQueue::new( - CreateParams { id: 1, size: 1, ..tmpl }, + CreateParams { id: 1, device_id: 0, size: 1, ..tmpl }, 0, hdl.clone(), acc_mem(), @@ -1241,6 +1241,7 @@ mod test { CompQueue::new( CreateParams { id: ADMIN_QUEUE_ID, + device_id: 0, base: write_base, size: 1024, }, @@ -1252,7 +1253,7 @@ mod test { ); let io_cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 1024 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 1024 }, 0, hdl, acc_mem(), @@ -1262,7 +1263,7 @@ mod test { // Admin queues must be less than 4K let sq = SubQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, base: read_base, size: 1024 }, + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, base: read_base, size: 1024 }, admin_cq.clone(), acc_mem(), ); @@ -1270,6 +1271,7 @@ mod test { let sq = SubQueue::new( CreateParams { id: ADMIN_QUEUE_ID, + device_id: 0, base: read_base, size: 5 * 1024, }, @@ -1280,13 +1282,13 @@ mod test { // I/O queues must be less than 64K let sq = SubQueue::new( - CreateParams { id: 1, base: read_base, size: 1024 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 1024 }, io_cq.clone(), acc_mem(), ); assert!(matches!(sq, Ok(_))); let sq = SubQueue::new( - CreateParams { id: 1, base: read_base, size: 65 * 1024 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 65 * 1024 }, io_cq, acc_mem(), ); @@ -1294,13 +1296,13 @@ mod test { // Neither must be less than 2 let sq = SubQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, base: read_base, size: 1 }, + CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, base: read_base, size: 1 }, admin_cq.clone(), acc_mem(), ); assert!(matches!(sq, Err(QueueCreateErr::InvalidSize))); let sq = SubQueue::new( - CreateParams { id: 1, base: read_base, size: 1 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 1 }, admin_cq, acc_mem(), ); @@ -1338,7 +1340,7 @@ mod test { // Create our queues let cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 4 }, 0, hdl, acc_mem(), @@ -1347,7 +1349,7 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, base: read_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 4 }, cq.clone(), acc_mem(), ) @@ -1417,7 +1419,7 @@ mod test { // Purposely make the CQ smaller to test kicks let cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 2 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 2 }, 0, hdl, acc_mem(), @@ -1426,7 +1428,7 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, base: read_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: read_base, size: 4 }, cq.clone(), acc_mem(), ) @@ -1488,7 +1490,7 @@ mod test { let sq_size = rng.random_range(512..2048); let cq = Arc::new( CompQueue::new( - CreateParams { id: 1, base: write_base, size: 4 }, + CreateParams { id: 1, device_id: 0, base: write_base, size: 4 }, 0, hdl, acc_mem(), @@ -1497,7 +1499,7 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, base: read_base, size: sq_size }, + CreateParams { id: 1, device_id: 0, base: read_base, size: sq_size }, cq.clone(), acc_mem(), ) From 6ccc658f1f8384e6c31404e66936cb26371ff802 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 20 Dec 2025 00:14:51 +0000 Subject: [PATCH 3/4] review feedback --- lib/propolis/src/hw/nvme/queue.rs | 75 +++++++++++++++++++++++----- lib/propolis/src/hw/nvme/requests.rs | 39 +++++++++++---- 2 files changed, 91 insertions(+), 23 deletions(-) diff --git a/lib/propolis/src/hw/nvme/queue.rs b/lib/propolis/src/hw/nvme/queue.rs index 07332b539..2218ddbcb 100644 --- a/lib/propolis/src/hw/nvme/queue.rs +++ b/lib/propolis/src/hw/nvme/queue.rs @@ -33,6 +33,16 @@ mod probes { /// Each queue is identified by a 16-bit ID. /// /// See NVMe 1.0e Section 4.1.4 Queue Identifier +/// +/// Submission and completion queue IDs are distinct namespaces, so a device +/// can have both a "Submission Queue 1" and "Completion Queue 1". +/// +/// For USDT probes, we combine this ID with an NVMe controller ID to produce a +/// `devq_id`. This combined identifier is still ambiguous beteen one submission +/// queue and one completion queue. Contextually there is typically only one +/// reasonable interpretation of the ID, but the probe arguments are named +/// `devsq_id` or `devcq_id` to be explicit about identifying a Submission or +/// Completion queue, respectively. pub type QueueId = u16; /// The minimum number of entries in either a Completion or Submission Queue. @@ -540,8 +550,8 @@ pub struct SubQueue { /// The ID of this Submission Queue. id: QueueId, - /// The ID of the device this is a submission queue for. Not semantically - /// interesting for submission queues, but useful context in probes. + /// The ID of the device that owns this submission queue. Kept here only to + /// produce `devsq_id` for DTrace probes. device_id: DeviceId, /// The corresponding Completion Queue. @@ -728,8 +738,8 @@ pub struct CompQueue { /// The ID of this Completion Queue. id: QueueId, - /// The ID of the device this is a completion queue for. Not semantically - /// interesting for completion queues, but useful context in probes. + /// The ID of the device that owns this completion queue. Kept here only to + /// produce `devcq_id` for DTrace probes. device_id: DeviceId, /// The Interrupt Vector used to signal to the host (VM) upon pushing @@ -1171,21 +1181,35 @@ mod test { let machine = Machine::new_test()?; let hdl = pci::MsixHdl::new_test(); let write_base = GuestAddr(1024 * 1024); - let tmpl = - CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, base: write_base, size: 0 }; + let tmpl = CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + base: write_base, + size: 0, + }; let acc_mem = || machine.acc_mem.child(None); // Admin queues must be less than 4K let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, size: 1024, ..tmpl }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + size: 1024, + ..tmpl + }, 0, hdl.clone(), acc_mem(), ); assert!(matches!(cq, Ok(_))); let cq = CompQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, size: 5 * 1024, ..tmpl }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + size: 5 * 1024, + ..tmpl + }, 0, hdl.clone(), acc_mem(), @@ -1253,7 +1277,12 @@ mod test { ); let io_cq = Arc::new( CompQueue::new( - CreateParams { id: 1, device_id: 0, base: write_base, size: 1024 }, + CreateParams { + id: 1, + device_id: 0, + base: write_base, + size: 1024, + }, 0, hdl, acc_mem(), @@ -1263,7 +1292,12 @@ mod test { // Admin queues must be less than 4K let sq = SubQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, base: read_base, size: 1024 }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + base: read_base, + size: 1024, + }, admin_cq.clone(), acc_mem(), ); @@ -1288,7 +1322,12 @@ mod test { ); assert!(matches!(sq, Ok(_))); let sq = SubQueue::new( - CreateParams { id: 1, device_id: 0, base: read_base, size: 65 * 1024 }, + CreateParams { + id: 1, + device_id: 0, + base: read_base, + size: 65 * 1024, + }, io_cq, acc_mem(), ); @@ -1296,7 +1335,12 @@ mod test { // Neither must be less than 2 let sq = SubQueue::new( - CreateParams { id: ADMIN_QUEUE_ID, device_id: 0, base: read_base, size: 1 }, + CreateParams { + id: ADMIN_QUEUE_ID, + device_id: 0, + base: read_base, + size: 1, + }, admin_cq.clone(), acc_mem(), ); @@ -1499,7 +1543,12 @@ mod test { ); let sq = Arc::new( SubQueue::new( - CreateParams { id: 1, device_id: 0, base: read_base, size: sq_size }, + CreateParams { + id: 1, + device_id: 0, + base: read_base, + size: sq_size, + }, cq.clone(), acc_mem(), ) diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 48a5aaffa..97ee8f4f6 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -12,18 +12,37 @@ use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] mod probes { - fn nvme_read_enqueue(devq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) {} - fn nvme_read_complete(devq_id: u64, cid: u16, res: u8) {} - - fn nvme_write_enqueue(devq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) { + // Note that unlike the probes in `queue.rs`, the probes here provide a + // `devsq_id` for completion as well as enqueuement. The submission queue is + // the one the command was originally submitted on. + // + // As long as queues are not destroyed (and the device is not reset), a + // `(devsq_id, cid)` tuple seen in an `nvme_*_enqueue` probably will not be + // reused before that same tuple is used in a corresponding + // `nvme_*_complete` probe. It is possible, but such a case is a + // guest error and unlikely. From the NVM Express Base Specification: + // + // > The Command Identifier field in the SQE shall be unique among all + // > outstanding commands associated with that queue. + fn nvme_read_enqueue(devsq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) { + } + fn nvme_read_complete(devsq_id: u64, cid: u16, res: u8) {} + + fn nvme_write_enqueue( + devsq_id: u64, + idx: u16, + cid: u16, + off: u64, + sz: u64, + ) { } - fn nvme_write_complete(devq_id: u64, cid: u16, res: u8) {} + fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {} - fn nvme_flush_enqueue(devq_id: u64, idx: u16, cid: u16) {} - fn nvme_flush_complete(devq_id: u64, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {} + fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {} fn nvme_raw_cmd( - devq_id: u64, + devsq_id: u64, cdw0nsid: u64, prp1: u64, prp2: u64, @@ -58,10 +77,10 @@ impl block::DeviceQueue for NvmeBlockQueue { let params = self.sq.params(); while let Some((sub, permit, idx)) = sq.pop() { - let devq_id = sq.devq_id(); + let devsq_id = sq.devq_id(); probes::nvme_raw_cmd!(|| { ( - devq_id, + devsq_id, u64::from(sub.cdw0) | (u64::from(sub.nsid) << 32), sub.prp1, sub.prp2, From c47deb605730d950a03d4b6526a8d955df72c988 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 20 Dec 2025 00:21:57 +0000 Subject: [PATCH 4/4] one more wording tweak --- lib/propolis/src/hw/nvme/mod.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 37456fa00..83317dd9a 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -32,14 +32,7 @@ mod requests; use bits::*; use queue::{CompQueue, QueueId, SubQueue}; -/// Static for generating unique NVMe device identifiers within a process. -/// -/// Numbering across NVMe devices means that the combination of NVMe -/// `device_id` and queue ID identify either an NVMe submission or completion -/// queue across a VM. -/// -/// Guests typically will configure submission queue N to complete into -/// completion queue N, but this is not a mandated device configuration. +/// Static for generating unique NVMe device identifiers across a VM. static NEXT_DEVICE_ID: AtomicU32 = AtomicU32::new(0); type DeviceId = u32; @@ -184,6 +177,8 @@ const MAX_NUM_IO_QUEUES: usize = MAX_NUM_QUEUES - 1; struct NvmeCtrl { /// A distinguishing identifier for this NVMe controller across the VM. /// Useful mostly to distinguish queues and commands as seen in probes. + /// `device_id` is held constant across NVMe resets, but not persisted + /// across export and import. device_id: DeviceId, /// Internal NVMe Controller state @@ -921,7 +916,7 @@ impl PciNvme { } })); - // Peel out device ID before we move it into the Mutex below. + // Cache device ID before we move it into the Mutex below. let device_id = state.device_id; PciNvme {