diff --git a/.github/workflows/dependency_modification_check.yml b/.github/workflows/dependency_modification_check.yml deleted file mode 100644 index ac6537af102..00000000000 --- a/.github/workflows/dependency_modification_check.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Check no dependencies were modified - -on: pull_request - -jobs: - dependency_changed_check: - runs-on: ubuntu-latest - steps: - - name: "Checkout repository" - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: "Check Cargo.lock not in changeset" - run: | - git fetch origin - git diff origin/$GITHUB_BASE_REF.. --name-only| ( ! grep "Cargo.lock") diff --git a/Cargo.lock b/Cargo.lock index 708e381624b..0135a2dfb9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -569,7 +569,7 @@ dependencies = [ "serde_json", "thiserror 2.0.17", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "utils", "vmm", "vmm-sys-util", @@ -1451,7 +1451,20 @@ dependencies = [ "libc", "nix", "thiserror 1.0.69", - "userfaultfd-sys", + "userfaultfd-sys 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "userfaultfd" +version = "0.9.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "libc", + "nix", + "thiserror 1.0.69", + "userfaultfd-sys 0.6.0 (git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection)", ] [[package]] @@ -1465,6 +1478,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "userfaultfd-sys" +version = "0.6.0" +source = "git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection#9f4f7b42adbb9bea59016f4af248ed547cf160f0" +dependencies = [ + "bindgen 0.69.5", + "cc", + "cfg-if", +] + [[package]] name = "utf8parse" version = "0.2.2" @@ -1583,7 +1606,7 @@ dependencies = [ "slab", "thiserror 2.0.17", "timerfd", - "userfaultfd", + "userfaultfd 0.9.0 (git+https://github.com/e2b-dev/userfaultfd-rs?branch=feat_write_protection)", "utils", "uuid", "vhost", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index dcd6753a4c5..1eb2d83e0f2 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -31,6 +31,9 @@ { "syscall": "mincore" }, + { + "syscall": "pread64" + }, { "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" diff --git a/src/firecracker/src/api_server/mod.rs b/src/firecracker/src/api_server/mod.rs index 60daaa26639..961fc68e836 100644 --- a/src/firecracker/src/api_server/mod.rs +++ b/src/firecracker/src/api_server/mod.rs @@ -275,7 +275,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); @@ -288,7 +288,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index f98170ccbea..478483e9ad9 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -31,6 +31,7 @@ use super::request::vsock::parse_put_vsock; use crate::api_server::request::hotplug::memory::{ parse_get_memory_hotplug, parse_patch_memory_hotplug, parse_put_memory_hotplug, }; +use crate::api_server::request::memory_info::parse_get_memory; use crate::api_server::request::serial::parse_put_serial; #[derive(Debug)] @@ -91,6 +92,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Get, "hotplug", None) if path_tokens.next() == Some("memory") => { parse_get_memory_hotplug() } + (Method::Get, "memory", None) => parse_get_memory(path_tokens), (Method::Get, _, Some(_)) => method_to_error(Method::Get), (Method::Put, "actions", Some(body)) => parse_put_actions(body), (Method::Put, "balloon", Some(body)) => parse_put_balloon(body), @@ -196,6 +198,9 @@ impl ParsedRequest { &serde_json::json!({ "firecracker_version": version.as_str() }), ), VmmData::FullVmConfig(config) => Self::success_response_with_data(config), + VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), + VmmData::Memory(meminfo) => Self::success_response_with_data(meminfo), + VmmData::MemoryDirty(dirty) => Self::success_response_with_data(dirty), }, Err(vmm_action_error) => { let mut response = match vmm_action_error { @@ -610,6 +615,15 @@ pub mod tests { &serde_json::json!({ "firecracker_version": version.as_str() }).to_string(), 200, ), + VmmData::MemoryMappings(mappings) => { + http_response(&serde_json::to_string(mappings).unwrap(), 200) + } + VmmData::Memory(meminfo) => { + http_response(&serde_json::to_string(meminfo).unwrap(), 200) + } + VmmData::MemoryDirty(dirty) => { + http_response(&serde_json::to_string(dirty).unwrap(), 200) + } }; let response = ParsedRequest::convert_to_response(&data); response.write_all(&mut buf).unwrap(); diff --git a/src/firecracker/src/api_server/request/memory_info.rs b/src/firecracker/src/api_server/request/memory_info.rs new file mode 100644 index 00000000000..2d8e55a420e --- /dev/null +++ b/src/firecracker/src/api_server/request/memory_info.rs @@ -0,0 +1,19 @@ +use micro_http::Method; +use vmm::rpc_interface::VmmAction; + +use crate::api_server::parsed_request::{ParsedRequest, RequestError}; + +pub(crate) fn parse_get_memory<'a, T>(mut path_tokens: T) -> Result +where + T: Iterator, +{ + match path_tokens.next() { + Some("mappings") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)), + Some("dirty") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryDirty)), + Some(unknown_path) => Err(RequestError::InvalidPathMethod( + format!("/memory/{}", unknown_path), + Method::Get, + )), + None => Ok(ParsedRequest::new_sync(VmmAction::GetMemory)), + } +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 9be4617bd8e..89472c52d8e 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -11,6 +11,7 @@ pub mod hotplug; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_info; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 8284aa66287..cc7c1c28762 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -144,7 +144,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -158,7 +158,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 0523dd9b08e..f674bed01c3 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -786,6 +786,50 @@ paths: schema: $ref: "#/definitions/Error" + /memory/mappings: + get: + summary: Gets the memory mappings with skippable pages bitmap. + operationId: getMemoryMappings + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryMappingsResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory: + get: + summary: Gets the memory info (resident and empty pages). + description: Returns an object with resident and empty bitmaps. The resident bitmap marks all pages that are resident. The empty bitmap marks zero pages (subset of resident pages). This is checked at the pageSize of each region. All regions must have the same page size. + operationId: getMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory/dirty: + get: + summary: Gets the dirty guest memory + description: This returns the resident memory that has been written since last snapshot. + operationId: getDirtyMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/DirtyMemory" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + /version: get: summary: Gets the Firecracker version. @@ -1347,6 +1391,72 @@ definitions: description: MicroVM hypervisor build version. type: string + GuestMemoryRegionMapping: + type: object + description: Describes the region of guest memory that can be used for creating the memfile. + required: + - base_host_virt_addr + - size + - offset + - page_size + properties: + base_host_virt_addr: + type: integer + size: + description: The size of the region in bytes. + type: integer + offset: + description: The offset of the region in bytes. + type: integer + page_size: + description: The page size in bytes. + type: integer + + MemoryMappingsResponse: + type: object + description: Response containing memory region mappings. + required: + - mappings + properties: + mappings: + type: array + description: The memory region mappings. + items: + $ref: "#/definitions/GuestMemoryRegionMapping" + + MemoryResponse: + type: object + description: Response containing the memory info (resident and empty pages). + required: + - resident + - empty + properties: + resident: + type: array + description: The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + items: + type: integer + format: uint64 + empty: + type: array + description: The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). This is a subset of the resident pages. + items: + type: integer + format: uint64 + + DirtyMemory: + type: object + description: Response containing the bitmap (one bit per page) of dirty pages of guest memory + required: + - bitmap + properties: + bitmap: + type: array + description: The dirty bitmap as a vector of u64 values. Each bit represents if the page is dirty. + items: + type: integer + format: uint64 + Logger: type: object description: @@ -1555,12 +1665,14 @@ definitions: SnapshotCreateParams: type: object required: - - mem_file_path - snapshot_path properties: mem_file_path: type: string - description: Path to the file that will contain the guest memory. + description: + Path to the file that will contain the guest memory. It is optional. + In case that a user doesn't provide a path, they are responsible to + ensure they store the microVM's memory state via external means. snapshot_path: type: string description: Path to the file that will contain the microVM state. diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index b6ab412a862..e621c4cb82f 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -47,7 +47,11 @@ serde_json = "1.0.145" slab = "0.4.11" thiserror = "2.0.17" timerfd = "1.5.0" -userfaultfd = "0.9.0" +userfaultfd = { git = "https://github.com/e2b-dev/userfaultfd-rs", branch = "feat_write_protection", features = [ + "linux5_7", + "linux5_13", + "linux6_7" +] } utils = { path = "../utils" } uuid = "1.18.1" vhost = { version = "0.15.0", features = ["vhost-user-frontend"] } diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 37d97d8c212..9241ab27112 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -224,13 +224,18 @@ pub struct VmState { pub memory: GuestMemoryState, /// resource allocator pub resource_allocator: ResourceAllocator, - pitstate: kvm_pit_state2, - clock: kvm_clock_data, + /// KVM interrupt timer + pub pitstate: kvm_pit_state2, + /// KVM clock data + pub clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. - pic_master: kvm_irqchip, + /// Master PIC controller + pub pic_master: kvm_irqchip, // TODO: rename this field to adopt inclusive language once Linux updates it, too. - pic_slave: kvm_irqchip, - ioapic: kvm_irqchip, + /// Slave PIC controller + pub pic_slave: kvm_irqchip, + /// IOAPIC + pub ioapic: kvm_irqchip, } impl fmt::Debug for VmState { diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 332b1ac3cc3..63944bfff83 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -318,6 +318,7 @@ pub fn build_microvm_for_boot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; let vmm = Arc::new(Mutex::new(vmm)); @@ -518,6 +519,7 @@ pub fn build_microvm_from_snapshot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. @@ -751,6 +753,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::arch::host_page_size; use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::generated::virtio_ids; @@ -836,6 +839,7 @@ pub(crate) mod tests { vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager: default_device_manager(), + page_size: host_page_size(), } } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2a0393e57f2..8e815bf8b6c 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -168,9 +168,9 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { - vmgenid: VMGenIDState, + pub vmgenid: VMGenIDState, #[cfg(target_arch = "x86_64")] - vmclock: VmClockState, + pub vmclock: VmClockState, } impl<'a> Persist<'a> for ACPIDeviceManager { diff --git a/src/vmm/src/devices/acpi/mod.rs b/src/vmm/src/devices/acpi/mod.rs index 8eba26ac41d..4e8c62922e6 100644 --- a/src/vmm/src/devices/acpi/mod.rs +++ b/src/vmm/src/devices/acpi/mod.rs @@ -1,6 +1,6 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod generated; +pub mod generated; pub mod vmclock; pub mod vmgenid; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index d7882a78ded..56aee6e44d4 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -22,7 +22,7 @@ use crate::vstate::resources::ResourceAllocator; unsafe impl ByteValued for vmclock_abi {} // We are reserving a physical page to expose the [`VmClock`] data -const VMCLOCK_SIZE: u32 = 0x1000; +pub const VMCLOCK_SIZE: u32 = 0x1000; // Write a value in `vmclock_abi` both in the Firecracker-managed state // and inside guest memory address that corresponds to it. diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 4d83075fa0f..5a960ef0130 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -68,7 +68,7 @@ unsafe impl ByteValued for ConfigSpace {} /// Holds state of the free page hinting run #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] -pub(crate) struct HintingState { +pub struct HintingState { /// The command requested by us. Set to STOP by default. pub host_cmd: u32, /// The last command supplied by guest. diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 2314a98aa33..f044c99494b 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -31,22 +31,22 @@ pub struct BalloonConfigSpaceState { /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BalloonStatsState { - swap_in: Option, - swap_out: Option, - major_faults: Option, - minor_faults: Option, - free_memory: Option, - total_memory: Option, - available_memory: Option, - disk_caches: Option, - hugetlb_allocations: Option, - hugetlb_failures: Option, - oom_kill: Option, - alloc_stall: Option, - async_scan: Option, - direct_scan: Option, - async_reclaim: Option, - direct_reclaim: Option, + pub swap_in: Option, + pub swap_out: Option, + pub major_faults: Option, + pub minor_faults: Option, + pub free_memory: Option, + pub total_memory: Option, + pub available_memory: Option, + pub disk_caches: Option, + pub hugetlb_allocations: Option, + pub hugetlb_failures: Option, + pub oom_kill: Option, + pub alloc_stall: Option, + pub async_scan: Option, + pub direct_scan: Option, + pub async_reclaim: Option, + pub direct_reclaim: Option, } impl BalloonStatsState { @@ -101,11 +101,11 @@ impl BalloonStatsState { /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BalloonState { - stats_polling_interval_s: u16, - stats_desc_index: Option, - latest_stats: BalloonStatsState, - config_space: BalloonConfigSpaceState, - hinting_state: HintingState, + pub stats_polling_interval_s: u16, + pub stats_desc_index: Option, + pub latest_stats: BalloonStatsState, + pub config_space: BalloonConfigSpaceState, + pub hinting_state: HintingState, pub virtio_state: VirtioDeviceState, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/persist.rs b/src/vmm/src/devices/virtio/block/vhost_user/persist.rs index d507fa9577b..230e6caf47b 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/persist.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/persist.rs @@ -15,14 +15,14 @@ use crate::snapshot::Persist; /// vhost-user block device state. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VhostUserBlockState { - id: String, - partuuid: Option, - cache_type: CacheType, - root_device: bool, - socket_path: String, - vu_acked_protocol_features: u64, - config_space: Vec, - virtio_state: VirtioDeviceState, + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub socket_path: String, + pub vu_acked_protocol_features: u64, + pub config_space: Vec, + pub virtio_state: VirtioDeviceState, } impl Persist<'_> for VhostUserBlock { diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 380fe1de0e8..98f17c258ad 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -53,14 +53,14 @@ impl From for FileEngineType { /// Holds info about the block device. Gets saved in snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioBlockState { - id: String, - partuuid: Option, - cache_type: CacheType, - root_device: bool, - disk_path: String, + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub disk_path: String, pub virtio_state: VirtioDeviceState, - rate_limiter_state: RateLimiterState, - file_engine_type: FileEngineTypeState, + pub rate_limiter_state: RateLimiterState, + pub file_engine_type: FileEngineTypeState, } impl Persist<'_> for VirtioBlock { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index ba56cc39aac..1af7a2cc081 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,18 +30,29 @@ pub struct NetConfigSpaceState { guest_mac: Option, } +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct RxBufferState { + // Number of iovecs we have parsed from the guest + parsed_descriptor_chains_nr: u16, + // Number of used descriptors + used_descriptors: u16, + // Number of used bytes + used_bytes: u32, +} + /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NetState { pub id: String, pub tap_if_name: String, - rx_rate_limiter_state: RateLimiterState, - tx_rate_limiter_state: RateLimiterState, + pub rx_rate_limiter_state: RateLimiterState, + pub tx_rate_limiter_state: RateLimiterState, /// The associated MMDS network stack. pub mmds_ns: Option, - config_space: NetConfigSpaceState, + pub config_space: NetConfigSpaceState, pub virtio_state: VirtioDeviceState, + pub rx_buffers_state: RxBufferState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -84,6 +95,7 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), + rx_buffers_state: RxBufferState::default(), } } @@ -128,6 +140,10 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; + if state.virtio_state.activated { + net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; + } + Ok(net) } } diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 85c4940f305..4306b60961b 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -194,13 +194,13 @@ impl VirtioDeviceState { #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct MmioTransportState { // The register where feature bits are stored. - features_select: u32, + pub features_select: u32, // The register where features page is selected. - acked_features_select: u32, - queue_select: u32, - device_status: u32, - config_generation: u32, - interrupt_status: u32, + pub acked_features_select: u32, + pub queue_select: u32, + pub device_status: u32, + pub config_generation: u32, + pub interrupt_status: u32, } /// Auxiliary structure for initializing the transport when resuming from a snapshot. diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 27df145eb81..e841af4926b 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -20,7 +20,7 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { pub virtio_state: VirtioDeviceState, - rate_limiter_state: RateLimiterState, + pub rate_limiter_state: RateLimiterState, } #[derive(Debug)] diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 30273e92c06..b0031931200 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -144,13 +144,15 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::mem::{VIRTIO_MEM_DEV_ID, VirtioMem, VirtioMemError, VirtioMemStatus}; use crate::devices::virtio::net::Net; use crate::logger::{METRICS, MetricsError, error, info, warn}; -use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; +use crate::persist::{GuestRegionUffdMapping, MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; +use crate::utils::usize_to_u64; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use crate::vstate::vm::mincore_bitmap; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -254,6 +256,8 @@ pub enum VmmError { Block(#[from] BlockError), /// Balloon: {0} Balloon(#[from] BalloonError), + /// Pagemap error: {0} + Pagemap(#[from] utils::pagemap::PagemapError), /// Failed to create memory hotplug device: {0} VirtioMem(#[from] VirtioMemError), } @@ -313,6 +317,8 @@ pub struct Vmm { vcpus_exit_evt: EventFd, // Device manager device_manager: DeviceManager, + /// Page size used for backing guest memory + pub page_size: usize, } impl Vmm { @@ -690,6 +696,130 @@ impl Vmm { pub fn vm(&self) -> &Vm { &self.vm } + + /// Get the list of mappings for guest memory + pub fn guest_memory_mappings(&self, page_size: usize) -> Vec { + let mut mappings = vec![]; + let mut offset = 0; + + for region in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let size = region.slice.len(); + #[allow(deprecated)] + mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: region.slice.ptr_guard_mut().as_ptr() as u64, + size, + offset, + page_size, + page_size_kib: page_size, + }); + + offset += usize_to_u64(size); + } + + mappings + } + + /// Get info regarding resident and empty pages for guest memory + pub fn guest_memory_info(&self, page_size: usize) -> Result<(Vec, Vec), VmmError> { + let mut resident = vec![]; + let mut empty = vec![]; + let zero_page = vec![0u8; page_size]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + debug_assert!(mem_slot.slice.len().is_multiple_of(page_size)); + debug_assert!( + (mem_slot.slice.ptr_guard_mut().as_ptr() as usize).is_multiple_of(page_size) + ); + + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + let addr = mem_slot.slice.ptr_guard_mut().as_ptr(); + let mut curr_empty = vec![0u64; nr_pages.div_ceil(64)]; + let curr_resident = mincore_bitmap(addr, mem_slot.slice.len(), page_size)?; + + for page_idx in 0..nr_pages { + if (curr_resident[page_idx / 64] & (1u64 << (page_idx % 64))) == 0 { + continue; + } + + // SAFETY: `addr` points to a memory region that is `nr_pages * page_size` long. + let curr_addr = unsafe { addr.add(page_idx * page_size) }; + + // SAFETY: both addresses are valid and they point to a memory region + // that is (at least) `page_size` long + let ret = unsafe { + libc::memcmp( + curr_addr.cast::(), + zero_page.as_ptr().cast::(), + page_size, + ) + }; + + if ret == 0 { + curr_empty[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + + resident.extend_from_slice(&curr_resident); + empty.extend_from_slice(&curr_empty); + } + + Ok((resident, empty)) + } + + /// Get dirty pages bitmap for guest memory + pub fn get_dirty_memory(&self, page_size: usize) -> Result, VmmError> { + let pagemap = utils::pagemap::PagemapReader::new(page_size)?; + let mut dirty_bitmap = vec![]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let base_addr = mem_slot.slice.ptr_guard_mut().as_ptr() as usize; + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + + // Use mincore_bitmap to get resident pages at guest page size granularity + let resident_bitmap = vstate::vm::mincore_bitmap(base_addr as *mut u8, len, page_size)?; + + // TODO: if we don't support UFFD/async WP, we can completely skip this bit, as the + // UFFD handler already tracks dirty pages through the WriteProtected events. For the + // time being, we always do. + // + // Build dirty bitmap: check pagemap only for pages that mincore reports resident. + // This way we reduce the amount of times we read out of /proc//pagemap. + let mut slot_bitmap = vec![0u64; nr_pages.div_ceil(64)]; + for page_idx in 0..nr_pages { + // Check if page is resident in the bitmap. + // TODO: These operations (add to bitmap, check for presence, etc.) merit their own + // implementation, somewhere within a bitmap type). + let is_resident = (resident_bitmap[page_idx / 64] & (1u64 << (page_idx % 64))) != 0; + if is_resident { + let virt_addr = base_addr + (page_idx * page_size); + if pagemap.is_page_dirty(virt_addr)? { + slot_bitmap[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + } + + dirty_bitmap.extend_from_slice(&slot_bitmap); + } + + Ok(dirty_bitmap) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist/mod.rs similarity index 91% rename from src/vmm/src/persist.rs rename to src/vmm/src/persist/mod.rs index ba2608070c6..4c4a64928e6 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist/mod.rs @@ -11,10 +11,11 @@ use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; use std::path::Path; use std::sync::{Arc, Mutex}; +use std::time::Instant; use semver::Version; use serde::{Deserialize, Serialize}; -use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use userfaultfd::{FeatureFlags, RegisterMode, Uffd, UffdBuilder}; use vmm_sys_util::sock_ctrl_msg::ScmSocket; #[cfg(target_arch = "aarch64")] @@ -29,7 +30,7 @@ use crate::device_manager::{DevicePersistError, DevicesState}; use crate::logger::{info, warn}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; -use crate::snapshot::Snapshot; +use crate::snapshot::{Snapshot, SnapshotError, SnapshotHdr}; use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -43,6 +44,10 @@ use crate::vstate::vcpu::{VcpuSendEventError, VcpuState}; use crate::vstate::vm::{VmError, VmState}; use crate::{EventManager, Vmm, vstate}; +pub(crate) mod v1_10; +pub(crate) mod v1_12; +pub(crate) mod v1_14; + /// Holds information related to the VM that is not part of VmState. #[derive(Clone, Debug, Default, Deserialize, PartialEq, Eq, Serialize)] pub struct VmInfo { @@ -161,8 +166,10 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; - vmm.vm - .snapshot_memory_to_file(¶ms.mem_file_path, params.snapshot_type)?; + if let Some(mem_file_path) = params.mem_file_path.as_ref() { + vmm.vm + .snapshot_memory_to_file(mem_file_path, params.snapshot_type, vmm.page_size)?; + } // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime @@ -445,10 +452,36 @@ pub enum SnapshotStateFromFileError { fn snapshot_state_from_file( snapshot_path: &Path, ) -> Result { - let mut snapshot_reader = File::open(snapshot_path)?; - let snapshot = Snapshot::load(&mut snapshot_reader)?; + let start = Instant::now(); + + let data = std::fs::read(snapshot_path)?; + let version = SnapshotHdr::load(&mut data.as_slice())?.version; - Ok(snapshot.data) + let mut snapshot_reader = data.as_slice(); + let data = match (version.major, version.minor) { + (8, 0) => Snapshot::load(&mut snapshot_reader)?.data, + (6, 0) => { + let v12_state = Snapshot::::load(&mut snapshot_reader)?; + MicrovmState::try_from(v12_state.data).unwrap() + } + (4, 0) => { + let v10_state = Snapshot::::load(&mut snapshot_reader)?; + let v12_state = v1_12::MicrovmState::from(v10_state.data); + MicrovmState::try_from(v12_state).unwrap() + } + _ => { + return Err(SnapshotStateFromFileError::Load( + SnapshotError::InvalidFormatVersion(version), + )); + } + }; + + info!( + "Loading snapshot file took {} usec", + start.elapsed().as_micros() + ); + + Ok(data) } /// Error type for [`guest_memory_from_file`]. @@ -481,6 +514,8 @@ pub enum GuestMemoryFromUffdError { Create(userfaultfd::Error), /// Failed to register memory address range with the userfaultfd object: {0} Register(userfaultfd::Error), + /// Failed to enable write protection on memory address range with the userfaultfd object: {0} + WriteProtect(userfaultfd::Error), /// Failed to connect to UDS Unix stream: {0} Connect(#[from] std::io::Error), /// Failed to sends file descriptor: {0} @@ -502,7 +537,9 @@ fn guest_memory_from_uffd( // because the only place the kernel checks this is in a hook from madvise, e.g. it doesn't // actively change the behavior of UFFD, only passively. Without balloon devices // we never call madvise anyway, so no need to put this into a conditional. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + uffd_builder.require_features( + FeatureFlags::EVENT_REMOVE | FeatureFlags::MISSING_HUGETLBFS | FeatureFlags::WP_ASYNC, + ); let uffd = uffd_builder .close_on_exec(true) @@ -512,8 +549,22 @@ fn guest_memory_from_uffd( .map_err(GuestMemoryFromUffdError::Create)?; for mem_region in guest_memory.iter() { - uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) - .map_err(GuestMemoryFromUffdError::Register)?; + uffd.register_with_mode( + mem_region.as_ptr().cast(), + mem_region.size() as _, + RegisterMode::MISSING | RegisterMode::WRITE_PROTECT, + ) + .map_err(GuestMemoryFromUffdError::Register)?; + + // If memory is backed by huge pages, we can immediately write protect it. + // Otherwise (memory is backed by anonymous memory), write protecting here + // won't have any effect, as the write-protection bit for a bitwill be + // wiped when the first page fault occurs. These cases need to be handled + // directly from the UFFD handler. + if huge_pages.is_hugetlbfs() { + uffd.write_protect(mem_region.as_ptr().cast(), mem_region.size() as _) + .map_err(GuestMemoryFromUffdError::WriteProtect)?; + } } send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; diff --git a/src/vmm/src/persist/v1_10/aarch64.rs b/src/vmm/src/persist/v1_10/aarch64.rs new file mode 100644 index 00000000000..ff7ab011a78 --- /dev/null +++ b/src/vmm/src/persist/v1_10/aarch64.rs @@ -0,0 +1,45 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde::{Deserialize, Serialize}; + +use super::{KvmCapability, MMIODeviceInfo}; + +// Types that are identical across all versions — canonical definitions in v1_14. +pub use crate::v1_14::{ + StaticCpuTemplate, + DeviceType, + GicRegState, + VgicSysRegsState, + GicVcpuState, + Aarch64RegisterVec, +}; + +// Types that are identical in v1.10 and v1.12 — canonical definitions in v1_12. +pub use crate::v1_12::{ + // aarch64 GicState is identical in v1.10 and v1.12 (gains its_state in v1.14) + GicState, + // aarch64 VcpuState is identical in v1.10 and v1.12 (gains pvtime_ipa in v1.14) + VcpuState, +}; + +// ─────────────────────────────────────────────────────────────────── +// aarch64 legacy device info (v1.10 layout: uses v1.10 MMIODeviceInfo with irqs: Vec) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (aarch64, v1.10) +// In v1.10, VmState holds kvm_cap_modifiers; memory_state is at MicrovmState level. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub gic: GicState, + pub kvm_cap_modifiers: Vec, +} diff --git a/src/vmm/src/persist/v1_10/mod.rs b/src/vmm/src/persist/v1_10/mod.rs new file mode 100644 index 00000000000..f95ce37bdca --- /dev/null +++ b/src/vmm/src/persist/v1_10/mod.rs @@ -0,0 +1,154 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.10 (snapshot format version 4.0.0). +//! +//! Types that are identical to v1.14 are imported from that module (the canonical source). +//! Types that are the same in v1.10 and v1.12 (but different from v1.14) are imported +//! from v1.12 (the canonical source for that version pair). +//! Only types that are truly v1.10-specific are defined here. +//! +//! Key differences from v1.12: +//! - `GuestMemoryRegionState` includes an `offset` field (removed in v1.11) +//! - `MMIODeviceInfo` uses `irqs: Vec` (changed to `irq: Option` in v1.11) +//! - `VmState` (both arches) has `kvm_cap_modifiers` instead of `memory` +//! - `MicrovmState` has `memory_state: GuestMemoryState` at the top level (not inside VmState) +//! - x86_64 `VcpuState.xsave` is `kvm_xsave` (changed to `Xsave` in v1.12) +//! - No `KvmState` wrapper struct + +use serde::{Deserialize, Serialize}; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +// ─────────────────────────────────────────────────────────────────── +// Types identical to v1.12 — imported from that module (canonical source) +// ─────────────────────────────────────────────────────────────────── + +use crate::persist::VmInfo; + +pub use super::v1_12::{ + // ACPI device manager state (used in MicrovmState defined below) + ACPIDeviceManagerState, + BalloonState, + // Device inner states (used in Connected* wrappers defined below) + BlockState, + EntropyState, + // MMDS version (used in DeviceStates defined below) + MmdsVersionState, + // Virtio transport state (used in Connected* wrappers defined below) + MmioTransportState, + NetState, + VsockState, +}; + +// ─────────────────────────────────────────────────────────────────── +// MMIO device info (v1.10 uses `irqs: Vec`, changed to `irq: Option` in v1.11) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MMIODeviceInfo { + pub addr: u64, + pub len: u64, + pub irqs: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// Connected device state wrappers (use v1.10 MMIODeviceInfo with irqs: Vec) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBlockState { + pub device_id: String, + pub device_state: BlockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedNetState { + pub device_id: String, + pub device_state: NetState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedVsockState { + pub device_id: String, + pub device_state: VsockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBalloonState { + pub device_id: String, + pub device_state: BalloonState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedEntropyState { + pub device_id: String, + pub device_state: EntropyState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +// ─────────────────────────────────────────────────────────────────── +// Device states (v1.10 layout) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceStates { + #[cfg(target_arch = "aarch64")] + pub legacy_devices: Vec, + pub block_devices: Vec, + pub net_devices: Vec, + pub vsock_device: Option, + pub balloon_device: Option, + pub mmds_version: Option, + pub entropy_device: Option, +} + +// ─────────────────────────────────────────────────────────────────── +// Memory state (v1.10: GuestMemoryRegionState has `offset` field) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryRegionState { + pub base_address: u64, + pub size: usize, + /// File offset into the memory snapshot file (present in v1.10, removed in v1.11) + pub offset: u64, +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryState { + pub regions: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.10) +// Note: `memory_state` is at this level (not inside VmState), and there is no `kvm_state`. +// The kvm_cap_modifiers field lives inside VmState. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +pub struct MicrovmState { + pub vm_info: VmInfo, + pub memory_state: GuestMemoryState, + pub vm_state: VmState, + pub vcpu_states: Vec, + pub device_states: DeviceStates, + pub acpi_dev_state: ACPIDeviceManagerState, +} diff --git a/src/vmm/src/persist/v1_10/x86_64.rs b/src/vmm/src/persist/v1_10/x86_64.rs new file mode 100644 index 00000000000..d66d1c36eec --- /dev/null +++ b/src/vmm/src/persist/v1_10/x86_64.rs @@ -0,0 +1,55 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + CpuId, Msrs, kvm_clock_data, kvm_debugregs, kvm_irqchip, kvm_lapic_state, kvm_mp_state, + kvm_pit_state2, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs, kvm_xsave, +}; +use serde::{Deserialize, Serialize}; + +use crate::cpu_config::templates::KvmCapability; + +// ─────────────────────────────────────────────────────────────────── +// VM state (x86_64, v1.10) +// In v1.10, VmState holds kvm_cap_modifiers; memory_state is at MicrovmState level. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub pitstate: kvm_pit_state2, + pub clock: kvm_clock_data, + pub pic_master: kvm_irqchip, + pub pic_slave: kvm_irqchip, + pub ioapic: kvm_irqchip, + pub kvm_cap_modifiers: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (x86_64, v1.10) +// xsave is kvm_xsave (not Xsave/FamStructWrapper) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Serialize, Deserialize)] +pub struct VcpuState { + pub cpuid: CpuId, + pub saved_msrs: Vec, + pub debug_regs: kvm_debugregs, + pub lapic: kvm_lapic_state, + pub mp_state: kvm_mp_state, + pub regs: kvm_regs, + pub sregs: kvm_sregs, + pub vcpu_events: kvm_vcpu_events, + pub xcrs: kvm_xcrs, + /// In v1.10, xsave is stored as kvm_xsave (4096-byte opaque blob). + /// In v1.12+, it became Xsave = FamStructWrapper to support Intel AMX. + pub xsave: kvm_xsave, + pub tsc_khz: Option, +} + +impl std::fmt::Debug for VcpuState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VcpuState") + .field("tsc_khz", &self.tsc_khz) + .finish_non_exhaustive() + } +} diff --git a/src/vmm/src/persist/v1_12/aarch64.rs b/src/vmm/src/persist/v1_12/aarch64.rs new file mode 100644 index 00000000000..7c66c2f1dae --- /dev/null +++ b/src/vmm/src/persist/v1_12/aarch64.rs @@ -0,0 +1,64 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_mp_state, kvm_vcpu_init}; +use serde::{Deserialize, Serialize}; + +use super::{GuestMemoryState, MMIODeviceInfo}; + +// Types that are canonical in v1_14 and unchanged through all versions +pub use crate::v1_14::{ + // Legacy device type enum + DeviceType, + // GIC helper types (GicState itself changed — its_state added — so redefined in v1_14) + GicRegState, + VgicSysRegsState, + GicVcpuState, + // Register vector with custom serde + Aarch64RegisterVec, +}; + +// ─────────────────────────────────────────────────────────────────── +// aarch64 GIC types (identical to v1.10; its_state added in v1.14) +// Canonical definitions are here; v1.10 imports from this module. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicState { + pub dist: Vec>, + pub gic_vcpu_states: Vec, +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (aarch64, v1.10 = v1.12) +// Canonical definition is here; v1.10 imports from this module. +// Gains `pvtime_ipa` in v1.14. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VcpuState { + pub mp_state: kvm_mp_state, + pub regs: Aarch64RegisterVec, + pub mpidr: u64, + pub kvi: kvm_vcpu_init, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: memory moved into VmState; kvm_cap_modifiers → KvmState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub gic: GicState, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 ConnectedLegacyState uses updated MMIODeviceInfo +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} diff --git a/src/vmm/src/persist/v1_12/mod.rs b/src/vmm/src/persist/v1_12/mod.rs new file mode 100644 index 00000000000..85ba0d00b31 --- /dev/null +++ b/src/vmm/src/persist/v1_12/mod.rs @@ -0,0 +1,435 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.12 (snapshot format version 6.0.0). +//! +//! Types that are structurally identical to v1.14 are imported from that module. +//! Types that are the same in v1.10 and v1.12 (but different from v1.14) are defined +//! here as the canonical source; v1.10 imports them from this module. +//! Only types that are truly v1.12-specific are also defined here. +//! +//! Changes from v1.10: +//! - `MMIODeviceInfo`: `irqs: Vec` → `irq: Option` (v1.11) +//! - `GuestMemoryRegionState`: `offset` field removed (v1.11) +//! - `VmState`: memory moved here from `MicrovmState`, `kvm_cap_modifiers` moved to `KvmState` +//! - x86_64 `VcpuState.xsave`: `kvm_xsave` → `Xsave` (v1.12) +//! - `KvmState`: new wrapper for `kvm_cap_modifiers` +//! - `MicrovmState`: adds `kvm_state`, removes `memory_state` + +use serde::{Deserialize, Serialize}; + +use super::v1_10; +use crate::arch::VcpuState; +use crate::devices::acpi::vmgenid::VMGenIDState; +use crate::devices::virtio::balloon::persist::BalloonConfigSpaceState; +use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::block::virtio::persist::FileEngineTypeState; +use crate::devices::virtio::net::persist::{NetConfigSpaceState, RxBufferState}; +use crate::devices::virtio::persist::QueueState; +use crate::devices::virtio::vsock::persist::VsockBackendState; +use crate::mmds::persist::MmdsNetworkStackState; +use crate::persist::VmInfo; +use crate::rate_limiter::persist::RateLimiterState; +use crate::vstate::kvm::KvmState; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +// ─────────────────────────────────────────────────────────────────── +// Shared simple types — same in v1.10 and v1.12; differs in v1.14 +// Canonical definitions are here; v1.10 imports from this module. +// ─────────────────────────────────────────────────────────────────── +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct VirtioDeviceState { + pub device_type: u32, + pub avail_features: u64, + pub acked_features: u64, + pub queues: Vec, + pub interrupt_status: u32, + pub activated: bool, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MmioTransportState { + pub features_select: u32, + pub acked_features_select: u32, + pub queue_select: u32, + pub device_status: u32, + pub config_generation: u32, +} + +// ─────────────────────────────────────────────────────────────────── +// Block device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioBlockState { + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub disk_path: String, + pub virtio_state: VirtioDeviceState, + pub rate_limiter_state: RateLimiterState, + pub file_engine_type: FileEngineTypeState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VhostUserBlockState { + pub id: String, + pub partuuid: Option, + pub cache_type: CacheType, + pub root_device: bool, + pub socket_path: String, + pub vu_acked_protocol_features: u64, + pub config_space: Vec, + pub virtio_state: VirtioDeviceState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum BlockState { + Virtio(VirtioBlockState), + VhostUser(VhostUserBlockState), +} + +// ─────────────────────────────────────────────────────────────────── +// Net device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NetState { + pub id: String, + pub tap_if_name: String, + pub rx_rate_limiter_state: RateLimiterState, + pub tx_rate_limiter_state: RateLimiterState, + pub mmds_ns: Option, + pub config_space: NetConfigSpaceState, + pub virtio_state: VirtioDeviceState, + pub rx_buffers_state: RxBufferState, +} + +// ─────────────────────────────────────────────────────────────────── +// Vsock device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VsockFrontendState { + pub cid: u64, + pub virtio_state: VirtioDeviceState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VsockState { + pub backend: VsockBackendState, + pub frontend: VsockFrontendState, +} + +// ─────────────────────────────────────────────────────────────────── +// Balloon device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct BalloonStatsState { + pub swap_in: Option, + pub swap_out: Option, + pub major_faults: Option, + pub minor_faults: Option, + pub free_memory: Option, + pub total_memory: Option, + pub available_memory: Option, + pub disk_caches: Option, + pub hugetlb_allocations: Option, + pub hugetlb_failures: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BalloonState { + pub stats_polling_interval_s: u16, + pub stats_desc_index: Option, + pub latest_stats: BalloonStatsState, + pub config_space: BalloonConfigSpaceState, + pub virtio_state: VirtioDeviceState, +} + +// ─────────────────────────────────────────────────────────────────── +// Entropy device +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EntropyState { + pub virtio_state: VirtioDeviceState, + pub rate_limiter_state: RateLimiterState, +} + +// ─────────────────────────────────────────────────────────────────── +// MMDS +// ─────────────────────────────────────────────────────────────────── + +/// MMDS version (renamed to `MmdsVersion` and restructured in v1.14). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum MmdsVersionState { + V1, + V2, +} + +// ─────────────────────────────────────────────────────────────────── +// ACPI devices state (same as v1.10; vmgenid becomes mandatory in v1.14) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct ACPIDeviceManagerState { + pub vmgenid: Option, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.11: irqs: Vec → irq: Option +// ─────────────────────────────────────────────────────────────────── + +/// MMIO device info. +/// +/// Note: stored as `Option` in Firecracker source, but `NonZeroU32` has +/// the same bincode wire format as `u32`, so we use `Option` here. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MMIODeviceInfo { + pub addr: u64, + pub len: u64, + pub irq: Option, +} + +impl MMIODeviceInfo { + pub(crate) fn from(old: v1_10::MMIODeviceInfo) -> MMIODeviceInfo { + MMIODeviceInfo { + addr: old.addr, + len: old.len, + // v1.10 stored a Vec of IRQs; v1.11+ uses a single optional IRQ. + // In practice exactly one IRQ was always present for devices that have one. + irq: old.irqs.into_iter().next(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.11: `offset` field removed from GuestMemoryRegionState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryRegionState { + pub base_address: u64, + pub size: usize, +} + +impl From for GuestMemoryRegionState { + fn from(old: v1_10::GuestMemoryRegionState) -> Self { + // Drop the `offset` field which was removed in v1.11. + GuestMemoryRegionState { + base_address: old.base_address, + size: old.size, + } + } +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GuestMemoryState { + pub regions: Vec, +} + +impl From for GuestMemoryState { + fn from(old: v1_10::GuestMemoryState) -> Self { + GuestMemoryState { + regions: old + .regions + .into_iter() + .map(GuestMemoryRegionState::from) + .collect(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Connected device state wrappers — redefined because MMIODeviceInfo changed. +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBlockState { + pub device_id: String, + pub device_state: BlockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedNetState { + pub device_id: String, + pub device_state: NetState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedVsockState { + pub device_id: String, + pub device_state: VsockState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedBalloonState { + pub device_id: String, + pub device_state: BalloonState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedEntropyState { + pub device_id: String, + pub device_state: EntropyState, + pub transport_state: MmioTransportState, + pub device_info: MMIODeviceInfo, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceStates { + #[cfg(target_arch = "aarch64")] + pub legacy_devices: Vec, + pub block_devices: Vec, + pub net_devices: Vec, + pub vsock_device: Option, + pub balloon_device: Option, + pub mmds_version: Option, + pub entropy_device: Option, +} + +impl From for DeviceStates { + fn from(old: v1_10::DeviceStates) -> Self { + DeviceStates { + #[cfg(target_arch = "aarch64")] + legacy_devices: old + .legacy_devices + .into_iter() + .map(|ld| ConnectedLegacyState { + type_: ld.type_, + device_info: MMIODeviceInfo::from(ld.device_info), + }) + .collect(), + block_devices: old + .block_devices + .into_iter() + .map(|d| ConnectedBlockState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }) + .collect(), + net_devices: old + .net_devices + .into_iter() + .map(|d| ConnectedNetState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }) + .collect(), + vsock_device: old.vsock_device.map(|d| ConnectedVsockState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + balloon_device: old.balloon_device.map(|d| ConnectedBalloonState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + mmds_version: old.mmds_version, + entropy_device: old.entropy_device.map(|d| ConnectedEntropyState { + device_id: d.device_id, + device_state: d.device_state, + transport_state: d.transport_state, + device_info: MMIODeviceInfo::from(d.device_info), + }), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.12) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Serialize, Deserialize)] +pub struct MicrovmState { + /// Imported from v1_14; unchanged through all versions. + pub vm_info: VmInfo, + /// Imported from v1_14; wraps `kvm_cap_modifiers`, extracted from v1.10's `VmState`. + pub kvm_state: KvmState, + /// Redefined in v1.12: `memory` moved in from top-level `MicrovmState.memory_state`, + /// `kvm_cap_modifiers` moved out to `KvmState`. Redefined again in v1.14: gains + /// `resource_allocator`; `GuestMemoryRegionState` gains `region_type` and `plugged`. + pub vm_state: VmState, + /// x86_64: redefined here (`xsave` type changed from `kvm_xsave` to `Xsave`); + /// imported into v1.14 (same type). + /// aarch64: canonical definition here (same as v1.10; gains `pvtime_ipa` in v1.14). + pub vcpu_states: Vec, + /// Redefined here: all `ConnectedXxxState` wrappers rebuilt because `MMIODeviceInfo` + /// changed (`irqs: Vec` → `irq: Option`). Inner device states (BlockState, + /// NetState, etc.) are defined in this module as the v1.10/v1.12 canonical source. + pub device_states: DeviceStates, + /// Defined in this module as the v1.10/v1.12 canonical source. Redefined in v1.14: + /// `vmgenid` becomes mandatory, x86_64 gains `vmclock`; moved inside + /// `DevicesState.acpi_state` (no longer top-level). + pub acpi_dev_state: ACPIDeviceManagerState, +} + +impl From for MicrovmState { + fn from(old: v1_10::MicrovmState) -> Self { + // In v1.10, kvm_cap_modifiers lives in VmState; in v1.12 it moves to KvmState. + // KvmCapability is the same type in all versions (imported from v1_14). + let kvm_cap_modifiers = old.vm_state.kvm_cap_modifiers; + + let memory = GuestMemoryState::from(old.memory_state); + + #[cfg(target_arch = "x86_64")] + let vm_state = VmState { + memory, + pitstate: old.vm_state.pitstate, + clock: old.vm_state.clock, + pic_master: old.vm_state.pic_master, + pic_slave: old.vm_state.pic_slave, + ioapic: old.vm_state.ioapic, + }; + + #[cfg(target_arch = "aarch64")] + let vm_state = VmState { + memory, + gic: old.vm_state.gic, + }; + + // x86_64: xsave type changed from kvm_xsave → Xsave, needs conversion. + // aarch64: VcpuState is identical in v1.10 and v1.12 (v1_12 is canonical source). + #[cfg(target_arch = "x86_64")] + let vcpu_states: Vec = + old.vcpu_states.into_iter().map(VcpuState::from).collect(); + #[cfg(target_arch = "aarch64")] + let vcpu_states = old.vcpu_states; + + MicrovmState { + vm_info: old.vm_info, + kvm_state: KvmState { kvm_cap_modifiers }, + vm_state, + vcpu_states, + device_states: DeviceStates::from(old.device_states), + acpi_dev_state: old.acpi_dev_state, + } + } +} diff --git a/src/vmm/src/persist/v1_12/x86_64.rs b/src/vmm/src/persist/v1_12/x86_64.rs new file mode 100644 index 00000000000..912bb10b7ab --- /dev/null +++ b/src/vmm/src/persist/v1_12/x86_64.rs @@ -0,0 +1,46 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_clock_data, kvm_irqchip, kvm_pit_state2}; +use serde::{Deserialize, Serialize}; + +use crate::{arch::VcpuState, persist::v1_14::x86_64::xsave_from_v1_10}; + +use super::{GuestMemoryState, v1_10}; + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: memory moved into VmState; kvm_cap_modifiers → KvmState +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub pitstate: kvm_pit_state2, + pub clock: kvm_clock_data, + pub pic_master: kvm_irqchip, + pub pic_slave: kvm_irqchip, + pub ioapic: kvm_irqchip, +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.12: xsave type changed from kvm_xsave → Xsave +// VcpuState is defined in v1_14 (same in v1.12 and v1.14); conversion from v1.10 is here. +// ─────────────────────────────────────────────────────────────────── + +impl VcpuState { + pub(crate) fn from(old: v1_10::VcpuState) -> VcpuState { + VcpuState { + cpuid: old.cpuid, + saved_msrs: old.saved_msrs, + debug_regs: old.debug_regs, + lapic: old.lapic, + mp_state: old.mp_state, + regs: old.regs, + sregs: old.sregs, + vcpu_events: old.vcpu_events, + xcrs: old.xcrs, + xsave: xsave_from_v1_10(old.xsave), + tsc_khz: old.tsc_khz, + } + } +} diff --git a/src/vmm/src/persist/v1_14/aarch64.rs b/src/vmm/src/persist/v1_14/aarch64.rs new file mode 100644 index 00000000000..f203ea6fee6 --- /dev/null +++ b/src/vmm/src/persist/v1_14/aarch64.rs @@ -0,0 +1,201 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{kvm_mp_state, kvm_vcpu_init}; +use serde::{Deserialize, Serialize}; + +use crate::convert::{ConvertError, irq_to_gsi}; +use crate::v1_12; + +use super::{ + ACPIDeviceManagerState, GuestMemoryState, MMIODeviceInfo, ResourceAllocator, VMGenIDState, +}; + +// ─────────────────────────────────────────────────────────────────── +// StaticCpuTemplate — canonical definition (identical in v1.10, v1.12, v1.14) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum StaticCpuTemplate { + V1N1, + #[default] + None, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 legacy device types — canonical definitions +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum DeviceType { + Virtio(u32), + Serial, + Rtc, +} + +// ─────────────────────────────────────────────────────────────────── +// GIC helper types — canonical definitions (unchanged since v1.10) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(bound(serialize = "T: Serialize", deserialize = "T: for<'a> Deserialize<'a>"))] +pub struct GicRegState Deserialize<'a>> { + pub chunks: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VgicSysRegsState { + pub main_icc_regs: Vec>, + pub ap_icc_regs: Vec>>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicVcpuState { + pub rdist: Vec>, + pub icc: VgicSysRegsState, +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 register vector — canonical definition (unchanged since v1.10) +// ─────────────────────────────────────────────────────────────────── + +/// aarch64 register vector with custom serde: serialized as (Vec, Vec) +#[derive(Debug, Clone)] +pub struct Aarch64RegisterVec { + pub ids: Vec, + pub data: Vec, +} + +impl Serialize for Aarch64RegisterVec { + fn serialize(&self, serializer: S) -> Result { + (&self.ids, &self.data).serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for Aarch64RegisterVec { + fn deserialize>(deserializer: D) -> Result { + let (ids, data) = <(Vec, Vec)>::deserialize(deserializer)?; + Ok(Aarch64RegisterVec { ids, data }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 ConnectedLegacyState (uses updated MMIODeviceInfo with gsi) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConnectedLegacyState { + pub type_: DeviceType, + pub device_info: MMIODeviceInfo, +} + +impl From for ConnectedLegacyState { + fn from(s: v1_12::ConnectedLegacyState) -> Self { + ConnectedLegacyState { + type_: s.type_, + device_info: MMIODeviceInfo::from(s.device_info), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// aarch64 GIC state (v1.14: adds its_state) +// GicRegState, VgicSysRegsState, GicVcpuState are defined above +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ItsRegisterState { + pub iidr: u64, + pub cbaser: u64, + pub creadr: u64, + pub cwriter: u64, + pub baser: [u64; 8], + pub ctlr: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GicState { + pub dist: Vec>, + pub gic_vcpu_states: Vec, + /// ITS state (GICv3 only). None for GICv2 or when converted from v1.12. + pub its_state: Option, +} + +impl GicState { + pub(crate) fn from(old_state: v1_12::GicState) -> GicState { + GicState { + dist: old_state.dist, + gic_vcpu_states: old_state.gic_vcpu_states, + its_state: None, // v1.12 had no ITS support + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// vCPU state (aarch64, v1.14: gains pvtime_ipa) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VcpuState { + pub mp_state: kvm_mp_state, + pub regs: Aarch64RegisterVec, + pub mpidr: u64, + pub kvi: kvm_vcpu_init, + pub pvtime_ipa: Option, +} + +impl VcpuState { + pub(crate) fn from(old_state: v1_12::VcpuState) -> VcpuState { + VcpuState { + mp_state: old_state.mp_state, + regs: old_state.regs, + mpidr: old_state.mpidr, + kvi: old_state.kvi, + pvtime_ipa: None, // new in v1.14; default to None (not configured) + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ACPI device state impl (aarch64: no vmclock) +// ─────────────────────────────────────────────────────────────────── + +impl ACPIDeviceManagerState { + pub(crate) fn from( + s: v1_12::ACPIDeviceManagerState, + _resource_allocator: &mut ResourceAllocator, + ) -> Result { + let vmgenid = s.vmgenid.ok_or(ConvertError::MissingVmGenId)?; + Ok(ACPIDeviceManagerState { + vmgenid: VMGenIDState { + // v1.12 aarch64 uses IRQ_BASE=32-based numbers; v1.14 uses 0-based GSIs + gsi: irq_to_gsi(vmgenid.gsi), + addr: vmgenid.addr, + }, + }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (aarch64, v1.14: adds resource_allocator) +// ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VmState { + pub memory: GuestMemoryState, + pub gic: GicState, + pub resource_allocator: ResourceAllocator, +} + +impl VmState { + pub(crate) fn from( + old_state: v1_12::VmState, + resource_allocator: ResourceAllocator, + ) -> VmState { + VmState { + memory: GuestMemoryState::from(old_state.memory), + gic: GicState::from(old_state.gic), + resource_allocator, + } + } +} diff --git a/src/vmm/src/persist/v1_14/mod.rs b/src/vmm/src/persist/v1_14/mod.rs new file mode 100644 index 00000000000..dcaddb6e8c4 --- /dev/null +++ b/src/vmm/src/persist/v1_14/mod.rs @@ -0,0 +1,593 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Serializable state types for Firecracker v1.14 (snapshot format version 8.0.0). +//! +//! This module is the **canonical source** for types shared across all snapshot versions. +//! Older modules (v1_12, v1_10) import unchanged types from here rather than defining +//! their own copies. +//! +//! Types that are unique to v1.14 or changed from v1.12: +//! - `VirtioDeviceState`: `interrupt_status` removed (moved to `MmioTransportState`) +//! - `MmioTransportState`: gains `interrupt_status` +//! - `MMIODeviceInfo`: `irq` → `gsi` +//! - `NetState`: `rx_buffers_state` retained +//! - `BalloonStatsState`: 6 new fields +//! - `BalloonState`: gains `hinting_state` +//! - aarch64 `GicState`: gains `its_state` +//! - aarch64 `VcpuState`: gains `pvtime_ipa` +//! - `GuestMemoryRegionState`: gains `region_type` and `plugged` +//! - `ACPIDeviceManagerState`: vmgenid now mandatory, adds vmclock (x86_64) +//! - New types: `ConnectedDeviceState`, `DevicesState`, `ResourceAllocator`, +//! `PmemState`, `VirtioMemState`, `MmdsState`, `GuestRegionType`, etc. + +use vm_allocator::{AddressAllocator, AllocPolicy, IdAllocator}; + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +use crate::arch::{ + FIRST_ADDR_PAST_64BITS_MMIO, GSI_LEGACY_END, GSI_LEGACY_START, GSI_MSI_END, GSI_MSI_START, + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PAST_64BITS_MMIO_SIZE, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, VmState, +}; +use crate::device_manager::DevicesState; +use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevicesState; +use crate::device_manager::persist::{ + ACPIDeviceManagerState, DeviceStates, MmdsState, VirtioDeviceState as ConnectedDeviceState, +}; +use crate::devices::acpi::vmgenid::VMGENID_MEM_SIZE; +use crate::devices::virtio::balloon::device::HintingState; +use crate::devices::virtio::balloon::persist::{BalloonState, BalloonStatsState}; +use crate::devices::virtio::block::persist::BlockState; +use crate::devices::virtio::block::vhost_user::persist::VhostUserBlockState; +use crate::devices::virtio::block::virtio::persist::VirtioBlockState; +use crate::devices::virtio::net::persist::NetState; +use crate::devices::virtio::persist::{MmioTransportState, VirtioDeviceState}; +use crate::devices::virtio::rng::persist::EntropyState; +use crate::devices::virtio::vsock::persist::{VsockFrontendState, VsockState}; +use crate::mmds::data_store::MmdsVersion; +use crate::persist::{MicrovmState, v1_12}; +use crate::vstate::memory::{GuestMemoryRegionState, GuestMemoryState, GuestRegionType}; +use crate::vstate::resources::ResourceAllocator; + +#[derive(Debug, thiserror::Error)] +pub enum ConvertError { + #[error("VMGenID state is missing; cannot convert snapshot (v1.12 snapshot must have VMGenID)")] + MissingVmGenId, + #[error("vm-allocator error during ResourceAllocator reconstruction: {0}")] + Allocator(#[from] vm_allocator::Error), + #[error("ResourceAllocator reconstruction failed: duplicate/invalid MMIO address 0x{0:x}")] + DuplicateAddress(u64), + #[error("ResourceAllocator reconstruction failed: GSI {0} out of expected range")] + #[allow(dead_code)] + GsiOutOfRange(u32), +} + +// In v1.12 x86_64, IRQ_BASE = 5 = GSI_LEGACY_START. No conversion needed. +// This constant exists for symmetry with the aarch64 SPI_START offset. +pub const SPI_START: u32 = 0; // no-op offset for x86_64 + +/// Convert a v1.12 IRQ number to a v1.14 GSI number. +/// +/// x86_64: IRQ_BASE (5) == GSI_LEGACY_START (5) — no transformation needed. +/// aarch64: IRQ_BASE (32) != GSI_LEGACY_START (0) — subtract SPI_START (32). +pub(crate) fn irq_to_gsi(irq: u32) -> u32 { + irq.saturating_sub(SPI_START) +} + +impl VirtioDeviceState { + /// Convert v1.12 VirtioDeviceState → v1.14 VirtioDeviceState. + /// + /// With v1.14, the `interrupt_status` moves from [`VirtioDeviceState`] to [`MmioTransportState`]. + /// That's why we don't use `From` here, so we can return + /// `interrupt_status` separately. + pub(crate) fn from(old_state: v1_12::VirtioDeviceState) -> (Self, u32) { + let interrupt_status = old_state.interrupt_status; + let new_state = VirtioDeviceState { + device_type: old_state.device_type, + avail_features: old_state.avail_features, + acked_features: old_state.acked_features, + queues: old_state.queues, // QueueState is the same type (re-exported v1_10 → v1_12 → v1_14) + activated: old_state.activated, + }; + (new_state, interrupt_status) + } +} + +/// Convert v1.12 MmioTransportState → v1.14 MmioTransportState with interrupt_status. +impl MmioTransportState { + pub(crate) fn from(old_state: v1_12::MmioTransportState, interrupt_status: u32) -> Self { + MmioTransportState { + features_select: old_state.features_select, + acked_features_select: old_state.acked_features_select, + queue_select: old_state.queue_select, + device_status: old_state.device_status, + config_generation: old_state.config_generation, + interrupt_status, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Changed in v1.14: irq → gsi +// ─────────────────────────────────────────────────────────────────── +impl MMIODeviceInfo { + /// Convert v1.12 MMIODeviceInfo → v1.14 MMIODeviceInfo. + /// irq (Option, same wire format as Option) → gsi: Option + pub(crate) fn from(old_state: v1_12::MMIODeviceInfo) -> MMIODeviceInfo { + MMIODeviceInfo { + addr: old_state.addr, + len: old_state.len, + gsi: old_state.irq.map(irq_to_gsi), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Block device — redefined because VirtioDeviceState changed +// ─────────────────────────────────────────────────────────────────── +impl VirtioBlockState { + pub(crate) fn from(old_state: v1_12::VirtioBlockState) -> (VirtioBlockState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = VirtioBlockState { + id: old_state.id, + partuuid: old_state.partuuid, + cache_type: old_state.cache_type, + root_device: old_state.root_device, + disk_path: old_state.disk_path, + virtio_state, + rate_limiter_state: old_state.rate_limiter_state, + file_engine_type: old_state.file_engine_type, + }; + (new, interrupt_status) + } +} + +impl VhostUserBlockState { + pub(crate) fn from(old_state: v1_12::VhostUserBlockState) -> (VhostUserBlockState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = VhostUserBlockState { + id: old_state.id, + partuuid: old_state.partuuid, + cache_type: old_state.cache_type, + root_device: old_state.root_device, + socket_path: old_state.socket_path, + vu_acked_protocol_features: old_state.vu_acked_protocol_features, + config_space: old_state.config_space, + virtio_state, + }; + (new, interrupt_status) + } +} + +impl BlockState { + pub(crate) fn from(old_state: v1_12::BlockState) -> (BlockState, u32) { + match old_state { + v1_12::BlockState::Virtio(b) => { + let (new, irq) = VirtioBlockState::from(b); + (BlockState::Virtio(new), irq) + } + v1_12::BlockState::VhostUser(b) => { + let (new, irq) = VhostUserBlockState::from(b); + (BlockState::VhostUser(new), irq) + } + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// MMDS — MmdsVersionState renamed/restructured to MmdsState +// ─────────────────────────────────────────────────────────────────── +impl MmdsVersion { + pub(crate) fn from(old_state: v1_12::MmdsVersionState) -> MmdsVersion { + match old_state { + v1_12::MmdsVersionState::V1 => MmdsVersion::V1, + v1_12::MmdsVersionState::V2 => MmdsVersion::V2, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Net device — changed: VirtioDeviceState changed; rx_buffers_state retained +// ─────────────────────────────────────────────────────────────────── +impl NetState { + pub(crate) fn from(old_state: v1_12::NetState) -> (NetState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = NetState { + id: old_state.id, + tap_if_name: old_state.tap_if_name, + rx_rate_limiter_state: old_state.rx_rate_limiter_state, + tx_rate_limiter_state: old_state.tx_rate_limiter_state, + mmds_ns: old_state.mmds_ns, + config_space: old_state.config_space, + virtio_state, + rx_buffers_state: old_state.rx_buffers_state, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Vsock device — VsockFrontendState/VsockState redefined (VirtioDeviceState changed) +// VsockUdsState and VsockBackendState are unchanged and defined above +// ─────────────────────────────────────────────────────────────────── +impl VsockState { + pub(crate) fn from(old_state: v1_12::VsockState) -> (VsockState, u32) { + let (virtio_state, interrupt_status) = + VirtioDeviceState::from(old_state.frontend.virtio_state); + let new = VsockState { + backend: old_state.backend, + frontend: VsockFrontendState { + cid: old_state.frontend.cid, + virtio_state, + }, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Balloon device — BalloonStatsState gains 6 new fields; BalloonState gains hinting_state +// ─────────────────────────────────────────────────────────────────── +impl BalloonStatsState { + pub(crate) fn from(old_state: v1_12::BalloonStatsState) -> BalloonStatsState { + BalloonStatsState { + swap_in: old_state.swap_in, + swap_out: old_state.swap_out, + major_faults: old_state.major_faults, + minor_faults: old_state.minor_faults, + free_memory: old_state.free_memory, + total_memory: old_state.total_memory, + available_memory: old_state.available_memory, + disk_caches: old_state.disk_caches, + hugetlb_allocations: old_state.hugetlb_allocations, + hugetlb_failures: old_state.hugetlb_failures, + oom_kill: None, + alloc_stall: None, + async_scan: None, + direct_scan: None, + async_reclaim: None, + direct_reclaim: None, + } + } +} + +impl BalloonState { + pub(crate) fn from(old_state: v1_12::BalloonState) -> (BalloonState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = BalloonState { + stats_polling_interval_s: old_state.stats_polling_interval_s, + stats_desc_index: old_state.stats_desc_index, + latest_stats: BalloonStatsState::from(old_state.latest_stats), + config_space: old_state.config_space, + hinting_state: HintingState { + host_cmd: 0, + last_cmd_id: 0, + guest_cmd: None, + // Default: acknowledge on finish (matches firecracker's `default_ack_on_stop()`) + acknowledge_on_finish: true, + }, + virtio_state, + }; + (new, interrupt_status) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Entropy device — redefined because VirtioDeviceState changed +// ─────────────────────────────────────────────────────────────────── +impl EntropyState { + pub(crate) fn from(old_state: v1_12::EntropyState) -> (EntropyState, u32) { + let (virtio_state, interrupt_status) = VirtioDeviceState::from(old_state.virtio_state); + let new = EntropyState { + virtio_state, + rate_limiter_state: old_state.rate_limiter_state, + }; + (new, interrupt_status) + } +} + +macro_rules! convert_connected_state { + ($old_type:ty, $new_type:ty) => { + impl From<$old_type> for ConnectedDeviceState<$new_type> { + fn from(old_type: $old_type) -> Self { + let (device_state, interrupt_status) = <$new_type>::from(old_type.device_state); + let transport_state = + MmioTransportState::from(old_type.transport_state, interrupt_status); + ConnectedDeviceState { + device_id: old_type.device_id, + device_state, + transport_state, + device_info: MMIODeviceInfo::from(old_type.device_info), + } + } + } + }; +} + +convert_connected_state!(v1_12::ConnectedBlockState, BlockState); +convert_connected_state!(v1_12::ConnectedNetState, NetState); +convert_connected_state!(v1_12::ConnectedVsockState, VsockState); +convert_connected_state!(v1_12::ConnectedBalloonState, BalloonState); +convert_connected_state!(v1_12::ConnectedEntropyState, EntropyState); + +// ─────────────────────────────────────────────────────────────────── +// Device states (v1.14 layout) +// ─────────────────────────────────────────────────────────────────── + +impl From for DeviceStates { + fn from(old_state: v1_12::DeviceStates) -> Self { + DeviceStates { + #[cfg(target_arch = "aarch64")] + legacy_devices: old_state + .legacy_devices + .into_iter() + .map(ConnectedLegacyState::from) + .collect(), + block_devices: old_state + .block_devices + .into_iter() + .map(ConnectedDeviceState::::from) + .collect(), + net_devices: old_state + .net_devices + .into_iter() + .map(ConnectedDeviceState::::from) + .collect(), + vsock_device: old_state + .vsock_device + .map(ConnectedDeviceState::::from), + balloon_device: old_state + .balloon_device + .map(ConnectedDeviceState::::from), + mmds: old_state.mmds_version.map(|v| MmdsState { + version: MmdsVersion::from(v), + imds_compat: false, + }), + entropy_device: old_state + .entropy_device + .map(ConnectedDeviceState::::from), + // pmem and memory devices are new in v1.14, not present in v1.12 + pmem_devices: Vec::new(), + memory_device: None, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Memory state (v1.14: region_type and plugged added) +// ─────────────────────────────────────────────────────────────────── +impl From for GuestMemoryState { + fn from(old_state: v1_12::GuestMemoryState) -> Self { + GuestMemoryState { + regions: old_state + .regions + .into_iter() + .map(|r| GuestMemoryRegionState { + base_address: r.base_address, + size: r.size, + // v1.12 snapshots don't have memory hotplug, all regions are Dram + region_type: GuestRegionType::Dram, + // No slots were plugged/unplugged; Dram regions have a single slot + // of size == region size, so there's 1 plugged slot + plugged: vec![true], + }) + .collect(), + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// ResourceAllocator (new in v1.14) +// ─────────────────────────────────────────────────────────────────── +impl ResourceAllocator { + /// Reconstruct the v1.14 ResourceAllocator from v1.12 device information. + /// + /// In v1.12, the ResourceAllocator state wasn't persisted; in v1.14 it is. + /// We reconstruct it by marking all allocations that were made during VM setup. + pub(crate) fn from( + device_states: &v1_12::DeviceStates, + acpi_state: &v1_12::ACPIDeviceManagerState, + ) -> Result { + // Initialize fresh allocators matching ResourceAllocator::new() + let mut gsi_legacy = + IdAllocator::new(GSI_LEGACY_START, GSI_LEGACY_END).map_err(ConvertError::Allocator)?; + let mut gsi_msi = + IdAllocator::new(GSI_MSI_START, GSI_MSI_END).map_err(ConvertError::Allocator)?; + let mut mmio32 = AddressAllocator::new(MEM_32BIT_DEVICES_START, MEM_32BIT_DEVICES_SIZE) + .map_err(ConvertError::Allocator)?; + + // 64-bit MMIO space + let mmio64_start = MEM_64BIT_DEVICES_START; + let mmio64_size = MEM_64BIT_DEVICES_SIZE; + let mmio64 = + AddressAllocator::new(mmio64_start, mmio64_size).map_err(ConvertError::Allocator)?; + + // Past 64-bit MMIO space + let past_mmio64_start = FIRST_ADDR_PAST_64BITS_MMIO; + let past_mmio64_size = PAST_64BITS_MMIO_SIZE; + let past_mmio64 = AddressAllocator::new(past_mmio64_start, past_mmio64_size) + .map_err(ConvertError::Allocator)?; + + // System memory allocator + let mut system_mem = AddressAllocator::new(SYSTEM_MEM_START, SYSTEM_MEM_SIZE) + .map_err(ConvertError::Allocator)?; + + // Collect all used GSIs and MMIO addresses from devices + let mut used_legacy_gsis: Vec = Vec::new(); + let mut used_msi_gsis: Vec = Vec::new(); + let mut used_mmio32_addrs: Vec<(u64, u64)> = Vec::new(); // (addr, len) + + // Helper to classify and record a device's MMIODeviceInfo. + // On aarch64, v1.12 stores IRQ numbers starting from IRQ_BASE=32 (physical SPI), + // while v1.14 uses 0-based GSI numbers. We convert with irq_to_gsi(). + // Also: only record MMIO addresses within the v1.14 mmio32_memory range + // [MEM_32BIT_DEVICES_START, ...). Addresses below that (serial, RTC, early virtio + // devices allocated from v1.12's single MMIO allocator) are not tracked by the + // v1.14 mmio32_memory allocator and must be skipped. + let mut record_device_info = |info: &v1_12::MMIODeviceInfo| { + if let Some(irq) = info.irq { + let gsi = irq_to_gsi(irq); + if (GSI_LEGACY_START..=GSI_LEGACY_END).contains(&gsi) { + used_legacy_gsis.push(gsi); + } else if (GSI_MSI_START..=GSI_MSI_END).contains(&gsi) { + used_msi_gsis.push(gsi); + } + } + // Only record addresses within the v1.14 mmio32_memory range + if info.addr >= MEM_32BIT_DEVICES_START { + used_mmio32_addrs.push((info.addr, info.len)); + } + }; + + for dev in &device_states.block_devices { + record_device_info(&dev.device_info); + } + for dev in &device_states.net_devices { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.vsock_device { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.balloon_device { + record_device_info(&dev.device_info); + } + if let Some(dev) = &device_states.entropy_device { + record_device_info(&dev.device_info); + } + + #[cfg(target_arch = "aarch64")] + for dev in &device_states.legacy_devices { + record_device_info(&dev.device_info); + } + + // Also account for VMGenID's legacy GSI. + // v1.12 stores IRQ_BASE-based values; convert to v1.14 0-based GSI. + if let Some(vmgenid) = &acpi_state.vmgenid { + let gsi = irq_to_gsi(vmgenid.gsi); + if (GSI_LEGACY_START..=GSI_LEGACY_END).contains(&gsi) { + used_legacy_gsis.push(gsi); + } + } + + // Reconstruct legacy GSI allocator + // IdAllocator allocates sequentially. To reconstruct it, we allocate IDs up to + // max(used_ids) and free the ones we didn't use. + if !used_legacy_gsis.is_empty() { + let max_gsi = *used_legacy_gsis.iter().max().unwrap(); + let used_set: std::collections::HashSet = + used_legacy_gsis.iter().cloned().collect(); + + // Allocate all IDs from start to max + let mut allocated = Vec::new(); + for id in GSI_LEGACY_START..=max_gsi { + let got = gsi_legacy.allocate_id().map_err(ConvertError::Allocator)?; + allocated.push(got); + assert_eq!(got, id, "IdAllocator must allocate sequentially"); + } + // Free the ones not in use + for id in GSI_LEGACY_START..=max_gsi { + if !used_set.contains(&id) { + gsi_legacy.free_id(id).map_err(ConvertError::Allocator)?; + } + } + } + + // Reconstruct MSI GSI allocator (similarly) + if !used_msi_gsis.is_empty() { + let max_gsi = *used_msi_gsis.iter().max().unwrap(); + let used_set: std::collections::HashSet = used_msi_gsis.iter().cloned().collect(); + + for id in GSI_MSI_START..=max_gsi { + let got = gsi_msi.allocate_id().map_err(ConvertError::Allocator)?; + assert_eq!(got, id); + } + for id in GSI_MSI_START..=max_gsi { + if !used_set.contains(&id) { + gsi_msi.free_id(id).map_err(ConvertError::Allocator)?; + } + } + } + + // Reconstruct 32-bit MMIO allocator + // Each MMIO device was allocated with FirstMatch policy, so they were assigned + // sequentially. We use ExactMatch to mark each address as used. + for (addr, len) in &used_mmio32_addrs { + mmio32 + .allocate(*len, 1, AllocPolicy::ExactMatch(*addr)) + .map_err(|_| ConvertError::DuplicateAddress(*addr))?; + } + + // Reconstruct system memory allocator. + // In v1.12, VMGenID was allocated with LastMatch (highest addr in system_memory). + // VmClock (x86_64 only, new in v1.14) will be allocated in ACPIDeviceManagerState::from + // using LastMatch, which will place it just below the VMGenID region. + // We mark the VMGenID address as used here so the VmClock allocation in + // ACPIDeviceManagerState::from gets the correct (lower) address. + if let Some(vmgenid) = &acpi_state.vmgenid { + system_mem + .allocate(VMGENID_MEM_SIZE, 8, AllocPolicy::ExactMatch(vmgenid.addr)) + .map_err(|_| ConvertError::DuplicateAddress(vmgenid.addr))?; + } + + Ok(ResourceAllocator { + gsi_legacy_allocator: gsi_legacy, + gsi_msi_allocator: gsi_msi, + mmio32_memory: mmio32, + mmio64_memory: mmio64, + past_mmio64_memory: past_mmio64, + system_memory: system_mem, + }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// Top-level MicrovmState (v1.14) +// ─────────────────────────────────────────────────────────────────── +impl TryFrom for MicrovmState { + type Error = ConvertError; + + fn try_from(old: v1_12::MicrovmState) -> Result { + // Reconstruct ResourceAllocator from device info + let mut resource_allocator = + ResourceAllocator::from(&old.device_states, &old.acpi_dev_state)?; + + // Convert ACPI state (also allocates VmClock from resource_allocator on x86_64) + let acpi_state = ACPIDeviceManagerState::from(old.acpi_dev_state, &mut resource_allocator)?; + + // Convert device states + let mmio_state = DeviceStates::from(old.device_states); + + let device_states = DevicesState { + mmio_state, + acpi_state, + pci_state: PciDevicesState::default(), + }; + + // Convert VM state (embeds the reconstructed resource allocator) + let vm_state = VmState::from(old.vm_state, resource_allocator); + + // x86_64: VcpuState is the same type in v1.12 and v1.14. + // aarch64: VcpuState gains pvtime_ipa field, needs conversion. + #[cfg(target_arch = "x86_64")] + let vcpu_states = old.vcpu_states; + #[cfg(target_arch = "aarch64")] + let vcpu_states: Vec = + old.vcpu_states.into_iter().map(VcpuState::from).collect(); + + Ok(MicrovmState { + vm_info: old.vm_info, + kvm_state: old.kvm_state, + vm_state, + vcpu_states, + device_states, + }) + } +} diff --git a/src/vmm/src/persist/v1_14/x86_64.rs b/src/vmm/src/persist/v1_14/x86_64.rs new file mode 100644 index 00000000000..d772c78016e --- /dev/null +++ b/src/vmm/src/persist/v1_14/x86_64.rs @@ -0,0 +1,93 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::kvm_xsave; +use vm_allocator::AllocPolicy; + +use super::v1_12; +use crate::devices::acpi::generated::vmclock_abi::{ + VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, +}; +use crate::{ + arch::VmState, + devices::acpi::vmclock::{VMCLOCK_SIZE, VmClockState}, + persist::v1_14::ConvertError, +}; + +use super::{ACPIDeviceManagerState, GuestMemoryState, ResourceAllocator}; + +pub use kvm_bindings::Xsave; + +// ─────────────────────────────────────────────────────────────────── +// ACPI device state impl (x86_64: allocates vmclock) +// ─────────────────────────────────────────────────────────────────── + +impl ACPIDeviceManagerState { + pub(crate) fn from( + s: v1_12::ACPIDeviceManagerState, + resource_allocator: &mut ResourceAllocator, + ) -> Result { + let vmgenid = s.vmgenid.ok_or(ConvertError::MissingVmGenId)?; + + // Allocate VmClock from system memory using LastMatch (same as VmClock::new()) + // VmClock must be allocated after VMGenID in the system memory allocator reconstruction. + let vmclock_addr = resource_allocator + .system_memory + .allocate( + VMCLOCK_SIZE as u64, + VMCLOCK_SIZE as u64, + AllocPolicy::LastMatch, + ) + .map_err(ConvertError::Allocator)? + .start(); + + let vmclock = VmClockState { + guest_address: vmclock_addr, + inner: vmclock_abi { + magic: VMCLOCK_MAGIC, + size: VMCLOCK_SIZE, + version: 1, + clock_status: VMCLOCK_STATUS_UNKNOWN, + counter_id: VMCLOCK_COUNTER_INVALID, + ..Default::default() + }, + }; + + Ok(ACPIDeviceManagerState { vmgenid, vmclock }) + } +} + +// ─────────────────────────────────────────────────────────────────── +// VM state (x86_64, v1.14: adds resource_allocator) +// ─────────────────────────────────────────────────────────────────── +impl VmState { + pub(crate) fn from(s: v1_12::VmState, resource_allocator: ResourceAllocator) -> VmState { + VmState { + memory: GuestMemoryState::from(s.memory), + resource_allocator, + pitstate: s.pitstate, + clock: s.clock, + pic_master: s.pic_master, + pic_slave: s.pic_slave, + ioapic: s.ioapic, + } + } +} + +// ─────────────────────────────────────────────────────────────────── +// Helper used by v1_12::VcpuState::from(v1_10::VcpuState) +// ─────────────────────────────────────────────────────────────────── + +/// Convert a v1.10 `kvm_xsave` into a v1.12/v1.14 `Xsave` (= `FamStructWrapper`). +/// +/// v1.12 introduced `Xsave` to support Intel AMX extended save state (extra FAM entries). +/// A snapshot from v1.10 has no AMX state, so `len = 0` (zero FAM entries). +pub(crate) fn xsave_from_v1_10(old: kvm_xsave) -> Xsave { + let mut xsave = Xsave::new(0).expect("failed to allocate Xsave wrapper"); + // SAFETY: We only overwrite the `xsave` sub-field, not `len`, so the + // FamStructWrapper length invariant is preserved. + unsafe { + xsave.as_mut_fam_struct().xsave = old; + } + xsave +} diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index fdd0862a9d4..18621d7a74d 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -28,8 +28,9 @@ use crate::vmm_config::balloon::{ use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, DriveError}; use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; -use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; +use crate::vmm_config::meminfo::{MemoryDirty, MemoryMapingsResponse, MemoryResponse}; use crate::vmm_config::memory_hotplug::{ MemoryHotplugConfig, MemoryHotplugConfigError, MemoryHotplugSizeUpdate, }; @@ -146,6 +147,12 @@ pub enum VmmAction { /// Update the microVM configuration (memory & vcpu) using `VmUpdateConfig` as input. This /// action can only be called before the microVM has booted. UpdateMachineConfiguration(MachineConfigUpdate), + /// Get the guest memory mappings to host memory + GetMemoryMappings, + /// Get guest memory resident and empty pages information + GetMemory, + /// Get guest memory dirty pages information + GetMemoryDirty, } /// Wrapper for all errors associated with VMM actions. @@ -197,6 +204,8 @@ pub enum VmmActionError { OperationNotSupportedPostBoot, /// The requested operation is not supported before starting the microVM. OperationNotSupportedPreBoot, + /// The requested operation is not supported while the microVM is running. + OperationNotSupportedWhileRunning, /// Start microvm error: {0} StartMicrovm(#[from] StartMicrovmError), /// Vsock config error: {0} @@ -228,6 +237,12 @@ pub enum VmmData { VirtioMemStatus(VirtioMemStatus), /// The status of the virtio-balloon hinting run HintingStatus(HintingStatus), + /// The guest memory mapping information. + MemoryMappings(MemoryMapingsResponse), + /// The guest memory resident and empty pages information + Memory(MemoryResponse), + /// The guest memory dirty pages information + MemoryDirty(MemoryDirty), } /// Trait used for deduplicating the MMDS request handling across the two ApiControllers. @@ -495,7 +510,10 @@ impl<'a> PrebootApiController<'a> { | UpdateNetworkInterface(_) | StartFreePageHinting(_) | GetFreePageHintingStatus - | StopFreePageHinting => Err(VmmActionError::OperationNotSupportedPreBoot), + | StopFreePageHinting + | GetMemoryMappings + | GetMemory + | GetMemoryDirty => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -771,6 +789,9 @@ impl RuntimeApiController { .update_memory_hotplug_size(cfg.requested_size_mib) .map(|_| VmmData::Empty) .map_err(VmmActionError::MemoryHotplugUpdate), + GetMemoryMappings => self.get_guest_memory_mappings(), + GetMemory => self.get_guest_memory_info(), + GetMemoryDirty => self.get_dirty_memory_info(), // Operations not allowed post-boot. ConfigureBootSource(_) | ConfigureLogger(_) @@ -937,6 +958,57 @@ impl RuntimeApiController { .map_err(NetworkInterfaceError::DeviceUpdate) .map_err(VmmActionError::NetworkConfig) } + + /// Get guest memory mappings + fn get_guest_memory_mappings(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + + let vmm = self.vmm.lock().expect("Poisoned lock"); + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let mappings = vmm.guest_memory_mappings(page_size); + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory mappings' VMM action took {elapsed_time_us} us."); + Ok(VmmData::MemoryMappings(MemoryMapingsResponse { mappings })) + } + + /// Get resident and empty pages information for guest memory + fn get_guest_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let (resident, empty) = vmm.guest_memory_info(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory info' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::Memory(MemoryResponse { resident, empty })) + } + + /// Get dirty pages information for guest memory + fn get_dirty_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let bitmap = vmm.get_dirty_memory(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get dirty memory' VMM action took {elapsed_time_us} us."); + + Ok(VmmData::MemoryDirty(MemoryDirty { bitmap })) + } } #[cfg(test)] @@ -1243,7 +1315,7 @@ mod tests { CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), }, ))); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/snapshot/mod.rs b/src/vmm/src/snapshot/mod.rs index 76b5203298d..360b823712b 100644 --- a/src/vmm/src/snapshot/mod.rs +++ b/src/vmm/src/snapshot/mod.rs @@ -81,26 +81,21 @@ fn serialize(data: &S, write: &mut W) -> Result<(), Snap /// Firecracker snapshot header #[derive(Debug, Serialize, Deserialize)] -struct SnapshotHdr { +pub struct SnapshotHdr { /// magic value - magic: u64, + pub magic: u64, /// Snapshot data version - version: Version, + pub version: Version, } impl SnapshotHdr { - fn load(buf: &mut &[u8]) -> Result { + pub(crate) fn load(buf: &mut &[u8]) -> Result { let (hdr, bytes_read) = bincode::serde::decode_from_slice::(buf, BINCODE_CONFIG)?; if hdr.magic != SNAPSHOT_MAGIC_ID { return Err(SnapshotError::InvalidMagic(hdr.magic)); } - if hdr.version.major != SNAPSHOT_VERSION.major || hdr.version.minor > SNAPSHOT_VERSION.minor - { - return Err(SnapshotError::InvalidFormatVersion(hdr.version)); - } - *buf = &buf[bytes_read..]; Ok(hdr) diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 1288abef0ba..6718100e2cf 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -9,6 +9,8 @@ pub mod net; pub mod signal; /// Module with state machine pub mod sm; +/// Module with pagemap utilities +pub mod pagemap; use std::fs::{File, OpenOptions}; use std::num::Wrapping; diff --git a/src/vmm/src/utils/pagemap.rs b/src/vmm/src/utils/pagemap.rs new file mode 100644 index 00000000000..fff9e1f5cb2 --- /dev/null +++ b/src/vmm/src/utils/pagemap.rs @@ -0,0 +1,115 @@ +//! Utilities for reading /proc/self/pagemap to track dirty pages. + +#![allow(clippy::cast_possible_wrap)] + +use std::fs::File; +use std::os::unix::io::AsRawFd; + +use crate::arch::host_page_size; + +const PAGEMAP_ENTRY_SIZE: usize = 8; + +/// Errors related to pagemap operations +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PagemapError { + /// Failed to open /proc/self/pagemap: {0} + OpenPagemap(#[source] std::io::Error), + /// Failed to read pagemap entry: {0} + ReadEntry(#[source] std::io::Error), + /// Failed to open /proc/self/clear_refs: {0} + OpenClearRefs(#[source] std::io::Error), + /// Failed to clear soft-dirty bits: {0} + ClearSoftDirty(#[source] std::io::Error), +} + +/// Represents a single entry in /proc/pid/pagemap. +/// +/// Each virtual page has an 8-byte entry with the following layout: +/// - Bits 0-54: Page frame number (PFN) if present +/// - Bit 55: Page is soft-dirty (written to since last clear) +/// - Bit 56: Page is exclusively mapped +/// - Bit 57: Page is write-protected via userfaultfd +/// - Bit 58: Unused +/// - Bit 59-60: Unused +/// - Bit 61: Page is file-page or shared-anon +/// - Bit 62: Page is swapped +/// - Bit 63: Page is present in RAM +#[derive(Debug, Clone, Copy)] +pub struct PagemapEntry { + raw: u64, +} + +impl PagemapEntry { + /// Create a PagemapEntry from bytes (little-endian) + pub fn from_bytes(bytes: [u8; 8]) -> Self { + Self { + raw: u64::from_ne_bytes(bytes), + } + } + + /// Check if page is write-protected via userfaultfd + pub fn is_write_protected(&self) -> bool { + (self.raw & (1u64 << 57)) != 0 + } + + /// Check if page is present in RAM (bit 63) + pub fn is_present(&self) -> bool { + (self.raw & (1u64 << 63)) != 0 + } +} + +/// Reader for /proc/self/pagemap +#[derive(Debug)] +pub struct PagemapReader { + pagemap_fd: File, +} + +impl PagemapReader { + /// Create a new PagemapReader + pub fn new(_page_size: usize) -> Result { + let pagemap_fd = File::open("/proc/self/pagemap").map_err(PagemapError::OpenPagemap)?; + + Ok(Self { pagemap_fd }) + } + + /// Check if a single page is dirty (write-protected bit cleared). + /// + /// Checks the first host page (4K) of the guest page at the given address. + /// For huge pages, all host pages within the huge page typically have the same + /// dirty status, so sampling the first is sufficient. + /// + /// # Arguments + /// * `virt_addr` - Virtual address of the page to check + /// + /// # Returns + /// True if the page is present and write-protected bit is cleared (dirty). + pub fn is_page_dirty(&self, virt_addr: usize) -> Result { + // Pagemap always uses host (4K) page size + let host_page_size = host_page_size(); + + // Calculate offset for this virtual page (using host page size) + let host_vpn = virt_addr / host_page_size; + let offset = (host_vpn * PAGEMAP_ENTRY_SIZE) as i64; + + let mut entry_bytes = [0u8; 8]; + + // SAFETY: pread is safe as long as the fd is valid and the buffer is properly sized + let ret = unsafe { + libc::pread( + self.pagemap_fd.as_raw_fd(), + entry_bytes.as_mut_ptr().cast(), + PAGEMAP_ENTRY_SIZE, + offset, + ) + }; + + if ret != PAGEMAP_ENTRY_SIZE as isize { + return Err(PagemapError::ReadEntry(std::io::Error::last_os_error())); + } + + let entry = PagemapEntry::from_bytes(entry_bytes); + + // Page must be present and the write_protected bit cleared (indicating it was written to) + Ok(entry.is_present() && !entry.is_write_protected()) + } +} diff --git a/src/vmm/src/vmm_config/meminfo.rs b/src/vmm/src/vmm_config/meminfo.rs new file mode 100644 index 00000000000..693ece6b4d4 --- /dev/null +++ b/src/vmm/src/vmm_config/meminfo.rs @@ -0,0 +1,29 @@ +use serde::Serialize; + +use crate::persist::GuestRegionUffdMapping; + +/// Serializeable struct that contains information about guest's memory mappings +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryMapingsResponse { + /// Vector with mappings from guest physical to host virtual memoryv + pub mappings: Vec, +} + +/// Information about guest memory resident pages and pages that are all-0s +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryResponse { + /// Bitmap for resident pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is present in the resident memory set + pub resident: Vec, + /// Bitmap for empty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is empty (all 0s). + pub empty: Vec, +} + +/// Information about dirty guest memory pages +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryDirty { + /// Bitmap for dirty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page has been written since the last snapshot. + pub bitmap: Vec, +} diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 9a4c104ce3a..c593b3ec0dc 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -20,6 +20,8 @@ pub mod entropy; pub mod instance_info; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; +/// Wrapper for getting memory-related information. +pub mod meminfo; /// Wrapper for configuring memory hotplug. pub mod memory_hotplug; /// Wrapper for configuring the metrics. diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 13a87ba30c4..dcd1d5b2268 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -44,7 +44,7 @@ pub struct CreateSnapshotParams { /// Path to the file that will contain the microVM state. pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. - pub mem_file_path: PathBuf, + pub mem_file_path: Option, } /// Allows for changing the mapping between tap devices and host devices diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 83e899eff1d..0cca1eb02d6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -305,7 +305,7 @@ impl Vm { } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. - pub fn get_dirty_bitmap(&self) -> Result { + pub fn get_dirty_bitmap(&self, page_size: usize) -> Result { self.guest_memory() .iter() .flat_map(|region| region.plugged_slots()) @@ -318,6 +318,7 @@ impl Vm { None => mincore_bitmap( mem_slot.slice.ptr_guard_mut().as_ptr(), mem_slot.slice.len(), + page_size, )?, }; Ok((mem_slot.slot, bitmap)) @@ -335,6 +336,7 @@ impl Vm { &self, mem_file_path: &Path, snapshot_type: SnapshotType, + page_size: usize, ) -> Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; @@ -377,7 +379,7 @@ impl Vm { match snapshot_type { SnapshotType::Diff => { - let dirty_bitmap = self.get_dirty_bitmap()?; + let dirty_bitmap = self.get_dirty_bitmap(page_size)?; self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; } SnapshotType::Full => { @@ -503,7 +505,11 @@ impl Vm { /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used /// if a diff snapshot is requested, but dirty page tracking wasn't enabled. -fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { +pub(crate) fn mincore_bitmap( + addr: *mut u8, + len: usize, + page_size: usize, +) -> Result, VmError> { // TODO: Once Host 5.10 goes out of support, we can make this more robust and work on // swap-enabled systems, by doing mlock2(MLOCK_ONFAULT)/munlock() in this function (to // force swapped-out pages to get paged in, so that mincore will consider them incore). @@ -513,8 +519,11 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { // Mincore always works at PAGE_SIZE granularity, even if the VMA we are dealing with // is a hugetlbfs VMA (e.g. to report a single hugepage as "present", mincore will // give us 512 4k markers with the lowest bit set). - let page_size = host_page_size(); - let mut mincore_bitmap = vec![0u8; len / page_size]; + let host_page_size = host_page_size(); + let mut mincore_bitmap = vec![0u8; len / host_page_size]; + // The bitmap we return though tracks pages in terms of the actually used page size. In + // the case of a hugetlbfs VMA, we just need to check if the first of the reported pages + // is present. let mut bitmap = vec![0u64; (len / page_size).div_ceil(64)]; // SAFETY: The safety invariants of GuestRegionMmap ensure that region.as_ptr() is a valid @@ -529,7 +538,8 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { return Err(VmError::Mincore(vmm_sys_util::errno::Error::last())); } - for (page_idx, b) in mincore_bitmap.iter().enumerate() { + let step = page_size / host_page_size; + for (page_idx, b) in mincore_bitmap.iter().step_by(step).enumerate() { bitmap[page_idx / 64] |= (*b as u64 & 0x1) << (page_idx as u64 % 64); } diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6a5e6a08a14..a7a4a8c1d73 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -235,7 +235,7 @@ fn verify_create_snapshot( let snapshot_params = CreateSnapshotParams { snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), - mem_file_path: memory_file.as_path().to_path_buf(), + mem_file_path: Some(memory_file.as_path().to_path_buf()), }; controller