diff --git a/.gitignore b/.gitignore index 69660095..3536a549 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ **/fuzz/target/ **/fuzz/corpus/ **/fuzz/artifacts/ +.worktrees +*.rpm diff --git a/Justfile b/Justfile index 7e8b7fb1..90991eeb 100644 --- a/Justfile +++ b/Justfile @@ -40,6 +40,9 @@ fmt: check-fuzz: cargo check --manifest-path crates/composefs/fuzz/Cargo.toml +# Run unit + non-privileged integration tests (no VM, no root) +test-all: test test-integration + # Run all checks (clippy + fmt + test + fuzz build) check: clippy check-feature-combos fmt-check test check-fuzz @@ -90,6 +93,21 @@ test-integration-vm *ARGS: build _integration-container-build install-nextest: @which cargo-nextest > /dev/null 2>&1 || cargo install cargo-nextest --locked +# Build and run a bls example locally. +# Usage: just test-example-local bls arch +# just test-example-local bls arch fsfmt=ext4 verity=none +# 'fsfmt' defaults to ext4, 'verity' defaults to none (no fs-verity enforcement). +# Requires: qemu-kvm, OVMF, skopeo, mtools, fsverity, mkfs.erofs, systemd-repart, podman. +test-example-local example os fsfmt="ext4" verity="none": build + #!/usr/bin/env bash + set -euo pipefail + export FS_FORMAT={{ fsfmt }} + export FS_VERITY_MODE={{ verity }} + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + cd examples + {{ example }}/build {{ os }} + TEST_IMAGE="{{ example }}/{{ os }}-{{ example }}-efi.qcow2" pytest test -v + # Run everything: checks + full integration tests including VM ci: check test-integration-vm diff --git a/bootc/Justfile b/bootc/Justfile index 8ad4a840..ca4b22db 100644 --- a/bootc/Justfile +++ b/bootc/Justfile @@ -4,6 +4,16 @@ # This builds and tests bootc against the local composefs-rs checkout # using bootc's auto-detection of path dependencies via `cargo xtask local-rust-deps`. # -------------------------------------------------------------------- +# +# NOTE: composefs-boot/src/cmdline.rs contains karg parsing logic (ComposefsCmdline, +# KARG_V1, KARG_V2) that must be kept in sync with bootc's crates/initramfs/src/lib.rs +# manually, since bootc does not yet depend on composefs-boot directly. When changing +# karg handling, verify both sides. The composefs-rs cmdline.rs is the source of truth. +# +# NOTE: cfsctl init now defaults to --erofs v1 (V1-only format set). bootc needs +# --erofs dual so that both V1 EROFS (composefs.digest=) and V2 EROFS (composefs=) are +# generated for each image. bootc's own init scripts must pass --erofs dual; this +# Justfile does not inject that flag. # Configuration variables (override via environment or command line) # Example: COMPOSEFS_BOOTC_REF=v1.0.0 just bootc/build @@ -51,47 +61,65 @@ patch: clone #!/bin/bash set -euo pipefail - # Require a clean composefs-rs working tree so we test a real commit + # Require a clean composefs-rs working tree so we test a real commit. + # Only tracked files matter; untracked files are allowed. + # git diff HEAD already excludes untracked files. if ! git -C "$_COMPOSEFS_SRC" diff --quiet HEAD 2>/dev/null; then echo "error: composefs-rs has uncommitted changes — commit or stash first" >&2 - git -C "$_COMPOSEFS_SRC" status --short >&2 + git -C "$_COMPOSEFS_SRC" diff --stat HEAD >&2 exit 1 fi - cfs_path="$_COMPOSEFS_SRC/crates/cfsctl" + cfs_crates="$_COMPOSEFS_SRC/crates" cd "$COMPOSEFS_BOOTC_PATH" - # Add or update the [patch] section with a path override - patch_value="cfsctl = { path = \"${cfs_path}\" } # Patched by composefs-rs" - if grep -q '^[[:space:]]*\[patch\."https://github.com/composefs/composefs-rs"\]' Cargo.toml; then - # Patch section already exists (uncommented) — replace the cfsctl line - sed -i '/^[[:space:]]*\[patch\."https:\/\/github.com\/composefs\/composefs-rs"\]/,/^$\|^\[/{ - s|^cfsctl = .*|'"$patch_value"'| - }' Cargo.toml - else - # No patch section yet — append one - { + # Crates to override: all composefs-rs workspace members that bootc may + # depend on directly. Cargo resolves transitive workspace deps automatically + # once any workspace member is patched by path, but direct deps in bootc's + # own Cargo.toml need an explicit [patch] entry. Extra entries for crates + # bootc doesn't use yet are harmless (Cargo warns but does not error). + # + # Two patch-section URLs are written because the canonical repo moved from + # github.com/composefs/composefs-rs to github.com/containers/composefs-rs; + # bootc's Cargo.toml may use either. Having both is harmless. + local_crates=(composefs composefs-boot composefs-oci composefs-ctl) + + _rev=$(git -C "$_COMPOSEFS_SRC" rev-parse HEAD) + + # Build the block of patch entries + patch_entries="" + for crate in "${local_crates[@]}"; do + patch_entries+="${crate} = { path = \"${cfs_crates}/${crate}\" }"$'\n' + done + + # Remove any existing composefs-rs [patch] sections (both URLs), then + # append fresh ones. The remove-and-reappend approach is simpler than + # trying to surgically update individual lines when the entry count changes. + for url in "https://github.com/composefs/composefs-rs" "https://github.com/containers/composefs-rs"; do + escaped_url="${url//\//\\/}" + sed -i "/^\[patch\.\"${escaped_url}\"\]/,/^$/d" Cargo.toml + done + + { + echo '' + echo "# Patched by composefs-rs at ${_rev}" + for url in "https://github.com/composefs/composefs-rs" "https://github.com/containers/composefs-rs"; do + echo "[patch.\"${url}\"]" + printf '%s' "$patch_entries" echo '' - echo '# Patched by composefs-rs CI to test against local composefs-rs' - echo '[patch."https://github.com/composefs/composefs-rs"]' - echo "$patch_value" - } >> Cargo.toml - fi + done + } >> Cargo.toml - # Patch the workspace lints to allow missing_docs for composefs-rs crates - # bootc has workspace.lints.rust.missing_docs = "deny" but composefs-rs has undocumented items + # Patch the workspace lints to allow missing_docs for composefs-rs crates. + # bootc has workspace.lints.rust.missing_docs = "deny" but some composefs-rs + # items lack documentation. sed -i 's/missing_docs = "deny"/missing_docs = "allow"/' Cargo.toml # Cargo.lock will be updated on the next build/check. # We intentionally don't run `cargo update` here because it rewrites # the workspace dependency line in Cargo.toml (replacing git+rev with path). - # Update the rev comment in the [patch] section so Cargo.toml actually - # changes when composefs-rs moves to a new commit. Since the file is - # part of the podman build context this busts the layer cache. - _rev=$(git -C "$_COMPOSEFS_SRC" rev-parse HEAD) - sed -i "s/^# Patched by composefs-rs.*/# Patched by composefs-rs at ${_rev}/" Cargo.toml echo "bootc patched for composefs-rs at ${_rev}" # Build sealed bootc image using local composefs-rs @@ -156,6 +184,9 @@ config: Environment Variables: COMPOSEFS_BOOTC_PATH - Override bootc checkout path COMPOSEFS_BOOTC_REF - Override bootc git ref (branch/tag/PR) + Use this when composefs-rs has API-breaking changes + that require a matching bootc branch, e.g.: + COMPOSEFS_BOOTC_REF=refs/pull/123/head just bootc/build COMPOSEFS_BOOTC_REPO - Override bootc git repository Test Parameters: diff --git a/crates/composefs-boot/src/bootloader.rs b/crates/composefs-boot/src/bootloader.rs index 8aef71ab..9c6bcdbc 100644 --- a/crates/composefs-boot/src/bootloader.rs +++ b/crates/composefs-boot/src/bootloader.rs @@ -19,7 +19,7 @@ use composefs::{ tree::{DirectoryRef, FileSystem, ImageError, Inode, LeafContent, RegularFile}, }; -use crate::cmdline::{make_cmdline_composefs, split_cmdline}; +use crate::cmdline::split_cmdline; /// Strips the key (if it matches) plus the following whitespace from a single line in a "Type #1 /// Boot Loader Specification Entry" file. @@ -139,11 +139,15 @@ impl BootLoaderEntryFile { self.lines.push(format!("options {arg}")); } - /// Adjusts the kernel command-line arguments by adding a composefs= parameter (if appropriate) - /// and adding additional arguments, as requested. - pub fn adjust_cmdline(&mut self, composefs: Option<&str>, insecure: bool, extra: &[&str]) { - if let Some(id) = composefs { - self.add_cmdline(&make_cmdline_composefs(id, insecure)); + /// Adjusts the kernel command-line arguments by adding a composefs karg (if provided) + /// and adding additional arguments. + /// + /// `karg` should be a complete kernel argument string such as + /// `"composefs.digest=abc123"` or `"composefs=abc123"` as produced by + /// [`composefs_boot::cmdline::ComposefsCmdline::to_cmdline_arg`]. + pub fn adjust_cmdline(&mut self, karg: Option<&str>, extra: &[&str]) { + if let Some(k) = karg { + self.add_cmdline(k); } for item in extra { @@ -729,7 +733,7 @@ mod tests { #[test] fn test_adjust_cmdline_with_composefs() { let mut entry = BootLoaderEntryFile::new("title Test Entry\nlinux /vmlinuz\n"); - entry.adjust_cmdline(Some("abc123"), false, &["quiet", "splash"]); + entry.adjust_cmdline(Some("composefs=abc123"), &["quiet", "splash"]); assert_eq!(entry.lines.len(), 3); assert_eq!(entry.lines[2], "options composefs=abc123 quiet splash"); @@ -738,17 +742,16 @@ mod tests { #[test] fn test_adjust_cmdline_with_composefs_insecure() { let mut entry = BootLoaderEntryFile::new("title Test Entry\nlinux /vmlinuz\n"); - entry.adjust_cmdline(Some("abc123"), true, &[]); + entry.adjust_cmdline(Some("composefs=?abc123"), &[]); assert_eq!(entry.lines.len(), 3); - // Assuming make_cmdline_composefs adds digest=off for insecure mode - assert!(entry.lines[2].contains("abc123")); + assert_eq!(entry.lines[2], "options composefs=?abc123"); } #[test] fn test_adjust_cmdline_no_composefs() { let mut entry = BootLoaderEntryFile::new("title Test Entry\nlinux /vmlinuz\n"); - entry.adjust_cmdline(None, false, &["quiet", "splash"]); + entry.adjust_cmdline(None, &["quiet", "splash"]); assert_eq!(entry.lines.len(), 3); assert_eq!(entry.lines[2], "options quiet splash"); @@ -757,7 +760,7 @@ mod tests { #[test] fn test_adjust_cmdline_existing_options() { let mut entry = BootLoaderEntryFile::new("title Test Entry\noptions root=/dev/sda1\n"); - entry.adjust_cmdline(Some("abc123"), false, &["quiet"]); + entry.adjust_cmdline(Some("composefs=abc123"), &["quiet"]); assert_eq!(entry.lines.len(), 2); assert!(entry.lines[1].contains("root=/dev/sda1")); diff --git a/crates/composefs-boot/src/cmdline.rs b/crates/composefs-boot/src/cmdline.rs index f65dd303..0ca087c5 100644 --- a/crates/composefs-boot/src/cmdline.rs +++ b/crates/composefs-boot/src/cmdline.rs @@ -8,6 +8,169 @@ use anyhow::{Context, Result}; use composefs::fsverity::FsVerityHashValue; +/// Kernel argument name for the V2 EROFS format: `composefs=`. +/// +/// Used in existing sealed UKIs. The initramfs checks for [`KARG_V1`] first, +/// then falls back to this. +pub const KARG_V2: &str = "composefs"; + +/// Kernel argument name for the V1 EROFS format: `composefs.digest=`. +/// +/// Newer karg added to distinguish V1 EROFS images from V2. The initramfs +/// checks for this before falling back to [`KARG_V2`]. +pub const KARG_V1: &str = "composefs.digest"; + +/// A composefs kernel argument identifying which EROFS image to mount at boot. +/// +/// Two variants exist to distinguish EROFS format versions: +/// - [`ComposefsCmdline::V2`]: legacy `composefs=` karg (V2 EROFS, existing sealed UKIs) +/// - [`ComposefsCmdline::V1`]: new `composefs.digest=` karg (V1 EROFS) +/// +/// The initramfs checks for `composefs.digest=` first, then falls back to `composefs=`. +/// +/// NOTE: The equivalent parsing logic in bootc's `crates/initramfs/src/lib.rs` must be +/// kept in sync with this file manually, since bootc does not yet depend on composefs-boot +/// directly. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ComposefsCmdline { + /// V2 EROFS image: embedded as `composefs=` in the UKI cmdline. + /// + /// The `insecure` flag, when `true`, means the digest is prefixed with `?` + /// (e.g. `composefs=?`), making fs-verity verification optional. + V2 { + /// The fs-verity hash of the EROFS image. + digest: ObjectID, + /// If `true`, a `?` prefix is added to the digest, making fs-verity + /// verification optional at boot. + insecure: bool, + }, + /// V1 EROFS image: embedded as `composefs.digest=` in the UKI cmdline. + /// + /// The `insecure` flag, when `true`, means the digest is prefixed with `?` + /// (e.g. `composefs.digest=?`), making fs-verity verification optional. + V1 { + /// The fs-verity hash of the EROFS image. + digest: ObjectID, + /// If `true`, a `?` prefix is added to the digest, making fs-verity + /// verification optional at boot. + insecure: bool, + }, +} + +impl ComposefsCmdline { + /// Returns a reference to the hex digest, regardless of variant. + /// + /// Useful for looking up the image in `composefs/images/`. + pub fn digest(&self) -> &ObjectID { + match self { + ComposefsCmdline::V2 { digest, .. } | ComposefsCmdline::V1 { digest, .. } => digest, + } + } + + /// Returns whether this karg is in insecure mode (fs-verity verification skipped). + pub fn is_insecure(&self) -> bool { + match self { + ComposefsCmdline::V1 { insecure, .. } | ComposefsCmdline::V2 { insecure, .. } => { + *insecure + } + } + } + + /// Constructs a V2 cmdline value (`composefs=`). + pub fn new_v2(digest: ObjectID, insecure: bool) -> Self { + ComposefsCmdline::V2 { digest, insecure } + } + + /// Constructs a V1 cmdline value (`composefs.digest=`). + pub fn new_v1(digest: ObjectID, insecure: bool) -> Self { + ComposefsCmdline::V1 { digest, insecure } + } + + /// Parses a [`ComposefsCmdline`] from a kernel command line string. + /// + /// Checks for `composefs.digest=` first (→ [`ComposefsCmdline::V1`]), then falls + /// back to `composefs=` (→ [`ComposefsCmdline::V2`]). Returns `None` if neither + /// is present. + /// + /// # Errors + /// + /// Returns an error if a matching karg is found but the hex digest cannot be parsed + /// for the given `ObjectID` type. + pub fn from_cmdline(cmdline: &str) -> Result> { + let expected_hex_len = size_of::() * 2; + + // V1: composefs.digest= (checked first per initramfs convention) + // Optional '?' prefix for insecure mode: composefs.digest=? + if let Some(val) = get_cmdline_value(cmdline, &format!("{KARG_V1}=")) { + let (hex, insecure) = if let Some(stripped) = val.strip_prefix('?') { + (stripped, true) + } else { + (val, false) + }; + let digest = ObjectID::from_hex(hex).with_context(|| { + format!( + "parsing {KARG_V1}= hash: got {} hex chars, expected {} for {}", + hex.len(), + expected_hex_len, + ObjectID::ALGORITHM, + ) + })?; + return Ok(Some(ComposefsCmdline::V1 { digest, insecure })); + } + + // V2: composefs= (optional '?' prefix for insecure mode) + if let Some(val) = get_cmdline_value(cmdline, &format!("{KARG_V2}=")) { + let (hex, insecure) = if let Some(stripped) = val.strip_prefix('?') { + (stripped, true) + } else { + (val, false) + }; + let digest = ObjectID::from_hex(hex).with_context(|| { + format!( + "parsing {KARG_V2}= hash: got {} hex chars, expected {} for {}", + hex.len(), + expected_hex_len, + ObjectID::ALGORITHM, + ) + })?; + return Ok(Some(ComposefsCmdline::V2 { digest, insecure })); + } + + Ok(None) + } + + /// Renders this value as a kernel command line fragment. + /// + /// - [`ComposefsCmdline::V1`] (secure) → `"composefs.digest="` + /// - [`ComposefsCmdline::V1`] (insecure) → `"composefs.digest=?"` + /// - [`ComposefsCmdline::V2`] (secure) → `"composefs="` + /// - [`ComposefsCmdline::V2`] (insecure) → `"composefs=?"` + pub fn to_cmdline_arg(&self) -> String { + match self { + ComposefsCmdline::V1 { + digest, + insecure: false, + } => format!("{KARG_V1}={}", digest.to_hex()), + ComposefsCmdline::V1 { + digest, + insecure: true, + } => format!("{KARG_V1}=?{}", digest.to_hex()), + ComposefsCmdline::V2 { + digest, + insecure: false, + } => { + format!("{KARG_V2}={}", digest.to_hex()) + } + ComposefsCmdline::V2 { + digest, + insecure: true, + } => { + format!("{KARG_V2}=?{}", digest.to_hex()) + } + } + } +} + /// Perform kernel command line splitting. /// /// The way this works in the kernel is to split on whitespace with an extremely simple quoting @@ -35,61 +198,221 @@ pub fn get_cmdline_value<'a>(cmdline: &'a str, prefix: &str) -> Option<&'a str> split_cmdline(cmdline).find_map(|item| item.strip_prefix(prefix)) } -/// Extracts and parses the composefs= parameter from a kernel command line. -/// -/// # Arguments -/// -/// * `cmdline` - The kernel command line string -/// -/// # Returns -/// -/// A tuple of (hash, insecure_flag) where the hash is the composefs object ID -/// and insecure_flag indicates whether the '?' prefix was present (making verification optional) -pub fn get_cmdline_composefs( - cmdline: &str, -) -> Result<(ObjectID, bool)> { - let id = get_cmdline_value(cmdline, "composefs=").context("composefs= value not found")?; - let expected_hex_len = size_of::() * 2; - if let Some(stripped) = id.strip_prefix('?') { - Ok(( - ObjectID::from_hex(stripped).with_context(|| { - format!( - "parsing composefs= hash: got {} hex chars, expected {} for {}", - stripped.len(), - expected_hex_len, - ObjectID::ALGORITHM, - ) - })?, - true, - )) - } else { - Ok(( - ObjectID::from_hex(id).with_context(|| { - format!( - "parsing composefs= hash: got {} hex chars, expected {} for {}", - id.len(), - expected_hex_len, - ObjectID::ALGORITHM, - ) - })?, - false, - )) - } -} - -/// Creates a composefs= kernel command line argument. +/// Creates a composefs kernel command line argument string. /// /// # Arguments /// /// * `id` - The composefs object ID as a hex string /// * `insecure` - If true, prepends '?' to make fs-verity verification optional +/// * `version` - Which EROFS format version karg to emit /// /// # Returns /// -/// A string like "composefs=abc123" or "composefs=?abc123" (if insecure) -pub fn make_cmdline_composefs(id: &str, insecure: bool) -> String { +/// A string like `"composefs.digest=abc123"` (V1) or `"composefs=abc123"` (V2), +/// with optional `?` prefix for insecure mode. +pub fn make_cmdline_composefs( + id: &str, + insecure: bool, + version: composefs::erofs::format::FormatVersion, +) -> String { + use composefs::erofs::format::FormatVersion; + let prefix = match version { + FormatVersion::V1 => KARG_V1, + FormatVersion::V2 => KARG_V2, + }; match insecure { - true => format!("composefs=?{id}"), - false => format!("composefs={id}"), + true => format!("{prefix}=?{id}"), + false => format!("{prefix}={id}"), + } +} + +#[cfg(test)] +mod tests { + use composefs::fsverity::Sha256HashValue; + + use super::*; + + const SHA256_HEX: &str = "8b7df143d91c716ecfa5fc1730022f6b421b05cedee8fd52b1fc65a96030ad52"; + + #[test] + fn test_composefs_cmdline_v2_round_trip() { + let digest = Sha256HashValue::from_hex(SHA256_HEX).unwrap(); + let karg = ComposefsCmdline::new_v2(digest.clone(), false); + assert_eq!(karg.to_cmdline_arg(), format!("composefs={SHA256_HEX}")); + + let parsed = ComposefsCmdline::::from_cmdline(&karg.to_cmdline_arg()) + .unwrap() + .unwrap(); + assert_eq!( + parsed, + ComposefsCmdline::V2 { + digest, + insecure: false + } + ); + } + + #[test] + fn test_composefs_cmdline_v2_insecure_round_trip() { + let digest = Sha256HashValue::from_hex(SHA256_HEX).unwrap(); + let karg = ComposefsCmdline::new_v2(digest.clone(), true); + assert_eq!(karg.to_cmdline_arg(), format!("composefs=?{SHA256_HEX}")); + + let parsed = ComposefsCmdline::::from_cmdline(&karg.to_cmdline_arg()) + .unwrap() + .unwrap(); + assert_eq!( + parsed, + ComposefsCmdline::V2 { + digest, + insecure: true + } + ); + } + + #[test] + fn test_composefs_cmdline_v1_round_trip() { + let digest = Sha256HashValue::from_hex(SHA256_HEX).unwrap(); + let karg = ComposefsCmdline::new_v1(digest.clone(), false); + assert_eq!( + karg.to_cmdline_arg(), + format!("composefs.digest={SHA256_HEX}") + ); + + let parsed = ComposefsCmdline::::from_cmdline(&karg.to_cmdline_arg()) + .unwrap() + .unwrap(); + assert_eq!( + parsed, + ComposefsCmdline::V1 { + digest, + insecure: false + } + ); + } + + #[test] + fn test_composefs_cmdline_v1_insecure_round_trip() { + let digest = Sha256HashValue::from_hex(SHA256_HEX).unwrap(); + let karg = ComposefsCmdline::new_v1(digest.clone(), true); + assert_eq!( + karg.to_cmdline_arg(), + format!("composefs.digest=?{SHA256_HEX}") + ); + + let parsed = ComposefsCmdline::::from_cmdline(&karg.to_cmdline_arg()) + .unwrap() + .unwrap(); + assert_eq!( + parsed, + ComposefsCmdline::V1 { + digest, + insecure: true + } + ); + assert!(parsed.is_insecure()); + } + + #[test] + fn test_composefs_cmdline_v1_takes_priority_over_v2() { + // When both kargs are present, V1 (composefs.digest=) should win. + let hex_v1 = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + let hex_v2 = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + let cmdline = format!("composefs={hex_v2} composefs.digest={hex_v1}"); + + let parsed = ComposefsCmdline::::from_cmdline(&cmdline) + .unwrap() + .unwrap(); + assert!( + matches!(&parsed, ComposefsCmdline::V1 { digest, .. } if digest.to_hex() == hex_v1), + "expected V1 variant with hex_v1, got {parsed:?}" + ); + } + + #[test] + fn test_composefs_cmdline_absent_returns_none() { + assert!( + ComposefsCmdline::::from_cmdline("quiet splash rw") + .unwrap() + .is_none() + ); + assert!( + ComposefsCmdline::::from_cmdline("") + .unwrap() + .is_none() + ); + } + + #[test] + fn test_composefs_cmdline_invalid_hex_errors() { + // Valid key present but digest is garbage. + let err = ComposefsCmdline::::from_cmdline("composefs.digest=notahex") + .unwrap_err(); + assert!(err.to_string().contains("composefs.digest=")); + + let err = + ComposefsCmdline::::from_cmdline("composefs=notahex").unwrap_err(); + assert!(err.to_string().contains("composefs=")); + } + + #[test] + fn test_digest_accessor() { + let digest = Sha256HashValue::from_hex(SHA256_HEX).unwrap(); + let v1 = ComposefsCmdline::new_v1(digest.clone(), false); + let v2 = ComposefsCmdline::new_v2(digest.clone(), false); + assert_eq!(v1.digest(), &digest); + assert_eq!(v2.digest(), &digest); + } + + #[test] + fn test_from_cmdline_v1() { + let cmdline = format!("root=UUID=abc composefs.digest={SHA256_HEX} rw"); + let result = ComposefsCmdline::::from_cmdline(&cmdline) + .unwrap() + .unwrap(); + assert!(matches!(result, ComposefsCmdline::V1 { .. })); + assert_eq!(result.digest().to_hex(), SHA256_HEX); + assert!(!result.is_insecure()); + } + + #[test] + fn test_from_cmdline_v2_fallback() { + let cmdline = format!("root=UUID=abc composefs={SHA256_HEX} rw"); + let result = ComposefsCmdline::::from_cmdline(&cmdline) + .unwrap() + .unwrap(); + assert!(matches!(result, ComposefsCmdline::V2 { .. })); + assert_eq!(result.digest().to_hex(), SHA256_HEX); + assert!(!result.is_insecure()); + } + + #[test] + fn test_from_cmdline_missing_returns_none() { + let result = ComposefsCmdline::::from_cmdline("root=UUID=abc rw").unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_from_cmdline_insecure_prefix() { + let cmdline = format!("composefs=?{SHA256_HEX}"); + let result = ComposefsCmdline::::from_cmdline(&cmdline) + .unwrap() + .unwrap(); + assert!(result.is_insecure()); + assert_eq!(result.digest().to_hex(), SHA256_HEX); + } + + #[test] + fn test_make_cmdline_composefs_v1() { + use composefs::erofs::format::FormatVersion; + let result = make_cmdline_composefs(SHA256_HEX, false, FormatVersion::V1); + assert_eq!(result, format!("composefs.digest={SHA256_HEX}")); + } + + #[test] + fn test_make_cmdline_composefs_v2_insecure() { + use composefs::erofs::format::FormatVersion; + let result = make_cmdline_composefs(SHA256_HEX, true, FormatVersion::V2); + assert_eq!(result, format!("composefs=?{SHA256_HEX}")); } } diff --git a/crates/composefs-boot/src/lib.rs b/crates/composefs-boot/src/lib.rs index 40ff0335..11e5cc33 100644 --- a/crates/composefs-boot/src/lib.rs +++ b/crates/composefs-boot/src/lib.rs @@ -101,6 +101,10 @@ impl BootOps for FileSystem { ) -> Result>> { let boot_entries = get_boot_resources(self, repo)?; empty_toplevel_dirs(self)?; + // Compact the leaves table after clearing directories, so that leaves + // which were only referenced by /boot or /sysroot are removed and + // don't appear as orphans when the filesystem is validated. + self.compact(); selabel::selabel(self, repo)?; Ok(boot_entries) @@ -108,6 +112,8 @@ impl BootOps for FileSystem { fn transform_for_boot_from_dir(&mut self, rootfs: impl AsFd) -> Result<()> { empty_toplevel_dirs(self)?; + // Same as above: compact to remove leaves orphaned by clearing dirs. + self.compact(); selabel::selabel_from_dir(self, rootfs)?; Ok(()) } diff --git a/crates/composefs-boot/src/selabel.rs b/crates/composefs-boot/src/selabel.rs index 04204737..0b3fd111 100644 --- a/crates/composefs-boot/src/selabel.rs +++ b/crates/composefs-boot/src/selabel.rs @@ -508,6 +508,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }; @@ -522,6 +523,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::Inline(data.to_vec().into_boxed_slice())), diff --git a/crates/composefs-boot/src/write_boot.rs b/crates/composefs-boot/src/write_boot.rs index 344d3649..96294956 100644 --- a/crates/composefs-boot/src/write_boot.rs +++ b/crates/composefs-boot/src/write_boot.rs @@ -16,7 +16,7 @@ use composefs::{fsverity::FsVerityHashValue, repository::Repository}; use crate::{ bootloader::{BootEntry, Type1Entry, Type2Entry}, - cmdline::get_cmdline_composefs, + cmdline::ComposefsCmdline, uki, }; @@ -27,16 +27,14 @@ use crate::{ /// * `t1` - The Type 1 entry to write /// * `bootdir` - Path to the boot directory /// * `boot_subdir` - Optional subdirectory to prepend to paths -/// * `root_id` - The composefs root object ID -/// * `insecure` - Whether to allow optional fs-verity verification +/// * `karg` - The composefs kernel argument (encodes format version, digest, and insecure flag) /// * `cmdline_extra` - Additional kernel command line arguments /// * `repo` - The composefs repository pub fn write_t1_simple( mut t1: Type1Entry, bootdir: &Path, boot_subdir: Option<&str>, - root_id: &ObjectID, - insecure: bool, + karg: &ComposefsCmdline, cmdline_extra: &[&str], repo: &Repository, ) -> Result<()> { @@ -47,8 +45,8 @@ pub fn write_t1_simple( bootdir.to_path_buf() }; - t1.entry - .adjust_cmdline(Some(&root_id.to_hex()), insecure, cmdline_extra); + let karg_str = karg.to_cmdline_arg(); + t1.entry.adjust_cmdline(Some(&karg_str), cmdline_extra); // Write the content before we write the loader entry for (filename, file) in &t1.files { @@ -70,7 +68,8 @@ pub fn write_t1_simple( /// Writes a Type 2 boot entry (UKI) to the boot directory. /// -/// Validates that the UKI's embedded composefs= parameter matches the expected root_id. +/// Validates that the UKI's embedded composefs karg (`composefs=` or `composefs.digest=`) +/// matches the expected `root_id`. /// /// # Arguments /// @@ -89,12 +88,18 @@ pub fn write_t2_simple( let filename = efi_linux.join(t2.file_path); let content = composefs::fs::read_file(&t2.file, repo)?; let cmdline = uki::get_cmdline(&content)?; - let (composefs, _) = get_cmdline_composefs::(cmdline) - .with_context(|| format!("parsing UKI .cmdline section: {cmdline:?}"))?; + let parsed = ComposefsCmdline::::from_cmdline(cmdline) + .with_context(|| format!("parsing UKI .cmdline section: {cmdline:?}"))? + .ok_or_else(|| { + anyhow::anyhow!( + "UKI .cmdline has no composefs karg (composefs= or composefs.digest=): {cmdline:?}" + ) + })?; ensure!( - &composefs == root_id, - "The UKI has the wrong composefs= parameter (is '{composefs:?}', should be {root_id:?})" + parsed.digest() == root_id, + "The UKI has the wrong composefs karg (digest is '{:?}', should be {root_id:?})", + parsed.digest() ); write(filename, content)?; Ok(()) @@ -106,9 +111,8 @@ pub fn write_t2_simple( /// /// * repo - The composefs repository /// * entry - Boot entry variant to be written -/// * root_id - The content hash of the generated EROFS image id -/// * insecure - Make fs-verity validation optional in case the filesystem doesn't support -/// it, indicated by `composefs=?hash` cmdline argument +/// * karg - The composefs kernel argument (encodes format version, digest, and insecure +/// flag); used to build the `composefs=` or `composefs.digest=` cmdline argument /// * boot_partition - Path to the boot partition/directory /// * boot_subdir - If `Some(path)`, the path is prepended to `initrd` and `linux` keys in the BLS entry /// @@ -130,12 +134,10 @@ pub fn write_t2_simple( /// * entry_id - In case of a BLS entry, the name of file to be generated in `loader/entries` /// * cmdline_extra - Extra kernel command line arguments /// -#[allow(clippy::too_many_arguments)] pub fn write_boot_simple( repo: &Repository, entry: BootEntry, - root_id: &ObjectID, - insecure: bool, + karg: &ComposefsCmdline, boot_partition: &Path, boot_subdir: Option<&str>, entry_id: Option<&str>, @@ -146,37 +148,21 @@ pub fn write_boot_simple( if let Some(name) = entry_id { t1.relocate(boot_subdir, name); } - write_t1_simple( - t1, - boot_partition, - boot_subdir, - root_id, - insecure, - cmdline_extra, - repo, - )?; + write_t1_simple(t1, boot_partition, boot_subdir, karg, cmdline_extra, repo)?; } BootEntry::Type2(mut t2) => { if let Some(name) = entry_id { t2.rename(name); } ensure!(cmdline_extra.is_empty(), "Can't add --cmdline args to UKIs"); - write_t2_simple(t2, boot_partition, root_id, repo)?; + write_t2_simple(t2, boot_partition, karg.digest(), repo)?; } BootEntry::UsrLibModulesVmLinuz(entry) => { let mut t1 = entry.into_type1(entry_id); if let Some(name) = entry_id { t1.relocate(boot_subdir, name); } - write_t1_simple( - t1, - boot_partition, - boot_subdir, - root_id, - insecure, - cmdline_extra, - repo, - )?; + write_t1_simple(t1, boot_partition, boot_subdir, karg, cmdline_extra, repo)?; } }; diff --git a/crates/composefs-ctl/src/composefs_info.rs b/crates/composefs-ctl/src/composefs_info.rs new file mode 100644 index 00000000..c1301e58 --- /dev/null +++ b/crates/composefs-ctl/src/composefs_info.rs @@ -0,0 +1,331 @@ +//! composefs-info - Query information from composefs images. +//! +//! This is a Rust reimplementation of the C composefs-info tool, providing +//! commands to inspect EROFS images, list objects, and compute fs-verity digests. +//! +//! ## Compatibility status +//! +//! Implemented subcommands: +//! - `ls` — lists files with type suffixes, skips whiteout entries +//! - `dump` — outputs composefs-dump(5) text format (image → tree → dumpfile) +//! - `objects` — lists all backing file object paths (XX/XXXX...) +//! - `missing-objects` — lists objects not present in `--basedir` +//! - `measure-file` — computes fs-verity digest of files +//! +//! Known gaps vs C composefs-info: +//! - TODO(compat): `measure-file` uses userspace fs-verity computation instead +//! of the kernel `FS_IOC_MEASURE_VERITY` ioctl. This works on files without +//! verity enabled (computing what the digest *would* be), while the C version +//! fails on non-verity files. + +use std::collections::HashSet; +use std::io::Write; +use std::path::Path; +use std::{fs::File, io::Read, path::PathBuf}; + +use anyhow::{Context, Result}; +use clap::{Parser, Subcommand}; + +use composefs::{ + dumpfile::write_dumpfile, + erofs::reader::erofs_to_filesystem, + fsverity::{FsVerityHashValue, FsVerityHasher, Sha256HashValue}, + generic_tree::{Inode, LeafContent, LeafId}, + tree::{FileSystem, RegularFile}, +}; + +/// Query information from composefs images. +#[derive(Parser, Debug)] +#[command( + name = "composefs-info", + version, + about = "Query information from composefs images" +)] +struct Cli { + /// The subcommand to run. + #[command(subcommand)] + command: Command, +} + +/// Available subcommands. +#[derive(Subcommand, Debug)] +enum Command { + /// Simple listing of files and directories in the image. + Ls { + /// Filter entries at the root level by name (can be specified multiple times). + #[arg(long = "filter", action = clap::ArgAction::Append)] + filter: Vec, + /// Composefs image files to inspect. + images: Vec, + }, + + /// Full dump in composefs-dump(5) format. + Dump { + /// Filter entries at the root level by name (can be specified multiple times). + #[arg(long = "filter", action = clap::ArgAction::Append)] + filter: Vec, + /// Composefs image files to dump. + images: Vec, + }, + + /// List all backing file object paths. + Objects { + /// Composefs image files to inspect. + images: Vec, + }, + + /// List backing files not present in basedir. + MissingObjects { + /// Base directory for object lookups. + #[arg(long = "basedir", required = true)] + basedir: PathBuf, + /// Composefs image files to inspect. + images: Vec, + }, + + /// Print the fs-verity digest of files. + MeasureFile { + /// Files to measure. + files: Vec, + }, +} + +/// Entry point for the composefs-info multi-call mode. +pub(crate) fn run() -> Result<()> { + let cli = Cli::parse(); + + match &cli.command { + Command::Ls { filter, images } => cmd_ls(filter, images), + Command::Dump { filter, images } => cmd_dump(filter, images), + Command::Objects { images } => cmd_objects(images), + Command::MissingObjects { basedir, images } => cmd_missing_objects(basedir, images), + Command::MeasureFile { files } => cmd_measure_file(files), + } +} + +/// Print escaped path (matches C implementation behavior). +fn print_escaped(out: &mut W, s: &[u8]) -> std::io::Result<()> { + for &c in s { + match c { + b'\\' => write!(out, "\\\\")?, + b'\n' => write!(out, "\\n")?, + b'\r' => write!(out, "\\r")?, + b'\t' => write!(out, "\\t")?, + // Non-printable or non-ASCII characters are hex-escaped + c if !c.is_ascii_graphic() && c != b' ' => write!(out, "\\x{c:02x}")?, + c => out.write_all(&[c])?, + } + } + Ok(()) +} + +/// Walk and print entries: directory line first, then recurse into children. +fn ls_print( + out: &mut W, + fs: &FileSystem, + dir: &composefs::tree::Directory, + path: &[u8], + seen_leaf_ids: &mut HashSet, + filter: &[String], + is_root: bool, +) -> Result<()> { + for (name, child) in dir.sorted_entries() { + let name_bytes = name.as_encoded_bytes(); + + // At the root level, apply name filters if any were given. + if is_root && !filter.is_empty() { + let name_str = name.to_string_lossy(); + if !filter.iter().any(|f| f == name_str.as_ref()) { + continue; + } + } + + let mut child_path = path.to_vec(); + child_path.push(b'/'); + child_path.extend_from_slice(name_bytes); + + match child { + Inode::Directory(child_dir) => { + // Print the directory entry with trailing slash. + print_escaped(out, &child_path)?; + write!(out, "/\t")?; + writeln!(out)?; + // Recurse into the directory. + ls_print( + out, + fs, + child_dir, + &child_path, + seen_leaf_ids, + filter, + false, + )?; + } + Inode::Leaf(leaf_id, _) => { + let leaf = fs.leaf(*leaf_id); + + print_escaped(out, &child_path)?; + + match &leaf.content { + LeafContent::Regular(regular) => { + let is_hardlink = !seen_leaf_ids.insert(*leaf_id); + if !is_hardlink && let RegularFile::External(id, _) = regular { + write!(out, "\t@ ")?; + print_escaped(out, id.to_object_pathname().as_bytes())?; + } + } + LeafContent::Symlink(target) => { + write!(out, "\t-> ")?; + print_escaped(out, target.as_encoded_bytes())?; + } + _ => {} + } + + writeln!(out)?; + } + } + } + Ok(()) +} + +/// List files and directories in the image. +fn cmd_ls(filter: &[String], images: &[PathBuf]) -> Result<()> { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + + let mut seen_leaf_ids = HashSet::new(); + ls_print( + &mut out, + &fs, + &fs.root, + b"", + &mut seen_leaf_ids, + filter, + true, + )?; + } + + Ok(()) +} + +/// Dump the image in composefs-dump(5) text format. +/// +/// This matches the C composefs-info dump output: the EROFS image is parsed +/// back into a filesystem tree which is then serialized as a dumpfile. +fn cmd_dump(_filter: &[String], images: &[PathBuf]) -> Result<()> { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + write_dumpfile(&mut out, &fs) + .with_context(|| format!("Failed to dump image: {image_path:?}"))?; + } + + Ok(()) +} + +/// Collect all external object IDs from a parsed filesystem. +/// +/// Iterates the leaves table directly — each `RegularFile::External` entry +/// is a unique content-addressed object. Because `erofs_to_filesystem` +/// deduplicates hard-linked inodes into a single leaf, each object appears +/// exactly once even if it is referenced by multiple paths. +fn collect_objects_from_fs(fs: &FileSystem) -> HashSet { + fs.leaves + .iter() + .filter_map(|leaf| match &leaf.content { + LeafContent::Regular(RegularFile::External(id, _)) => Some(id.clone()), + _ => None, + }) + .collect() +} + +/// List all object paths from the images. +fn cmd_objects(images: &[PathBuf]) -> Result<()> { + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + + let mut objects: Vec = collect_objects_from_fs(&fs).into_iter().collect(); + objects.sort_by_key(|id| id.to_hex()); + + for obj in objects { + println!("{}", obj.to_object_pathname()); + } + } + Ok(()) +} + +/// List objects not present in basedir. +fn cmd_missing_objects(basedir: &Path, images: &[PathBuf]) -> Result<()> { + let mut all_objects: HashSet = HashSet::new(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + all_objects.extend(collect_objects_from_fs(&fs)); + } + + let mut missing: Vec = all_objects + .into_iter() + .filter(|obj| !basedir.join(obj.to_object_pathname()).exists()) + .collect(); + + missing.sort_by_key(|a| a.to_hex()); + + for obj in missing { + println!("{}", obj.to_object_pathname()); + } + + Ok(()) +} + +/// Compute and print the fs-verity digest of each file. +fn cmd_measure_file(files: &[PathBuf]) -> Result<()> { + use std::io::BufRead; + for path in files { + let file = File::open(path).with_context(|| format!("Failed to open file: {path:?}"))?; + + let mut hasher = FsVerityHasher::::new(); + let mut reader = std::io::BufReader::with_capacity( + FsVerityHasher::::BLOCK_SIZE * 2, + file, + ); + + loop { + let buf = reader + .fill_buf() + .with_context(|| format!("Failed to read file: {path:?}"))?; + if buf.is_empty() { + break; + } + let chunk = &buf[..buf.len().min(FsVerityHasher::::BLOCK_SIZE)]; + hasher.add_block(chunk); + let n = chunk.len(); + reader.consume(n); + } + + let digest = hasher.digest(); + println!("{}", digest.to_hex()); + } + Ok(()) +} + +/// Read an entire image file into memory. +fn read_image(path: &PathBuf) -> Result> { + let mut file = File::open(path).with_context(|| format!("Failed to open image: {path:?}"))?; + let mut data = Vec::new(); + file.read_to_end(&mut data) + .with_context(|| format!("Failed to read image: {path:?}"))?; + Ok(data) +} diff --git a/crates/composefs-ctl/src/lib.rs b/crates/composefs-ctl/src/lib.rs index 91a73efc..d1c19d5a 100644 --- a/crates/composefs-ctl/src/lib.rs +++ b/crates/composefs-ctl/src/lib.rs @@ -22,8 +22,12 @@ pub use composefs_http; #[cfg(feature = "oci")] pub use composefs_oci; +#[cfg(any(feature = "oci", feature = "http"))] +use std::collections::HashMap; use std::io::Read; use std::path::Path; +#[cfg(any(feature = "oci", feature = "http"))] +use std::sync::Mutex; use std::{ffi::OsString, path::PathBuf}; #[cfg(feature = "oci")] @@ -35,24 +39,123 @@ use anyhow::{Context as _, Result}; use clap::{Parser, Subcommand, ValueEnum}; #[cfg(feature = "oci")] use comfy_table::{Table, presets::UTF8_FULL}; +#[cfg(any(feature = "oci", feature = "http"))] +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use rustix::fs::{CWD, Mode, OFlags}; use serde::Serialize; +#[cfg(any(feature = "oci", feature = "http"))] +use composefs::progress::{ + ComponentId, ProgressEvent, ProgressReporter, ProgressUnit, SharedReporter, +}; use composefs_boot::BootOps; +use composefs_boot::cmdline::ComposefsCmdline; #[cfg(feature = "oci")] use composefs_boot::write_boot; +use composefs::erofs::format::FormatVersion; #[cfg(feature = "oci")] use composefs::shared_internals::IO_BUF_CAPACITY; use composefs::{ dumpfile::{dump_single_dir, dump_single_file}, - erofs::reader::erofs_to_filesystem, + erofs::{format::FormatSet, reader::erofs_to_filesystem}, fsverity::{Algorithm, FsVerityHashValue, Sha256HashValue, Sha512HashValue}, generic_tree::{FileSystem, Inode}, - repository::{REPO_METADATA_FILENAME, Repository, read_repo_algorithm, system_path, user_path}, + repository::{ + REPO_METADATA_FILENAME, Repository, RepositoryConfig, read_repo_algorithm, system_path, + user_path, + }, tree::RegularFile, }; +/// An `indicatif`-backed [`ProgressReporter`] for use in the CLI. +/// +/// Renders per-component progress bars via [`MultiProgress`]. When a component +/// completes or is skipped the bar is removed; human-readable messages are +/// printed above the bar group via [`MultiProgress::println`]. +#[cfg(any(feature = "oci", feature = "http"))] +struct IndicatifReporter { + multi: MultiProgress, + bars: Mutex>, +} + +#[cfg(any(feature = "oci", feature = "http"))] +impl IndicatifReporter { + fn new() -> Self { + IndicatifReporter { + multi: MultiProgress::new(), + bars: Mutex::new(HashMap::new()), + } + } + + /// Build a shared reporter from this instance. + fn into_shared(self) -> SharedReporter { + Arc::new(self) + } +} + +#[cfg(any(feature = "oci", feature = "http"))] +impl std::fmt::Debug for IndicatifReporter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("IndicatifReporter").finish_non_exhaustive() + } +} + +#[cfg(any(feature = "oci", feature = "http"))] +impl ProgressReporter for IndicatifReporter { + fn report(&self, event: ProgressEvent) { + match event { + ProgressEvent::Started { id, total, unit } => { + let bar = if let Some(total) = total { + self.multi.add(ProgressBar::new(total)) + } else { + self.multi.add(ProgressBar::new_spinner()) + }; + let style = match unit { + ProgressUnit::Bytes => ProgressStyle::with_template( + "[eta {eta}] {bar:40.cyan/blue} {decimal_bytes:>7}/{decimal_total_bytes:7} {msg}", + ), + ProgressUnit::Items => ProgressStyle::with_template( + "[eta {eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} objects {msg}", + ), + // Future unit variants fall back to a generic spinner. + _ => ProgressStyle::with_template( + "[eta {eta}] {bar:40.cyan/blue} {pos}/{len} {msg}", + ), + }; + bar.set_style( + style + .unwrap_or_else(|_| ProgressStyle::default_bar()) + .progress_chars("##-"), + ); + bar.set_message(id.to_string()); + self.bars.lock().unwrap().insert(id, bar); + } + ProgressEvent::Progress { id, fetched, .. } => { + if let Some(bar) = self.bars.lock().unwrap().get(&id) { + bar.set_position(fetched); + } + } + ProgressEvent::Done { id, .. } => { + if let Some(bar) = self.bars.lock().unwrap().remove(&id) { + bar.finish_and_clear(); + } + } + ProgressEvent::Skipped { id } => { + if let Some(bar) = self.bars.lock().unwrap().remove(&id) { + bar.finish_with_message("skipped"); + } + } + ProgressEvent::Message(msg) => { + let _ = self.multi.println(msg); + } + // `ProgressEvent` is #[non_exhaustive]: new variants added to the library + // will be silently ignored here until cfsctl is updated to handle them. + _ => {} + } + } +} + /// JSON output wrapper for `cfsctl fsck --json`. #[derive(Serialize)] struct FsckJsonOutput { @@ -89,6 +192,11 @@ pub struct App { #[clap(long, value_enum)] pub hash: Option, + /// The EROFS format version to use when generating images. + /// If omitted, the library default (V2) is used. + #[clap(long, value_enum)] + pub erofs_version: Option, + /// Deprecated: security mode is now auto-detected from meta.json. /// Use `cfsctl init --insecure` to create a repo without verity. /// Kept for backward compatibility. @@ -123,6 +231,44 @@ pub enum HashType { Sha512, } +/// The EROFS format version used when generating images. +#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)] +pub enum ErofsVersion { + /// Format V1: compact inodes, BFS, C-compatible. + #[clap(name = "1")] + V1, + /// Format V2: extended inodes, DFS, current default. + #[clap(name = "2")] + V2, +} + +impl From for composefs::erofs::format::FormatVersion { + fn from(v: ErofsVersion) -> Self { + match v { + ErofsVersion::V1 => Self::V1, + ErofsVersion::V2 => Self::V2, + } + } +} + +/// EROFS format generation mode for `cfsctl init --erofs`. +#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)] +pub enum ErofsMode { + /// Generate only V1 EROFS (default; compatible with C `mkcomposefs`/`composefs-info` 1.0.8). + V1, + /// Generate both V1 and V2 EROFS (dual mode, used by bootc and other multi-format consumers). + Dual, +} + +impl From for FormatSet { + fn from(m: ErofsMode) -> Self { + match m { + ErofsMode::V1 => FormatSet::V1_ONLY, + ErofsMode::Dual => FormatSet::BOTH, + } + } +} + /// A reference to an OCI image: either a content digest or a named ref. /// /// Digests are prefixed with `@` (e.g. `@sha256:abc123…`), while bare @@ -336,6 +482,28 @@ enum OciCommand { config_opts: OCIConfigFilesystemOptions, }, + /// Compute the composefs boot image karg for a stored OCI image. + /// + /// Applies the bootable transformation (SELinux relabeling, empty /boot and /sysroot), + /// computes the V1 EROFS digest, and prints the full kernel argument string: + /// + /// composefs.digest= + /// + /// This is intended for use in UKI Containerfile builds where no composefs + /// repository is available. The output can be written directly to + /// /etc/kernel/cmdline: + /// + /// cfsctl oci composefs-digest-karg @sha256:abc... > /etc/kernel/cmdline + /// + /// The image can be specified by ref name or @digest: + /// cfsctl oci composefs-digest-karg myimage:latest + /// cfsctl oci composefs-digest-karg @sha256:a1b2c3... + #[clap(name = "composefs-digest-karg")] + ComposefsDigestKarg { + #[clap(flatten)] + config_opts: OCIConfigOptions, + }, + /// Create the composefs image of the rootfs of a stored OCI image, perform bootable transformation, commit it to the repo, /// then configure boot for the image by writing new boot resources and bootloader entries to boot partition. Performs /// state preparation for composefs-setup-root consumption as well. Note that state preparation here is not suitable for @@ -405,6 +573,20 @@ enum Command { /// re-imported after migration. #[clap(long)] reset_metadata: bool, + /// Default EROFS format version for images in this repository. + /// V1 is compatible with C `mkcomposefs` 1.0.8. + /// If omitted, falls back to the global `--erofs-version` flag, then defaults to V2. + #[clap(long)] + erofs_version: Option, + /// EROFS format generation mode. + /// + /// Controls which EROFS format versions are produced when committing images: + /// v1 Generate only V1 EROFS (default; C-tool compatible) + /// dual Generate both V1 and V2 EROFS (used by bootc) + /// + /// If omitted, defaults to `v1`. + #[clap(long, value_enum)] + erofs: Option, }, /// Take a transaction lock on the repository. /// This prevents garbage collection from occurring. @@ -453,6 +635,24 @@ enum Command { #[clap(flatten)] fs_opts: FsReadOptions, }, + /// Read rootfs located at a path and compute the composefs kernel argument string. + /// + /// Like compute-id but outputs the full kernel argument rather than the bare digest, + /// choosing the argument name based on the EROFS format version: + /// + /// V1 (default): composefs.digest= + /// V2: composefs= + /// + /// Use --erofs-version to select the format; defaults to V1. + /// Use --bootable to apply the boot transformation (SELinux relabeling, empty /boot and /sysroot). + /// + /// Example (in a Containerfile): + /// cfsctl --erofs-version 1 compute-karg --bootable /mnt/base > /etc/kernel/cmdline + #[clap(name = "compute-karg")] + ComputeKarg { + #[clap(flatten)] + fs_opts: FsReadOptions, + }, /// Read rootfs located at a path and dump full content of the rootfs to a composefs dumpfile, /// writing to stdout. Does not store any file objects in the repository. CreateDumpfile { @@ -612,13 +812,29 @@ pub async fn run_app(args: App) -> Result<()> { ref path, insecure, reset_metadata, + erofs_version: ref init_erofs_version, + erofs: init_erofs, } = args.cmd { + // --erofs controls the FormatSet (which versions to generate); default V1_ONLY. + let erofs_formats = init_erofs + .map(FormatSet::from) + .unwrap_or(FormatSet::V1_ONLY); + // Prefer the subcommand-level --erofs-version; fall back to global flag. + // If neither is given, default to V1: that is the primary format for + // both v1-only and dual modes, and "cfsctl init" (no --erofs flag) is + // equivalent to "cfsctl init --erofs v1". + let erofs_version = init_erofs_version + .or(args.erofs_version) + .map(composefs::erofs::format::FormatVersion::from) + .unwrap_or(composefs::erofs::format::FormatVersion::V1); return run_init( algorithm, path.as_deref(), insecure || args.insecure, reset_metadata, + erofs_version, + erofs_formats, &args, ); } @@ -628,7 +844,9 @@ pub async fn run_app(args: App) -> Result<()> { if args.no_repo || matches!( args.cmd, - Command::ComputeId { .. } | Command::CreateDumpfile { .. } + Command::ComputeId { .. } + | Command::ComputeKarg { .. } + | Command::CreateDumpfile { .. } ) { // If a repo path is available and --no-repo wasn't passed, @@ -666,6 +884,8 @@ fn run_init( path: Option<&Path>, insecure: bool, reset_metadata: bool, + erofs_version: composefs::erofs::format::FormatVersion, + erofs_formats: FormatSet, args: &App, ) -> Result<()> { let repo_path = if let Some(p) = path { @@ -686,12 +906,18 @@ fn run_init( // init_path handles idempotency: same algorithm is a no-op, // different algorithm is an error. + let config = { + let mut c = RepositoryConfig::new(*algorithm); + c.erofs_version = erofs_version; + c.erofs_formats = erofs_formats; + if insecure { c.set_insecure() } else { c } + }; let created = match algorithm { Algorithm::Sha256 { .. } => { - Repository::::init_path(CWD, &repo_path, *algorithm, !insecure)?.1 + Repository::::init_path(CWD, &repo_path, config)?.1 } Algorithm::Sha512 { .. } => { - Repository::::init_path(CWD, &repo_path, *algorithm, !insecure)?.1 + Repository::::init_path(CWD, &repo_path, config)?.1 } }; @@ -734,6 +960,11 @@ where if args.require_verity { repo.require_verity()?; } + // If the user explicitly passed --erofs-version, override the stored + // repo setting for this invocation only (does not rewrite meta.json). + if let Some(version) = args.erofs_version { + repo.set_erofs_version(version.into()); + } Ok(repo) } @@ -883,12 +1114,36 @@ fn dump_file_impl( /// Run commands that don't require a repository. pub async fn run_cmd_without_repo(args: App) -> Result<()> { + let erofs_version = args + .erofs_version + .map(composefs::erofs::format::FormatVersion::from); match args.cmd { Command::ComputeId { fs_opts } => { let fs = load_filesystem_from_ondisk_fs::(&fs_opts, None).await?; - let id = fs.compute_image_id(); + let version = erofs_version.unwrap_or_default(); + let id = composefs::fsverity::compute_verity::( + &composefs::erofs::writer::mkfs_erofs_versioned( + &composefs::erofs::writer::ValidatedFileSystem::new(fs)?, + version, + ), + ); println!("{}", id.to_hex()); } + Command::ComputeKarg { fs_opts } => { + let fs = load_filesystem_from_ondisk_fs::(&fs_opts, None).await?; + let version = erofs_version.unwrap_or(FormatVersion::V1); + let id = composefs::fsverity::compute_verity::( + &composefs::erofs::writer::mkfs_erofs_versioned( + &composefs::erofs::writer::ValidatedFileSystem::new(fs)?, + version, + ), + ); + let karg = match version { + FormatVersion::V1 => ComposefsCmdline::new_v1(id, args.insecure), + FormatVersion::V2 => ComposefsCmdline::new_v2(id, args.insecure), + }; + println!("{}", karg.to_cmdline_arg()); + } Command::CreateDumpfile { fs_opts } => { let fs = load_filesystem_from_ondisk_fs::(&fs_opts, None).await?; fs.print_dumpfile()?; @@ -974,9 +1229,28 @@ where } OciCommand::ComputeId { config_opts } => { let fs = load_filesystem_from_oci_image(&repo, config_opts)?; - let id = fs.compute_image_id(); + let id = fs.compute_image_id(repo.erofs_version()); println!("{}", id.to_hex()); } + OciCommand::ComposefsDigestKarg { config_opts } => { + let verity = verity_opt(&config_opts.config_verity)?; + let (config_digest, config_verity) = + resolve_oci_config(&repo, &config_opts.config_name, verity)?; + let mut fs = composefs_oci::image::create_filesystem( + &repo, + &config_digest, + config_verity.as_ref(), + )?; + fs.transform_for_boot(&repo)?; + let digest = composefs::fsverity::compute_verity::( + &composefs::erofs::writer::mkfs_erofs_versioned( + &composefs::erofs::writer::ValidatedFileSystem::new(fs)?, + composefs::erofs::format::FormatVersion::V1, + ), + ); + let karg = ComposefsCmdline::new_v1(digest, repo.is_insecure()); + println!("{}", karg.to_cmdline_arg()); + } OciCommand::Pull { ref image, name, @@ -986,8 +1260,10 @@ where // If no explicit name provided, use the image reference as the tag let tag_name = name.as_deref().unwrap_or(image); + let reporter: SharedReporter = IndicatifReporter::new().into_shared(); let opts = composefs_oci::PullOptions { local_fetch: local_fetch.into(), + progress: Some(reporter), ..Default::default() }; @@ -1117,7 +1393,25 @@ where config_verity.as_ref(), )?; let entries = fs.transform_for_boot(&repo)?; - let id = fs.commit_image(&repo, None)?; + let formats = repo.default_format_set(); + let ids = fs.commit_images(&repo, None, formats)?; + // Prefer V1 digest; fall back to V2. + let id = ids + .get(&FormatVersion::V1) + .or_else(|| ids.get(&FormatVersion::V2)) + .ok_or_else(|| anyhow::anyhow!("commit_images produced no images"))? + .clone(); + + let insecure = repo.is_insecure(); + let karg = if formats.contains(FormatVersion::V1) + && !formats.contains(FormatVersion::V2) + { + // V1-only repo → composefs.digest= (with optional ? for insecure) + ComposefsCmdline::new_v1(id, insecure) + } else { + // BOTH or V2-only repo → composefs= (with optional ? for insecure) + ComposefsCmdline::new_v2(id, insecure) + }; let Some(entry) = entries.into_iter().next() else { anyhow::bail!("No boot entries!"); @@ -1127,8 +1421,7 @@ where write_boot::write_boot_simple( &repo, entry, - &id, - repo.is_insecure(), + &karg, bootdir, None, entry_id.as_deref(), @@ -1141,7 +1434,7 @@ where .map(|p: &PathBuf| p.parent().unwrap()) .unwrap_or(Path::new("/sysroot")) .join("state/deploy") - .join(id.to_hex()); + .join(karg.digest().to_hex()); create_dir_all(state.join("var"))?; create_dir_all(state.join("etc/upper"))?; @@ -1176,9 +1469,13 @@ where let id = fs.commit_image(&repo, image_name.as_deref())?; println!("{}", id.to_id()); } - Command::ComputeId { .. } | Command::CreateDumpfile { .. } => { + Command::ComputeId { .. } + | Command::ComputeKarg { .. } + | Command::CreateDumpfile { .. } => { // Handled in run_app before opening the repo - unreachable!("compute-id and create-dumpfile are dispatched without a repo"); + unreachable!( + "compute-id, compute-karg, and create-dumpfile are dispatched without a repo" + ); } Command::Mount { name, mountpoint } => { repo.mount_at(&name, &mountpoint)?; @@ -1244,10 +1541,158 @@ where } #[cfg(feature = "http")] Command::Fetch { url, name } => { - let (digest, verity) = composefs_http::download(&url, &name, Arc::clone(&repo)).await?; + let reporter: SharedReporter = IndicatifReporter::new().into_shared(); + let (digest, verity) = composefs_http::download( + &url, + &name, + Arc::clone(&repo), + composefs_http::DownloadOptions { + progress: Some(reporter), + }, + ) + .await?; println!("content {digest}"); println!("verity {}", verity.to_hex()); } } Ok(()) } + +#[cfg(test)] +#[cfg(any(feature = "oci", feature = "http"))] +mod tests { + use super::*; + use composefs::progress::{ProgressEvent, ProgressUnit}; + + // ── IndicatifReporter ──────────────────────────────────────────────────── + + /// A complete valid lifecycle (Started → Progress → Done) must not panic, + /// even without a real terminal (indicatif handles headless gracefully). + #[test] + fn test_indicatif_reporter_valid_lifecycle() { + let reporter = IndicatifReporter::new(); + // Message before any component + reporter.report(ProgressEvent::Message("starting pull".into())); + // Byte-tracked component + reporter.report(ProgressEvent::Started { + id: "sha256:abc".into(), + total: Some(1_000_000), + unit: ProgressUnit::Bytes, + }); + reporter.report(ProgressEvent::Progress { + id: "sha256:abc".into(), + fetched: 500_000, + total: Some(1_000_000), + }); + reporter.report(ProgressEvent::Done { + id: "sha256:abc".into(), + transferred: 1_000_000, + }); + // Item-counted component (HTTP objects) + reporter.report(ProgressEvent::Started { + id: "objects:stream".into(), + total: Some(200), + unit: ProgressUnit::Items, + }); + reporter.report(ProgressEvent::Progress { + id: "objects:stream".into(), + fetched: 100, + total: Some(200), + }); + reporter.report(ProgressEvent::Done { + id: "objects:stream".into(), + transferred: 200, + }); + // Skipped component + reporter.report(ProgressEvent::Started { + id: "sha256:cached".into(), + total: None, + unit: ProgressUnit::Bytes, + }); + reporter.report(ProgressEvent::Skipped { + id: "sha256:cached".into(), + }); + } + + /// Progress/Done events for an ID that was never `Started` must not panic. + /// + /// This guards against error-recovery paths where a `Started` event may + /// have been suppressed or the reporter was attached after the operation + /// began. + #[test] + fn test_indicatif_reporter_unknown_id_no_panic() { + let reporter = IndicatifReporter::new(); + // Progress for unknown ID — should silently ignore + reporter.report(ProgressEvent::Progress { + id: "ghost".into(), + fetched: 42, + total: None, + }); + // Done for unknown ID — should silently ignore + reporter.report(ProgressEvent::Done { + id: "ghost".into(), + transferred: 42, + }); + // Skipped for unknown ID — should silently ignore + reporter.report(ProgressEvent::Skipped { id: "ghost".into() }); + } + + /// A spinner-style bar (unknown total) must not panic. + #[test] + fn test_indicatif_reporter_spinner_lifecycle() { + let reporter = IndicatifReporter::new(); + // Started with unknown total → spinner + reporter.report(ProgressEvent::Started { + id: "layer:unknown-size".into(), + total: None, + unit: ProgressUnit::Bytes, + }); + reporter.report(ProgressEvent::Progress { + id: "layer:unknown-size".into(), + fetched: 1024, + total: None, + }); + reporter.report(ProgressEvent::Done { + id: "layer:unknown-size".into(), + transferred: 2048, + }); + } + + /// Multiple concurrent components must not interfere with each other. + #[test] + fn test_indicatif_reporter_multiple_concurrent_components() { + let reporter = IndicatifReporter::new(); + // Start two layers in parallel + reporter.report(ProgressEvent::Started { + id: "layer:a".into(), + total: Some(100), + unit: ProgressUnit::Bytes, + }); + reporter.report(ProgressEvent::Started { + id: "layer:b".into(), + total: Some(200), + unit: ProgressUnit::Bytes, + }); + // Interleaved progress + reporter.report(ProgressEvent::Progress { + id: "layer:a".into(), + fetched: 50, + total: Some(100), + }); + reporter.report(ProgressEvent::Progress { + id: "layer:b".into(), + fetched: 100, + total: Some(200), + }); + // Layer B finishes first + reporter.report(ProgressEvent::Done { + id: "layer:b".into(), + transferred: 200, + }); + // Layer A finishes + reporter.report(ProgressEvent::Done { + id: "layer:a".into(), + transferred: 100, + }); + } +} diff --git a/crates/composefs-ctl/src/main.rs b/crates/composefs-ctl/src/main.rs index a7ae17a5..ff57703f 100644 --- a/crates/composefs-ctl/src/main.rs +++ b/crates/composefs-ctl/src/main.rs @@ -1,30 +1,67 @@ //! Command-line control utility for composefs repositories and images. //! -//! `cfsctl` provides a comprehensive interface for managing composefs repositories, -//! creating and mounting filesystem images, handling OCI containers, and performing -//! repository maintenance operations like garbage collection. +//! `cfsctl` is a multi-call binary: when invoked as `mkcomposefs` or +//! `composefs-info` (via symlink or hardlink), it dispatches to the +//! corresponding tool. Otherwise it runs the normal `cfsctl` interface. +//! +//! ## C composefs compatibility roadmap +//! +//! This work aims to provide a Rust implementation that is a drop-in for the +//! C composefs tools and library. See: +//! +//! +//! Status: +//! 1. **CLI interfaces** (`mkcomposefs`, `composefs-info`): Substantially +//! implemented. V1 EROFS output is byte-for-byte identical to C mkcomposefs. +//! See individual module docs for remaining gaps. +//! 2. **EROFS output format**: V1 (C-compatible) writer with compact inodes, +//! BFS ordering, whiteout table, and overlay xattr escaping is complete and +//! tested. V2 (Rust-native) is the default for the composefs-rs repository. +//! 3. **C shared library (`libcomposefs`)**: TODO(compat): Not yet started. +//! This is the next major milestone — providing a C-ABI compatible shared +//! library so that existing C consumers (e.g. ostree, bootc) can link +//! against the Rust implementation. Will require `#[no_mangle]` exports, +//! a `cdylib` crate, and C header generation (e.g. via cbindgen). -use composefs_ctl::App; +use std::path::Path; use anyhow::Result; -use clap::Parser; -fn main() -> Result<()> { - // If we were spawned as a userns helper process, handle that and exit. - // This MUST be called before the tokio runtime is created. - #[cfg(feature = "containers-storage")] - cstorage::init_if_helper(); - - // Now we can create the tokio runtime for the main application - tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()? - .block_on(async_main()) +mod composefs_info; +mod mkcomposefs; + +/// Extract the binary name from argv[0], stripping any directory prefix. +fn binary_name() -> Option { + std::env::args_os().next().and_then(|arg0| { + Path::new(&arg0) + .file_name() + .map(|f| f.to_string_lossy().into_owned()) + }) } -async fn async_main() -> Result<()> { - env_logger::init(); +fn main() -> Result<()> { + match binary_name().as_deref() { + Some("mkcomposefs") => mkcomposefs::run(), + Some("composefs-info") => composefs_info::run(), + _ => { + use clap::Parser; + use composefs_ctl::App; + + // If we were spawned as a userns helper process, handle that and exit. + // This MUST be called before the tokio runtime is created. + #[cfg(feature = "containers-storage")] + cstorage::init_if_helper(); + + env_logger::init(); - let args = App::parse(); - composefs_ctl::run_app(args).await + // Now we can create the tokio runtime for the main application + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()? + .block_on(async { + let args = App::parse(); + composefs_ctl::run_app(args).await + }) + } + } } diff --git a/crates/composefs-ctl/src/mkcomposefs.rs b/crates/composefs-ctl/src/mkcomposefs.rs new file mode 100644 index 00000000..a71f90c7 --- /dev/null +++ b/crates/composefs-ctl/src/mkcomposefs.rs @@ -0,0 +1,420 @@ +//! mkcomposefs - Create composefs images from directories or dumpfiles. +//! +//! This is a Rust reimplementation of the C mkcomposefs tool, providing +//! compatible command-line interface and output format. +//! +//! ## Compatibility status +//! +//! See for context. +//! +//! Implemented and tested (byte-for-byte match with C mkcomposefs): +//! - `--from-file`, `--print-digest`, `--print-digest-only` +//! - `--skip-devices`, `--skip-xattrs`, `--user-xattrs` +//! - `--min-version` / `--max-version` (V1 compact inodes, BFS ordering, whiteout table) +//! - `--digest-store` (C-compatible flat `XX/digest` layout via [`FlatDigestStore`]) +//! - `--threads` (controls tokio worker threads and verity-computation concurrency) +//! - Source from directory or dumpfile, output to file or stdout +//! +//! All known compatibility gaps have been resolved. + +use std::{ + ffi::OsString, + fs::File, + io::{self, BufReader, IsTerminal, Read, Write}, + path::{Path, PathBuf}, + sync::Arc, + thread::available_parallelism, +}; + +use anyhow::{Context, Result, bail}; +use clap::Parser; +use rustix::fs::CWD; +use tokio::sync::Semaphore; + +use composefs::{ + dumpfile::dumpfile_to_filesystem, + erofs::{ + format::FormatVersion, + writer::{ValidatedFileSystem, mkfs_erofs_versioned}, + }, + fs::{ + FlatDigestStore, ObjectStore, read_filesystem_with_semaphore, read_filesystem_with_store, + }, + fsverity::{FsVerityHashValue, Sha256HashValue, compute_verity}, + tree::FileSystem, +}; + +/// Create a composefs image from a source directory or dumpfile. +/// +/// Composefs uses EROFS image files for metadata and separate content-addressed +/// backing directories for regular file data. +#[derive(Parser, Debug)] +#[command(name = "mkcomposefs", version, about)] +struct Args { + /// Treat SOURCE as a dumpfile in composefs-dump(5) format. + /// + /// If SOURCE is `-`, reads from stdin. + #[arg(long)] + from_file: bool, + + /// Print the fsverity digest of the image after writing. + #[arg(long)] + print_digest: bool, + + /// Print the fsverity digest without writing the image. + /// + /// When set, IMAGE must be omitted. + #[arg(long)] + print_digest_only: bool, + + /// Set modification time to zero (Unix epoch) for all files. + #[arg(long)] + use_epoch: bool, + + /// Exclude device nodes from the image. + #[arg(long)] + skip_devices: bool, + + /// Exclude all extended attributes. + #[arg(long)] + skip_xattrs: bool, + + /// Only include xattrs with the `user.` prefix. + #[arg(long)] + user_xattrs: bool, + + /// Minimum image format version to use (0 or 1). + #[arg(long, default_value = "0")] + min_version: u32, + + /// Maximum image format version (for auto-upgrade). + #[arg(long, default_value = "1")] + max_version: u32, + + /// Copy regular file content to the given object store directory. + /// + /// Files are stored by their fsverity digest using the same flat layout + /// as C mkcomposefs: `XX/DIGEST` where XX is the first byte of the digest. + /// The directory is created if it doesn't exist. The layout is compatible + /// with digest stores written by the C mkcomposefs tool. + #[arg(long)] + digest_store: Option, + + /// Number of threads to use for digest calculation and file copying. + #[arg(long)] + threads: Option, + + /// The source directory or dumpfile. + source: PathBuf, + + /// The output image path (use `-` for stdout). + /// + /// Must be omitted when using --print-digest-only. + image: Option, +} + +/// Entry point for the mkcomposefs multi-call mode. +pub(crate) fn run() -> Result<()> { + let args = Args::parse(); + + // Validate arguments + if args.print_digest_only && args.image.is_some() { + bail!("IMAGE must be omitted when using --print-digest-only"); + } + + if !args.print_digest_only && args.image.is_none() { + bail!("IMAGE is required (or use --print-digest-only)"); + } + + if args.min_version > args.max_version { + bail!( + "Invalid version range: --min-version ({}) must not exceed --max-version ({})", + args.min_version, + args.max_version + ); + } + + // Determine format version based on min/max version flags. + // min_version=0 means we use Format 1.0 / V1 (composefs_version=0): + // compact inodes, BFS ordering, whiteout table, build_time + // min_version=1+ means we use Format 1.1 / V2 (composefs_version=2): + // extended inodes, DFS ordering, no whiteouts + // + // No content-driven upgrade from V1→V2 is needed: V1 already supports + // extended inodes (64-byte) natively for entries that don't fit in compact + // (32-byte) inodes, so every filesystem can be represented in V1 without + // loss. Starting at min_version and going up to max_version is therefore + // equivalent to simply using min_version. + let format_version = if args.min_version == 0 { + FormatVersion::V1 + } else { + FormatVersion::V2 + }; + + // Open or create the digest store if specified. + // Always uses the C-compatible flat layout (XX/DIGEST) so that the store + // is interchangeable with the one written by C mkcomposefs. + let store: Option>> = + if let Some(store_path) = &args.digest_store { + let n = args + .threads + .unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); + Some(Arc::new(FlatDigestStore::open(store_path, n, true)?)) + } else { + None + }; + + // Warn if --digest-store is combined with --from-file (store is unused in that case) + if args.from_file && args.digest_store.is_some() { + eprintln!("warning: --digest-store is ignored when --from-file is specified"); + } + + // Read input + let mut fs = if args.from_file { + read_dumpfile(&args)? + } else { + read_directory(&args.source, store, args.threads)? + }; + + // Apply transformations based on flags + apply_transformations(&mut fs, &args, format_version)?; + + // Generate EROFS image + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs)?, format_version); + + // Handle output + if args.print_digest_only { + let digest = compute_fsverity_digest(&image); + println!("{digest}"); + return Ok(()); + } + + // Write image + let image_path = args.image.as_ref().unwrap(); + write_image(image_path, &image)?; + + // Optionally print digest + if args.print_digest { + let digest = compute_fsverity_digest(&image); + println!("{digest}"); + } + + Ok(()) +} + +/// Read and parse a dumpfile from the given source. +fn read_dumpfile(args: &Args) -> Result> { + let content = if args.source.as_os_str() == "-" { + // Read from stdin + let stdin = io::stdin(); + let mut content = String::new(); + stdin.lock().read_to_string(&mut content)?; + content + } else { + // Read from file + let file = File::open(&args.source) + .with_context(|| format!("Failed to open dumpfile: {:?}", args.source))?; + let mut reader = BufReader::new(file); + let mut content = String::new(); + reader.read_to_string(&mut content)?; + content + }; + + dumpfile_to_filesystem(&content).context("Failed to parse dumpfile") +} + +/// Read a filesystem tree from a directory path. +/// +/// If a store is provided, large file contents are copied there and +/// referenced by digest. The store must implement [`ObjectStore`]. +/// +/// The `threads` argument controls both the tokio worker thread count and the +/// semaphore used to limit concurrent verity computations. `Some(1)` uses a +/// single-threaded runtime; `None` or `Some(n > 1)` uses the multi-threaded +/// scheduler. +fn read_directory( + path: &Path, + store: Option>>, + threads: Option, +) -> Result> { + use rustix::fs::{Mode, OFlags}; + + // Verify the path exists and is a directory + let metadata = std::fs::metadata(path) + .with_context(|| format!("Failed to access source directory: {path:?}"))?; + + if !metadata.is_dir() { + bail!("Source path is not a directory: {path:?}"); + } + + // Open a dirfd for the current directory (required by the async API) + let dirfd = rustix::fs::openat( + CWD, + ".", + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .context("Failed to open current directory")?; + + // Build a tokio runtime appropriate for the requested thread count. + // --threads 1 → current_thread (no extra OS threads, minimal overhead). + // --threads N → multi_thread with exactly N worker threads. + // (default) → multi_thread with the tokio default (one per logical CPU). + let rt = match threads { + Some(1) => tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .context("Failed to create single-threaded tokio runtime")?, + Some(n) => tokio::runtime::Builder::new_multi_thread() + .worker_threads(n) + .enable_all() + .build() + .context("Failed to create multi-threaded tokio runtime")?, + None => tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .context("Failed to create multi-threaded tokio runtime")?, + }; + + let path = path.to_path_buf(); + + // When a store is present its semaphore is already configured; + // delegate entirely to read_filesystem_with_store. + // When there is no store we build the semaphore ourselves so the + // requested thread count is honoured. + if store.is_some() { + rt.block_on(read_filesystem_with_store(dirfd, path, store)) + .context("Failed to read directory tree") + } else { + let n = threads.unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); + let semaphore = Arc::new(Semaphore::new(n)); + rt.block_on(read_filesystem_with_semaphore(dirfd, path, None, semaphore)) + .context("Failed to read directory tree") + } +} + +/// Write the image to the specified path (or stdout if `-`). +fn write_image(path: &PathBuf, image: &[u8]) -> Result<()> { + if path.as_os_str() == "-" { + let stdout = io::stdout(); + if stdout.is_terminal() { + bail!( + "Refusing to write binary image to terminal. Redirect stdout or use a file path." + ); + } + stdout.lock().write_all(image)?; + } else { + let mut file = + File::create(path).with_context(|| format!("Failed to create image file: {path:?}"))?; + file.write_all(image)?; + } + Ok(()) +} + +/// Compute the fsverity digest of the image. +fn compute_fsverity_digest(image: &[u8]) -> String { + let digest: Sha256HashValue = compute_verity(image); + digest.to_hex() +} + +/// Apply filesystem transformations based on command-line flags. +fn apply_transformations( + fs: &mut FileSystem, + args: &Args, + format_version: FormatVersion, +) -> Result<()> { + // Handle xattr filtering + if args.skip_xattrs { + // Remove all xattrs + fs.filter_xattrs(|_| false); + } else if args.user_xattrs { + // Keep only user.* xattrs + fs.filter_xattrs(|name| name.as_encoded_bytes().starts_with(b"user.")); + } + + // Handle --use-epoch (set all mtimes to 0) + if args.use_epoch { + set_all_mtimes_to_epoch(fs); + } + + // Handle --skip-devices (remove device nodes) + if args.skip_devices { + remove_device_nodes(fs); + } + + // For Format 1.0, add overlay whiteout entries for compatibility + // with the C mkcomposefs tool. + // Note: The overlay.opaque xattr is added by the writer (not here) to ensure + // it's not escaped by the trusted.overlay.* escaping logic. + if format_version == FormatVersion::V1 { + fs.add_overlay_whiteouts(); + } + + Ok(()) +} + +/// Set all modification times in the filesystem to Unix epoch (0). +fn set_all_mtimes_to_epoch(fs: &mut FileSystem) { + fs.for_each_stat_mut(|stat| { + stat.st_mtim_sec = 0; + stat.st_mtim_nsec = 0; + }); +} + +/// Remove all device nodes (block and character devices) from the filesystem. +fn remove_device_nodes(fs: &mut FileSystem) { + use composefs::generic_tree::{Inode, LeafContent}; + + type Leaf = composefs::generic_tree::Leaf>; + type Dir = composefs::generic_tree::Directory>; + + fn process_dir(dir: &mut Dir, leaves: &[Leaf]) { + // First, collect names of subdirectories to process + let subdir_names: Vec = dir + .entries() + .filter_map(|(name, inode)| { + if matches!(inode, Inode::Directory(_)) { + Some(name.to_os_string()) + } else { + None + } + }) + .collect(); + + // Recursively process subdirectories + for name in subdir_names { + if let Ok(subdir) = dir.get_directory_mut(&name) { + process_dir(subdir, leaves); + } + } + + // Collect names of device nodes to remove + let devices_to_remove: Vec = dir + .entries() + .filter_map(|(name, inode)| { + if let Inode::Leaf(leaf_id, _) = inode + && matches!( + leaves[leaf_id.0].content, + LeafContent::BlockDevice(_) | LeafContent::CharacterDevice(_) + ) + { + return Some(name.to_os_string()); + } + None + }) + .collect(); + + // Remove device nodes + for name in devices_to_remove { + dir.remove(&name); + } + } + + // Split struct field borrows: Rust allows borrowing different fields simultaneously. + let FileSystem { root, leaves, .. } = fs; + process_dir(root, leaves); + + // Compact the leaves table to remove entries now unreferenced after + // device-node removal. Without this, fs.fsck() would report orphaned leaves. + fs.compact(); +} diff --git a/crates/composefs-http/Cargo.toml b/crates/composefs-http/Cargo.toml index d6838856..f7048327 100644 --- a/crates/composefs-http/Cargo.toml +++ b/crates/composefs-http/Cargo.toml @@ -15,7 +15,6 @@ anyhow = { version = "1.0.87", default-features = false } bytes = { version = "1.7.1", default-features = false } composefs = { workspace = true } hex = { version = "0.4.0", default-features = false } -indicatif = { version = "0.18.0", default-features = false } reqwest = { version = "0.13.0", features = ["zstd"] } sha2 = { version = "0.11.0", default-features = false } tokio = { version = "1.24.2", default-features = false } diff --git a/crates/composefs-http/src/lib.rs b/crates/composefs-http/src/lib.rs index cced211b..4c880db8 100644 --- a/crates/composefs-http/src/lib.rs +++ b/crates/composefs-http/src/lib.rs @@ -15,19 +15,53 @@ use std::{ use anyhow::{Result, bail}; use bytes::Bytes; use composefs::util::DigestWrite; -use indicatif::{ProgressBar, ProgressStyle}; use reqwest::{Client, Response, Url}; use sha2::{Digest, Sha256}; use tokio::task::JoinSet; +use composefs::progress::{ComponentId, NullReporter, ProgressEvent, ProgressUnit, SharedReporter}; use composefs::{ fsverity::FsVerityHashValue, repository::Repository, splitstream::SplitStreamReader, }; +/// Initial number of concurrent HTTP object fetch requests. +/// +/// Matches the default `SETTINGS_MAX_CONCURRENT_STREAMS` value from RFC 7540 +/// §6.5.2. This bounds the JoinSet backlog while new tasks are queued as +/// existing ones complete. +const INITIAL_CONCURRENT_REQUESTS: usize = 100; + +/// Options for a [`download`] operation. +#[derive(Default)] +pub struct DownloadOptions { + /// Progress reporter for this download operation. + /// + /// When `None`, all progress events are silently discarded. Supply a + /// [`SharedReporter`] implementation (e.g. an `indicatif`-backed renderer) + /// to receive [`ProgressEvent`]s as the download proceeds. + pub progress: Option, +} + +impl std::fmt::Debug for DownloadOptions { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DownloadOptions") + .field( + "progress", + if self.progress.is_some() { + &"Some()" + } else { + &"None" + }, + ) + .finish() + } +} + struct Downloader { client: Client, repo: Arc>, url: Url, + reporter: SharedReporter, } impl Downloader { @@ -70,15 +104,6 @@ impl Downloader { } async fn ensure_stream(self: &Arc, name: &str) -> Result<(String, ObjectID)> { - let progress = ProgressBar::new(2); // the first object gets "ensured" twice - progress.set_style( - ProgressStyle::with_template( - "[eta {eta}] {bar:40.cyan/blue} Fetching {pos} / {len} splitstreams", - ) - .unwrap() - .progress_chars("##-"), - ); - // Ideally we'll get a symlink, but we might get the data directly let (data, is_symlink) = self.fetch("streams/", name).await?; let my_id = if is_symlink { @@ -86,7 +111,10 @@ impl Downloader { } else { self.repo.ensure_object_async(data.into()).await? }; - progress.inc(1); + + self.reporter.report(ProgressEvent::Message(format!( + "Fetching splitstreams for {name}" + ))); let mut objects_todo = HashSet::new(); @@ -99,9 +127,9 @@ impl Downloader { while let Some(id) = splitstreams_todo.pop() { // this is the slow part (downloads, writing to disk, etc.) if self.ensure_object(&id).await? { - progress.inc(1); - } else { - progress.dec_length(1); + self.reporter.report(ProgressEvent::Message(format!( + "Fetched splitstream {id:?}" + ))); } // this part is fast: it only touches the header @@ -111,7 +139,6 @@ impl Downloader { // This is the (normal) case if we encounter a splitstream we didn't see yet... None => { splitstreams_todo.push(verity.clone()); - progress.inc_length(1); } // This is the case where we've already been asked to fetch this stream. We'll @@ -143,25 +170,21 @@ impl Downloader { })?; } - progress.finish(); - - let progress = ProgressBar::new(objects_todo.len() as u64); - progress.set_style( - ProgressStyle::with_template( - "[eta {eta}] {bar:40.cyan/blue} Fetching {pos} / {len} objects", - ) - .unwrap() - .progress_chars("##-"), - ); + let objects_total = objects_todo.len() as u64; + let fetch_id = ComponentId::from(format!("objects:{name}")); + self.reporter.report(ProgressEvent::Started { + id: fetch_id.clone(), + total: Some(objects_total), + unit: ProgressUnit::Items, + }); // Fetch all the objects let mut set = JoinSet::>::new(); let mut iter = objects_todo.into_iter(); + let mut fetched: u64 = 0; - // Queue up 100 initial requests - // See SETTINGS_MAX_CONCURRENT_STREAMS in RFC 7540 - // We might actually want to increase this... - for id in iter.by_ref().take(100) { + // Queue up the initial batch of concurrent requests. + for id in iter.by_ref().take(INITIAL_CONCURRENT_REQUESTS) { let self_ = Arc::clone(self); set.spawn(async move { self_.ensure_object(&id).await }); } @@ -171,10 +194,12 @@ impl Downloader { while let Some(result) = set.join_next().await { if result?? { // a download - progress.inc(1); - } else { - // a not-download - progress.dec_length(1); + fetched += 1; + self.reporter.report(ProgressEvent::Progress { + id: fetch_id.clone(), + fetched, + total: Some(objects_total), + }); } if let Some(id) = iter.next() { @@ -183,18 +208,17 @@ impl Downloader { } } - progress.finish(); + self.reporter.report(ProgressEvent::Done { + id: fetch_id, + transferred: fetched, + }); // Now that we have all of the objects, we can verify that the merged-content of each // splitstream corresponds to its claimed body content checksum, if any... - let progress = ProgressBar::new(splitstreams.len() as u64); - progress.set_style( - ProgressStyle::with_template( - "[eta {eta}] {bar:40.cyan/blue} Verifying {pos} / {len} splitstreams", - ) - .unwrap() - .progress_chars("##-"), - ); + self.reporter.report(ProgressEvent::Message(format!( + "Verifying {} splitstreams", + splitstreams.len() + ))); let mut my_sha256 = None; // TODO: This can definitely happen in parallel... @@ -217,12 +241,8 @@ impl Downloader { if id == my_id { my_sha256 = Some(measured_checksum); } - - progress.inc(1); } - progress.finish(); - // We've definitely set this by now: `my_id` is in `splitstreams`. let my_sha256 = my_sha256.unwrap(); @@ -241,6 +261,7 @@ impl Downloader { /// * `url` - The base HTTP URL where the splitstream repository is hosted /// * `name` - The name of the splitstream to download (located under `streams/` on the server) /// * `repo` - The repository where downloaded objects will be stored +/// * `opts` - Download options including an optional progress reporter /// /// # Returns /// @@ -258,11 +279,15 @@ pub async fn download( url: &str, name: &str, repo: Arc>, + opts: DownloadOptions, ) -> Result<(String, ObjectID)> { + let reporter: SharedReporter = opts.progress.unwrap_or_else(|| Arc::new(NullReporter)); + let downloader = Arc::new(Downloader { client: Client::new(), repo, url: Url::parse(url)?, + reporter, }); downloader.ensure_stream(name).await diff --git a/crates/composefs-integration-tests/src/lib.rs b/crates/composefs-integration-tests/src/lib.rs index c86ebfb1..b9decb88 100644 --- a/crates/composefs-integration-tests/src/lib.rs +++ b/crates/composefs-integration-tests/src/lib.rs @@ -11,8 +11,8 @@ use std::process::Command; use std::sync::Arc; use anyhow::Result; -use composefs_oci::composefs::fsverity::{Algorithm, Sha256HashValue}; -use composefs_oci::composefs::repository::Repository; +use composefs_oci::composefs::fsverity::Sha256HashValue; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; use tempfile::TempDir; /// A test function that returns a Result. @@ -110,9 +110,11 @@ pub fn create_test_repository(tempdir: &TempDir) -> Result::init_path(&fd, ".", Algorithm::SHA256, false)?; - repo.set_insecure(); + let (repo, _created) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::default().set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs-integration-tests/src/tests/cli.rs b/crates/composefs-integration-tests/src/tests/cli.rs index a3dda2e9..65166d0d 100644 --- a/crates/composefs-integration-tests/src/tests/cli.rs +++ b/crates/composefs-integration-tests/src/tests/cli.rs @@ -16,13 +16,26 @@ use crate::{cfsctl, create_test_rootfs, integration_test}; const OCI_LAYOUT_COMPOSEFS_ID: &str = "f26c6eb439749b82f0d1520e83455bb21766572fb2b5cfe009dd7749a61caf74e0c42c56f1a2cbd9d\ 359e7d172c8e2c65641666c9a18cc484a8b0f6e4e6d47ab"; +// Pinned V1 EROFS composefs image ID for the same OCI layout. Differs from +// OCI_LAYOUT_COMPOSEFS_ID because the V1 EROFS writer produces a different +// on-disk layout than V2. +const OCI_LAYOUT_COMPOSEFS_V1_ID: &str = "5973d67c99d847461d7b51cbe7b38b537e64f74cf4b42ddc63670d98e053202cc77ae195b7f10f619808d33aa25f11f428d42de7eaee08e2af5da4e1014ce68b"; + /// Create a fresh initialized insecure repository in a tempdir. /// /// Returns the tempdir (for lifetime) and the path to the repo. +/// +/// Creates a V2 (legacy) EROFS repo explicitly so that tests which compare +/// against pinned V2 digests (e.g. `OCI_LAYOUT_COMPOSEFS_ID`) continue to +/// work correctly now that `cfsctl init` defaults to V1. fn init_insecure_repo(sh: &Shell, cfsctl: &std::path::Path) -> Result { let repo_dir = tempfile::tempdir()?; let repo = repo_dir.path(); - cmd!(sh, "{cfsctl} --repo {repo} init --insecure").read()?; + cmd!( + sh, + "{cfsctl} --repo {repo} init --insecure --erofs-version 2" + ) + .read()?; Ok(repo_dir) } @@ -1688,3 +1701,385 @@ fn test_compute_image_id() -> Result<()> { Ok(()) } integration_test!(test_compute_image_id); + +/// Creates an OCI image layout that satisfies the `transform_for_boot` requirements: +/// the image must contain `/boot`, `/sysroot`, and `/usr` as top-level directories. +fn create_bootable_oci_layout(parent: &std::path::Path) -> Result { + use cap_std_ext::cap_std; + use ocidir::oci_spec::image::{ + ConfigBuilder, ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, + }; + + let oci_dir = parent.join("oci-bootable"); + std::fs::create_dir_all(&oci_dir)?; + + let dir = cap_std::fs::Dir::open_ambient_dir(&oci_dir, cap_std::ambient_authority())?; + let ocidir = ocidir::OciDir::ensure(dir)?; + + let mut manifest = ocidir.new_empty_manifest()?.build()?; + + let runtime_config = ConfigBuilder::default().build()?; + let rootfs = RootFsBuilder::default() + .typ("layers") + .diff_ids(Vec::::new()) + .build()?; + let mut config = ImageConfigurationBuilder::default() + .architecture("amd64") + .os("linux") + .rootfs(rootfs) + .config(runtime_config) + .build()?; + + let mut layer_builder = ocidir.create_layer(None)?; + // Add the directories required by transform_for_boot + for dir_name in &["usr/", "boot/", "sysroot/"] { + let mut dir_header = tar::Header::new_gnu(); + dir_header.set_entry_type(tar::EntryType::Directory); + dir_header.set_size(0); + dir_header.set_mode(0o755); + dir_header.set_uid(0); + dir_header.set_gid(0); + dir_header.set_mtime(1234567890); + dir_header.set_cksum(); + layer_builder.append_data(&mut dir_header, dir_name, &[] as &[u8])?; + } + // Add a file under usr/ so the image has non-trivial content + { + let data = b"hello from bootable test layer\n"; + let mut header = tar::Header::new_gnu(); + header.set_size(data.len() as u64); + header.set_mode(0o644); + header.set_uid(0); + header.set_gid(0); + header.set_mtime(1234567890); + header.set_cksum(); + layer_builder.append_data(&mut header, "usr/hello.txt", &data[..])?; + } + let layer = layer_builder.into_inner()?.complete()?; + + ocidir.push_layer( + &mut manifest, + &mut config, + layer, + "bootable test layer", + None, + ); + + let platform: Platform = PlatformBuilder::default() + .architecture("amd64") + .os("linux") + .build()?; + ocidir.insert_manifest_and_config(manifest, config, None, platform)?; + + Ok(oci_dir) +} + +/// Test that `oci composefs-digest-karg` emits a valid V1 karg string. +/// +/// Pulls a small local OCI image using an insecure repo and verifies the subcommand outputs +/// `composefs.digest=?<128-char-hex>` (the `?` marks fs-verity verification as optional). +fn test_oci_composefs_digest_karg() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_bootable_oci_layout(fixture_dir.path())?; + + let pull_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} karg-test-image" + ) + .read()?; + + // Extract the config digest from pull output (e.g. "config sha256:abc...") + let config_digest = pull_output + .lines() + .find_map(|l| l.strip_prefix("config").map(|s| s.trim().to_string())) + .expect("config digest in pull output"); + let at_config_digest = format!("@{config_digest}"); + + let karg = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci composefs-digest-karg {at_config_digest}" + ) + .read()?; + let karg = karg.trim(); + + // Parse like ComposefsCmdline would: strip the key, then the optional '?' insecure marker, + // then validate the hex digest. + let after_key = karg + .strip_prefix("composefs.digest=") + .unwrap_or_else(|| panic!("expected karg to start with 'composefs.digest=', got: {karg}")); + // This test uses an insecure repo, so the '?' insecure marker must be present. + let hex = after_key + .strip_prefix('?') + .unwrap_or_else(|| panic!("expected insecure '?' prefix in karg, got: {karg}")); + // The default repo algorithm is fsverity-sha512-12 (SHA-512), which produces + // a 64-byte digest encoded as 128 hex characters. + assert_eq!( + hex.len(), + 128, + "expected 128-char sha512 hex digest, got: {hex}" + ); + assert!( + hex.chars().all(|c| c.is_ascii_hexdigit()), + "expected hex digest, got: {hex}" + ); + + // Should be deterministic + let karg2 = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci composefs-digest-karg {at_config_digest}" + ) + .read()?; + assert_eq!( + karg, + karg2.trim(), + "composefs-digest-karg should be deterministic" + ); + + Ok(()) +} +integration_test!(test_oci_composefs_digest_karg); + +/// Test that --erofs-version 1 and 2 produce different deterministic digests, +/// for both `compute-id` (no-repo) and `create-image` (repo-based). +fn test_erofs_versions() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let fixture_dir = tempfile::tempdir()?; + let rootfs = create_test_rootfs(fixture_dir.path())?; + + // V1 digest via compute-id (no repo) + let id1 = cmd!( + sh, + "{cfsctl} --no-repo --erofs-version 1 compute-id --no-propagate-usr-to-root {rootfs}" + ) + .read()?; + + // V2 digest via compute-id (no repo) + let id2 = cmd!( + sh, + "{cfsctl} --no-repo --erofs-version 2 compute-id --no-propagate-usr-to-root {rootfs}" + ) + .read()?; + + // Default digest (should be V2) + let id_default = cmd!( + sh, + "{cfsctl} --no-repo compute-id --no-propagate-usr-to-root {rootfs}" + ) + .read()?; + + assert_ne!( + id1.trim(), + id2.trim(), + "V1 and V2 should produce different digests" + ); + assert_eq!(id2.trim(), id_default.trim(), "Default should be V2"); + + // Also verify via create-image in a real repo + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + + let img_v1 = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} create-image {rootfs}" + ) + .read()?; + let img_v2 = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 2 --repo {repo} create-image {rootfs}" + ) + .read()?; + let img_default = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} create-image {rootfs}" + ) + .read()?; + + assert_ne!( + img_v1.trim(), + img_v2.trim(), + "create-image: V1 and V2 should produce different image IDs" + ); + assert_eq!( + img_v2.trim(), + img_default.trim(), + "create-image: default should match V2" + ); + + Ok(()) +} +integration_test!(test_erofs_versions); + +/// Verify that `create-image --erofs-version 1` is idempotent and differs from V2. +fn test_create_image_v1_idempotent() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let rootfs = create_test_rootfs(fixture_dir.path())?; + + let v1_id_a = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} create-image {rootfs}" + ) + .read()?; + let v1_id_b = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} create-image {rootfs}" + ) + .read()?; + let v2_id = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} create-image {rootfs}" + ) + .read()?; + + assert_eq!( + v1_id_a.trim(), + v1_id_b.trim(), + "create-image V1 must be idempotent" + ); + assert_ne!( + v1_id_a.trim(), + v2_id.trim(), + "create-image V1 and V2 must produce different image IDs" + ); + + Ok(()) +} +integration_test!(test_create_image_v1_idempotent); + +/// Verify that a repository initialized with `--erofs-version 1` produces V1 images +/// by default, without needing `--erofs-version` on every subsequent command. +fn test_v1_repo_uses_v1_by_default() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let fixture_dir = tempfile::tempdir()?; + let rootfs = create_test_rootfs(fixture_dir.path())?; + + // Init a V1-only repo using the init subcommand's --erofs flag + let repo_dir = tempfile::tempdir()?; + let repo = repo_dir.path(); + cmd!(sh, "{cfsctl} --repo {repo} init --insecure --erofs v1").read()?; + + // Verify meta.json records v1_erofs in ro_compat (V1-only mode) + let meta_json = std::fs::read_to_string(repo.join("meta.json"))?; + assert!( + meta_json.contains("v1_erofs"), + "meta.json should contain v1_erofs in ro_compat for a V1-only repo, got: {meta_json}" + ); + + // create-image WITHOUT --erofs-version flag — should use the repo's default (V1) + let id_default = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} create-image {rootfs}" + ) + .read()?; + let id_default = id_default.trim(); + + // create-image WITH explicit --erofs-version 1 — should be identical + let id_explicit_v1 = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} create-image {rootfs}" + ) + .read()?; + let id_explicit_v1 = id_explicit_v1.trim(); + + assert_eq!( + id_default, id_explicit_v1, + "repo initialized as V1 must produce V1 images by default (no flag needed)" + ); + + // create-image with explicit --erofs-version 2 — should differ + let id_v2 = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 2 --repo {repo} create-image {rootfs}" + ) + .read()?; + let id_v2 = id_v2.trim(); + + assert_ne!( + id_default, id_v2, + "V1 repo default must not equal explicit V2 output" + ); + + Ok(()) +} +integration_test!(test_v1_repo_uses_v1_by_default); + +/// Verify `oci compute-id --erofs-version 1` is idempotent, differs from V2, +/// and matches the pinned V1 digest for the deterministic test OCI layout. +fn test_oci_pull_v1_digest_stability() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull the OCI layout + let pull_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-v1-image" + ) + .read()?; + + // Extract config digest from pull output (e.g. "config sha256:abc...") + let config_digest = pull_output + .lines() + .find_map(|l| l.strip_prefix("config").map(|s| s.trim().to_string())) + .expect("config digest in pull output"); + let at_config_digest = format!("@{config_digest}"); + + // Compute V1 digest twice — must be identical (idempotency) + let v1_id_a = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} oci compute-id {at_config_digest}" + ) + .read()?; + let v1_id_b = cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} oci compute-id {at_config_digest}" + ) + .read()?; + assert_eq!( + v1_id_a.trim(), + v1_id_b.trim(), + "V1 oci compute-id must be idempotent" + ); + + // V2 (default) must differ from V1 + let v2_id = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci compute-id {at_config_digest}" + ) + .read()?; + assert_ne!( + v1_id_a.trim(), + v2_id.trim(), + "V1 and V2 oci compute-id must produce different digests" + ); + + // V2 must still match the existing pinned constant + assert_eq!( + v2_id.trim(), + OCI_LAYOUT_COMPOSEFS_ID, + "V2 OCI layout composefs image ID changed" + ); + + // V1 must match the pinned V1 constant (stability across code changes) + assert_eq!( + v1_id_a.trim(), + OCI_LAYOUT_COMPOSEFS_V1_ID, + "V1 OCI layout composefs image ID changed — \ + the V1 EROFS writer produced different output for the same deterministic OCI image" + ); + + Ok(()) +} +integration_test!(test_oci_pull_v1_digest_stability); diff --git a/crates/composefs-integration-tests/src/tests/cstor.rs b/crates/composefs-integration-tests/src/tests/cstor.rs index 0ae5e1b3..554cd79d 100644 --- a/crates/composefs-integration-tests/src/tests/cstor.rs +++ b/crates/composefs-integration-tests/src/tests/cstor.rs @@ -92,8 +92,14 @@ fn privileged_test_cstor_vs_skopeo_equivalence() -> Result<()> { // Import from the OCI directory via skopeo/tar path let skopeo_image_ref = format!("oci:{}:test", oci_path.display()); println!("Importing via skopeo/OCI: {}", skopeo_image_ref); - let (skopeo_pull_result, _skopeo_stats) = - composefs_oci::pull_image(&skopeo_repo, &skopeo_image_ref, None, None).await?; + let (skopeo_pull_result, _skopeo_stats) = composefs_oci::pull_image( + &skopeo_repo, + &skopeo_image_ref, + None, + None, + std::sync::Arc::new(composefs_oci::NullReporter), + ) + .await?; let (skopeo_config_digest, skopeo_config_verity) = skopeo_pull_result.into_config(); // Get layer maps from both configs diff --git a/crates/composefs-integration-tests/src/tests/digest_stability.rs b/crates/composefs-integration-tests/src/tests/digest_stability.rs index 11fb5b95..dd1db2c3 100644 --- a/crates/composefs-integration-tests/src/tests/digest_stability.rs +++ b/crates/composefs-integration-tests/src/tests/digest_stability.rs @@ -20,11 +20,18 @@ struct ContainerImage { /// unavailable (e.g. a PR that adds a new mirror entry before it has been /// pushed). Should be pinned by digest for reproducibility. upstream_ref: &'static str, - /// Expected composefs image ID without `--bootable`. + /// Expected composefs image ID without `--bootable` (V2/default EROFS). expected_id: &'static str, - /// Expected composefs image ID with `--bootable`, or `None` if the - /// image lacks /sysroot and doesn't support bootable transformation. + /// Expected composefs image ID with `--bootable` (V2/default EROFS), or + /// `None` if the image lacks /sysroot and doesn't support bootable + /// transformation. expected_bootable_id: Option<&'static str>, + /// Expected composefs image ID without `--bootable` using V1 EROFS writer. + expected_v1_id: &'static str, + /// Expected composefs image ID with `--bootable` using V1 EROFS writer, or + /// `None` if the image lacks /sysroot and doesn't support bootable + /// transformation. + expected_v1_bootable_id: Option<&'static str>, } // RHEL UBI 10.1, build 1772441712 (amd64). @@ -38,6 +45,9 @@ const UBI10: ContainerImage = ContainerImage { expected_id: "ff8dad033a3e6015d63d6b00c16918da27bf96cc8ddd824e521549db01013227\ 87c30a3f49e5716f8f6052d78b46308dfaaccf0dfc504d26fe58d468810c0b0e", expected_bootable_id: None, + expected_v1_id: "b4143dd605a376f878665a962b9db2de1b385178a26d00073c591e296aae4ee\ + c4fac3bee41e14019ee9422e5a0d0d1427aa2b4040ad2bb304ac6e07b73d5dbef", + expected_v1_bootable_id: None, }; // centos-bootc stream10, pinned by manifest digest so the test is @@ -54,6 +64,12 @@ const CENTOS_BOOTC: ContainerImage = ContainerImage { "79c840369bf1ef414d71731166967a01f6616039bc0e1d4c5353bed02e0d2bd9\ 4459e22407bb885f1d6ce44a04add35adf0d00ca8a23f90544a99a76fdadb65b", ), + expected_v1_id: "98d0b699c81f9f03cea3345d578a49d926338cd0f1d5670d5fcb381c276ba8d\ + 2c2762587d186ff255b9852a747272531ff268dadfef62092c061f2ef8d974143", + expected_v1_bootable_id: Some( + "58814924afc5454971abd5cc1c823bd20a691bb54413e0bc453fc5e5b6e71175\ + 43c521b97e8eff39cc872ea3ee88f2df32e345802cf488d445b0a91a885992e8", + ), }; // Ubuntu 26.04 (resolute), pinned by manifest digest. @@ -70,6 +86,9 @@ const UBUNTU_RESOLUTE: ContainerImage = ContainerImage { expected_id: "150caabb982d7005db1a1d0480d57a95e84b160aa2b1159f9aae66e92ba07b36\ 11ea38e1836eff923dc3a1a617c18494757be0f5e3db16cc7a522981b3f42d40", expected_bootable_id: None, + expected_v1_id: "86baa5b1d06edb7a941e0afcca22b2312cc8a77ec30244c03c8be6bb6184a1a\ + 4744a1c028309136c46ae6e4c186ac9e995aa44ae3940d2729e53ac17d129b524", + expected_v1_bootable_id: None, }; /// All container images to test. @@ -133,7 +152,7 @@ fn try_pull_image( bail!("could not find config digest in pull output:\n{output}") } -/// Compute the composefs image ID for a pulled OCI image. +/// Compute the composefs image ID for a pulled OCI image (default V2 EROFS). /// /// The `config_digest` should be a bare OCI digest (e.g. `sha256:abc...`); /// this function adds the `@` prefix required by the CLI. @@ -161,11 +180,38 @@ fn compute_id( Ok(output.trim().to_string()) } +/// Compute the composefs image ID using the V1 EROFS writer. +/// +/// `--erofs-version 1` is a global flag and must appear before the subcommand. +fn compute_id_v1( + sh: &Shell, + cfsctl: &std::path::Path, + repo: &std::path::Path, + config_digest: &str, + bootable: bool, +) -> Result { + let at_digest = format!("@{config_digest}"); + let output = if bootable { + cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} oci compute-id --bootable {at_digest}" + ) + .read()? + } else { + cmd!( + sh, + "{cfsctl} --insecure --erofs-version 1 --repo {repo} oci compute-id {at_digest}" + ) + .read()? + }; + Ok(output.trim().to_string()) +} + /// Table-driven OCI container digest stability test. /// /// Pulls each pinned container image from a registry, computes the composefs -/// image ID for both plain and `--bootable` transforms, and asserts they -/// match the expected values. +/// image ID for both plain and `--bootable` transforms using both the default +/// (V2) and V1 EROFS writers, and asserts they match the expected values. /// /// Skipped when `COMPOSEFS_SKIP_NETWORK=1` is set. fn test_oci_container_digest_stability() -> Result<()> { @@ -181,31 +227,37 @@ fn test_oci_container_digest_stability() -> Result<()> { eprintln!("--- {} ---", image.label); let repo_dir = tempfile::tempdir()?; let repo = repo_dir.path(); - cmd!(sh, "{cfsctl} --repo {repo} init --insecure").read()?; + // Use V2 explicitly: compute_id() tests V2 (default) hashes; V1 is + // tested separately via compute_id_v1() with --erofs-version 1. + cmd!( + sh, + "{cfsctl} --repo {repo} init --insecure --erofs-version 2" + ) + .read()?; eprintln!("Pulling {} (this may take a while)...", image.label); let config = pull_image(&sh, &cfsctl, repo, image, image.label)?; - // Plain (non-bootable) image ID + // V2 (default): plain image ID let plain_id = compute_id(&sh, &cfsctl, repo, &config, false)?; - eprintln!("{} composefs image ID: {plain_id}", image.label); + eprintln!("{} composefs V2 image ID: {plain_id}", image.label); assert_eq!( plain_id, image.expected_id, - "{}: composefs image ID changed — the EROFS writer or OCI \ + "{}: composefs image ID changed — the EROFS V2 writer or OCI \ pipeline produced different output for the same image", image.label, ); - // Bootable image ID (only for images that support it) + // V2 (default): bootable image ID (only for images that support it) if let Some(expected_bootable) = image.expected_bootable_id { let bootable_id = compute_id(&sh, &cfsctl, repo, &config, true)?; eprintln!( - "{} composefs image ID (bootable): {bootable_id}", + "{} composefs V2 image ID (bootable): {bootable_id}", image.label ); assert_eq!( bootable_id, expected_bootable, - "{}: bootable composefs image ID changed — the EROFS writer or \ + "{}: bootable composefs image ID changed — the EROFS V2 writer or \ boot transform produced different output for the same image", image.label, ); @@ -217,6 +269,45 @@ fn test_oci_container_digest_stability() -> Result<()> { image.label, ); } + + // V1: plain image ID + let v1_plain_id = compute_id_v1(&sh, &cfsctl, repo, &config, false)?; + eprintln!("{} composefs V1 image ID: {v1_plain_id}", image.label); + assert_eq!( + v1_plain_id, image.expected_v1_id, + "{}: composefs V1 image ID changed — the EROFS V1 writer or OCI \ + pipeline produced different output for the same image", + image.label, + ); + + // V1 and V2 must produce different digests for the same image. + assert_ne!( + v1_plain_id, plain_id, + "{}: V1 and V2 EROFS image IDs should differ", + image.label, + ); + + // V1: bootable image ID (only for images that support it) + if let Some(expected_v1_bootable) = image.expected_v1_bootable_id { + let v1_bootable_id = compute_id_v1(&sh, &cfsctl, repo, &config, true)?; + eprintln!( + "{} composefs V1 image ID (bootable): {v1_bootable_id}", + image.label + ); + assert_eq!( + v1_bootable_id, expected_v1_bootable, + "{}: bootable composefs V1 image ID changed — the EROFS V1 writer or \ + boot transform produced different output for the same image", + image.label, + ); + + assert_ne!( + v1_plain_id, v1_bootable_id, + "{}: plain and --bootable V1 image IDs should differ \ + (bootable applies SELinux relabeling, empties /boot and /sysroot)", + image.label, + ); + } } Ok(()) diff --git a/crates/composefs-integration-tests/src/tests/mod.rs b/crates/composefs-integration-tests/src/tests/mod.rs index 0b99e04a..90ee0a58 100644 --- a/crates/composefs-integration-tests/src/tests/mod.rs +++ b/crates/composefs-integration-tests/src/tests/mod.rs @@ -3,5 +3,6 @@ pub mod cli; pub mod cstor; pub mod digest_stability; +pub mod oci_compat; pub mod old_format; pub mod privileged; diff --git a/crates/composefs-integration-tests/src/tests/oci_compat.rs b/crates/composefs-integration-tests/src/tests/oci_compat.rs new file mode 100644 index 00000000..42a37cc3 --- /dev/null +++ b/crates/composefs-integration-tests/src/tests/oci_compat.rs @@ -0,0 +1,515 @@ +//! Real filesystem compatibility tests. +//! +//! These tests create realistic filesystem structures (similar to what you'd find +//! in container images) and verify bit-for-bit compatibility between the Rust +//! mkfs_erofs and C mkcomposefs implementations. +//! +//! Requirements: +//! - C mkcomposefs binary (/usr/bin/mkcomposefs or set C_MKCOMPOSEFS_PATH) +//! - cfsctl binary (built from this project; invoked as "mkcomposefs" via symlink) +//! +//! Install the C mkcomposefs with: `sudo apt install composefs` + +use std::fs; +use std::io::Write; +use std::os::unix::fs::symlink; +use std::path::PathBuf; +use std::process::{Command, Stdio}; +use std::sync::OnceLock; + +use anyhow::{Context, Result, bail}; +use xshell::{Shell, cmd}; + +use crate::{cfsctl, integration_test}; + +/// Cached path to C mkcomposefs binary, computed once. +/// `None` means the binary is not available and compat tests should be skipped. +static C_MKCOMPOSEFS_PATH: OnceLock> = OnceLock::new(); + +/// Get the path to C mkcomposefs binary, or `None` if not available. +/// +/// Priority: +/// 1. C_MKCOMPOSEFS_PATH environment variable (if set; error if it doesn't exist) +/// 2. /usr/bin/mkcomposefs (system installation) +/// 3. None — caller should skip the test +fn c_mkcomposefs_path() -> Option<&'static PathBuf> { + C_MKCOMPOSEFS_PATH + .get_or_init(|| { + // Check env var first + if let Ok(path) = std::env::var("C_MKCOMPOSEFS_PATH") { + let path = PathBuf::from(path); + if path.exists() { + return Some(path); + } + panic!( + "C_MKCOMPOSEFS_PATH is set to '{}' but the file does not exist", + path.display() + ); + } + + // Check system location + let system_path = PathBuf::from("/usr/bin/mkcomposefs"); + if system_path.exists() { + return Some(system_path); + } + + None + }) + .as_ref() +} + +/// Cached symlink to cfsctl named "mkcomposefs" for multi-call dispatch. +static RUST_MKCOMPOSEFS_PATH: OnceLock> = OnceLock::new(); + +/// Get the path to the Rust mkcomposefs binary (a symlink to cfsctl). +/// +/// cfsctl is a multi-call binary that dispatches based on argv[0]. We create +/// a symlink named "mkcomposefs" pointing to cfsctl so that it runs in +/// mkcomposefs mode. +fn rust_mkcomposefs_path() -> Result { + let result = RUST_MKCOMPOSEFS_PATH.get_or_init(|| { + let cfsctl_path = cfsctl().map_err(|e| format!("{e:#}"))?; + + // Create a symlink in the same directory as cfsctl + let parent = cfsctl_path.parent().unwrap_or(std::path::Path::new(".")); + let symlink_path = parent.join("mkcomposefs"); + + // Remove any existing symlink/file (idempotent) + let _ = std::fs::remove_file(&symlink_path); + + std::os::unix::fs::symlink(&cfsctl_path, &symlink_path) + .map_err(|e| format!("Failed to create mkcomposefs symlink: {e}"))?; + + Ok(symlink_path) + }); + + match result { + Ok(path) => Ok(path.clone()), + Err(e) => bail!("{e}"), + } +} + +/// Compare Rust and C mkcomposefs output for a given dumpfile. +/// +/// Returns `Ok(true)` if the outputs are bit-for-bit identical, `Ok(false)` if +/// the C mkcomposefs binary is not available (test should be skipped). +fn compare_mkcomposefs_output(dumpfile: &str) -> Result { + let Some(c_mkcomposefs) = c_mkcomposefs_path() else { + eprintln!( + "Skipping: C mkcomposefs not found (install composefs or set C_MKCOMPOSEFS_PATH)" + ); + return Ok(false); + }; + let rust_mkcomposefs = rust_mkcomposefs_path()?; + + // Run Rust mkcomposefs + let mut rust_cmd = Command::new(&rust_mkcomposefs) + .args(["--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn Rust mkcomposefs")?; + + { + let stdin = rust_cmd.stdin.as_mut().unwrap(); + stdin + .write_all(dumpfile.as_bytes()) + .context("Failed to write to Rust mkcomposefs stdin")?; + } + + let rust_output = rust_cmd + .wait_with_output() + .context("Failed to wait for Rust mkcomposefs")?; + + if !rust_output.status.success() { + bail!( + "Rust mkcomposefs failed: {}", + String::from_utf8_lossy(&rust_output.stderr) + ); + } + + // Run C mkcomposefs + let mut c_cmd = Command::new(c_mkcomposefs) + .args(["--min-version=0", "--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn C mkcomposefs")?; + + { + let stdin = c_cmd.stdin.as_mut().unwrap(); + stdin + .write_all(dumpfile.as_bytes()) + .context("Failed to write to C mkcomposefs stdin")?; + } + + let c_output = c_cmd + .wait_with_output() + .context("Failed to wait for C mkcomposefs")?; + + if !c_output.status.success() { + bail!( + "C mkcomposefs failed: {}", + String::from_utf8_lossy(&c_output.stderr) + ); + } + + // Compare outputs + let rust_image = rust_output.stdout; + let c_image = c_output.stdout; + + if rust_image != c_image { + // Find first difference for debugging + let first_diff = rust_image + .iter() + .zip(c_image.iter()) + .position(|(a, b)| a != b) + .unwrap_or(std::cmp::min(rust_image.len(), c_image.len())); + + bail!( + "Images differ! Rust: {} bytes, C: {} bytes. First difference at byte {}.\n\ + Dumpfile has {} lines.", + rust_image.len(), + c_image.len(), + first_diff, + dumpfile.lines().count() + ); + } + + Ok(true) +} + +/// Create a realistic test filesystem with container-like structure. +/// +/// This creates a directory structure similar to what you'd find in a container: +/// - Nested directories (/usr/bin, /usr/lib, /etc, /var/log) +/// - Symlinks (absolute and relative) +/// - Large files (for external content) +/// - Various file permissions +fn create_container_like_rootfs(root: &std::path::Path) -> Result<()> { + // Create directory structure + fs::create_dir_all(root.join("usr/bin"))?; + fs::create_dir_all(root.join("usr/lib/x86_64-linux-gnu"))?; + fs::create_dir_all(root.join("usr/share/doc/test"))?; + fs::create_dir_all(root.join("etc/default"))?; + fs::create_dir_all(root.join("var/log"))?; + fs::create_dir_all(root.join("var/cache"))?; + fs::create_dir_all(root.join("tmp"))?; + fs::create_dir_all(root.join("home/user"))?; + + // Create various files + fs::write(root.join("usr/bin/hello"), "#!/bin/sh\necho Hello\n")?; + fs::write(root.join("usr/bin/world"), "#!/bin/sh\necho World\n")?; + + // Create a large file (128KB) that won't be inlined + let large_content = "x".repeat(128 * 1024); + fs::write(root.join("usr/lib/libtest.so"), &large_content)?; + + // Create files in nested directories + fs::write( + root.join("usr/lib/x86_64-linux-gnu/libc.so.6"), + &large_content, + )?; + fs::write( + root.join("usr/share/doc/test/README"), + "Test documentation\n", + )?; + fs::write( + root.join("usr/share/doc/test/LICENSE"), + "MIT License\n...\n", + )?; + + // Create config files + fs::write(root.join("etc/hostname"), "container\n")?; + fs::write(root.join("etc/passwd"), "root:x:0:0:root:/root:/bin/sh\n")?; + fs::write(root.join("etc/default/locale"), "LANG=en_US.UTF-8\n")?; + + // Create log files + fs::write(root.join("var/log/messages"), "")?; + fs::write(root.join("var/log/auth.log"), "")?; + + // Create symlinks + symlink("/usr/bin/hello", root.join("usr/bin/hi"))?; + symlink("../lib/libtest.so", root.join("usr/bin/libtest-link"))?; + symlink("/etc/hostname", root.join("etc/HOSTNAME"))?; + + // Create home directory files + fs::write(root.join("home/user/.bashrc"), "# Bash config\n")?; + fs::write(root.join("home/user/.profile"), "# Profile\n")?; + + Ok(()) +} + +/// Create a dumpfile from a directory using cfsctl. +fn create_dumpfile_from_dir(sh: &Shell, root: &std::path::Path) -> Result { + let cfsctl = cfsctl()?; + let repo_dir = tempfile::tempdir()?; + let repo = repo_dir.path(); + + // Use cfsctl to create a dumpfile from the directory. + // Use --no-propagate-usr-to-root because test directories may not have /usr. + let dumpfile = cmd!( + sh, + "{cfsctl} --insecure --hash sha256 --repo {repo} create-dumpfile --no-propagate-usr-to-root {root}" + ) + .read() + .with_context(|| format!("Failed to create dumpfile from {:?}", root))?; + + Ok(dumpfile) +} + +/// Test bit-for-bit compatibility with a container-like filesystem. +/// +/// Creates a realistic filesystem structure and verifies that both +/// Rust and C mkcomposefs produce identical output. +fn test_container_rootfs_compat() -> Result<()> { + let sh = Shell::new()?; + let rootfs_dir = tempfile::tempdir()?; + let rootfs = rootfs_dir.path().join("rootfs"); + fs::create_dir_all(&rootfs)?; + + // Create the test filesystem + create_container_like_rootfs(&rootfs)?; + + // Generate dumpfile + let dumpfile = create_dumpfile_from_dir(&sh, &rootfs)?; + + eprintln!( + "Container rootfs dumpfile: {} lines, {} bytes", + dumpfile.lines().count(), + dumpfile.len() + ); + + if compare_mkcomposefs_output(&dumpfile)? { + eprintln!("Container rootfs: bit-for-bit match!"); + } + Ok(()) +} +integration_test!(test_container_rootfs_compat); + +/// Test with deeply nested directory structure. +/// +/// This exercises the BFS inode ordering with many levels of nesting. +fn test_deep_nesting_compat() -> Result<()> { + let sh = Shell::new()?; + let rootfs_dir = tempfile::tempdir()?; + let rootfs = rootfs_dir.path().join("rootfs"); + + // Create deeply nested structure: /a/b/c/d/e/f/g/h/file + let deep_path = rootfs.join("a/b/c/d/e/f/g/h"); + fs::create_dir_all(&deep_path)?; + fs::write(deep_path.join("file"), "deep content")?; + + // Add files at various levels + fs::write(rootfs.join("a/file1"), "level 1")?; + fs::write(rootfs.join("a/b/file2"), "level 2")?; + fs::write(rootfs.join("a/b/c/file3"), "level 3")?; + fs::write(rootfs.join("a/b/c/d/file4"), "level 4")?; + + // Add parallel directory trees + fs::create_dir_all(rootfs.join("x/y/z"))?; + fs::write(rootfs.join("x/file"), "x tree")?; + fs::write(rootfs.join("x/y/file"), "y tree")?; + fs::write(rootfs.join("x/y/z/file"), "z tree")?; + + let dumpfile = create_dumpfile_from_dir(&sh, &rootfs)?; + + eprintln!( + "Deep nesting dumpfile: {} lines, {} bytes", + dumpfile.lines().count(), + dumpfile.len() + ); + + if compare_mkcomposefs_output(&dumpfile)? { + eprintln!("Deep nesting: bit-for-bit match!"); + } + Ok(()) +} +integration_test!(test_deep_nesting_compat); + +/// Test with many files in a single directory. +/// +/// This exercises the directory entry handling with many entries. +fn test_wide_directory_compat() -> Result<()> { + let sh = Shell::new()?; + let rootfs_dir = tempfile::tempdir()?; + let rootfs = rootfs_dir.path().join("rootfs"); + fs::create_dir_all(&rootfs)?; + + // Create many files in a single directory + for i in 0..100 { + fs::write(rootfs.join(format!("file{i:03}")), format!("content {i}"))?; + } + + // Add some subdirectories with files too + for i in 0..10 { + let subdir = rootfs.join(format!("dir{i:02}")); + fs::create_dir_all(&subdir)?; + for j in 0..5 { + fs::write(subdir.join(format!("file{j}")), format!("content {i}.{j}"))?; + } + } + + let dumpfile = create_dumpfile_from_dir(&sh, &rootfs)?; + + eprintln!( + "Wide directory dumpfile: {} lines, {} bytes", + dumpfile.lines().count(), + dumpfile.len() + ); + + if compare_mkcomposefs_output(&dumpfile)? { + eprintln!("Wide directory: bit-for-bit match!"); + } + Ok(()) +} +integration_test!(test_wide_directory_compat); + +/// Test with symlinks (both absolute and relative). +fn test_symlinks_compat() -> Result<()> { + let sh = Shell::new()?; + let rootfs_dir = tempfile::tempdir()?; + let rootfs = rootfs_dir.path().join("rootfs"); + + fs::create_dir_all(rootfs.join("usr/bin"))?; + fs::create_dir_all(rootfs.join("usr/lib"))?; + fs::create_dir_all(rootfs.join("bin"))?; + fs::create_dir_all(rootfs.join("lib"))?; + + // Create target files + fs::write(rootfs.join("usr/bin/real"), "real binary")?; + fs::write(rootfs.join("usr/lib/libreal.so"), "real library")?; + + // Absolute symlinks + symlink("/usr/bin/real", rootfs.join("bin/link1"))?; + symlink("/usr/lib/libreal.so", rootfs.join("lib/liblink.so"))?; + + // Relative symlinks + symlink("../usr/bin/real", rootfs.join("bin/link2"))?; + symlink("../lib/libreal.so", rootfs.join("usr/bin/liblink"))?; + + // Symlink to symlink + symlink("link1", rootfs.join("bin/link3"))?; + + // Long symlink target + let long_target = "/very/long/path/that/goes/deep/into/the/filesystem/structure"; + symlink(long_target, rootfs.join("bin/longlink"))?; + + let dumpfile = create_dumpfile_from_dir(&sh, &rootfs)?; + + eprintln!( + "Symlinks dumpfile: {} lines, {} bytes", + dumpfile.lines().count(), + dumpfile.len() + ); + + if compare_mkcomposefs_output(&dumpfile)? { + eprintln!("Symlinks: bit-for-bit match!"); + } + Ok(()) +} +integration_test!(test_symlinks_compat); + +/// Test that `--digest-store` writes files in the C-compatible flat `XX/DIGEST` layout. +/// +/// Creates a rootfs with a file large enough to be stored externally (> 4096 bytes), +/// runs Rust mkcomposefs with `--digest-store`, and verifies that the store contains +/// objects at `XX/DIGEST` paths (not under an `objects/` subdirectory). +fn test_digest_store_flat_layout() -> Result<()> { + let rust_mkcomposefs = rust_mkcomposefs_path()?; + + let td = tempfile::tempdir()?; + let rootfs = td.path().join("rootfs"); + let store = td.path().join("store"); + let image = td.path().join("image.img"); + + fs::create_dir_all(&rootfs)?; + + // Write a file large enough to be stored as an external object (> 4096 bytes). + let large_content = "x".repeat(8192); + fs::write(rootfs.join("bigfile"), &large_content)?; + // Also a small inline file. + fs::write(rootfs.join("small"), "tiny")?; + + // Run Rust mkcomposefs with --digest-store on the directory directly. + let output = Command::new(&rust_mkcomposefs) + .args([ + "--digest-store", + store.to_str().unwrap(), + rootfs.to_str().unwrap(), + image.to_str().unwrap(), + ]) + .output() + .context("Failed to run Rust mkcomposefs")?; + + if !output.status.success() { + bail!( + "Rust mkcomposefs failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + // The store should exist and contain objects in flat XX/DIGEST layout. + assert!(store.exists(), "Digest store directory should exist"); + + // Walk the store and collect object paths. + let mut found_objects = Vec::new(); + for entry in fs::read_dir(&store)? { + let entry = entry?; + let name = entry.file_name(); + let name = name.to_string_lossy(); + // Should be exactly 2-char hex directories (e.g. "a3") — no "objects/" subdirectory. + if entry.file_type()?.is_dir() { + assert_eq!( + name.len(), + 2, + "Store subdirectory should be 2-char hex, got {:?}", + name + ); + assert!( + name.chars().all(|c| c.is_ascii_hexdigit()), + "Store subdirectory should be hex, got {:?}", + name + ); + for obj in fs::read_dir(entry.path())? { + let obj = obj?; + found_objects.push(format!("{}/{}", name, obj.file_name().to_string_lossy())); + } + } + } + + // The large file should have been stored externally. + assert!( + !found_objects.is_empty(), + "At least one object should be stored (large file should be external)" + ); + + eprintln!( + "Digest store flat layout: found {} object(s)", + found_objects.len() + ); + for path in &found_objects { + eprintln!(" {path}"); + // Verify it's a valid 2-char prefix / 62-char hex path. + let parts: Vec<&str> = path.splitn(2, '/').collect(); + assert_eq!(parts.len(), 2, "Expected XX/DIGEST format"); + assert_eq!(parts[0].len(), 2); + assert_eq!( + parts[1].len(), + 62, + "Expected 62-char remainder of sha256 hex" + ); + assert!( + parts + .iter() + .flat_map(|s| s.chars()) + .all(|c| c.is_ascii_hexdigit()), + "All characters should be hex" + ); + } + + Ok(()) +} +integration_test!(test_digest_store_flat_layout); diff --git a/crates/composefs-integration-tests/src/tests/privileged.rs b/crates/composefs-integration-tests/src/tests/privileged.rs index de15ba63..8609a57d 100644 --- a/crates/composefs-integration-tests/src/tests/privileged.rs +++ b/crates/composefs-integration-tests/src/tests/privileged.rs @@ -15,7 +15,7 @@ use anyhow::{Context, Result, bail, ensure}; use xshell::{Shell, cmd}; use composefs_oci::composefs::fsverity::{FsVerityHashValue, Sha256HashValue, Sha512HashValue}; -use composefs_oci::composefs::repository::Repository; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; use crate::{cfsctl, integration_test}; @@ -657,8 +657,11 @@ fn init_insecure_repo_at( rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, 0.into(), )?; - let (mut repo, _created) = Repository::::init_path(&fd, ".", algorithm, false)?; - repo.set_insecure(); + let (repo, _created) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::new(algorithm).set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs-oci/Cargo.toml b/crates/composefs-oci/Cargo.toml index c721295f..1419b502 100644 --- a/crates/composefs-oci/Cargo.toml +++ b/crates/composefs-oci/Cargo.toml @@ -27,7 +27,6 @@ composefs-boot = { workspace = true, optional = true } containers-image-proxy = { version = "0.10", default-features = false } cstorage = { package = "composefs-storage", path = "../composefs-storage", version = "0.4.0", optional = true } hex = { version = "0.4.0", default-features = false } -indicatif = { version = "0.18.0", default-features = false, features = ["tokio"] } rustix = { version = "1.0.0", features = ["fs"] } serde = { version = "1.0", default-features = false, features = ["derive"] } thiserror = { version = "2.0.0", default-features = false } diff --git a/crates/composefs-oci/src/boot.rs b/crates/composefs-oci/src/boot.rs index 726ded80..0d7458d7 100644 --- a/crates/composefs-oci/src/boot.rs +++ b/crates/composefs-oci/src/boot.rs @@ -60,8 +60,10 @@ pub fn remove_boot_image( repo, &config_json, img.layer_refs().clone(), - img.image_ref(), - None, // no boot image + img.image_ref_v2(), // preserve existing V2 image ref + img.image_ref_v1(), // preserve existing V1 image ref + None, // no boot image (V2) + None, // no boot image (V1) )?; let manifest_json = img.read_manifest_json(repo)?; @@ -121,7 +123,7 @@ mod test { assert_eq!(oci.boot_image_ref(), Some(&image_verity)); let plain_image = crate::image::create_filesystem(repo, &img.config_digest, None).unwrap(); - let plain_verity = plain_image.compute_image_id(); + let plain_verity = plain_image.compute_image_id(repo.erofs_version()); assert_ne!( image_verity, plain_verity, "boot-transformed image should differ from non-transformed image" diff --git a/crates/composefs-oci/src/cstor.rs b/crates/composefs-oci/src/cstor.rs index 00cdc768..18eb759a 100644 --- a/crates/composefs-oci/src/cstor.rs +++ b/crates/composefs-oci/src/cstor.rs @@ -44,7 +44,6 @@ use std::sync::Arc; use anyhow::{Context, Result}; use base64::Engine; -use indicatif::{ProgressBar, ProgressStyle}; use composefs::{ INLINE_CONTENT_MAX_V0, @@ -61,6 +60,7 @@ use cstorage::{ pub use cstorage::init_if_helper; use crate::oci_image::manifest_identifier; +use crate::progress::{ComponentId, ProgressEvent, ProgressUnit, SharedReporter}; use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE}; use crate::{ContentAndVerity, ImportStats, OciDigest, config_identifier, layer_identifier}; @@ -98,6 +98,7 @@ pub async fn import_from_containers_storage( zerocopy: bool, storage_root: Option<&std::path::Path>, additional_image_stores: &[&std::path::Path], + reporter: SharedReporter, ) -> Result<(CstorImportResult, ImportStats)> { // Check if we can access files directly or need a proxy if can_bypass_file_permissions() { @@ -119,6 +120,7 @@ pub async fn import_from_containers_storage( zerocopy, storage_root.as_deref(), &additional_image_stores, + reporter, ) }) .await @@ -132,7 +134,7 @@ pub async fn import_from_containers_storage( "storage_root and additional_image_stores are not supported in rootless mode" ); } - import_from_containers_storage_proxied(repo, image_id, reference, zerocopy).await + import_from_containers_storage_proxied(repo, image_id, reference, zerocopy, reporter).await } } @@ -147,6 +149,7 @@ fn import_from_containers_storage_direct( zerocopy: bool, storage_root: Option<&std::path::Path>, additional_image_stores: &[std::path::PathBuf], + reporter: SharedReporter, ) -> Result<(CstorImportResult, ImportStats)> { let mut stats = ImportStats::default(); let mut ctx = ImportContext::default(); @@ -218,43 +221,41 @@ fn import_from_containers_storage_direct( stats.layers = storage_layer_ids.len() as u64; - // Import each layer with progress bar - let progress = ProgressBar::new(storage_layer_ids.len() as u64); - progress.set_style( - ProgressStyle::default_bar() - .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}") - .expect("valid template") - .progress_chars("=>-"), - ); - let mut layer_refs = Vec::with_capacity(storage_layer_ids.len()); for (storage_layer_id, diff_id) in storage_layer_ids.iter().zip(diff_ids.iter()) { let content_id = layer_identifier(diff_id); - let diff_id_str: &str = diff_id.as_ref(); - let short_id = diff_id_str.get(..19).unwrap_or(diff_id_str); + let id = ComponentId::from(diff_id.to_string()); let layer_verity = if let Some(existing) = repo.has_stream(&content_id)? { - progress.set_message(format!("Already have {short_id}...")); + reporter.report(ProgressEvent::Skipped { id }); stats.layers_already_present += 1; existing } else { - progress.set_message(format!("Importing {short_id}...")); + reporter.report(ProgressEvent::Started { + id: id.clone(), + total: None, + unit: ProgressUnit::Bytes, + }); let (layer_store, layer) = stores .iter() .find_map(|s| Layer::open(s, storage_layer_id).ok().map(|l| (s, l))) .with_context(|| format!("Failed to open layer {}", storage_layer_id))?; let (verity, layer_stats) = import_layer_direct(repo, layer_store, &layer, diff_id, zerocopy, &mut ctx)?; + let bytes = layer_stats.new_bytes(); stats.merge(&layer_stats); + reporter.report(ProgressEvent::Done { + id, + transferred: bytes, + }); verity }; layer_refs.push((diff_id.clone(), layer_verity)); - progress.inc(1); } - progress.finish_with_message("Layers imported"); - finalize_import(repo, &image, &layer_refs, reference, &progress, stats) + reporter.report(ProgressEvent::Message("Layers imported".to_string())); + finalize_import(repo, &image, &layer_refs, reference, &reporter, stats) } /// Proxied (rootless) implementation of containers-storage import. @@ -266,6 +267,7 @@ async fn import_from_containers_storage_proxied( image_id: &str, reference: Option<&str>, zerocopy: bool, + reporter: SharedReporter, ) -> Result<(CstorImportResult, ImportStats)> { let mut stats = ImportStats::default(); let mut ctx = ImportContext::default(); @@ -306,15 +308,6 @@ async fn import_from_containers_storage_proxied( stats.layers = image_info.storage_layer_ids.len() as u64; - // Import each layer with progress bar - let progress = ProgressBar::new(image_info.storage_layer_ids.len() as u64); - progress.set_style( - ProgressStyle::default_bar() - .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}") - .expect("valid template") - .progress_chars("=>-"), - ); - let mut layer_refs = Vec::with_capacity(image_info.storage_layer_ids.len()); for (storage_layer_id, diff_id) in image_info @@ -323,15 +316,18 @@ async fn import_from_containers_storage_proxied( .zip(image_info.layer_diff_ids.iter()) { let content_id = layer_identifier(diff_id); - let diff_id_str: &str = diff_id.as_ref(); - let short_id = diff_id_str.get(..19).unwrap_or(diff_id_str); + let id = ComponentId::from(diff_id.to_string()); let layer_verity = if let Some(existing) = repo.has_stream(&content_id)? { - progress.set_message(format!("Already have {short_id}...")); + reporter.report(ProgressEvent::Skipped { id }); stats.layers_already_present += 1; existing } else { - progress.set_message(format!("Importing {short_id}...")); + reporter.report(ProgressEvent::Started { + id: id.clone(), + total: None, + unit: ProgressUnit::Bytes, + }); let (verity, layer_stats) = import_layer_proxied( repo, &mut proxy, @@ -342,14 +338,19 @@ async fn import_from_containers_storage_proxied( &mut ctx, ) .await?; + let bytes = layer_stats.new_bytes(); stats.merge(&layer_stats); + reporter.report(ProgressEvent::Done { + id, + transferred: bytes, + }); verity }; layer_refs.push((diff_id.clone(), layer_verity)); - progress.inc(1); } - progress.finish_with_message("Layers imported"); + + reporter.report(ProgressEvent::Message("Layers imported".to_string())); // Config and manifest metadata don't have restrictive file permissions, // so we can read them directly without the proxy. @@ -362,7 +363,7 @@ async fn import_from_containers_storage_proxied( // Shutdown the proxy before the blocking finalization proxy.shutdown().await.context("Failed to shutdown proxy")?; - finalize_import(repo, &image, &layer_refs, reference, &progress, stats) + finalize_import(repo, &image, &layer_refs, reference, &reporter, stats) } /// Create config + manifest splitstreams, generate the EROFS image, and tag. @@ -378,7 +379,7 @@ fn finalize_import( image: &Image, layer_refs: &[(OciDigest, ObjectID)], reference: Option<&str>, - progress: &ProgressBar, + reporter: &SharedReporter, stats: ImportStats, ) -> Result<(CstorImportResult, ImportStats)> { // Read the raw config JSON bytes from metadata @@ -391,10 +392,14 @@ fn finalize_import( let content_id = config_identifier(&config_digest); let config_verity = if let Some(existing) = repo.has_stream(&content_id)? { - progress.println(format!("Already have config {config_digest}")); + reporter.report(ProgressEvent::Message(format!( + "Already have config {config_digest}" + ))); existing } else { - progress.println(format!("Creating config splitstream {config_digest}")); + reporter.report(ProgressEvent::Message(format!( + "Creating config splitstream {config_digest}" + ))); let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?; for (diff_id, verity) in layer_refs { @@ -414,10 +419,14 @@ fn finalize_import( let manifest_content_id = manifest_identifier(&manifest_digest); let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? { - progress.println(format!("Already have manifest {manifest_digest}")); + reporter.report(ProgressEvent::Message(format!( + "Already have manifest {manifest_digest}" + ))); existing } else { - progress.println(format!("Creating manifest splitstream {manifest_digest}")); + reporter.report(ProgressEvent::Message(format!( + "Creating manifest splitstream {manifest_digest}" + ))); let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?; let config_ref_key = format!("config:{config_digest}"); diff --git a/crates/composefs-oci/src/image.rs b/crates/composefs-oci/src/image.rs index 14a8ae2f..275d73fa 100644 --- a/crates/composefs-oci/src/image.rs +++ b/crates/composefs-oci/src/image.rs @@ -171,6 +171,7 @@ mod test { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, item: TarItem::Leaf(LeafContent::Regular(RegularFile::Inline([].into()))), @@ -185,6 +186,7 @@ mod test { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, item: TarItem::Directory, @@ -331,7 +333,11 @@ mod test { /// with `get_entry()`, and verify every entry type round-trips correctly. #[tokio::test] async fn test_build_baseimage_roundtrip() -> Result<()> { - use composefs::{INLINE_CONTENT_MAX_V0, repository::Repository, test::tempdir}; + use composefs::{ + INLINE_CONTENT_MAX_V0, + repository::{Repository, RepositoryConfig}, + test::tempdir, + }; use rustix::fs::CWD; use std::ffi::OsStr; use std::sync::Arc; @@ -344,8 +350,7 @@ mod test { let (repo, _) = Repository::::init_path( CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; let repo = Arc::new(repo); let (verity, _stats) = diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index f91cfedd..dcc92afb 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -19,6 +19,8 @@ pub mod image; pub mod layer; pub mod oci_image; pub mod oci_layout; +/// Re-exported from [`composefs::progress`]; use that path directly in new code. +pub mod progress; pub mod skopeo; pub mod tar; @@ -46,6 +48,7 @@ use containers_image_proxy::oci_spec::image::{Descriptor, MediaType}; use sha2::{Digest, Sha256}; use composefs::{ + erofs::format::FormatVersion, fsverity::FsVerityHashValue, repository::{ObjectStoreMethod, Repository}, splitstream::SplitStreamStats, @@ -54,12 +57,18 @@ use composefs::{ use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE}; use crate::tar::get_entry; -/// Named ref key for the EROFS image derived from this OCI config. +/// Named ref key for the V2 EROFS image derived from this OCI config. pub const IMAGE_REF_KEY: &str = "composefs.image"; -/// Named ref key for the boot EROFS image derived from this OCI config. +/// Named ref key for the V1 EROFS image derived from this OCI config. +pub const IMAGE_REF_KEY_V1: &str = "composefs.image.v1"; + +/// Named ref key for the V2 boot EROFS image derived from this OCI config. pub const BOOT_IMAGE_REF_KEY: &str = "composefs.image.boot"; +/// Named ref key for the V1 boot EROFS image derived from this OCI config. +pub const BOOT_IMAGE_REF_KEY_V1: &str = "composefs.image.boot.v1"; + // Re-export key types for convenience #[cfg(feature = "boot")] pub use boot::generate_boot_image; @@ -70,6 +79,7 @@ pub use oci_image::{ oci_fsck, oci_fsck_image, remove_referrer, remove_referrers_for_subject, resolve_ref, tag_image, untag_image, }; +pub use progress::{ComponentId, NullReporter, ProgressEvent, ProgressReporter, SharedReporter}; pub use skopeo::pull_image; /// Statistics from an image import operation. @@ -156,6 +166,22 @@ impl ImportStats { } } +/// Format a byte count in human-readable form (e.g. "1.23 MiB"). +fn human_bytes(bytes: u64) -> String { + const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"]; + let mut value = bytes as f64; + let mut unit_idx = 0; + while value >= 1024.0 && unit_idx + 1 < UNITS.len() { + value /= 1024.0; + unit_idx += 1; + } + if unit_idx == 0 { + format!("{} B", bytes) + } else { + format!("{:.2} {}", value, UNITS[unit_idx]) + } +} + impl std::fmt::Display for ImportStats { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let has_zerocopy = self.objects_reflinked > 0 || self.objects_hardlinked > 0; @@ -174,25 +200,13 @@ impl std::fmt::Display for ImportStats { let mut byte_parts = Vec::new(); if self.objects_reflinked > 0 { - byte_parts.push(format!( - "{} reflinked", - indicatif::HumanBytes(self.bytes_reflinked) - )); + byte_parts.push(format!("{} reflinked", human_bytes(self.bytes_reflinked))); } if self.objects_hardlinked > 0 { - byte_parts.push(format!( - "{} hardlinked", - indicatif::HumanBytes(self.bytes_hardlinked) - )); + byte_parts.push(format!("{} hardlinked", human_bytes(self.bytes_hardlinked))); } - byte_parts.push(format!( - "{} copied", - indicatif::HumanBytes(self.bytes_copied) - )); - byte_parts.push(format!( - "{} inlined", - indicatif::HumanBytes(self.bytes_inlined) - )); + byte_parts.push(format!("{} copied", human_bytes(self.bytes_copied))); + byte_parts.push(format!("{} inlined", human_bytes(self.bytes_inlined))); write!(f, "{}", byte_parts.join(", ")) } else { write!( @@ -200,8 +214,8 @@ impl std::fmt::Display for ImportStats { "{} new + {} already present objects; {} stored, {} inlined", self.objects_copied, self.objects_already_present, - indicatif::HumanBytes(self.bytes_copied), - indicatif::HumanBytes(self.bytes_inlined), + human_bytes(self.bytes_copied), + human_bytes(self.bytes_inlined), ) } } @@ -227,7 +241,7 @@ pub enum LocalFetchOpt { /// /// Use `Default::default()` for the common case (skopeo transport, no /// containers-storage import). -#[derive(Debug, Default)] +#[derive(Default)] pub struct PullOptions<'a> { /// Image proxy configuration passed to skopeo (ignored for /// `containers-storage:` references when `local_fetch` is not @@ -248,6 +262,32 @@ pub struct PullOptions<'a> { /// `additionalimagestore=` option in containers/storage. /// Only relevant when `local_fetch` is not [`Disabled`](LocalFetchOpt::Disabled). pub additional_image_stores: &'a [&'a std::path::Path], + + /// Progress reporter for this pull operation. + /// + /// When `None`, all progress events are silently discarded. Supply a + /// [`SharedReporter`] implementation (e.g. an `indicatif`-backed renderer) + /// to receive [`ProgressEvent`]s as the pull proceeds. + pub progress: Option, +} + +impl<'a> std::fmt::Debug for PullOptions<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PullOptions") + .field("img_proxy_config", &self.img_proxy_config) + .field("local_fetch", &self.local_fetch) + .field("storage_root", &self.storage_root) + .field("additional_image_stores", &self.additional_image_stores) + .field( + "progress", + if self.progress.is_some() { + &"Some()" + } else { + &"None" + }, + ) + .finish() + } } /// Result of a pull operation. @@ -274,10 +314,14 @@ pub struct OpenConfig { pub config: ImageConfiguration, /// Map from layer diff_id to its fs-verity object ID. pub layer_refs: HashMap, ObjectID>, - /// The EROFS image ObjectID linked to this config, if any. + /// The V2 EROFS image ObjectID linked to this config, if any. pub image_ref: Option, - /// The boot EROFS image ObjectID linked to this config, if any. + /// The V1 EROFS image ObjectID linked to this config, if any. + pub image_ref_v1: Option, + /// The V2 boot EROFS image ObjectID linked to this config, if any. pub boot_image_ref: Option, + /// The V1 boot EROFS image ObjectID linked to this config, if any. + pub boot_image_ref_v1: Option, } impl std::fmt::Debug for OpenConfig { @@ -285,7 +329,9 @@ impl std::fmt::Debug for OpenConfig { f.debug_struct("OpenConfig") .field("layer_refs", &self.layer_refs) .field("image_ref", &self.image_ref) + .field("image_ref_v1", &self.image_ref_v1) .field("boot_image_ref", &self.boot_image_ref) + .field("boot_image_ref_v1", &self.boot_image_ref_v1) .finish_non_exhaustive() } } @@ -367,6 +413,10 @@ pub async fn pull( reference: Option<&str>, opts: PullOptions<'_>, ) -> Result> { + let reporter: SharedReporter = opts + .progress + .unwrap_or_else(|| std::sync::Arc::new(NullReporter)); + #[cfg(feature = "containers-storage")] if opts.local_fetch != LocalFetchOpt::Disabled && let Some(image_id) = cstor::parse_containers_storage_ref(imgref) @@ -380,6 +430,7 @@ pub async fn pull( zerocopy, opts.storage_root, opts.additional_image_stores, + reporter, ) .await?; return Ok(PullResult { @@ -392,7 +443,7 @@ pub async fn pull( } let (result, stats) = - skopeo::pull_image(repo, imgref, reference, opts.img_proxy_config).await?; + skopeo::pull_image(repo, imgref, reference, opts.img_proxy_config, reporter).await?; Ok(crate::PullResult { manifest_digest: result.manifest_digest, manifest_verity: result.manifest_verity, @@ -489,24 +540,30 @@ pub fn open_config( } let image_ref = named_refs.remove(IMAGE_REF_KEY); + let image_ref_v1 = named_refs.remove(IMAGE_REF_KEY_V1); let boot_image_ref = named_refs.remove(BOOT_IMAGE_REF_KEY); + let boot_image_ref_v1 = named_refs.remove(BOOT_IMAGE_REF_KEY_V1); let config = ImageConfiguration::from_reader(&data[..])?; Ok(OpenConfig { config, layer_refs: named_refs, image_ref, + image_ref_v1, boot_image_ref, + boot_image_ref_v1, }) } /// Returns the composefs EROFS ObjectID referenced by the given OCI config, if any. +/// +/// Returns the V1 image ObjectID when present (primary format), otherwise the V2 image ObjectID. pub fn composefs_erofs_for_config( repo: &Repository, config_digest: &OciDigest, verity: Option<&ObjectID>, ) -> Result> { let oc = open_config(repo, config_digest, verity)?; - Ok(oc.image_ref) + Ok(oc.image_ref_v1.or(oc.image_ref)) } /// Returns the composefs EROFS ObjectID for an OCI image identified by manifest, if any. @@ -523,13 +580,15 @@ pub fn composefs_erofs_for_manifest( } /// Returns the boot EROFS ObjectID from the given OCI config, if any. +/// +/// Returns the V1 boot image ObjectID when present (primary format), otherwise the V2 boot image. pub fn composefs_boot_erofs_for_config( repo: &Repository, config_digest: &OciDigest, verity: Option<&ObjectID>, ) -> Result> { let oc = open_config(repo, config_digest, verity)?; - Ok(oc.boot_image_ref) + Ok(oc.boot_image_ref_v1.or(oc.boot_image_ref)) } /// Returns the boot EROFS ObjectID for an OCI image identified by manifest, if any. @@ -614,11 +673,12 @@ pub fn upgrade_repo( /// fsverity can be independently enabled on it. /// /// If `image` is provided, a named ref with key [`IMAGE_REF_KEY`] is added to the -/// splitstream pointing to the EROFS image's ObjectID. This ensures the GC walk keeps -/// the EROFS image alive as long as the config is reachable. +/// splitstream pointing to the V2 EROFS image's ObjectID. If `image_v1` is provided, +/// a named ref with key [`IMAGE_REF_KEY_V1`] is added pointing to the V1 image. +/// These named refs ensure the GC walk keeps images alive as long as the config is reachable. /// -/// If `boot_image` is provided, a named ref with key [`BOOT_IMAGE_REF_KEY`] is added -/// pointing to the boot EROFS image's ObjectID. +/// If `boot_image` / `boot_image_v1` are provided, named refs with keys +/// [`BOOT_IMAGE_REF_KEY`] / [`BOOT_IMAGE_REF_KEY_V1`] are added. /// /// Returns a tuple of (sha256 content hash, fs-verity hash value). pub fn write_config( @@ -626,10 +686,20 @@ pub fn write_config( config: &ImageConfiguration, refs: HashMap, ObjectID>, image: Option<&ObjectID>, + image_v1: Option<&ObjectID>, boot_image: Option<&ObjectID>, + boot_image_v1: Option<&ObjectID>, ) -> Result> { let json = config.to_string()?; - write_config_raw(repo, json.as_bytes(), refs, image, boot_image) + write_config_raw( + repo, + json.as_bytes(), + refs, + image, + image_v1, + boot_image, + boot_image_v1, + ) } /// Rewrites a container configuration in the repository from raw JSON bytes. @@ -643,7 +713,9 @@ pub fn write_config_raw( config_json: &[u8], refs: HashMap, ObjectID>, image: Option<&ObjectID>, + image_v1: Option<&ObjectID>, boot_image: Option<&ObjectID>, + boot_image_v1: Option<&ObjectID>, ) -> Result> { let config_digest = hash_sha256(config_json); let mut stream = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?; @@ -662,9 +734,15 @@ pub fn write_config_raw( if let Some(image_id) = image { stream.add_named_stream_ref(IMAGE_REF_KEY, image_id); } + if let Some(image_id_v1) = image_v1 { + stream.add_named_stream_ref(IMAGE_REF_KEY_V1, image_id_v1); + } if let Some(boot_id) = boot_image { stream.add_named_stream_ref(BOOT_IMAGE_REF_KEY, boot_id); } + if let Some(boot_id_v1) = boot_image_v1 { + stream.add_named_stream_ref(BOOT_IMAGE_REF_KEY_V1, boot_id_v1); + } stream.write_external(config_json)?; let id = repo.write_stream(stream, &config_identifier(&config_digest), None)?; Ok((config_digest, id)) @@ -702,22 +780,39 @@ fn ensure_oci_composefs_erofs( // Build the composefs filesystem from all layers let fs = image::create_filesystem(repo, img.config_digest(), Some(img.config_verity()))?; - // Commit as EROFS image (no name — the GC link comes from the config ref) - let erofs_id = fs.commit_image(repo, None)?; + // Commit as EROFS image(s) for all formats in the repository's default set. + // No named ref — the GC link comes from the config splitstream ref. + let formats = repo.default_format_set(); + let mut erofs_map = fs.commit_images(repo, None, formats)?; + let erofs_id_v2 = erofs_map.remove(&FormatVersion::V2); + let erofs_id_v1 = erofs_map.remove(&FormatVersion::V1); + + // The "primary" ID to return is V1 when present, otherwise V2. + let erofs_id = erofs_id_v1 + .clone() + .or_else(|| erofs_id_v2.clone()) + .ok_or_else(|| { + anyhow::anyhow!( + "commit_images produced no EROFS images for format set {:?}", + formats + ) + })?; // Read original config JSON to preserve its exact bytes (and thus its // sha256 digest) when rewriting the splitstream with the new EROFS ref. let config_json = img.read_config_json(repo)?; - // Rewrite config with the EROFS image ref, using layer refs from the + // Rewrite config with the EROFS image ref(s), using layer refs from the // OciImage (which already stripped the old image ref if any). - // Preserve any existing boot image ref. + // Preserve any existing boot image refs (using explicit V2/V1 accessors). let (_config_digest, new_config_verity) = write_config_raw( repo, &config_json, img.layer_refs().clone(), - Some(&erofs_id), - img.boot_image_ref(), + erofs_id_v2.as_ref(), + erofs_id_v1.as_ref(), + img.boot_image_ref_v2(), + img.boot_image_ref_v1(), )?; // Read original manifest JSON for rewriting @@ -764,19 +859,36 @@ fn ensure_oci_composefs_erofs_boot( let mut fs = image::create_filesystem(repo, img.config_digest(), Some(img.config_verity()))?; fs.transform_for_boot(repo)?; - // Commit as EROFS image - let boot_erofs_id = fs.commit_image(repo, None)?; + // Commit as EROFS image(s) for all formats in the repository's default set. + let formats = repo.default_format_set(); + let mut boot_erofs_map = fs.commit_images(repo, None, formats)?; + let boot_erofs_id_v2 = boot_erofs_map.remove(&FormatVersion::V2); + let boot_erofs_id_v1 = boot_erofs_map.remove(&FormatVersion::V1); + + // The "primary" ID to return is V1 when present, otherwise V2. + let boot_erofs_id = boot_erofs_id_v1 + .clone() + .or_else(|| boot_erofs_id_v2.clone()) + .ok_or_else(|| { + anyhow::anyhow!( + "commit_images produced no EROFS images for format set {:?}", + formats + ) + })?; // Read original config JSON to preserve its exact bytes let config_json = img.read_config_json(repo)?; - // Rewrite config with the boot EROFS image ref, preserving the existing image ref + // Rewrite config with the boot EROFS image ref(s), preserving the existing image refs + // (using explicit V2/V1 accessors to avoid the V1-preferred fallback). let (_config_digest, new_config_verity) = write_config_raw( repo, &config_json, img.layer_refs().clone(), - img.image_ref(), - Some(&boot_erofs_id), + img.image_ref_v2(), + img.image_ref_v1(), + boot_erofs_id_v2.as_ref(), + boot_erofs_id_v1.as_ref(), )?; // Read original manifest JSON for rewriting @@ -806,10 +918,27 @@ mod test { use rustix::fs::CWD; - use composefs::{fsverity::Sha256HashValue, repository::Repository, test::tempdir}; + use composefs::{ + fsverity::Sha256HashValue, + repository::{Repository, RepositoryConfig}, + test::tempdir, + }; use super::*; + #[test] + fn test_human_bytes() { + assert_eq!(human_bytes(0), "0 B"); + assert_eq!(human_bytes(1), "1 B"); + assert_eq!(human_bytes(1023), "1023 B"); + assert_eq!(human_bytes(1024), "1.00 KiB"); + assert_eq!(human_bytes(1024 * 1024), "1.00 MiB"); + assert_eq!(human_bytes(1024 * 1024 * 1024), "1.00 GiB"); + assert_eq!(human_bytes(1024 * 1024 * 1024 * 1024), "1.00 TiB"); + // Values above TiB stay in TiB + assert_eq!(human_bytes(2 * 1024 * 1024 * 1024 * 1024), "2.00 TiB"); + } + /// Expected composefs dumpfile output for the base test image created by /// [`test_util::create_base_image`]. Used across multiple tests to verify /// EROFS round-trip correctness. @@ -843,13 +972,9 @@ mod test { fn create_test_repo() -> (tempfile::TempDir, Arc>) { let dir = tempdir(); let repo_path = dir.path().join("repo"); - let (repo, _) = Repository::init_path( - CWD, - &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, - ) - .expect("initializing test repo"); + let (repo, _) = + Repository::init_path(CWD, &repo_path, RepositoryConfig::default().set_insecure()) + .expect("initializing test repo"); (dir, Arc::new(repo)) } @@ -972,7 +1097,7 @@ mod test { refs.insert("sha256:abc123def456".into(), Sha256HashValue::EMPTY); let (config_digest, config_verity) = - write_config(&repo, &config, refs.clone(), None, None).unwrap(); + write_config(&repo, &config, refs.clone(), None, None, None, None).unwrap(); assert!(config_digest.as_ref().starts_with("sha256:")); @@ -1008,7 +1133,7 @@ mod test { .unwrap(); let (config_digest, config_verity) = - write_config(&repo, &config, HashMap::new(), None, None).unwrap(); + write_config(&repo, &config, HashMap::new(), None, None, None, None).unwrap(); // Re-open the splitstream and check that the config JSON is stored // as an external object reference (not inline). This is important @@ -1094,8 +1219,8 @@ mod test { .map(|(d, v)| (d.as_str().into(), v.clone())) .collect(); - let (_digest1, verity1) = write_config(&repo, &config, refs1, None, None)?; - let (_digest2, verity2) = write_config(&repo, &config, refs2, None, None)?; + let (_digest1, verity1) = write_config(&repo, &config, refs1, None, None, None, None)?; + let (_digest2, verity2) = write_config(&repo, &config, refs2, None, None, None, None)?; // The verity must be identical regardless of HashMap iteration order assert_eq!( @@ -1133,7 +1258,7 @@ mod test { .unwrap(); let (config_digest, _config_verity) = - write_config(&repo, &config, HashMap::new(), None, None).unwrap(); + write_config(&repo, &config, HashMap::new(), None, None, None, None).unwrap(); let bad_digest: OciDigest = "sha256:0000000000000000000000000000000000000000000000000000000000000000" @@ -1173,8 +1298,16 @@ mod test { let fake_erofs_id: Sha256HashValue = composefs::fsverity::compute_verity(b"fake-erofs-image"); - let (config_digest, config_verity) = - write_config(&repo, &config, refs.clone(), Some(&fake_erofs_id), None).unwrap(); + let (config_digest, config_verity) = write_config( + &repo, + &config, + refs.clone(), + Some(&fake_erofs_id), + None, + None, + None, + ) + .unwrap(); // Reopen and verify let oc = open_config(&repo, &config_digest, Some(&config_verity)).unwrap(); @@ -1189,6 +1322,10 @@ mod test { Some(fake_erofs_id.clone()), "image ref should be returned" ); + assert!( + oc.image_ref_v1.is_none(), + "expected no V1 image ref for a V2-only config" + ); // Also verify via the convenience function let img_ref = @@ -1219,7 +1356,7 @@ mod test { refs.insert("sha256:abc123def456".into(), Sha256HashValue::EMPTY); let (config_digest, config_verity) = - write_config(&repo, &config, refs.clone(), None, None).unwrap(); + write_config(&repo, &config, refs.clone(), None, None, None, None).unwrap(); let oc = open_config(&repo, &config_digest, Some(&config_verity)).unwrap(); assert_eq!(oc.layer_refs.len(), 1); @@ -1289,6 +1426,93 @@ mod test { similar_asserts::assert_eq!(dump, EXPECTED_BASE_IMAGE_DUMPFILE); } + /// Verify that a repository with `FormatSet::BOTH` populates both V1 and V2 + /// named refs in the config splitstream and that both image objects exist. + #[tokio::test] + async fn test_dual_format_both_image_refs() { + use composefs::erofs::format::{FormatSet, FormatVersion}; + + // Create a BOTH-format repo (insecure, SHA-256). + let dir = tempdir(); + let repo_path = dir.path().join("repo"); + let mut both_config = RepositoryConfig::default().set_insecure(); + both_config.erofs_formats = FormatSet::BOTH; + let (repo_inner, _) = Repository::init_path(CWD, &repo_path, both_config) + .expect("initializing BOTH-format test repo"); + let repo = std::sync::Arc::new(repo_inner); + + assert_eq!(repo.default_format_set(), FormatSet::BOTH); + + // Pull a base image and generate EROFS. + let img = test_util::create_base_image(&repo, Some("dual:v1")).await; + let primary_id = ensure_oci_composefs_erofs( + &repo, + &img.manifest_digest, + Some(&img.manifest_verity), + Some("dual:v1"), + ) + .unwrap() + .expect("container image should produce EROFS"); + + // Re-open the rewritten config. + let oci = oci_image::OciImage::open_ref(&repo, "dual:v1").unwrap(); + let oc = open_config(&repo, oci.config_digest(), Some(oci.config_verity())).unwrap(); + + // Both V1 and V2 refs must be populated. + let id_v1 = oc + .image_ref_v1 + .as_ref() + .expect("V1 image ref should be set for BOTH format set"); + let id_v2 = oc + .image_ref + .as_ref() + .expect("V2 image ref should be set for BOTH format set"); + + // The two digests must differ (V1 and V2 produce different wire formats). + assert_ne!( + id_v1, id_v2, + "V1 and V2 EROFS images must have different digests" + ); + + // primary returned by ensure_oci_composefs_erofs is V1 (formats.iter() yields V1 first). + assert_eq!(&primary_id, id_v1, "primary ID should be the V1 digest"); + + // composefs_erofs_for_config prefers V1. + let via_fn = + composefs_erofs_for_config(&repo, oci.config_digest(), Some(oci.config_verity())) + .unwrap(); + assert_eq!( + via_fn.as_ref(), + Some(id_v1), + "composefs_erofs_for_config should prefer V1" + ); + + // OciImage::image_ref() also prefers V1. + assert_eq!(oci.image_ref(), Some(id_v1)); + assert_eq!(oci.image_ref_v2(), Some(id_v2)); + + // Both image objects must actually exist in the repository. + assert!( + repo.open_image(&id_v1.to_hex()).is_ok(), + "V1 EROFS image should exist in repo" + ); + assert!( + repo.open_image(&id_v2.to_hex()).is_ok(), + "V2 EROFS image should exist in repo" + ); + + // Verify that commit_images with BOTH wrote V1 and V2 in the map. + let fs = image::create_filesystem(&repo, oci.config_digest(), Some(oci.config_verity())) + .unwrap(); + let map = fs + .commit_images(&repo, None, FormatSet::BOTH) + .expect("commit_images with BOTH should succeed"); + assert!(map.contains_key(&FormatVersion::V1), "map must contain V1"); + assert!(map.contains_key(&FormatVersion::V2), "map must contain V2"); + assert_eq!(map[&FormatVersion::V1], *id_v1); + assert_eq!(map[&FormatVersion::V2], *id_v2); + } + #[tokio::test] async fn test_ensure_oci_composefs_erofs_gc() { use composefs::test::TestRepo; @@ -1405,6 +1629,8 @@ mod test { oci_before.layer_refs().clone(), None, None, + None, + None, ) .unwrap(); let new_config_digest = hash_sha256(&noncanonical_json); @@ -2037,4 +2263,204 @@ mod test { "EROFS should contain hostname" ); } + + // ── Progress API integration tests ─────────────────────────────────────── + + /// Create a minimal OCI layout directory with one (empty) tar layer. + /// + /// Returns the path to the OCI layout directory. The image is pinned to + /// the current host platform so `import_oci_layout` can resolve it. + /// + /// The layer is an empty tar archive (valid tar, zero entries), which is + /// sufficient to exercise the `import_layer_from_file` progress path. + fn make_test_oci_layout(parent: &std::path::Path) -> std::path::PathBuf { + use cap_std_ext::cap_std; + use containers_image_proxy::oci_spec::image::{ + Arch, ConfigBuilder, ImageConfigurationBuilder, Os, PlatformBuilder, RootFsBuilder, + }; + use ocidir::OciDir; + + let oci_dir = parent.join("oci-layout"); + std::fs::create_dir_all(&oci_dir).unwrap(); + let dir = + cap_std::fs::Dir::open_ambient_dir(&oci_dir, cap_std::ambient_authority()).unwrap(); + let ocidir = OciDir::ensure(dir).unwrap(); + + let mut manifest = ocidir.new_empty_manifest().unwrap().build().unwrap(); + let mut config = ImageConfigurationBuilder::default() + .architecture(Arch::default()) + .os(Os::default()) + .rootfs( + RootFsBuilder::default() + .typ("layers") + .diff_ids(Vec::::new()) + .build() + .unwrap(), + ) + .config(ConfigBuilder::default().build().unwrap()) + .build() + .unwrap(); + + // Create an empty tar layer (finish the builder immediately without adding any entries) + let layer = ocidir + .create_layer(None) + .unwrap() + .into_inner() + .unwrap() + .complete() + .unwrap(); + ocidir.push_layer(&mut manifest, &mut config, layer, "layer", None); + + let platform = PlatformBuilder::default() + .architecture(Arch::default()) + .os(Os::default()) + .build() + .unwrap(); + ocidir + .insert_manifest_and_config(manifest, config, None, platform) + .unwrap(); + + oci_dir + } + + /// Pulling a fresh OCI layout image (no prior cache) must emit at least one + /// `Started` event per layer and a matching `Done` event, via the + /// `import_oci_layout` fast path. + /// + /// This is the primary integration test for the progress API: it verifies + /// that the oci_layout fast path actually emits events (previously it + /// emitted none). + #[tokio::test] + async fn test_oci_layout_pull_emits_started_and_done() { + use crate::oci_layout::import_oci_layout; + use crate::progress::ProgressEvent; + use crate::progress::test_support::RecordingReporter; + use composefs::fsverity::Sha256HashValue; + use composefs::test::TestRepo; + + let layout_dir = tempfile::tempdir().unwrap(); + let layout_path = make_test_oci_layout(layout_dir.path()); + + let test_repo = TestRepo::::new(); + let repo = &test_repo.repo; + let recorder = std::sync::Arc::new(RecordingReporter::new()); + let reporter: crate::progress::SharedReporter = + std::sync::Arc::clone(&recorder) as crate::progress::SharedReporter; + + import_oci_layout(repo, &layout_path, None, reporter) + .await + .expect("import_oci_layout should succeed"); + + let events = recorder.events(); + + // There must be at least one Started event + let started_count = events + .iter() + .filter(|e| matches!(e, ProgressEvent::Started { .. })) + .count(); + assert!( + started_count >= 1, + "expected at least one Started event, got {started_count} (total events: {})", + events.len() + ); + + // Every Started must have a matching Done or Skipped + let started_ids: std::collections::HashSet = events + .iter() + .filter_map(|e| { + if let ProgressEvent::Started { id, .. } = e { + Some(id.as_str().to_owned()) + } else { + None + } + }) + .collect(); + for started_id in &started_ids { + let has_terminal = events.iter().any(|e| match e { + ProgressEvent::Done { id, .. } | ProgressEvent::Skipped { id } => { + id.as_str() == started_id + } + _ => false, + }); + assert!( + has_terminal, + "Started for '{started_id}' has no matching Done or Skipped" + ); + } + } + + /// Re-importing the same OCI layout (layers already cached) must emit + /// `Skipped` events rather than `Started`/`Done`. + #[tokio::test] + async fn test_oci_layout_reimport_emits_skipped() { + use crate::oci_layout::import_oci_layout; + use crate::progress::test_support::RecordingReporter; + use crate::progress::{NullReporter, ProgressEvent}; + use composefs::fsverity::Sha256HashValue; + use composefs::test::TestRepo; + + let layout_dir = tempfile::tempdir().unwrap(); + let layout_path = make_test_oci_layout(layout_dir.path()); + + let test_repo = TestRepo::::new(); + let repo = &test_repo.repo; + + // First import (populates cache) + let null: crate::progress::SharedReporter = std::sync::Arc::new(NullReporter); + import_oci_layout(repo, &layout_path, None, null) + .await + .expect("first import should succeed"); + + // Second import (everything already cached) + let recorder = std::sync::Arc::new(RecordingReporter::new()); + let reporter: crate::progress::SharedReporter = + std::sync::Arc::clone(&recorder) as crate::progress::SharedReporter; + import_oci_layout(repo, &layout_path, None, reporter) + .await + .expect("second import should succeed"); + + let events = recorder.events(); + + // On reimport, layers are cached: expect Skipped, not Done + let done_count = events + .iter() + .filter(|e| matches!(e, ProgressEvent::Done { .. })) + .count(); + let skipped_count = events + .iter() + .filter(|e| matches!(e, ProgressEvent::Skipped { .. })) + .count(); + assert_eq!( + done_count, 0, + "no Done events expected on reimport (layers cached), got {done_count}" + ); + assert!( + skipped_count >= 1, + "expected at least one Skipped on reimport, got {skipped_count}" + ); + } + + /// The `import_oci_layout` function with `NullReporter` (via `SharedReporter` + /// wrapping `NullReporter`) must not panic now that it uses the reporter internally. + /// + /// This verifies the zero-overhead default path still works correctly. + #[tokio::test] + async fn test_import_oci_layout_with_null_reporter_does_not_panic() { + use crate::oci_layout::import_oci_layout; + use crate::progress::NullReporter; + use composefs::fsverity::Sha256HashValue; + use composefs::test::TestRepo; + + let layout_dir = tempfile::tempdir().unwrap(); + let layout_path = make_test_oci_layout(layout_dir.path()); + + let test_repo = TestRepo::::new(); + let repo = &test_repo.repo; + + // NullReporter: zero overhead, no events collected + let reporter: crate::progress::SharedReporter = std::sync::Arc::new(NullReporter); + import_oci_layout(repo, &layout_path, None, reporter) + .await + .expect("import_oci_layout with NullReporter should not panic"); + } } diff --git a/crates/composefs-oci/src/oci_image.rs b/crates/composefs-oci/src/oci_image.rs index 9cd31b83..66aa62fe 100644 --- a/crates/composefs-oci/src/oci_image.rs +++ b/crates/composefs-oci/src/oci_image.rs @@ -107,10 +107,14 @@ pub struct OciImage { config: Option, /// Map from layer diff_id to its fs-verity object ID layer_refs: HashMap, ObjectID>, - /// The EROFS image ObjectID linked to this config, if any + /// The V2 EROFS image ObjectID linked to this config, if any image_ref: Option, - /// The boot EROFS image ObjectID linked to this config, if any + /// The V1 EROFS image ObjectID linked to this config, if any + image_ref_v1: Option, + /// The V2 boot EROFS image ObjectID linked to this config, if any boot_image_ref: Option, + /// The V1 boot EROFS image ObjectID linked to this config, if any + boot_image_ref_v1: Option, /// The fs-verity ID of the manifest splitstream manifest_verity: ObjectID, } @@ -179,9 +183,11 @@ impl OciImage { } }; - // Strip the EROFS image ref from layer_refs (it's not a layer) + // Strip the EROFS image refs from layer_refs (they're not layers) let image_ref = layer_refs.remove(crate::IMAGE_REF_KEY); + let image_ref_v1 = layer_refs.remove(crate::IMAGE_REF_KEY_V1); let boot_image_ref = layer_refs.remove(crate::BOOT_IMAGE_REF_KEY); + let boot_image_ref_v1 = layer_refs.remove(crate::BOOT_IMAGE_REF_KEY_V1); let manifest_verity = if let Some(v) = verity { v.clone() @@ -198,7 +204,9 @@ impl OciImage { config, layer_refs, image_ref, + image_ref_v1, boot_image_ref, + boot_image_ref_v1, manifest_verity, }) } @@ -249,16 +257,42 @@ impl OciImage { &self.layer_refs } - /// Returns the EROFS image ObjectID linked to this config, if any. + /// Returns the primary EROFS image ObjectID linked to this config, if any. + /// + /// Returns the V1 image when present (primary format), otherwise the V2 image. pub fn image_ref(&self) -> Option<&ObjectID> { + self.image_ref_v1.as_ref().or(self.image_ref.as_ref()) + } + + /// Returns the V2 EROFS image ObjectID linked to this config, if any. + pub fn image_ref_v2(&self) -> Option<&ObjectID> { self.image_ref.as_ref() } - /// Returns the boot EROFS image ObjectID linked to this config, if any. + /// Returns the V1 EROFS image ObjectID linked to this config, if any. + pub fn image_ref_v1(&self) -> Option<&ObjectID> { + self.image_ref_v1.as_ref() + } + + /// Returns the primary boot EROFS image ObjectID linked to this config, if any. + /// + /// Returns the V1 boot image when present (primary format), otherwise the V2 boot image. pub fn boot_image_ref(&self) -> Option<&ObjectID> { + self.boot_image_ref_v1 + .as_ref() + .or(self.boot_image_ref.as_ref()) + } + + /// Returns the V2 boot EROFS image ObjectID linked to this config, if any. + pub fn boot_image_ref_v2(&self) -> Option<&ObjectID> { self.boot_image_ref.as_ref() } + /// Returns the V1 boot EROFS image ObjectID linked to this config, if any. + pub fn boot_image_ref_v1(&self) -> Option<&ObjectID> { + self.boot_image_ref_v1.as_ref() + } + /// Returns the image architecture (empty string for artifacts). pub fn architecture(&self) -> String { self.config @@ -413,11 +447,11 @@ impl OciImage { "referrers": referrers_value, }); - if let Some(ref erofs_id) = self.image_ref { + if let Some(erofs_id) = self.image_ref() { result["composefs_erofs"] = serde_json::json!(erofs_id.to_hex()); } - if let Some(ref boot_id) = self.boot_image_ref { + if let Some(boot_id) = self.boot_image_ref() { result["composefs_boot_erofs"] = serde_json::json!(boot_id.to_hex()); } diff --git a/crates/composefs-oci/src/oci_layout.rs b/crates/composefs-oci/src/oci_layout.rs index 08bb72f6..e4365cf7 100644 --- a/crates/composefs-oci/src/oci_layout.rs +++ b/crates/composefs-oci/src/oci_layout.rs @@ -34,6 +34,7 @@ use composefs::repository::{ObjectStoreMethod, Repository}; use crate::layer::{decompress_async, import_tar_async, is_tar_media_type, store_blob_async}; use crate::oci_image::manifest_identifier; +use crate::progress::{ComponentId, ProgressEvent, ProgressRead, ProgressUnit, SharedReporter}; use crate::skopeo::OCI_BLOB_CONTENT_TYPE; use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE}; use crate::{ImportStats, config_identifier, layer_identifier}; @@ -71,13 +72,20 @@ fn resolve_manifest(ocidir: &OciDir, tag: Option<&str>) -> Result( repo: &Arc>, layout_path: &Path, layout_tag: Option<&str>, + reporter: SharedReporter, ) -> Result<(PullResult, ImportStats)> { + // Check writability before touching the source, so a read-only repo gives + // a clear "not writable" error rather than a misleading source-open error. + repo.ensure_writable()?; + // Open the OCI layout directory let dir = cap_std::fs::Dir::open_ambient_dir(layout_path, cap_std::ambient_authority()) .with_context(|| format!("Opening OCI layout directory {}", layout_path.display()))?; @@ -93,11 +101,17 @@ pub async fn import_oci_layout( // Import config and layers let config_descriptor = manifest.config(); let layers = manifest.layers(); + reporter.report(ProgressEvent::Message(format!( + "Importing {} layers from OCI layout", + layers.len() + ))); let (config_digest, config_verity, layer_refs, stats) = - import_config_and_layers(repo, &ocidir, layers, config_descriptor) + import_config_and_layers(repo, &ocidir, layers, config_descriptor, &reporter) .await .with_context(|| format!("Failed to import config {}", config_descriptor.digest()))?; + reporter.report(ProgressEvent::Message("Storing manifest".to_string())); + // Store the manifest let manifest_content_id = manifest_identifier(&manifest_digest); let manifest_verity = if let Some(verity) = repo.has_stream(&manifest_content_id)? { @@ -146,6 +160,7 @@ async fn import_config_and_layers( ocidir: &OciDir, manifest_layers: &[Descriptor], config_descriptor: &Descriptor, + reporter: &SharedReporter, ) -> Result<(OciDigest, ObjectID, Vec<(OciDigest, ObjectID)>, ImportStats)> { let config_digest: OciDigest = config_descriptor.digest().clone(); let content_id = config_identifier(&config_digest); @@ -187,6 +202,13 @@ async fn import_config_and_layers( layer_refs.len() ); + // Emit Skipped for each cached layer so callers can close any open progress bars + for (diff_id, _) in &layer_refs { + reporter.report(ProgressEvent::Skipped { + id: ComponentId::from(diff_id.to_string()), + }); + } + return Ok((config_digest, config_id, layer_refs, ImportStats::default())); } @@ -216,17 +238,26 @@ async fn import_config_and_layers( let diff_id = (*diff_id).clone(); let repo = Arc::clone(repo); let permit = Arc::clone(&sem).acquire_owned().await?; + let reporter = Arc::clone(reporter); let layer_file = ocidir .read_blob(descriptor) .with_context(|| format!("Opening layer blob {}", descriptor.digest()))?; let media_type = descriptor.media_type().clone(); + let layer_size = descriptor.size(); layer_tasks.spawn(async move { let _permit = permit; - let (verity, layer_stats) = - import_layer_from_file(&repo, &diff_id, layer_file, &media_type).await?; + let (verity, layer_stats) = import_layer_from_file( + &repo, + &diff_id, + layer_file, + &media_type, + layer_size, + &reporter, + ) + .await?; anyhow::Ok((idx, diff_id, verity, layer_stats)) }); } @@ -270,30 +301,55 @@ async fn import_config_and_layers( } /// Import a single layer by streaming from a file handle. +/// +/// Emits `Started`/`Done` (or `Skipped`) progress events via `reporter`. async fn import_layer_from_file( repo: &Arc>, diff_id: &OciDigest, layer_file: std::fs::File, media_type: &MediaType, + layer_size: u64, + reporter: &SharedReporter, ) -> Result<(ObjectID, ImportStats)> { let content_id = layer_identifier(diff_id); + let id = ComponentId::from(diff_id.to_string()); if let Some(layer_id) = repo.has_stream(&content_id)? { debug!("Already have layer {diff_id}"); + reporter.report(ProgressEvent::Skipped { id }); return Ok((layer_id, ImportStats::default())); } debug!("Importing layer {diff_id}"); - - // Convert std::fs::File to tokio::fs::File for async I/O - let async_file = tokio::fs::File::from_std(layer_file); + reporter.report(ProgressEvent::Started { + id: id.clone(), + total: Some(layer_size), + unit: ProgressUnit::Bytes, + }); + + // Wrap the file reader to emit Progress events as compressed bytes are read. + // This sits before decompression so `fetched` tracks bytes-on-disk, + // matching the `total` from the descriptor size above. + // + // The watch channel provides backpressure: if the renderer is slow, intermediate + // byte counts are coalesced rather than queued, keeping the I/O path non-blocking. + let (async_file, progress_driver) = ProgressRead::new( + tokio::fs::File::from_std(layer_file), + Arc::clone(reporter), + id.clone(), + Some(layer_size), + ); let (object_id, layer_stats) = if is_tar_media_type(media_type) { + // Run the progress driver concurrently with the import. let reader = decompress_async(async_file, media_type)?; - import_tar_async(repo.clone(), reader).await? + let (result, ()) = tokio::join!(import_tar_async(repo.clone(), reader), progress_driver); + result? } else { - // Non-tar blob: store as object and create splitstream wrapper - let (object_id, size, method) = store_blob_async(repo, async_file).await?; + // Non-tar blob: store as object and create splitstream wrapper. + // Run the progress driver concurrently with the blob store. + let (store_result, ()) = tokio::join!(store_blob_async(repo, async_file), progress_driver); + let (object_id, size, method) = store_result?; let mut stats = ImportStats::default(); match method { @@ -318,18 +374,27 @@ async fn import_layer_from_file( stream.add_external_size(size); stream.write_reference(object_id)?; let stream_id = repo.write_stream(stream, &content_id, None)?; + reporter.report(ProgressEvent::Done { + id, + transferred: size, + }); return Ok((stream_id, stats)); }; // Register the stream with its content identifier repo.register_stream(&object_id, &content_id, None).await?; + reporter.report(ProgressEvent::Done { + id, + transferred: layer_size, + }); Ok((object_id, layer_stats)) } #[cfg(test)] mod tests { use super::*; + use crate::progress::NullReporter; #[test] fn test_parse_oci_layout_ref() { @@ -411,13 +476,13 @@ mod tests { let (repo, _) = composefs::repository::Repository::::init_path( rustix::fs::CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + composefs::repository::RepositoryConfig::default().set_insecure(), ) .unwrap(); let repo = std::sync::Arc::new(repo); - let result = import_oci_layout(&repo, layout_path, None).await; + let reporter = std::sync::Arc::new(NullReporter); + let result = import_oci_layout(&repo, layout_path, None, reporter).await; let err = result.expect_err("should fail with no matching platform"); let err_msg = format!("{err:#}"); assert!( diff --git a/crates/composefs-oci/src/progress.rs b/crates/composefs-oci/src/progress.rs new file mode 100644 index 00000000..3fc618ea --- /dev/null +++ b/crates/composefs-oci/src/progress.rs @@ -0,0 +1,6 @@ +// Progress types now live in the core `composefs` crate. +// Re-export everything from there so existing code keeps compiling while +// callers migrate their imports. +#[cfg(any(test, feature = "test"))] +pub use composefs::progress::test_support; +pub use composefs::progress::*; diff --git a/crates/composefs-oci/src/skopeo.rs b/crates/composefs-oci/src/skopeo.rs index f4b93d4b..bd5a7269 100644 --- a/crates/composefs-oci/src/skopeo.rs +++ b/crates/composefs-oci/src/skopeo.rs @@ -18,7 +18,6 @@ use containers_image_proxy::{ ConvertedLayerInfo, ImageProxy, ImageProxyConfig, ImageReference, OpenedImage, Transport, }; use fn_error_context::context; -use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use rustix::process::geteuid; use tokio::{io::AsyncReadExt, sync::Semaphore, task::JoinSet}; @@ -33,6 +32,7 @@ use crate::{ layer::{decompress_async, import_tar_async, is_tar_media_type, store_blob_async}, layer_identifier, oci_image::{manifest_identifier, tag_image}, + progress::{ComponentId, ProgressEvent, ProgressRead, ProgressUnit, SharedReporter}, }; /// Result of pulling an OCI image. @@ -75,7 +75,7 @@ struct ImageOp { repo: Arc>, proxy: ImageProxy, img: OpenedImage, - progress: MultiProgress, + reporter: SharedReporter, transport: Transport, } @@ -84,6 +84,7 @@ impl ImageOp { repo: &Arc>, image_ref: &ImageReference, img_proxy_config: Option, + reporter: SharedReporter, ) -> Result { // Fail fast if the repository is not writable, before starting // the image proxy or doing any network I/O. @@ -142,12 +143,11 @@ impl ImageOp { .open_image_ref(image_ref) .await .context("Opening image")?; - let progress = MultiProgress::new(); Ok(ImageOp { repo: Arc::clone(repo), proxy, img, - progress, + reporter, transport, }) } @@ -165,8 +165,9 @@ impl ImageOp { let content_id = layer_identifier(diff_id); if let Some(layer_id) = self.repo.has_stream(&content_id)? { - self.progress - .println(format!("Already have layer {diff_id}"))?; + self.reporter.report(ProgressEvent::Skipped { + id: ComponentId::from(diff_id.to_string()), + }); Ok((layer_id, ImportStats::default())) } else { // Otherwise, we need to fetch it... @@ -197,21 +198,40 @@ impl ImageOp { // See https://github.com/containers/containers-image-proxy-rs/issues/71 let blob_reader = blob_reader.take(descriptor.size()); - let bar = self.progress.add(ProgressBar::new(descriptor.size())); - bar.set_style(ProgressStyle::with_template("[eta {eta}] {bar:40.cyan/blue} {decimal_bytes:>7}/{decimal_total_bytes:7} {msg}") - .unwrap() - .progress_chars("##-")); - let progress = bar.wrap_async_read(blob_reader); - self.progress.println(format!("Fetching layer {diff_id}"))?; + let id = ComponentId::from(diff_id.to_string()); + self.reporter.report(ProgressEvent::Started { + id: id.clone(), + total: Some(descriptor.size()), + unit: ProgressUnit::Bytes, + }); + + // Wrap the blob reader to emit Progress events as compressed bytes are read. + // This sits before decompression so `fetched` tracks bytes-over-the-wire, + // matching the `total` from the descriptor size above. + // + // The watch channel provides backpressure: if the renderer is slow, intermediate + // byte counts are coalesced rather than queued, keeping the I/O path non-blocking. + let (blob_reader, progress_driver) = ProgressRead::new( + blob_reader, + Arc::clone(&self.reporter), + id.clone(), + Some(descriptor.size()), + ); let media_type = descriptor.media_type(); let (object_id, layer_stats) = if is_tar_media_type(media_type) { - // Tar layers: decompress and split into a splitstream - let reader = decompress_async(progress, media_type)?; - import_tar_async(self.repo.clone(), reader).await? + // Tar layers: decompress and split into a splitstream. + // Run the progress driver concurrently with the import. + let reader = decompress_async(blob_reader, media_type)?; + let (result, ()) = + tokio::join!(import_tar_async(self.repo.clone(), reader), progress_driver); + result? } else { - // Non-tar layers (OCI artifacts): stream raw bytes to object store - let (object_id, size, method) = store_blob_async(&self.repo, progress).await?; + // Non-tar layers (OCI artifacts): stream raw bytes to object store. + // Run the progress driver concurrently with the blob store. + let (store_result, ()) = + tokio::join!(store_blob_async(&self.repo, blob_reader), progress_driver); + let (object_id, size, method) = store_result?; driver.await?; let mut stats = ImportStats::default(); @@ -237,6 +257,10 @@ impl ImageOp { stream.add_external_size(size); stream.write_reference(object_id)?; let stream_id = self.repo.write_stream(stream, &content_id, None)?; + self.reporter.report(ProgressEvent::Done { + id, + transferred: size, + }); return Ok((stream_id, stats)); }; @@ -249,6 +273,11 @@ impl ImageOp { .register_stream(&object_id, &content_id, None) .await?; + self.reporter.report(ProgressEvent::Done { + id, + transferred: descriptor.size(), + }); + Ok((object_id, layer_stats)) } } @@ -268,8 +297,9 @@ impl ImageOp { if let Some(config_id) = self.repo.has_stream(&content_id)? { // We already got this config - need to read the layer refs and diff_ids from it - self.progress - .println(format!("Already have container config {config_digest}"))?; + self.reporter.report(ProgressEvent::Message(format!( + "Already have container config {config_digest}" + ))); let (data, named_refs) = crate::oci_image::read_external_splitstream( &self.repo, @@ -310,8 +340,9 @@ impl ImageOp { )) } else { // We need to add the config to the repo - self.progress - .println(format!("Fetching config {config_digest}"))?; + self.reporter.report(ProgressEvent::Message(format!( + "Fetching config {config_digest}" + ))); let (mut config, driver) = self.proxy.get_descriptor(&self.img, descriptor).await?; let config = async move { @@ -433,12 +464,14 @@ impl ImageOp { let manifest_content_id = manifest_identifier(&manifest_digest); let manifest_verity = if let Some(verity) = self.repo.has_stream(&manifest_content_id)? { - self.progress - .println(format!("Already have manifest {manifest_digest}"))?; + self.reporter.report(ProgressEvent::Message(format!( + "Already have manifest {manifest_digest}" + ))); verity } else { - self.progress - .println(format!("Storing manifest {manifest_digest}"))?; + self.reporter.report(ProgressEvent::Message(format!( + "Storing manifest {manifest_digest}" + ))); let mut splitstream = self.repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?; @@ -483,6 +516,7 @@ pub async fn pull_image( imgref: &str, reference: Option<&str>, img_proxy_config: Option, + reporter: SharedReporter, ) -> Result<(PullResult, ImportStats)> { // Fail fast if the repository is not writable, before doing any I/O. repo.ensure_writable()?; @@ -494,10 +528,10 @@ pub async fn pull_image( let (result, stats) = if image_ref.transport == Transport::OciDir { let (path_str, layout_tag) = crate::oci_layout::parse_oci_layout_ref(&image_ref.name); let layout_path = std::path::Path::new(path_str); - crate::oci_layout::import_oci_layout(repo, layout_path, layout_tag).await? + crate::oci_layout::import_oci_layout(repo, layout_path, layout_tag, reporter).await? } else { // Standard path: use skopeo proxy for other transports - let op = Arc::new(ImageOp::new(repo, &image_ref, img_proxy_config).await?); + let op = Arc::new(ImageOp::new(repo, &image_ref, img_proxy_config, reporter).await?); op.pull() .await .with_context(|| format!("Unable to pull container image {imgref}"))? @@ -534,7 +568,8 @@ pub async fn pull( reference: Option<&str>, img_proxy_config: Option, ) -> Result<(OciDigest, ObjectID, ImportStats)> { - let (result, stats) = pull_image(repo, imgref, reference, img_proxy_config).await?; + let reporter = Arc::new(crate::progress::NullReporter); + let (result, stats) = pull_image(repo, imgref, reference, img_proxy_config, reporter).await?; let (config_digest, config_verity) = result.into_config(); Ok((config_digest, config_verity, stats)) } diff --git a/crates/composefs-oci/src/tar.rs b/crates/composefs-oci/src/tar.rs index 1fde0246..abb4cb9a 100644 --- a/crates/composefs-oci/src/tar.rs +++ b/crates/composefs-oci/src/tar.rs @@ -23,7 +23,7 @@ use anyhow::{Context, Result, bail, ensure}; use bytes::{Bytes, BytesMut}; use rustix::fs::makedev; use tar_core::{ - EntryType, HEADER_SIZE, + EntryType, HEADER_SIZE, PaxExtensions, parse::{ParseEvent, Parser}, }; use tokio::{ @@ -42,6 +42,39 @@ use composefs::{ use crate::ImportStats; +/// Extract sub-second nanoseconds from PAX extension mtime. +/// +/// PAX mtime values have the form `"."` where `` is a +/// decimal fraction of a second with up to 9 significant digits. +/// `tar-core` keeps only the integer part in `ParsedEntry::mtime`; we read +/// the fractional part from the raw PAX bytes ourselves. +/// +/// Returns 0 if there is no PAX mtime, the value has no fractional part, +/// or the value cannot be parsed. +fn pax_mtime_nsec(pax: &[u8]) -> u32 { + for ext in PaxExtensions::new(pax).flatten() { + if ext.key_bytes() == b"mtime" { + let Ok(value) = ext.value() else { return 0 }; + // Split on '.': "1234567890.123456789" → frac = "123456789" + let Some(frac) = value.split_once('.').map(|(_, f)| f) else { + return 0; + }; + // Truncate or pad to exactly 9 digits (nanosecond precision) + let frac = if frac.len() >= 9 { + &frac[..9] + } else { + // fewer than 9 digits: treat as leading digits, e.g. "5" → 500_000_000 + return frac + .parse::() + .ok() + .map_or(0, |v| v * 10u32.pow(9 - frac.len() as u32)); + }; + return frac.parse::().unwrap_or(0); + } + } + 0 +} + /// Receive data from channel, write to tmpfile, compute verity, and store object. /// /// This runs in a blocking task to avoid blocking the async runtime. @@ -456,6 +489,7 @@ pub fn get_entry( st_gid: entry.gid as u32, st_mode: entry.mode, st_mtim_sec: entry.mtime as i64, + st_mtim_nsec: entry.pax.map_or(0, pax_mtime_nsec), xattrs, }, item, @@ -475,7 +509,9 @@ mod tests { use super::*; use composefs::{ - fsverity::Sha256HashValue, generic_tree::LeafContent, repository::Repository, + fsverity::Sha256HashValue, + generic_tree::LeafContent, + repository::{Repository, RepositoryConfig}, splitstream::SplitStreamReader, }; use std::{io::Read, path::Path, sync::Arc}; @@ -493,8 +529,7 @@ mod tests { let (repo, _) = Repository::init_path( rustix::fs::CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; // Store tempdir in static to keep it alive @@ -542,6 +577,46 @@ mod tests { Ok(entries) } + #[test] + fn test_pax_mtime_nsec_parsing() { + // Standard 9-digit fractional part + // "30 mtime=1234567890.123456789\n": "mtime=1234567890.123456789\n" = 27 bytes, "30 " = 3 → total 30 + let pax = b"30 mtime=1234567890.123456789\n"; + assert_eq!(pax_mtime_nsec(pax), 123_456_789, "9-digit fraction"); + + // Fewer than 9 digits: "5" → 500_000_000 ns + // "mtime=1234567890.5\n" = 19 bytes, "22 " = 3 → total 22 + let pax = b"22 mtime=1234567890.5\n"; + assert_eq!(pax_mtime_nsec(pax), 500_000_000, "1-digit fraction"); + + // Exactly 9 digits (no truncation needed) + // "mtime=1234567890.000000001\n" = 27 bytes, "30 " = 3 → total 30 + let pax = b"30 mtime=1234567890.000000001\n"; + assert_eq!(pax_mtime_nsec(pax), 1, "trailing single non-zero digit"); + + // More than 9 digits (truncate to 9) + // "mtime=1234567890.1234567899\n" = 28 bytes, "31 " = 3 → total 31 + let pax = b"31 mtime=1234567890.1234567899\n"; + assert_eq!( + pax_mtime_nsec(pax), + 123_456_789, + "10-digit fraction truncated" + ); + + // No fractional part + // "mtime=1234567890\n" = 17 bytes, "20 " = 3 → total 20 + let pax = b"20 mtime=1234567890\n"; + assert_eq!(pax_mtime_nsec(pax), 0, "no fractional part"); + + // No mtime key + // "path=foo.txt\n" = 13 bytes, "16 " = 3 → total 16 + let pax = b"16 path=foo.txt\n"; + assert_eq!(pax_mtime_nsec(pax), 0, "no mtime key"); + + // Empty PAX data + assert_eq!(pax_mtime_nsec(b""), 0, "empty pax"); + } + #[test] fn test_make_absolute_path() { let cases: &[(&[u8], &str)] = &[ diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs index 73829c4a..2df99a5c 100644 --- a/crates/composefs-oci/src/test_util.rs +++ b/crates/composefs-oci/src/test_util.rs @@ -24,7 +24,7 @@ use crate::oci_image::write_manifest; use crate::skopeo::OCI_CONFIG_CONTENT_TYPE; use composefs::dumpfile_parse::{Entry, Item}; use composefs::fsverity::Sha256HashValue; -use composefs::repository::Repository; +use composefs::repository::{Repository, RepositoryConfig}; use containers_image_proxy::oci_spec::image::{ ConfigBuilder, DescriptorBuilder, Digest as OciDigest, ImageConfigurationBuilder, ImageManifestBuilder, MediaType, RootFsBuilder, @@ -639,13 +639,11 @@ pub async fn create_bootable_image( /// paths rather than `Repository` handles. Opens the repo, creates the /// image with `create_base_image`, generates the EROFS, and returns. pub fn create_test_oci_image(repo_path: &std::path::Path, tag: &str) -> anyhow::Result<()> { - let (mut repo, _) = Repository::::init_path( + let (repo, _) = Repository::::init_path( rustix::fs::CWD, repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; - repo.set_insecure(); let repo = Arc::new(repo); let rt = tokio::runtime::Runtime::new()?; rt.block_on(create_base_image(&repo, Some(tag))); @@ -663,13 +661,11 @@ pub fn create_test_bootable_oci_image( repo_path: &std::path::Path, tag: &str, ) -> anyhow::Result<()> { - let (mut repo, _) = Repository::::init_path( + let (repo, _) = Repository::::init_path( rustix::fs::CWD, repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; - repo.set_insecure(); let repo = Arc::new(repo); let rt = tokio::runtime::Runtime::new()?; let img = rt.block_on(create_bootable_image(&repo, Some(tag), 1)); diff --git a/crates/composefs-setup-root/src/main.rs b/crates/composefs-setup-root/src/main.rs index cab37cb0..35023e23 100644 --- a/crates/composefs-setup-root/src/main.rs +++ b/crates/composefs-setup-root/src/main.rs @@ -14,7 +14,7 @@ use std::{ use anyhow::{Context, Result}; use clap::Parser; -use hex::FromHexError; + use rustix::{ fs::{CWD, Mode, OFlags, major, minor, mkdirat, openat, stat, symlink}, io::Errno, @@ -31,7 +31,7 @@ use composefs::{ mountcompat::{overlayfs_set_fd, overlayfs_set_lower_and_data_fds, prepare_mount}, repository::Repository, }; -use composefs_boot::cmdline::get_cmdline_composefs; +use composefs_boot::cmdline::ComposefsCmdline; // Config file #[derive(Clone, Copy, Debug, Deserialize)] @@ -246,19 +246,20 @@ fn gpt_workaround() -> Result<()> { Ok(()) } -// Try parse cmdline with sha512 digest address first, if failed with invalid length, parse again with legacy sha256 digest address fn parse_image_address(cmdline: &str) -> Result<(String, bool)> { - match get_cmdline_composefs::(cmdline) { - Ok((id, insecure)) => Ok((id.to_hex(), insecure)), - Err(e) => { - if let Some(FromHexError::InvalidStringLength) = e.downcast_ref::() { - let (id, insecure) = get_cmdline_composefs::(cmdline)?; - Ok((id.to_hex(), insecure)) - } else { - Err(e) - } - } + if let Some(karg) = ComposefsCmdline::::from_cmdline(cmdline) + .ok() + .flatten() + { + return Ok((karg.digest().to_hex(), karg.is_insecure())); + } + if let Some(karg) = ComposefsCmdline::::from_cmdline(cmdline) + .ok() + .flatten() + { + return Ok((karg.digest().to_hex(), karg.is_insecure())); } + anyhow::bail!("no composefs= / composefs.digest= karg found in kernel cmdline") } fn setup_root(args: Args) -> Result<()> { @@ -332,22 +333,43 @@ mod test { for case in failing { assert!(parse_image_address(case).is_err()) } + + // Legacy V2 karg: composefs= let digest_legacy = "8b7df143d91c716ecfa5fc1730022f6b421b05cedee8fd52b1fc65a96030ad52"; let cmdline_legacy = &format!("composefs={digest_legacy}"); - let (digest_cmdline_legacy, _) = - get_cmdline_composefs::(cmdline_legacy).unwrap(); + let karg_legacy = ComposefsCmdline::::from_cmdline(cmdline_legacy) + .unwrap() + .unwrap(); similar_asserts::assert_eq!( - digest_cmdline_legacy, - Sha256HashValue::from_hex(digest_legacy).unwrap() + karg_legacy.digest(), + &Sha256HashValue::from_hex(digest_legacy).unwrap() ); let (parsed_addr_legacy, _) = parse_image_address(cmdline_legacy).unwrap(); assert_eq!(digest_legacy, parsed_addr_legacy); + // Legacy V2 karg: composefs= let digest = "6f06b5e82420abec546d6e6d3ddd612c50cfa9b707c129345b7ec16f456b92fe35df68999b042e1a6a70dfe75f2fed8cf9f67afd0bf08d2374678d75e2f65a02"; let cmdline = &format!("composefs={digest}"); - let (digest_cmdline, _) = get_cmdline_composefs::(cmdline).unwrap(); - similar_asserts::assert_eq!(digest_cmdline, Sha512HashValue::from_hex(digest).unwrap()); + let karg = ComposefsCmdline::::from_cmdline(cmdline) + .unwrap() + .unwrap(); + similar_asserts::assert_eq!(karg.digest(), &Sha512HashValue::from_hex(digest).unwrap()); let (parsed_addr, _) = parse_image_address(cmdline).unwrap(); assert_eq!(digest, parsed_addr); + + // New V1 karg: composefs.digest= + let (parsed_v1_sha256, _) = + parse_image_address(&format!("composefs.digest={digest_legacy}")).unwrap(); + assert_eq!(digest_legacy, parsed_v1_sha256); + + // New V1 karg: composefs.digest= + let (parsed_v1_sha512, _) = + parse_image_address(&format!("composefs.digest={digest}")).unwrap(); + assert_eq!(digest, parsed_v1_sha512); + + // V1 takes priority when both kargs are present + let cmdline_both = format!("composefs={digest_legacy} composefs.digest={digest_legacy}"); + let (parsed_both, _) = parse_image_address(&cmdline_both).unwrap(); + assert_eq!(digest_legacy, parsed_both); } } diff --git a/crates/composefs/Cargo.toml b/crates/composefs/Cargo.toml index d8c78357..9a365956 100644 --- a/crates/composefs/Cargo.toml +++ b/crates/composefs/Cargo.toml @@ -19,6 +19,7 @@ test = ["tempfile"] anyhow = { version = "1.0.87", default-features = false } composefs-ioctls = { workspace = true } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] } +serde_repr = "0.1" fn-error-context = "0.2" hex = { version = "0.4.0", default-features = false, features = ["std"] } log = { version = "0.4.8", default-features = false } diff --git a/crates/composefs/fuzz/Cargo.lock b/crates/composefs/fuzz/Cargo.lock index e8640e0a..98909985 100644 --- a/crates/composefs/fuzz/Cargo.lock +++ b/crates/composefs/fuzz/Cargo.lock @@ -66,7 +66,7 @@ dependencies = [ [[package]] name = "composefs" -version = "0.3.0" +version = "0.4.0" dependencies = [ "anyhow", "composefs-ioctls", @@ -78,6 +78,7 @@ dependencies = [ "rustix", "serde", "serde_json", + "serde_repr", "sha2", "thiserror", "tokio", @@ -97,7 +98,7 @@ dependencies = [ [[package]] name = "composefs-ioctls" -version = "0.3.0" +version = "0.4.0" dependencies = [ "rustix", "thiserror", @@ -460,6 +461,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.11.0" diff --git a/crates/composefs/fuzz/generate_corpus.rs b/crates/composefs/fuzz/generate_corpus.rs index dc179f76..b8904290 100644 --- a/crates/composefs/fuzz/generate_corpus.rs +++ b/crates/composefs/fuzz/generate_corpus.rs @@ -12,7 +12,8 @@ use std::ffi::{OsStr, OsString}; use std::fs; use std::path::Path; -use composefs::erofs::writer::mkfs_erofs; +use composefs::erofs::format::FormatVersion; +use composefs::erofs::writer::{ValidatedFileSystem, mkfs_erofs, mkfs_erofs_versioned}; use composefs::fsverity::{FsVerityHashValue, Sha256HashValue}; use composefs::generic_tree::{self, LeafContent, Stat}; use composefs::tree::{self, FileSystem, RegularFile}; @@ -27,6 +28,7 @@ fn stat(mode: u32, uid: u32, gid: u32, mtime: i64) -> Stat { st_uid: uid, st_gid: gid, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -55,20 +57,37 @@ fn insert_dir<'a>(parent: &'a mut Dir, name: &str, s: Stat) -> &'a mut Dir { parent.get_directory_mut(OsStr::new(name)).unwrap() } +/// Generate both V1 and V2 images for a filesystem, pushing them into seeds. +/// +/// The V2 image uses the name as-is. The V1 image appends "_v1" to the name. +/// For V1, overlay whiteouts are added before writing (required for C compat). +fn push_both_versions( + seeds: &mut Vec<(String, Vec)>, + name: &str, + build_fs: impl Fn() -> FileSystem, +) { + // V2 (default) + let fs = build_fs(); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + seeds.push((name.to_string(), image.into())); + + // V1 (C-compatible) + let mut fs = build_fs(); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + seeds.push((format!("{name}_v1"), image.into())); +} + fn main() { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); - let mut seeds: Vec<(&str, Vec)> = Vec::new(); + let mut seeds: Vec<(String, Vec)> = Vec::new(); // 1. Empty root - { - let fs = empty_root(); - let image = mkfs_erofs(&fs); - seeds.push(("empty_root", image.into())); - } + push_both_versions(&mut seeds, "empty_root", empty_root); // 2. Single inline file (small content stored in inode) - { + push_both_versions(&mut seeds, "single_inline_file", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -77,12 +96,11 @@ fn main() { )), ); fs.root.insert(OsStr::new("hello.txt"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("single_inline_file", image.into())); - } + fs + }); // 3. Single external (chunk-based) regular file - { + push_both_versions(&mut seeds, "single_external_file", || { let mut fs = empty_root(); let hash = Sha256HashValue::EMPTY; let id = fs.push_leaf( @@ -90,66 +108,60 @@ fn main() { LeafContent::Regular(RegularFile::External(hash, 65536)), ); fs.root.insert(OsStr::new("data.bin"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("single_external_file", image.into())); - } + fs + }); // 4. Symlink - { + push_both_versions(&mut seeds, "symlink", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o777, 0, 0, 0), LeafContent::Symlink(OsString::from("/target/path").into_boxed_os_str()), ); fs.root.insert(OsStr::new("link"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("symlink", image.into())); - } + fs + }); // 5. FIFO - { + push_both_versions(&mut seeds, "fifo", || { let mut fs = empty_root(); let id = fs.push_leaf(file_stat(), LeafContent::Fifo); fs.root.insert(OsStr::new("mypipe"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("fifo", image.into())); - } + fs + }); // 6. Character device - { + push_both_versions(&mut seeds, "chardev", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o666, 0, 0, 0), LeafContent::CharacterDevice(makedev(1, 3)), ); fs.root.insert(OsStr::new("null"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("chardev", image.into())); - } + fs + }); // 7. Block device - { + push_both_versions(&mut seeds, "blockdev", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o660, 0, 6, 0), LeafContent::BlockDevice(makedev(8, 0)), ); fs.root.insert(OsStr::new("sda"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("blockdev", image.into())); - } + fs + }); // 8. Socket - { + push_both_versions(&mut seeds, "socket", || { let mut fs = empty_root(); let id = fs.push_leaf(file_stat(), LeafContent::Socket); fs.root.insert(OsStr::new("mysock"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("socket", image.into())); - } + fs + }); // 9. Nested directories: /a/b/c/file - { + push_both_versions(&mut seeds, "nested_dirs", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -161,12 +173,11 @@ fn main() { let b = insert_dir(a, "b", dir_stat()); let c = insert_dir(b, "c", dir_stat()); c.insert(OsStr::new("file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("nested_dirs", image.into())); - } + fs + }); // 10. Many entries (20+ files to exercise multi-block directories) - { + push_both_versions(&mut seeds, "many_entries", || { let mut fs = empty_root(); for i in 0..25 { let name = format!("file_{i:03}"); @@ -179,12 +190,11 @@ fn main() { ); fs.root.insert(OsStr::new(&name), Inode::leaf(id)); } - let image = mkfs_erofs(&fs); - seeds.push(("many_entries", image.into())); - } + fs + }); // 11. Extended attributes - { + push_both_versions(&mut seeds, "xattrs", || { let mut fs = empty_root(); let mut xattrs = BTreeMap::new(); xattrs.insert( @@ -206,12 +216,11 @@ fn main() { )), ); fs.root.insert(OsStr::new("xattr_file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("xattrs", image.into())); - } + fs + }); // 12. Mixed types — one of every file type in a single directory - { + push_both_versions(&mut seeds, "mixed_types", || { let mut fs = empty_root(); let ids = [ fs.push_leaf( @@ -246,9 +255,8 @@ fn main() { LeafContent::Regular(RegularFile::External(hash, 4096)), ); fs.root.insert(OsStr::new("external"), Inode::leaf(ext_id)); - let image = mkfs_erofs(&fs); - seeds.push(("mixed_types", image.into())); - } + fs + }); // 13. Hardlink — two entries sharing the same LeafId (nlink > 1) { @@ -263,12 +271,12 @@ fn main() { .insert(OsStr::new("original"), Inode::leaf(shared_id)); fs.root .insert(OsStr::new("hardlink"), Inode::leaf(shared_id)); - let image = mkfs_erofs(&fs); - seeds.push(("hardlink", image.into())); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + seeds.push(("hardlink".to_string(), image.into())); } // 14. Large inline — file with maximum inline content (just under 4096 bytes) - { + push_both_versions(&mut seeds, "large_inline", || { let mut fs = empty_root(); let content = vec![0xABu8; 4000]; // just under block size let id = fs.push_leaf( @@ -277,12 +285,11 @@ fn main() { ); fs.root .insert(OsStr::new("large_inline.bin"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("large_inline", image.into())); - } + fs + }); // 15. Deep nesting — 8 levels of directories - { + push_both_versions(&mut seeds, "deep_nesting", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -296,12 +303,11 @@ fn main() { current = insert_dir(current, name, dir_stat()); } current.insert(OsStr::new("deep_file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("deep_nesting", image.into())); - } + fs + }); // 16. Nonzero mtime - { + push_both_versions(&mut seeds, "nonzero_mtime", || { let mut fs = FileSystem::new(stat(0o755, 0, 0, 1000000)); let id1 = fs.push_leaf( stat(0o644, 0, 0, 500000), @@ -317,12 +323,11 @@ fn main() { ); fs.root.insert(OsStr::new("old"), Inode::leaf(id1)); fs.root.insert(OsStr::new("new"), Inode::leaf(id2)); - let image = mkfs_erofs(&fs); - seeds.push(("nonzero_mtime", image.into())); - } + fs + }); // 17. Large uid/gid — forces extended inodes - { + push_both_versions(&mut seeds, "large_uid_gid", || { let big_id = u16::MAX as u32 + 1; // 65536, won't fit in u16 let mut fs = FileSystem::new(stat(0o755, big_id, big_id, 0)); let id = fs.push_leaf( @@ -332,9 +337,8 @@ fn main() { )), ); fs.root.insert(OsStr::new("bigids.txt"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("large_uid_gid", image.into())); - } + fs + }); // Write seeds to corpus directories for both fuzz targets let targets = ["read_image", "debug_image"]; diff --git a/crates/composefs/proptest-regressions/erofs/reader.txt b/crates/composefs/proptest-regressions/erofs/reader.txt new file mode 100644 index 00000000..40e5b6ca --- /dev/null +++ b/crates/composefs/proptest-regressions/erofs/reader.txt @@ -0,0 +1,8 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 6938de6542fd6c10b28ba78a0b5c0a8754da1fa13340f4952df34bf43c913f6b # shrinks to spec = FsSpec { root: DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [], subdirs: [("A", DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [("a", LeafSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 1, xattrs: {} }, content: Inline([]) })], subdirs: [] })] }, hardlinks: [HardlinkSpec { source_index: 0, link_name: "G4._s_z6._cbp" }, HardlinkSpec { source_index: 0, link_name: "C1-1Pgx_Cg2g" }, HardlinkSpec { source_index: 0, link_name: "îA\xEB\xCE$\xE6Z\x90K^\u{1d}\xC8\u{18}s\u{10}\x81\u{3}E\xAA" }] } +cc 0ddc52acd61b4976d1e5e21694863a9a3dc6fd2a0af6b620b379c6dcb5603c48 # shrinks to spec = FsSpec { root: DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [], subdirs: [("A", DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [("0_AA-aA-00-a0Aa_AA-0aA-aA000A-0_0_A__aAaa0a-_-__Aa0_a0A_aaa--aA0-A_aa-A0-0a0-aAaA0aAa__0AAA0A0-0aaa-a_aA000AA_-A00-a000-0aaaA--0AaAa_a0AaaA_-0Aa-_A0aaA0aA-0A0aaaaA_a0a00__A-aA__aA-A0A-00-00a0a-_A0A0A0_a0A-00aa0AAN5.n.bQHGB_-7GJbF-RrX0alT.t-KOi-S_B-_....Td", LeafSpec { stat: Stat { st_mode: 2755, st_uid: 37448, st_gid: 1536, st_mtim_sec: 341456497, st_mtim_nsec: 880834887, xattrs: {"lustre.lov": [164, 134, 7, 253, 237, 177, 226, 6, 175, 72, 217, 116], "system.posix_acl_default": [178, 49, 193, 209, 177, 17, 102, 91, 120, 161, 152], "user.test_1": [60, 197, 53], "user.test_4": [175, 169, 100, 201, 234, 81, 68, 205, 62, 158, 13]} }, content: Symlink("\x9B\u{1f}\x88\xB5K\xFC\x89uy\\\xD9\xC6\u{c}\u{7}\xA8") })], subdirs: [] })] }, hardlinks: [] } diff --git a/crates/composefs/src/dumpfile.rs b/crates/composefs/src/dumpfile.rs index 7143715d..8b3d6253 100644 --- a/crates/composefs/src/dumpfile.rs +++ b/crates/composefs/src/dumpfile.rs @@ -114,11 +114,12 @@ fn write_entry( let uid = stat.st_uid; let gid = stat.st_gid; let mtim_sec = stat.st_mtim_sec; + let mtim_nsec = stat.st_mtim_nsec; write_escaped(writer, path.as_os_str().as_bytes())?; write!( writer, - " {size} {mode:o} {nlink} {uid} {gid} {rdev} {mtim_sec}.0 " + " {size} {mode:o} {nlink} {uid} {gid} {rdev} {mtim_sec}.{mtim_nsec} " )?; write_escaped(writer, payload.as_ref().as_bytes())?; write!(writer, " ")?; @@ -422,7 +423,7 @@ pub fn add_entry_to_filesystem( // Handle root directory specially if path == Path::new("/") { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; fs.set_root_stat(stat); return Ok(()); } @@ -439,7 +440,7 @@ pub fn add_entry_to_filesystem( // Convert the entry to an inode let inode = match entry.item { Item::Directory { .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; Inode::Directory(Box::new(Directory::new(stat))) } Item::Hardlink { ref target } => { @@ -450,7 +451,7 @@ pub fn add_entry_to_filesystem( Inode::leaf(existing_id) } Item::RegularInline { ref content, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let data: Box<[u8]> = match content { std::borrow::Cow::Borrowed(d) => Box::from(*d), std::borrow::Cow::Owned(d) => d.clone().into_boxed_slice(), @@ -464,7 +465,7 @@ pub fn add_entry_to_filesystem( ref fsverity_digest, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let digest = fsverity_digest .as_ref() .ok_or_else(|| anyhow::anyhow!("External file missing fsverity digest"))?; @@ -473,10 +474,19 @@ pub fn add_entry_to_filesystem( let id = push_leaf(fs, stat, content); Inode::leaf(id) } - Item::Device { rdev, .. } => { - let stat = entry_to_stat(&entry); + Item::Device { rdev, nlink } => { // S_IFMT = 0o170000, S_IFBLK = 0o60000, S_IFCHR = 0o20000 - let content = if entry.mode & 0o170000 == 0o60000 { + let is_chardev = entry.mode & 0o170000 != 0o60000; + // A whiteout is a character device with rdev=0; hardlinked whiteouts + // are invalid because composefs cannot represent them correctly. + if is_chardev && rdev == 0 && nlink > 1 { + anyhow::bail!( + "invalid dumpfile: whiteout entry {:?} has nlink > 1", + entry.path + ); + } + let stat = entry_to_stat(&entry)?; + let content = if !is_chardev { LeafContent::BlockDevice(rdev) } else { LeafContent::CharacterDevice(rdev) @@ -485,7 +495,7 @@ pub fn add_entry_to_filesystem( Inode::leaf(id) } Item::Symlink { ref target, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let target_os: Box = match target { std::borrow::Cow::Borrowed(t) => Box::from(t.as_os_str()), std::borrow::Cow::Owned(t) => Box::from(t.as_os_str()), @@ -495,11 +505,17 @@ pub fn add_entry_to_filesystem( Inode::leaf(id) } Item::Fifo { .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let content = LeafContent::Fifo; let id = push_leaf(fs, stat, content); Inode::leaf(id) } + Item::Socket { .. } => { + let stat = entry_to_stat(&entry)?; + let content = LeafContent::Socket; + let id = push_leaf(fs, stat, content); + Inode::leaf(id) + } }; // Store LeafIds in the hardlinks map for future hardlink lookups @@ -521,7 +537,7 @@ pub fn add_entry_to_filesystem( } /// Convert a dumpfile Entry's metadata into a tree Stat structure. -fn entry_to_stat(entry: &Entry<'_>) -> Stat { +fn entry_to_stat(entry: &Entry<'_>) -> Result { let mut xattrs = BTreeMap::new(); for xattr in &entry.xattrs { let key: Box = match &xattr.key { @@ -535,13 +551,19 @@ fn entry_to_stat(entry: &Entry<'_>) -> Stat { xattrs.insert(key, value); } - Stat { + let nsec = entry.mtime.nsec; + if nsec >= 1_000_000_000 { + anyhow::bail!("Invalid mtime nanoseconds: {nsec} (must be < 1_000_000_000)"); + } + + Ok(Stat { st_mode: entry.mode & 0o7777, // Keep only permission bits st_uid: entry.uid, st_gid: entry.gid, st_mtim_sec: entry.mtime.sec as i64, + st_mtim_nsec: nsec as u32, xattrs, - } + }) } /// Parse a dumpfile string and build a complete FileSystem. @@ -566,7 +588,7 @@ pub fn dumpfile_to_filesystem( "Dumpfile must start with root directory entry, found: {:?}", entry.path ); - break entry_to_stat(&entry); + break entry_to_stat(&entry)?; } None => anyhow::bail!("Dumpfile is empty, expected root directory entry"), } @@ -591,6 +613,19 @@ pub fn dumpfile_to_filesystem( Ok(fs) } +/// Parse a composefs dumpfile string and validate the resulting filesystem +/// for EROFS serialization. +/// +/// Combines [`dumpfile_to_filesystem`] with [`ValidatedFileSystem::new`]. +/// Returns an error if the dumpfile is malformed or if the resulting +/// filesystem violates EROFS invariants (e.g. hardlinked whiteouts). +pub fn dumpfile_to_validated_filesystem( + dumpfile: &str, +) -> anyhow::Result> { + let fs = dumpfile_to_filesystem(dumpfile)?; + crate::erofs::writer::ValidatedFileSystem::new(fs) +} + #[cfg(test)] mod tests { use super::*; @@ -724,6 +759,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }); let leaf_id = fs.push_leaf( @@ -732,6 +768,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs, }, LeafContent::Regular(RegularFile::Inline(b"test".to_vec().into())), @@ -757,6 +794,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }; @@ -793,6 +831,23 @@ mod tests { Ok(()) } + /// A whiteout (chardev, rdev=0) with nlink > 1 must be rejected. + #[test] + fn test_hardlinked_whiteout_rejected() { + // /foo 0 20000 2 0 0 0 0.0 - - - + // ^size ^mode ^nlink ^uid ^gid ^rdev ^mtime ^payload ^digest ^xattrs + // mode 20000 = S_IFCHR (character device), rdev=0 → whiteout, nlink=2 + let dumpfile = "/ 0 40755 2 0 0 0 0.0 - - -\n\ + /foo 0 20000 2 0 0 0 0.0 - - -\n"; + let result = dumpfile_to_filesystem::(dumpfile); + let err = result.expect_err("hardlinked whiteout must be rejected"); + let msg = format!("{err:#}"); + assert!( + msg.contains("nlink"), + "error should mention nlink, got: {msg}" + ); + } + /// Helper to escape bytes through write_escaped and return the result. fn escaped(bytes: &[u8]) -> String { let mut out = String::new(); diff --git a/crates/composefs/src/dumpfile_parse.rs b/crates/composefs/src/dumpfile_parse.rs index f8cccdd8..f01a28a3 100644 --- a/crates/composefs/src/dumpfile_parse.rs +++ b/crates/composefs/src/dumpfile_parse.rs @@ -121,6 +121,11 @@ pub enum Item<'p> { /// Number of links nlink: u32, }, + /// A Unix domain socket + Socket { + /// Number of links + nlink: u32, + }, /// A directory Directory { /// Number of links @@ -482,7 +487,10 @@ impl<'p> Entry<'p> { Item::Directory { nlink } } FileType::Socket => { - anyhow::bail!("sockets are not supported"); + Self::check_nonregfile(content, fsverity_digest)?; + Self::check_rdev(rdev)?; + + Item::Socket { nlink } } FileType::Unknown => { anyhow::bail!("Unhandled file type from raw mode: {mode}") @@ -532,6 +540,7 @@ impl Item<'_> { Item::Symlink { nlink, .. } => *nlink, Item::Directory { nlink, .. } => *nlink, Item::Fifo { nlink, .. } => *nlink, + Item::Socket { nlink, .. } => *nlink, _ => 0, } } diff --git a/crates/composefs/src/erofs/composefs.rs b/crates/composefs/src/erofs/composefs.rs index 3acf1844..960064fa 100644 --- a/crates/composefs/src/erofs/composefs.rs +++ b/crates/composefs/src/erofs/composefs.rs @@ -7,19 +7,23 @@ use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; use crate::fsverity::FsVerityHashValue; -/* From linux/fs/overlayfs/overlayfs.h struct ovl_metacopy */ +/// Overlay metacopy xattr structure for fs-verity digest storage. +/// +/// From linux/fs/overlayfs/overlayfs.h struct ovl_metacopy #[derive(Debug, FromBytes, Immutable, KnownLayout, IntoBytes)] #[repr(C)] -pub(super) struct OverlayMetacopy { +pub struct OverlayMetacopy { version: u8, len: u8, flags: u8, digest_algo: u8, - pub(super) digest: H, + /// The fs-verity digest value. + pub digest: H, } impl OverlayMetacopy { - pub(super) fn new(digest: &H) -> Self { + /// Creates a new overlay metacopy entry with the given digest. + pub fn new(digest: &H) -> Self { Self { version: 0, len: size_of::() as u8, @@ -29,7 +33,8 @@ impl OverlayMetacopy { } } - pub(super) fn valid(&self) -> bool { + /// Checks whether this metacopy entry is valid. + pub fn valid(&self) -> bool { self.version == 0 && self.len == size_of::() as u8 && self.flags == 0 diff --git a/crates/composefs/src/erofs/debug.rs b/crates/composefs/src/erofs/debug.rs index d384d7de..e4905cf7 100644 --- a/crates/composefs/src/erofs/debug.rs +++ b/crates/composefs/src/erofs/debug.rs @@ -341,64 +341,6 @@ impl<'img> ImageVisitor<'img> { } } - fn visit_directory_block(&mut self, block: &DirectoryBlock, path: &Path) -> Result<()> { - for entry in block.entries()? { - let entry = entry?; - if entry.name == b"." || entry.name == b".." { - // TODO: maybe we want to follow those and let deduplication happen - continue; - } - self.visit_inode( - entry.header.inode_offset.get(), - &path.join(OsStr::from_bytes(entry.name)), - )?; - } - Ok(()) - } - - fn visit_inode(&mut self, id: u64, path: &Path) -> Result<()> { - let inode = self.image.inode(id)?; - let segment = match inode { - InodeType::Compact(inode) => SegmentType::CompactInode(inode), - InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), - }; - if self.note(segment, Some(path))? { - // TODO: maybe we want to throw an error if we detect loops - /* already processed */ - return Ok(()); - } - - if let Some(xattrs) = inode.xattrs()? { - for id in xattrs.shared()? { - self.note( - SegmentType::XAttr(self.image.shared_xattr(id.get())?), - Some(path), - )?; - } - } - - if inode.mode().is_dir() { - if let Some(inline) = inode.inline() { - let inline_block = DirectoryBlock::ref_from_bytes(inline) - .map_err(|_| anyhow::anyhow!("invalid inline directory block"))?; - self.visit_directory_block(inline_block, path)?; - } - - for id in self.image.inode_blocks(&inode)? { - let block = self.image.directory_block(id)?; - self.visit_directory_block(block, path)?; - self.note(SegmentType::DirectoryBlock(block), Some(path))?; - } - } else { - for id in self.image.inode_blocks(&inode)? { - let block = self.image.data_block(id)?; - self.note(SegmentType::DataBlock(block), Some(path))?; - } - } - - Ok(()) - } - #[allow(clippy::type_complexity)] fn visit_image( image: &'img Image<'img>, @@ -409,7 +351,70 @@ impl<'img> ImageVisitor<'img> { }; this.note(SegmentType::Header(image.header), None)?; this.note(SegmentType::Superblock(image.sb), None)?; - this.visit_inode(image.sb.root_nid.get() as u64, &PathBuf::from("/"))?; + + // Iterative traversal: push (nid, path) pairs rather than recursing. + // The previous mutual recursion (visit_inode ↔ visit_directory_block) + // had no depth limit and would stack-overflow on deeply nested images. + // Deduplication is by byte offset via note(), so cycles and hardlinks + // are safe: note() returns true on a second visit and we skip children. + let mut stack: Vec<(u64, PathBuf)> = + vec![(image.sb.root_nid.get() as u64, PathBuf::from("/"))]; + + while let Some((id, path)) = stack.pop() { + let inode = this.image.inode(id)?; + let segment = match inode { + InodeType::Compact(inode) => SegmentType::CompactInode(inode), + InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), + }; + if this.note(segment, Some(&path))? { + // Already visited this byte offset — additional path recorded, skip children. + continue; + } + + if let Some(xattrs) = inode.xattrs()? { + for xid in xattrs.shared()? { + this.note( + SegmentType::XAttr(this.image.shared_xattr(xid.get())?), + Some(&path), + )?; + } + } + + if inode.mode().is_dir() { + if let Some(inline) = inode.inline() { + let inline_block = DirectoryBlock::ref_from_bytes(inline) + .map_err(|_| anyhow::anyhow!("invalid inline directory block"))?; + for entry in inline_block.entries()? { + let entry = entry?; + if entry.name != b"." && entry.name != b".." { + stack.push(( + entry.header.inode_offset.get(), + path.join(OsStr::from_bytes(entry.name)), + )); + } + } + } + for blkid in this.image.inode_blocks(&inode)? { + let block = this.image.directory_block(blkid)?; + for entry in block.entries()? { + let entry = entry?; + if entry.name != b"." && entry.name != b".." { + stack.push(( + entry.header.inode_offset.get(), + path.join(OsStr::from_bytes(entry.name)), + )); + } + } + this.note(SegmentType::DirectoryBlock(block), Some(&path))?; + } + } else { + for blkid in this.image.inode_blocks(&inode)? { + let block = this.image.data_block(blkid)?; + this.note(SegmentType::DataBlock(block), Some(&path))?; + } + } + } + Ok(this.visited) } } diff --git a/crates/composefs/src/erofs/format.rs b/crates/composefs/src/erofs/format.rs index cc5a40a2..0b0e4241 100644 --- a/crates/composefs/src/erofs/format.rs +++ b/crates/composefs/src/erofs/format.rs @@ -81,7 +81,7 @@ const INODE_DATALAYOUT_FLAT_INLINE: u16 = 4; const INODE_DATALAYOUT_CHUNK_BASED: u16 = 8; /// Data layout method for file content storage -#[derive(Debug)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[repr(u16)] pub enum DataLayout { /// File data stored in separate blocks @@ -271,11 +271,103 @@ impl std::ops::BitOr for FileType { /// EROFS format version number pub const VERSION: U32 = U32::new(1); -/// Composefs-specific version number +/// Composefs-specific version number (V2, Rust-native format) pub const COMPOSEFS_VERSION: U32 = U32::new(2); +/// Composefs-specific version number for V1 (C-compatible format: compact inodes, whiteout table) +pub const COMPOSEFS_VERSION_V1: U32 = U32::new(0); /// Magic number identifying composefs images pub const COMPOSEFS_MAGIC: U32 = U32::new(0xd078629a); +/// Format version for composefs images +/// +/// This enum represents the different format versions supported by composefs. +/// The format version affects the composefs header version field and build time handling. +/// +/// Serialized as an integer: V1 → `1`, V2 → `2`. +#[repr(u32)] +#[derive( + Clone, + Copy, + Debug, + Default, + Hash, + PartialEq, + Eq, + serde_repr::Serialize_repr, + serde_repr::Deserialize_repr, +)] +pub enum FormatVersion { + /// Format V1: compact inodes, whiteout table. + /// + /// This is the original format used by older versions of composefs. + /// Build time is set to the minimum mtime across all inodes. + /// The `composefs_version` header field is 0 normally, but 1 when + /// user-land whiteout files are present (matching C mkcomposefs behavior). + V1 = 1, + /// Format V2: extended inodes, no whiteout table, composefs_version=2 + /// + /// This is the current default format. + #[default] + V2 = 2, +} + +impl FormatVersion { + /// Returns the composefs_version value for this format version + pub fn composefs_version(self) -> U32 { + match self { + FormatVersion::V1 => COMPOSEFS_VERSION_V1, + FormatVersion::V2 => COMPOSEFS_VERSION, + } + } +} + +/// The set of EROFS format versions to generate when committing images. +/// +/// Stored in `meta.json` via the `"v1_erofs"` ro_compat feature flag: +/// flag present → [`V1_ONLY`](Self::V1_ONLY), flag absent → [`BOTH`](Self::BOTH). +/// +/// A `FormatSet` is a small bitset (bit 0 = V1, bit 1 = V2) so it can be +/// cheaply copied and tested without heap allocation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FormatSet(u8); + +impl FormatSet { + /// Generate only V1 EROFS (default for new repos; C-tool compatible). + pub const V1_ONLY: FormatSet = FormatSet(0b01); + /// Generate both V1 and V2 EROFS (used by bootc and other multi-format consumers). + pub const BOTH: FormatSet = FormatSet(0b11); + + /// Map a [`FormatVersion`] to its bit position in the `FormatSet` bitset. + /// + /// V1 → bit 0 (`0b01`), V2 → bit 1 (`0b10`). Adding a V3 only requires + /// updating this one function. + fn version_bit(v: FormatVersion) -> u8 { + match v { + FormatVersion::V1 => 0b01, + FormatVersion::V2 => 0b10, + } + } + + /// Returns `true` if this set includes the given format version. + pub fn contains(self, v: FormatVersion) -> bool { + self.0 & Self::version_bit(v) != 0 + } + + /// Iterate over the format versions in this set, in ascending order (V1 before V2). + pub fn iter(self) -> impl Iterator { + [FormatVersion::V1, FormatVersion::V2] + .into_iter() + .filter(move |&v| self.contains(v)) + } +} + +impl From for FormatSet { + /// Create a single-version `FormatSet` from a [`FormatVersion`]. + fn from(v: FormatVersion) -> Self { + FormatSet(FormatSet::version_bit(v)) + } +} + /// Flag indicating the presence of ACL data pub const COMPOSEFS_FLAGS_HAS_ACL: U32 = U32::new(1 << 0); @@ -493,7 +585,52 @@ pub struct XAttrHeader { pub value_size: U16, } -/// Standard xattr name prefixes indexed by name_index +/// EROFS xattr prefix index for `system.posix_acl_access` (index 2). +pub const XATTR_INDEX_POSIX_ACL_ACCESS: u8 = 2; +/// EROFS xattr prefix index for `system.posix_acl_default` (index 3). +pub const XATTR_INDEX_POSIX_ACL_DEFAULT: u8 = 3; +/// EROFS xattr prefix index for `lustre.` (index 5). +/// Absent from C mkcomposefs v1.0.8's prefix table; V1 writer skips it. +pub const XATTR_INDEX_LUSTRE: u8 = 5; + +// Overlay xattr keys used by composefs V1 whiteout escaping. +// Named to match the C mkcomposefs OVERLAY_XATTR_* constants. +/// `trusted.overlay.overlay.whiteout` — V1 escaped whiteout marker. +pub const XATTR_OVERLAY_WHITEOUT: &[u8] = b"trusted.overlay.overlay.whiteout"; +/// `user.overlay.whiteout` — userxattr escaped whiteout marker. +pub const XATTR_USERXATTR_WHITEOUT: &[u8] = b"user.overlay.whiteout"; +/// `trusted.overlay.overlay.whiteouts` — escaped whiteouts directory marker. +pub const XATTR_OVERLAY_WHITEOUTS: &[u8] = b"trusted.overlay.overlay.whiteouts"; +/// `user.overlay.whiteouts` — userxattr whiteouts directory marker. +pub const XATTR_USERXATTR_WHITEOUTS: &[u8] = b"user.overlay.whiteouts"; +/// `trusted.overlay.overlay.opaque` — escaped opaque directory marker. +pub const XATTR_OVERLAY_OPAQUE: &[u8] = b"trusted.overlay.overlay.opaque"; +/// `user.overlay.opaque` — userxattr opaque directory marker. +pub const XATTR_USERXATTR_OPAQUE: &[u8] = b"user.overlay.opaque"; +/// `trusted.overlay.opaque` — root opaque marker written by V1 writer. +pub const XATTR_OVERLAY_OPAQUE_ROOT: &[u8] = b"trusted.overlay.opaque"; +/// `trusted.overlay.metacopy` — metacopy marker (C adds redirect xattr too). +pub const XATTR_OVERLAY_METACOPY: &[u8] = b"trusted.overlay.metacopy"; +/// `trusted.overlay.redirect` — redirect target xattr. +pub const XATTR_OVERLAY_REDIRECT: &[u8] = b"trusted.overlay.redirect"; +/// `trusted.overlay.` prefix — all xattrs with this prefix are escaped in V1. +pub const XATTR_OVERLAY_PREFIX: &[u8] = b"trusted.overlay."; +/// `trusted.overlay.overlay.` prefix — escaped overlay xattr prefix. +pub const XATTR_OVERLAY_ESCAPED_PREFIX: &[u8] = b"trusted.overlay.overlay."; +/// `security.selinux` — SELinux label, copied to overlay whiteout stubs. +pub const XATTR_SECURITY_SELINUX: &[u8] = b"security.selinux"; + +/// Standard xattr name prefixes indexed by EROFS name_index. +/// +/// Index 0 is the fallback (empty prefix, full name stored as suffix). +/// Indices 1–6 map to the well-known EROFS prefix constants: +/// EROFS_XATTR_INDEX_USER=1, POSIX_ACL_ACCESS=2, POSIX_ACL_DEFAULT=3, +/// EROFS_XATTR_INDEX_TRUSTED=4, EROFS_XATTR_INDEX_LUSTRE=5, EROFS_XATTR_INDEX_SECURITY=6. +/// +/// **V1 compatibility note:** C mkcomposefs v1.0.8 does NOT include `lustre.` (index 5) +/// in its prefix table. Any `lustre.*` xattr is therefore encoded with prefix index 0 +/// (raw fallback) by C. For V1 images the writer must skip index 5 during prefix +/// matching so that `lustre.*` xattrs fall through to the empty-string fallback. pub const XATTR_PREFIXES: [&[u8]; 7] = [ b"", b"user.", @@ -519,3 +656,35 @@ pub struct DirectoryEntryHeader { /// Reserved field pub reserved: u8, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_set_contains() { + assert!(FormatSet::BOTH.contains(FormatVersion::V1)); + assert!(FormatSet::BOTH.contains(FormatVersion::V2)); + assert!(FormatSet::V1_ONLY.contains(FormatVersion::V1)); + assert!(!FormatSet::V1_ONLY.contains(FormatVersion::V2)); + } + + #[test] + fn test_format_set_from_version() { + assert_eq!(FormatSet::from(FormatVersion::V1), FormatSet::V1_ONLY); + // V2 alone is a single-version set (neither V1_ONLY nor BOTH). + let v2_only = FormatSet::from(FormatVersion::V2); + assert!(!v2_only.contains(FormatVersion::V1)); + assert!(v2_only.contains(FormatVersion::V2)); + } + + #[test] + fn test_format_set_iter_order() { + // iter() must yield V1 before V2. + let versions: Vec<_> = FormatSet::BOTH.iter().collect(); + assert_eq!(versions, vec![FormatVersion::V1, FormatVersion::V2]); + + let v1_only: Vec<_> = FormatSet::V1_ONLY.iter().collect(); + assert_eq!(v1_only, vec![FormatVersion::V1]); + } +} diff --git a/crates/composefs/src/erofs/reader.rs b/crates/composefs/src/erofs/reader.rs index 06f09932..65568ec1 100644 --- a/crates/composefs/src/erofs/reader.rs +++ b/crates/composefs/src/erofs/reader.rs @@ -17,10 +17,10 @@ use zerocopy::{FromBytes, Immutable, KnownLayout, little_endian::U32}; use super::{ composefs::OverlayMetacopy, format::{ - self, BLOCK_BITS, COMPOSEFS_MAGIC, CompactInodeHeader, ComposefsHeader, DataLayout, - DirectoryEntryHeader, ExtendedInodeHeader, InodeXAttrHeader, MAGIC_V1, ModeField, S_IFBLK, - S_IFCHR, S_IFIFO, S_IFLNK, S_IFMT, S_IFREG, S_IFSOCK, Superblock, VERSION, XATTR_PREFIXES, - XAttrHeader, + self, BLOCK_BITS, COMPOSEFS_MAGIC, COMPOSEFS_VERSION, COMPOSEFS_VERSION_V1, + CompactInodeHeader, ComposefsHeader, DataLayout, DirectoryEntryHeader, ExtendedInodeHeader, + InodeXAttrHeader, MAGIC_V1, ModeField, S_IFBLK, S_IFCHR, S_IFIFO, S_IFLNK, S_IFMT, S_IFREG, + S_IFSOCK, Superblock, VERSION, XATTR_PREFIXES, XAttrHeader, }, }; use crate::MAX_INLINE_CONTENT; @@ -494,8 +494,18 @@ impl<'img> Image<'img> { self.header.version.get(), ))); } - // Note: we don't enforce composefs_version here because C mkcomposefs - // writes version 0 while the Rust writer uses version 2. Both are valid. + // Reject unknown composefs versions. + // 0 = V1 (C-compatible, no user whiteouts) + // 1 = V1 (C-compatible, user whiteouts present — C bumps version when it + // encounters a char-device-rdev-0 entry in the input tree) + // 2 = V2 (Rust-native format) + let cv = self.header.composefs_version.get(); + if cv != COMPOSEFS_VERSION.get() && cv != COMPOSEFS_VERSION_V1.get() && cv != 1 { + return Err(ErofsReaderError::InvalidImage(format!( + "unknown composefs_version {cv} (expected 0, 1, or {})", + COMPOSEFS_VERSION.get(), + ))); + } // Validate EROFS superblock magic if self.sb.magic != MAGIC_V1 { @@ -649,17 +659,29 @@ impl<'img> Image<'img> { } /// Returns a data block by its ID + /// Returns a byte slice of the image at `[offset, offset+len)`, validating + /// that both the offset and the range lie within the image. + /// + /// This is the single choke point for all raw byte accesses derived from + /// image fields (block addresses, xattr offsets, etc.). All callers that + /// compute `blkaddr * block_size + delta` should go through here rather + /// than slicing `self.image` directly. + pub fn image_slice(&self, offset: usize, len: usize) -> Result<&[u8], ErofsReaderError> { + let end = offset + .checked_add(len) + .ok_or(ErofsReaderError::OutOfBounds)?; + self.image + .get(offset..end) + .ok_or(ErofsReaderError::OutOfBounds) + } + + /// Returns a block by its ID as a raw byte slice, validated against the image size. pub fn block(&self, id: u64) -> Result<&[u8], ErofsReaderError> { let start = usize::try_from(id) .ok() .and_then(|id| id.checked_mul(self.block_size)) .ok_or(ErofsReaderError::OutOfBounds)?; - let end = start - .checked_add(self.block_size) - .ok_or(ErofsReaderError::OutOfBounds)?; - self.image - .get(start..end) - .ok_or(ErofsReaderError::OutOfBounds) + self.image_slice(start, self.block_size) } /// Returns a data block by its ID as a DataBlock reference @@ -711,6 +733,150 @@ impl<'img> Image<'img> { Ok(range) } + /// Performs a full structural fsck of the image metadata by traversing the + /// entire inode tree. + /// + /// This is separate from [`Self::restrict_to_composefs`], which only checks + /// superblock and header fields without any traversal. Call this when you + /// want a thorough integrity check (e.g. during repository fsck) rather than + /// just the cheap open-time validation. + /// + /// Currently checks: + /// - V1 images: no FlatInline symlink inode has a block-boundary layout that + /// old Linux kernels (< 6.12) would reject with `EFSCORRUPTED` (`EUCLEAN`). + pub fn fsck_metadata(&self) -> Result<(), ErofsReaderError> { + self.validate_v1_inline_layout() + } + + /// Validates that the image does not contain FlatInline inodes with a layout + /// that old Linux kernels (< 6.12) would reject with `EFSCORRUPTED` (`EUCLEAN`). + /// + /// Only V1 (C-compatible, `composefs_version` = 0 or 1) images are expected to be + /// mounted on kernels that may predate the 6.12 fix; V2 images use a different + /// block-boundary strategy that is frozen for digest stability, so this check + /// is deliberately restricted to V1. + /// + /// The kernel's pre-6.12 fast-symlink path checks: + /// ```text + /// (inode_offset % block_size) + inode_and_xattr_size + inline_size > block_size + /// ``` + /// and returns `-EFSCORRUPTED` if true. This method returns an error for any + /// inode where that condition holds. + fn validate_v1_inline_layout(&self) -> Result<(), ErofsReaderError> { + // Only applies to V1 (C-compatible) images: composefs_version 0 (no user + // whiteouts) or 1 (user whiteouts present). V2 images (composefs_version=2) + // use a frozen layout strategy and are never mounted on pre-6.12 kernels. + let cv = self.header.composefs_version.get(); + if cv >= format::COMPOSEFS_VERSION.get() { + return Ok(()); + } + + let block_size = self.block_size as u64; + + // Walk all reachable inodes from the root rather than iterating raw nid slots. + // The inode table is not densely packed — gaps arise from padding — so + // iterating 0..sb.inos by slot can hit mid-inode bytes that accidentally + // parse as valid-looking headers with garbage xattr_icount values. + let mut stack = vec![self.sb.root_nid.get() as u64]; + let mut visited = std::collections::HashSet::new(); + + while let Some(nid) = stack.pop() { + if !visited.insert(nid) { + continue; + } + let inode = match self.inode(nid) { + Ok(i) => i, + Err(_) => continue, + }; + + // Recurse into directories to find all symlink inodes. + if inode.mode().is_dir() { + // Collect child nids from both inline and block directory data. + let mut child_nids: Vec = Vec::new(); + if let Some(inline) = inode.inline() + && let Ok(block) = DirectoryBlock::ref_from_bytes(inline) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + let name = entry.name; + if name == b"." || name == b".." { + continue; + } + child_nids.push(entry.nid()); + } + } + if let Ok(range) = self.inode_blocks(&inode) { + for blkid in range { + if let Ok(block) = self.directory_block(blkid) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + let name = entry.name; + if name == b"." || name == b".." { + continue; + } + child_nids.push(entry.nid()); + } + } + } + } + stack.extend(child_nids); + continue; + } + + // Only the pre-6.12 symlink fast-path checks the block boundary. + let mode = inode.mode().0.get(); + if mode & S_IFMT != S_IFLNK { + continue; + } + + let layout = match inode.data_layout() { + Ok(l) => l, + Err(_) => continue, + }; + if !matches!(layout, DataLayout::FlatInline) { + continue; // symlink stored out-of-band (long target > block_size) + } + + let inline_size = inode.size() % block_size; + if inline_size == 0 { + continue; + } + + // nid * 32 is the byte offset from meta_start (which is 0 for composefs). + let inode_offset = nid + .checked_mul(32) + .ok_or_else(|| ErofsReaderError::InvalidImage("nid overflow".into()))?; + let inode_pos_in_block = inode_offset % block_size; + + let header_size: u64 = match &inode { + InodeType::Compact(_) => size_of::() as u64, + InodeType::Extended(_) => size_of::() as u64, + }; + let xattr_size = inode.xattr_size() as u64; + let inode_and_xattr_size = header_size.checked_add(xattr_size).ok_or_else(|| { + ErofsReaderError::InvalidImage("inode+xattr size overflow".into()) + })?; + + let total = inode_pos_in_block + .checked_add(inode_and_xattr_size) + .and_then(|t| t.checked_add(inline_size)) + .ok_or_else(|| { + ErofsReaderError::InvalidImage("inline layout size overflow".into()) + })?; + if total > block_size { + return Err(ErofsReaderError::InvalidImage(format!( + "inode at nid {nid} (FlatInline symlink, inode_pos_in_block={inode_pos_in_block}, \ + inode_and_xattr_size={inode_and_xattr_size}, inline_size={inline_size}) \ + would trigger EUCLEAN on kernels older than 6.12: \ + {inode_pos_in_block} + {inode_and_xattr_size} + {inline_size} = {total} > {block_size}" + ))); + } + } + + Ok(()) + } + /// Finds a child directory entry by name within a directory inode. /// /// Returns the nid (inode number) of the child if found. @@ -743,6 +909,41 @@ impl<'img> Image<'img> { } } +/// Check if an inode is a V1 escaped whiteout (a regular file carrying the +/// `trusted.overlay.overlay.whiteout` xattr added by the V1 writer). +/// +/// C composefs v1.0.8 converts char-device-rdev-0 entries to regular files +/// on write (whiteout escaping). The reader must reverse this. +fn is_escaped_v1_whiteout(img: &Image, inode: &InodeType) -> anyhow::Result { + // Only relevant for regular files + let mode = inode.mode().0.get(); + if mode & S_IFMT != S_IFREG { + return Ok(false); + } + + let Some(xattrs_section) = inode.xattrs()? else { + return Ok(false); + }; + + // Check shared xattrs + for id in xattrs_section.shared()? { + let xattr = img.shared_xattr(id.get())?; + let full_name = construct_xattr_name(xattr)?; + if full_name == format::XATTR_OVERLAY_WHITEOUT { + return Ok(true); + } + } + // Check local xattrs + for xattr in xattrs_section.local()? { + let xattr = xattr?; + let full_name = construct_xattr_name(xattr)?; + if full_name == format::XATTR_OVERLAY_WHITEOUT { + return Ok(true); + } + } + Ok(false) +} + // TODO: there must be an easier way... #[derive(FromBytes, Immutable, KnownLayout)] #[repr(C)] @@ -1041,6 +1242,7 @@ impl ObjectCollector { /// Returns a set of all referenced object IDs. pub fn collect_objects(image: &[u8]) -> ReadResult> { let img = Image::open(image)?.restrict_to_composefs()?; + img.fsck_metadata()?; let mut this = ObjectCollector { visited_nids: HashSet::new(), nids_to_visit: BTreeSet::new(), @@ -1078,21 +1280,23 @@ fn construct_xattr_name(xattr: &XAttr) -> Result, ErofsReaderError> { /// - Strips `trusted.overlay.metacopy` and `trusted.overlay.redirect` /// - Unescapes `trusted.overlay.overlay.X` back to `trusted.overlay.X` fn stat_from_inode_for_tree(img: &Image, inode: &InodeType) -> anyhow::Result { - let (st_mode, st_uid, st_gid, st_mtim_sec) = match inode { + let (st_mode, st_uid, st_gid, st_mtim_sec, st_mtim_nsec) = match inode { InodeType::Compact(inode) => ( inode.header.mode.0.get() as u32 & 0o7777, inode.header.uid.get() as u32, inode.header.gid.get() as u32, - // Compact inodes don't store mtime; the writer uses build_time - // but for round-trip purposes, 0 matches what was written for - // compact headers (the writer always uses ExtendedInodeHeader) - 0i64, + // Compact inodes don't store mtime; use superblock build_time + // (the writer sets build_time = min mtime across all inodes) + img.sb.build_time.get() as i64, + // and build_time_nsec for the nanosecond component + img.sb.build_time_nsec.get(), ), InodeType::Extended(inode) => ( inode.header.mode.0.get() as u32 & 0o7777, inode.header.uid.get(), inode.header.gid.get(), inode.header.mtime.get() as i64, + inode.header.mtime_nsec.get(), ), }; @@ -1120,6 +1324,7 @@ fn stat_from_inode_for_tree(img: &Image, inode: &InodeType) -> anyhow::Result anyhow::Result anyhow::Result, Box<[u8]>)>> { let full_name = construct_xattr_name(xattr)?; - // Skip internal overlay xattrs added by the writer - if full_name == b"trusted.overlay.metacopy" || full_name == b"trusted.overlay.redirect" { + // Skip internal overlay xattrs added by the writer (metacopy/redirect + // are composefs-internal and should not be exposed to readers). + if full_name == format::XATTR_OVERLAY_METACOPY || full_name == format::XATTR_OVERLAY_REDIRECT { + return Ok(None); + } + + // V1 whiteout escaping artifacts: strip these internal xattrs. + // XATTR_OVERLAY_WHITEOUT signals the inode is a whiteout (handled separately). + // The *_WHITEOUTS, *_OPAQUE, and user-namespace variants are parent-dir markers + // added by the V1 writer that are composefs-internal. + // Note: XATTR_OVERLAY_OPAQUE must be listed explicitly here because the general + // unescape handler below would otherwise expose it as trusted.overlay.opaque. + if full_name == format::XATTR_OVERLAY_WHITEOUT + || full_name == format::XATTR_OVERLAY_WHITEOUTS + || full_name == format::XATTR_OVERLAY_OPAQUE + || full_name == format::XATTR_USERXATTR_WHITEOUT + || full_name == format::XATTR_USERXATTR_WHITEOUTS + || full_name == format::XATTR_USERXATTR_OPAQUE + { return Ok(None); } // Unescape: trusted.overlay.overlay.X -> trusted.overlay.X - if let Some(rest) = full_name.strip_prefix(b"trusted.overlay.overlay.") { - let mut unescaped = b"trusted.overlay.".to_vec(); + if let Some(rest) = full_name.strip_prefix(format::XATTR_OVERLAY_ESCAPED_PREFIX) { + let mut unescaped = format::XATTR_OVERLAY_PREFIX.to_vec(); unescaped.extend_from_slice(rest); let name = Box::from(OsStr::from_bytes(&unescaped)); let value = Box::from(xattr.value()?); return Ok(Some((name, value))); } // Skip all other trusted.overlay.* xattrs (internal to composefs) - if full_name.starts_with(b"trusted.overlay.") { + if full_name.starts_with(format::XATTR_OVERLAY_PREFIX) { return Ok(None); } @@ -1393,6 +1615,25 @@ fn populate_directory( let name = OsStr::from_bytes(name_bytes); let child_inode = img.inode(nid)?; + // Skip overlay whiteout entries — but only in the root directory. + // C composefs only skips hex-named (00–ff) chardev(0,0) entries in root + // (lcfs-writer-erofs.c: "Skip real whiteouts (00-ff)"). + // A chardev(0,0) in a subdirectory is a legitimate device node. + // + // In V1 images the writer escapes whiteouts to regular files with + // trusted.overlay.overlay.whiteout xattr, so we must check both + // the native chardev form and the escaped regular-file form. + let is_root_dir = dir_nid == img.sb.root_nid.get() as u64; + let is_escaped_whiteout = is_escaped_v1_whiteout(img, &child_inode)?; + let is_native_whiteout = child_inode.is_whiteout(); + if is_root_dir + && (is_native_whiteout || is_escaped_whiteout) + && name_bytes.len() == 2 + && name_bytes.iter().all(|b| b.is_ascii_hexdigit()) + { + continue; + } + if child_inode.mode().is_dir() { n_subdirs = n_subdirs .checked_add(1) @@ -1427,7 +1668,14 @@ fn populate_directory( let content = match file_type { S_IFREG => { - if let Some(digest) = extract_metacopy_digest::(img, &child_inode)? { + // V1 images escape whiteouts (char dev rdev=0) to regular files. + // The is_escaped_whiteout flag was computed above (before the + // root-dir skip check), so reuse it here. + if is_escaped_whiteout { + tree::LeafContent::CharacterDevice(0) + } else if let Some(digest) = + extract_metacopy_digest::(img, &child_inode)? + { tree::LeafContent::Regular(tree::RegularFile::External( digest, child_inode.size(), @@ -1468,10 +1716,19 @@ fn populate_directory( _ => anyhow::bail!("unknown file type {:#o} for {:?}", file_type, name), }; + // Hardlinked whiteouts are semantically invalid: a whiteout represents the + // absence of a file in an overlay, so nlink > 1 is meaningless. + let on_disk_nlink = child_inode.nlink(); + if matches!(content, tree::LeafContent::CharacterDevice(0)) && on_disk_nlink > 1 { + anyhow::bail!( + "invalid composefs image: whiteout inode {:?} has nlink > 1", + name + ); + } + let leaf_id = builder.push_leaf(stat, content); // Track for hardlink detection if nlink > 1 - let on_disk_nlink = child_inode.nlink(); if on_disk_nlink > 1 { builder.hardlinks.insert(nid, leaf_id); } @@ -1572,7 +1829,7 @@ mod tests { use super::*; use crate::{ dumpfile::{dumpfile_to_filesystem, write_dumpfile}, - erofs::writer::mkfs_erofs, + erofs::writer::{ValidatedFileSystem, mkfs_erofs}, fsverity::Sha256HashValue, }; use std::collections::HashMap; @@ -1653,7 +1910,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Root should have . and .. and empty_dir @@ -1698,7 +1955,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Find dir1 @@ -1743,7 +2000,7 @@ mod tests { } let fs = dumpfile_to_filesystem::(&dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Find bigdir @@ -1793,7 +2050,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Navigate through the structure @@ -1831,7 +2088,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); let root_inode = img.root().unwrap(); @@ -1877,7 +2134,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // This should traverse all directories without error let result = collect_objects::(&image); @@ -1953,7 +2210,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Verify root entries @@ -2000,7 +2257,7 @@ mod tests { write_dumpfile(&mut orig_output, &fs_orig).unwrap(); let orig_str = String::from_utf8(orig_output).unwrap(); - let image = mkfs_erofs(&fs_orig); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs_orig).unwrap()); let fs_rt = erofs_to_filesystem::(&image).unwrap(); let mut rt_output = Vec::new(); @@ -2105,7 +2362,8 @@ mod tests { "#; let fs_orig = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs_orig); + let vfs_orig = ValidatedFileSystem::new(fs_orig).unwrap(); + let image = mkfs_erofs(&vfs_orig); let fs_rt = erofs_to_filesystem::(&image).unwrap(); // Verify hardlink sharing via LeafId @@ -2120,7 +2378,7 @@ mod tests { // Verify dumpfile round-trips correctly let mut orig_output = Vec::new(); - write_dumpfile(&mut orig_output, &fs_orig).unwrap(); + write_dumpfile(&mut orig_output, &vfs_orig.0).unwrap(); let orig_str = String::from_utf8(orig_output).unwrap(); let mut rt_output = Vec::new(); @@ -2149,7 +2407,7 @@ mod tests { // Build a minimal valid composefs image (just a root directory). let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity: the unmodified image passes restrict_to_composefs(). Image::open(&base_image) @@ -2278,7 +2536,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity: unmodified image round-trips fine erofs_to_filesystem::(&base_image) @@ -2335,9 +2593,14 @@ mod tests { for case in &cases { let mut image = base_image.clone(); - let offset = inline_offset + case.entry_byte_offset; + let entry_start = inline_offset + case.entry_byte_offset; // Write a bogus nid (0xDEAD) that doesn't match the directory's own nid - image[offset..offset + 8].copy_from_slice(&0xDEADu64.to_le_bytes()); + // Use zerocopy to get a typed &mut DirectoryEntryHeader instead of raw bytes. + let hdr = DirectoryEntryHeader::mut_from_bytes( + &mut image[entry_start..entry_start + size_of::()], + ) + .expect("entry slice must be a valid DirectoryEntryHeader"); + hdr.inode_offset = zerocopy::little_endian::U64::new(0xDEAD); let result = erofs_to_filesystem::(&image); let err = result.expect_err(&format!("{}: should have been rejected", case.name)); @@ -2369,7 +2632,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity check erofs_to_filesystem::(&base_image) @@ -2380,22 +2643,26 @@ mod tests { let root_nid = img.sb.root_nid.get() as u64; let file_nid = img.find_child_nid(root_nid, b"file").unwrap().unwrap(); - // Compute byte offset of the file's inode in the image - let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_byte_offset = meta_start + file_nid as usize * 32; - let is_extended = base_image[inode_byte_offset] & 1 != 0; + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(file_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + file_nid as usize * 32; + drop(inode); drop(img); let mut image = base_image.clone(); + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.nlink is U32 at byte offset 44 - let nlink_offset = inode_byte_offset + 44; - image[nlink_offset..nlink_offset + 4].copy_from_slice(&5u32.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.nlink = zerocopy::little_endian::U32::new(5); } else { - // CompactInodeHeader.nlink is U16 at byte offset 6 - let nlink_offset = inode_byte_offset + 6; - image[nlink_offset..nlink_offset + 2].copy_from_slice(&5u16.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.nlink = zerocopy::little_endian::U16::new(5); } let result = erofs_to_filesystem::(&image); @@ -2421,7 +2688,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity check erofs_to_filesystem::(&base_image) @@ -2432,21 +2699,26 @@ mod tests { let root_nid = img.sb.root_nid.get() as u64; let dir_nid = img.find_child_nid(root_nid, b"dir").unwrap().unwrap(); - let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_byte_offset = meta_start + dir_nid as usize * 32; - let is_extended = base_image[inode_byte_offset] & 1 != 0; + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(dir_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + dir_nid as usize * 32; + drop(inode); drop(img); let mut image = base_image.clone(); + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.nlink is U32 at byte offset 44 - let nlink_offset = inode_byte_offset + 44; - image[nlink_offset..nlink_offset + 4].copy_from_slice(&99u32.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.nlink = zerocopy::little_endian::U32::new(99); } else { - // CompactInodeHeader.nlink is U16 at byte offset 6 - let nlink_offset = inode_byte_offset + 6; - image[nlink_offset..nlink_offset + 2].copy_from_slice(&99u16.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.nlink = zerocopy::little_endian::U16::new(99); } let result = erofs_to_filesystem::(&image); @@ -2471,30 +2743,35 @@ mod tests { // stays the same and the inode still parses successfully. let dumpfile = "/ 0 40755 1 0 0 0 0.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let mut image = mkfs_erofs(&fs); + let mut image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); - let root_nid = img.sb.root_nid.get() as usize; + let root_nid = img.sb.root_nid.get() as u64; let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_offset = meta_start + root_nid * 32; - // Determine inode layout from the first byte - let is_extended = image[inode_offset] & 1 != 0; + + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(root_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + root_nid as usize * 32; + drop(inode); drop(img); // Use a huge size that is a multiple of block_size (4096) so inline // tail size stays 0 and the inode remains parseable. let huge_size: u64 = (block_size as u64) * 1_000_000_000; + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.size is a U64 at byte offset 8 - let size_offset = inode_offset + 8; - image[size_offset..size_offset + 8].copy_from_slice(&huge_size.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.size = zerocopy::little_endian::U64::new(huge_size); } else { - // CompactInodeHeader.size is a U32 at byte offset 8 - let size_offset = inode_offset + 8; - let truncated = huge_size as u32; - image[size_offset..size_offset + 4].copy_from_slice(&truncated.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.size = zerocopy::little_endian::U32::new(huge_size as u32); } let img = Image::open(&image).unwrap(); @@ -2510,43 +2787,558 @@ mod tests { mod proptest_tests { use super::*; + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; use crate::fsverity::Sha512HashValue; - use crate::test::proptest_strategies::{build_filesystem, filesystem_spec}; + use crate::test::proptest_strategies::{ + FsSpec, build_filesystem, build_unusual_filesystem, filesystem_spec, + unusual_filesystem_spec, + }; use proptest::prelude::*; - /// Round-trip a FileSystem through erofs and compare dumpfile output. - fn round_trip_filesystem( - fs_orig: &tree::FileSystem, - ) { - let mut orig_output = Vec::new(); - write_dumpfile(&mut orig_output, fs_orig).unwrap(); - - let image = mkfs_erofs(fs_orig); + /// Round-trip a FileSystem through V2 erofs and compare dumpfile output. + /// + /// V2 EROFS does not store mtime nanoseconds: the on-disk `mtime_nsec` + /// field is always zero. Build the expected dumpfile from a copy of the + /// filesystem with `mtime_nsec` zeroed so the comparison reflects what + /// V2 actually stores, not what the in-memory tree carries. + fn round_trip_filesystem(spec: FsSpec) { + // fs_write → source for the EROFS image. + // fs_expected → reference with mtime_nsec=0, matching V2 on-disk format. + let fs_write = build_filesystem::(spec.clone()); + let mut fs_expected = build_filesystem::(spec); + // V2 EROFS does not store mtime nanoseconds; zero them before comparing. + fs_expected.for_each_stat_mut(|s| s.st_mtim_nsec = 0); + + let mut expected_output = Vec::new(); + write_dumpfile(&mut expected_output, &fs_expected).unwrap(); + + let image = mkfs_erofs(&ValidatedFileSystem::new(fs_write).unwrap()); let fs_rt = erofs_to_filesystem::(&image).unwrap(); let mut rt_output = Vec::new(); write_dumpfile(&mut rt_output, &fs_rt).unwrap(); similar_asserts::assert_eq!( - String::from_utf8_lossy(&orig_output), + String::from_utf8_lossy(&expected_output), String::from_utf8_lossy(&rt_output) ); } + /// Round-trip a FileSystem through V1 erofs and compare dumpfile output. + /// + /// V1 uses compact inodes (when mtime matches the minimum), BFS ordering, + /// and includes overlay whiteout character device entries in the root. + /// The writer adds `trusted.overlay.opaque` to the root; the reader strips + /// internal overlay xattrs. Whiteout char-device entries (00–ff in root) + /// are also stripped, matching C composefs reader behaviour. + fn round_trip_filesystem_v1(spec: FsSpec) { + // Build two separate filesystems from the same spec so we avoid + // Rc::strong_count issues from sharing leaf Rcs. + let mut fs_write = build_filesystem::(spec.clone()); + let fs_expected = build_filesystem::(spec); + + // Only the write side needs whiteouts — the reader strips them + // just like C composefs does. + fs_write.add_overlay_whiteouts(); + + // The writer internally adds trusted.overlay.opaque=y to root, + // but the reader strips all trusted.overlay.* xattrs that aren't + // escaped user xattrs. So the expected filesystem should NOT have it. + + // Generate the V1 image from the write filesystem. + let image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_write).unwrap(), + FormatVersion::V1, + ); + + // Validate the layout invariant: no FlatInline inode should + // trigger EUCLEAN on kernels < 6.12. This catches the + // block-boundary bug even when proptest doesn't generate a + // case large enough to trip it at mount time. + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("V1 image should have valid inline layout for pre-6.12 kernels"); + + // Read back from the image. + let fs_rt = erofs_to_filesystem::(&image).unwrap(); + + // Compare via dumpfile serialization. + let mut expected_output = Vec::new(); + write_dumpfile(&mut expected_output, &fs_expected).unwrap(); + + let mut rt_output = Vec::new(); + write_dumpfile(&mut rt_output, &fs_rt).unwrap(); + + if expected_output != rt_output { + let expected_str = String::from_utf8_lossy(&expected_output); + let rt_str = String::from_utf8_lossy(&rt_output); + panic!( + "V1 round-trip mismatch:\n--- expected ---\n{expected_str}\n--- got ---\n{rt_str}" + ); + } + } + + /// Verify that C composefs-info can parse an EROFS image we generated, + /// and that its dump output matches our Rust reader's interpretation. + /// + /// This is the critical compatibility test: it proves that EROFS images + /// produced by our writer are consumable by the C implementation. + fn verify_c_composefs_info_reads_image(image: &[u8]) { + use std::io::Write; + + // Validate layout invariant before testing C reader compatibility. + Image::open(image) + .unwrap() + .fsck_metadata() + .expect("image should have valid inline layout for pre-6.12 kernels"); + + // Write image to a tempfile + let mut tmp = tempfile::NamedTempFile::new().unwrap(); + tmp.write_all(image).unwrap(); + tmp.flush().unwrap(); + + // Run C composefs-info dump on the image with a timeout. + let child = std::process::Command::new("composefs-info") + .arg("dump") + .arg(tmp.path()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .unwrap(); + + let output = { + let (tx, rx) = std::sync::mpsc::channel(); + std::thread::spawn(move || { + let _ = tx.send(child.wait_with_output()); + }); + rx.recv_timeout(std::time::Duration::from_secs(10)) + .expect("composefs-info timed out after 10 seconds") + .unwrap() + }; + + if !output.status.success() { + panic!( + "C composefs-info dump failed (exit {:?}):\nstderr: {}", + output.status.code(), + String::from_utf8_lossy(&output.stderr), + ); + } + + let c_dump = String::from_utf8(output.stdout).expect("C dump should be valid UTF-8"); + + // Get our Rust reader's interpretation of the same image + let fs_rt = erofs_to_filesystem::(image).unwrap(); + let mut rust_dump_bytes = Vec::new(); + write_dumpfile(&mut rust_dump_bytes, &fs_rt).unwrap(); + let rust_dump = String::from_utf8(rust_dump_bytes).unwrap(); + + // Parse both dumps into structured entries, then normalize and + // compare. This avoids fragile string munging and lets the + // dumpfile parser handle escaping, field splitting, etc. + // + // Apply the C reader empty-xattr workaround to the Rust dump as + // well: we are testing C-reader compatibility here, so we strip + // the same entries C would silently drop. Rust-only round-trip + // tests (test_erofs_round_trip_*) compare dumpfiles directly + // without this workaround, catching Rust writer bugs without masking them. + let c_entries = parse_c_dump(&c_dump); + let rust_entries = parse_c_dump(&rust_dump); + + similar_asserts::assert_eq!(c_entries, rust_entries); + } + + /// Parse a dump produced by C composefs-info and normalize for comparison. + /// + /// Applies the empty-xattr workaround for the known C reader bug: the + /// inline-xattr loop uses strict `<` instead of `<=` when checking the + /// end pointer, so it silently skips the last entry whenever it is exactly + /// 4 bytes (header only: name_len=0, value_size=0). This occurs for + /// system.posix_acl_access/default with empty values, where the prefix + /// index encodes the full key leaving a zero-length suffix. + fn parse_c_dump(dump: &str) -> Vec { + normalize_dump(dump, true) + } + + /// Parse a dump produced by our Rust reader and normalize for comparison. + /// + /// Does NOT apply the C reader empty-xattr workaround — Rust output must + /// be left unfiltered so any Rust writer bugs producing empty xattrs are + /// caught rather than silently masked. + /// + /// For C compat tests, use [`parse_c_dump`] on both sides so the + /// comparison accounts for the known C reader limitation. + + fn normalize_dump(dump: &str, strip_empty_xattrs: bool) -> Vec { + use crate::dumpfile_parse::{Entry, Item}; + use std::os::unix::ffi::OsStrExt; + + dump.lines() + .filter(|line| !line.is_empty()) + .filter_map(|line| { + let mut entry = Entry::parse(line).unwrap_or_else(|e| { + panic!("Failed to parse dump line: {e}\n line: {line}") + }); + + // C composefs-info (lcfs_build_node_from_image) unconditionally + // treats any chardev with rdev=0 as a whiteout and skips it, + // returning ENOTSUP regardless of where in the tree it appears: + // + // if (type == S_IFCHR && node->inode.st_rdev == 0) { + // errno = ENOTSUP; + // return NULL; + // } + // + // Our Rust reader preserves chardev(0,0) entries in subdirectories + // (it only strips the root-level 00–ff overlay whiteout stubs). + // Strip all chardev(0,0) entries from both sides of the comparison + // so the test reflects what C actually outputs. + if let Item::Device { rdev: 0, .. } = entry.item { + if (entry.mode & 0o170000) == 0o20000 { + return None; + } + } + + if strip_empty_xattrs { + entry.xattrs.retain(|x| !x.value.is_empty()); + } + // Strip overlay xattrs that the C reader keeps but our Rust reader + // strips as composefs-internal: + // - user.overlay.opaque: OVERLAY_XATTR_USERXATTR_OPAQUE, kept by C + // - trusted.overlay.opaque: the C reader unescapes + // trusted.overlay.overlay.opaque to this; Rust strips the + // escaped form before unescaping so it never appears in Rust + // output. Normalizing both sides makes the comparison test + // semantic content rather than internal overlay state. + entry.xattrs.retain(|x| { + x.key.as_bytes() != b"user.overlay.opaque" + && x.key.as_bytes() != b"trusted.overlay.opaque" + }); + Some(entry.to_string()) + }) + .collect() + } + proptest! { - #![proptest_config(ProptestConfig::with_cases(64))] + #![proptest_config(ProptestConfig::with_cases(200))] #[test] fn test_erofs_round_trip_sha256(spec in filesystem_spec()) { - let fs = build_filesystem::(spec); - round_trip_filesystem(&fs); + round_trip_filesystem::(spec); } #[test] fn test_erofs_round_trip_sha512(spec in filesystem_spec()) { - let fs = build_filesystem::(spec); - round_trip_filesystem(&fs); + round_trip_filesystem::(spec); + } + + #[test] + fn test_erofs_round_trip_v1_sha256(spec in filesystem_spec()) { + round_trip_filesystem_v1::(spec); + } + + #[test] + fn test_erofs_round_trip_v1_sha512(spec in filesystem_spec()) { + round_trip_filesystem_v1::(spec); + } + + } + + /// Verify C composefs-info can parse random V1 (C-compatible) EROFS + /// images generated by our writer, and that its dump output matches + /// our Rust reader's interpretation. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v1() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + let mut fs = build_filesystem::(spec); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Verify C composefs-info can parse random V2 (Rust-native) EROFS + /// images generated by our writer. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v2() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + let fs = build_filesystem::(spec); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Verify C composefs-info can parse random V2 EROFS images generated from + /// unusual content (whiteout escaping, ACLs, multiple overlay xattrs, large + /// external files, cross-type hardlinks), and that its dump output matches + /// our Rust reader's interpretation. + /// + /// Mirrors `test_v1_binary_identical_unusual_content` but for V2 images + /// where byte-for-byte C identity is not the goal (V2 is Rust-native); + /// instead we verify semantic equivalence via normalized dump comparison. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v2_unusual() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&unusual_filesystem_spec(), |spec| { + let fs = build_unusual_filesystem::(spec); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Run `debug_img` on an image and return the structured dump as a String. + fn debug_dump(image: &[u8]) -> String { + use crate::erofs::debug::debug_img; + let mut out = Vec::new(); + debug_img(&mut out, image).expect("debug_img failed"); + String::from_utf8(out).expect("debug_img produced non-UTF8") + } + + /// Diff two debug dumps, returning a unified-diff-style string of the differences. + fn diff_debug_dumps(label_a: &str, a: &str, label_b: &str, b: &str) -> String { + use std::fmt::Write; + let a_lines: Vec<&str> = a.lines().collect(); + let b_lines: Vec<&str> = b.lines().collect(); + let mut out = String::new(); + let max = a_lines.len().max(b_lines.len()); + let mut diffs = 0usize; + for i in 0..max { + let la = a_lines.get(i).copied().unwrap_or(""); + let lb = b_lines.get(i).copied().unwrap_or(""); + if la != lb { + diffs += 1; + if diffs <= 40 { + writeln!(out, "line {i}:").unwrap(); + writeln!(out, " {label_a}: {la}").unwrap(); + writeln!(out, " {label_b}: {lb}").unwrap(); + } + } + } + if diffs > 40 { + writeln!(out, "... and {} more differing lines", diffs - 40).unwrap(); } + if diffs == 0 { + out.push_str("(no differences)"); + } + out + } + + /// Run C `mkcomposefs --from-file -` on a dumpfile string and return the raw image bytes. + fn c_mkcomposefs_from_dumpfile(dumpfile: &str) -> Vec { + use std::io::{Read, Seek, SeekFrom, Write}; + // Write dumpfile to a tempfile + let mut tf = tempfile::tempfile().unwrap(); + tf.write_all(dumpfile.as_bytes()).unwrap(); + tf.seek(SeekFrom::Start(0)).unwrap(); + // Run mkcomposefs --from-file - - + let out_tf = tempfile::tempfile().unwrap(); + let mut child = std::process::Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(std::process::Stdio::from(tf)) + .stdout(std::process::Stdio::from(out_tf.try_clone().unwrap())) + .stderr(std::process::Stdio::inherit()) + .spawn() + .expect("failed to spawn mkcomposefs"); + let status = child.wait().unwrap(); + assert!(status.success(), "mkcomposefs failed: {status}"); + let mut out_tf = out_tf; + out_tf.seek(SeekFrom::Start(0)).unwrap(); + let mut bytes = Vec::new(); + out_tf.read_to_end(&mut bytes).unwrap(); + bytes + } + + /// Verify that our Rust V1 writer produces byte-for-byte identical EROFS images + /// to C mkcomposefs for the same user-level input. + /// + /// This is a stronger check than `test_c_composefs_info_reads_v1`: instead of + /// comparing parsed dump output (which won't catch wrong binary layout like the + /// EUCLEAN block-boundary bug), we compare raw image bytes. If our V1 writer + /// disagrees with the C reference even on a single padding byte, this fails. + /// + /// The test mirrors the production flow: C receives a dumpfile of the user-level + /// tree (no whiteout stubs) and adds the 256 stubs internally, while the Rust + /// writer operates on the in-memory tree after `add_overlay_whiteouts()`. + /// + /// On failure the structural diff from `debug_img` is printed to make the + /// divergence immediately obvious without a separate manual step. + #[test_with::executable(mkcomposefs)] + #[test] + fn test_v1_binary_identical_to_c_mkcomposefs() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + // Build two independent filesystems from the same spec: + // fs_c — user entries only, serialized as a dumpfile and fed to + // C mkcomposefs (which adds the 256 whiteout stubs internally) + // fs_rs — user entries + stubs added by add_overlay_whiteouts(), fed + // directly to our Rust V1 writer + // + // This mirrors the production flow: C receives a dumpfile without + // the stubs and adds them itself, while Rust adds them in-process. + // Using the same spec for both ensures the user-level content matches. + let fs_c = build_filesystem::(spec.clone()); + let mut fs_rs = build_filesystem::(spec); + fs_rs.add_overlay_whiteouts(); + + // Serialize the pre-whiteout tree for C (no stubs in dumpfile) + let mut dumpfile_bytes = Vec::new(); + write_dumpfile(&mut dumpfile_bytes, &fs_c).unwrap(); + let dumpfile = String::from_utf8(dumpfile_bytes).unwrap(); + + // Get C mkcomposefs binary output (C adds stubs internally) + let c_image = c_mkcomposefs_from_dumpfile(&dumpfile); + + // Get our Rust V1 writer binary output (stubs already in fs_rs) + let rust_image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V1, + ); + + if c_image != rust_image.as_ref() { + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + similar_asserts::assert_eq!( + c_debug, + rust_debug, + "binary mismatch (c={} bytes, rust={} bytes)\ndumpfile:\n{dumpfile}", + c_image.len(), + rust_image.len(), + ); + } + Ok(()) + }) + .unwrap(); + } + + /// Binary-compatibility test using the unusual-content generator. + /// + /// Covers corner cases in the V1 writer that the ordinary random generator almost + /// never exercises: whiteout escaping, multiple trusted.overlay.* xattrs per inode, + /// system.posix_acl_access (HAS_ACL flag), large external file sizes, and + /// cross-type hardlinks (to symlinks, whiteouts, devices, FIFOs). + /// + /// Runs 64 cases against C mkcomposefs byte-for-byte. + #[test_with::executable(mkcomposefs)] + #[test] + fn test_v1_binary_identical_unusual_content() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&unusual_filesystem_spec(), |spec| { + let fs_c = build_unusual_filesystem::(spec.clone()); + let mut fs_rs = build_unusual_filesystem::(spec); + fs_rs.add_overlay_whiteouts(); + + let mut dumpfile_bytes = Vec::new(); + write_dumpfile(&mut dumpfile_bytes, &fs_c).unwrap(); + let dumpfile = String::from_utf8(dumpfile_bytes).unwrap(); + + let c_image = c_mkcomposefs_from_dumpfile(&dumpfile); + let rust_image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V1, + ); + + if c_image != rust_image.as_ref() { + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + similar_asserts::assert_eq!( + c_debug, + rust_debug, + "binary mismatch (c={} bytes, rust={} bytes)\ndumpfile:\n{dumpfile}", + c_image.len(), + rust_image.len(), + ); + } + Ok(()) + }) + .unwrap(); + } + + /// Diagnostic: dump the structural diff between C mkcomposefs and our Rust V1 + /// writer for a known-failing minimal case (large flat directory, no xattrs). + /// + /// This test is `#[ignore]` — run it manually with: + /// cargo test -p composefs --lib -- erofs::reader::tests::proptest_tests::test_v1_binary_diff_diagnostic --ignored --nocapture + /// + /// It uses `debug_img` (our injective EROFS structure dumper) to show exactly + /// which fields diverge between the two images, making it easy to pinpoint + /// the bug in the writer without manually parsing hex dumps. + #[test_with::executable(mkcomposefs)] + #[test] + #[ignore] + fn test_v1_binary_diff_diagnostic() { + // Known-failing proptest case: use the exact dumpfile from a proptest failure. + // The flow matches the proptest exactly: + // - fs_c is built from spec and serialized to dumpfile (no stubs) for C + // - fs_rs has add_overlay_whiteouts() called on it, fed to Rust writer + let dumpfile = "\ +/ 0 40000 3 0 0 0 0.0 - - -\n\ +/B 0 47123 2 32924 6322 0 334277904.419157028 - - - user.test_3=\\x14\\x11\\xf5\\xbe\\xf0\\x1f\\x15<\\\\\\x84Gu(\\x17T\\xdb\\xca\\xd5\n\ +/B/\\x06\\xc3} 43 102747 1 14780 50024 0 1909128638.32940851 - X\\xb8\\xac\\xf9[\\x8br\\x1a\\x11\\xed\\x96]\\x9c\\xed\\xba\\x8f\\x13\\xcc/i\\x12\\x7fE\\x18\\xf8n\\xaeV_E\\x8bS]x\\x93/g\\x92\\x0f?\\xd8\\xf4\\xf5 - security.capability=r\\x93\\x84\\x18M user.test_3=&+\\xf2\\xee\\x89sz user.test_4=\n\ +/B/\\x1f\\xe3\\x17\\xcb\\xe9\\x81\\x9aT\\xd2\\x13\\x19\\xf2\\xaf\\xee\\x20\\xba\\xb3 43 102274 1 41061 21812 0 446804811.557100600 - <\\x10@Z\\x00\\xc5\\xf9\\xca\\xe1=\\xfc\\xe0\\x81)p\\xa4\\x9f\\xa8\\x18+\\x88\\x0e\\xc3\\xa2\\xdf0\\x82*\\xc2q[x\\x86\\x88\\x80\\xf1]b$\\\\\\x1f]\\xeb - system.posix_acl_access= trusted.test_0=\\x92 trusted.test_2=\\\\\\xec\\x83\\x89\\x85\"\\xf9\\x9b\\xbc\\xa5\\xb0\\xef\\xbcC\\xe8Z\\x88F\\x83\\x17 user.test_1=\\xc4\\xc1\\x08\\xff\\xfa\\xd3\\xed\\xad\\x9bS6f\\tS\\x8d\n\ +/B/#\\xcd\\x17\\xb2\\xf0\\x03g\\xea\\x87iI\\xe3{_\\xe1 7 100554 1 50668 49879 0 1545457558.133147722 - \\xb6\\xa1$?\\xd2:\\xb9 - system.posix_acl_default=\\x97\\xde\\xd1S;,; user.test_4=\\xf7\\x82S\\xa5\\xc3,?\\x98\\x84p\\xbf\\x14&\\x91+\\x8e\\xdb\n\ +/B/3\\xf4\\xf5\\xc2e\\x07\\xb5\\xacC\\xa1 45 106705 1 56683 56444 0 1577642975.579080132 - \\xdf[\\x83j\\x1e\\x99\\xd8\\xc0[\\x8ba\\xc0f\\xec\\xe0\\x8b*\\xee\\x031\\x91\\x0f38\\x0f\\x08\\xc0\\xcd\\xa9\\x1a^\\x90]\\xc9!>\\xa9S*\\x94\\x8c\\x17\\xa8h\\xc3 - security.ima=E\\x04L\\tb@9\\x07!h) trusted.overlay.custom=~\\x16\\x1f-\\xfc\\xa3\\x07\\x17\\xd1\\xa0 trusted.test_2=O\n\ +/B/Eap_z828H.-6-_S 0 14476 1 4557 40071 0 206142614.191638235 - - - security.ima=H\\xfd\\x9e&\\x9a:\\xe5\\x93\\xa4 system.posix_acl_access=N\\x1c|\\xc7$O3\\x198%\\xb4\\xe8 trusted.overlay.origin=Y+\\xa4\\xd1\\x16r\\xdd|\\xfaG user.test_4=\n\ +/B/Gv7O_..._.faB2-_-22dNscP_eGqkxP35_.0l.w.hfrZXl_v4h.MGEE7___GGF221-V-__WgP-h-6Th_NIB_._j.-U.Qj_2_iA.P_3_-_..9.1oxn4_mM_6XEAJ196_.6Z9iR_YM-Wr0L_.kz.icFqb_EzB27-___AC7bGW_.t_rwee8rtQ4_0rD_t1-J__5iR.r1_8cNUQXai5w4.e2_G-.7j.DyiD__Rfv6Lhgfzn-QFr_-J 44 124140 1 29304 30605 0 620161379.796821778 ____SlN/.yp1zAst_-P/5_RO_-cy7O_Z__310L__d2yo - -\n\ +/B/IP-_jBs 1 126270 1 31623 24545 0 1072774021.893731176 \\xcb - -\n\ +/B/KAS.d8m.y6U 16 125603 1 24529 17343 0 340236667.19836524 9\\x14\\xe2{\\xe9[\\x96q\\x08h;\\xc8\\x83\\xa4\\xb3\\xb9 - - trusted.overlay.origin=b\\xec'\\x8c\\x16\\xea\\xcb\\x10\\xc8\\xbe\\x18\\xf7*\\x0c\\x04\\xb8\\xb1 trusted.overlay.overlay.nested= trusted.test_1=e\\x08#\n\ +/B/Mp 27 106753 1 37244 13252 0 91373000.857571176 - OV\\x8e!\\xfdw9I\\xab\\x8f\\x9a;!\\xb4]f\\n]\\xc8\\x7f\\xa5\\x94\\x07\\xd4%\\x97\\x85 -\n\ +/B/Ze.7.-.9_._Ocl1k2_ 46 107670 1 14097 58513 0 488459452.877162371 - \\xc1\\x17\\x1d\\xa7\\x14S)\\xcd}\\xc9/~\\xa4d\\x1cN\\xbeN\\x184\\x90\\xa9A\\x12\\x8bY/(\\x1a,%\"\\xe3\\xb3\\xf2\\x86\\xec\\x20\\xf6\"Ug;\\x84\\\\A - trusted.overlay.origin=\\xfe\\xda7D\\xbf\\xb0\\xe9\\x9ct0Q user.test_4=-\\xdc\n\ +/B/]\\x05\\x19i\\x97\\xeb\\x8c\\xc4k\\x02\\\\jB`j\\x8f\\xb4\\xb6\\xfbw5\\xef\\xf3\\x0fd 0 23230 1 31997 45657 7135 105859383.867998730 - - - system.posix_acl_default=\\xb1p\\x96\\xe45\\xdcC\\x8bI\\x0e\\xfd#\\x8d\n\ +/B/_tvW.__t_l_-jK.4j 554649 106606 1 29300 51208 0 705049404.750293896 e5/39a0e32972ef85332212be14f7b863409d9e4113f80603285d1cd52a852822 - e539a0e32972ef85332212be14f7b863409d9e4113f80603285d1cd52a852822 user.test_4=\\xbf\\xbbL\\xe9\\xbc\\x92$\\xa3\\xf9\\xc6\\x06.\\x3d^\n\ +/B/q._v.T_.Mba__ 32 122305 1 29088 34366 0 881062039.274688283 _C_Kn1_.r_.IK/TGai6_zqLoTt___w_e - - trusted.overlay.overlay.nested=6\\x03\\xee\\xff\\xdbI\\xdcu(\\\\\\xe1\\x9a\\xee\\xd3e\\x06 user.test_2=\\x9a\\xc4$\\xe1\n\ +/B/u 25 105023 2 14652 44878 0 294073763.291036424 - \\x84R\\xd6@\\x0e\\x8b\\x04\\xb4(e\\x93\\xe9\\x86\\xdc\\x03\\xc7\\xbf\\xe1,OmC\\xe9U\\xf1 - trusted.overlay.origin=\\xc4mH\\x9a\n\ +/B/\\x81X\\xef\\r\\xce\\x12\\xf4U(p\\xc3\\xb2\\x19\\xe3r\\xd2v9\\x1c\\x02\\xca 46 121141 1 3272 11859 0 1219611767.718731195 jfsk35_Gz__n4tv4xzFFcj_.Z_AV__IJS_k_1I__FuSb.2 - - security.selinux= trusted.overlay.upper=\\x07\\xe8\\xa1%\\xbe\\xb0\\xc8)\\xcf\\xc2\\xf8\\xbah\\x19\\xae_\\xccH\\x9f\\xf0 trusted.test_1=i\\xe6\\xd9\\xd0 user.test_2=\\xc8\\xa0K\\xb2\\xa0V\\xb0\\xb7\\xd1\\xec(\\x95\\xfe\\xbb`\n\ +/B/\\xc4\\xf8\\x92\\xc2}<4\\xc8\\xec\\xd2\\xa5\\xe6\\x9ee\\xf0\\x95\\xf8(dumpfile).unwrap(); + fs_rs.add_overlay_whiteouts(); + + let c_image = c_mkcomposefs_from_dumpfile(dumpfile); + let rust_image = + mkfs_erofs_versioned(&ValidatedFileSystem::new(fs_rs).unwrap(), FormatVersion::V1); + + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + + println!("=== C mkcomposefs ({} bytes) ===", c_image.len()); + println!("{c_debug}"); + println!("=== Rust V1 writer ({} bytes) ===", rust_image.len()); + println!("{rust_debug}"); + println!("=== Structural diff (c vs rust) ==="); + println!("{}", diff_debug_dumps("c", &c_debug, "rust", &rust_debug)); + + assert_eq!( + c_image, + rust_image.as_ref(), + "images differ — see structural diff above" + ); } } @@ -2561,7 +3353,7 @@ mod tests { /bbb 5 100644 1 0 0 0 1000.0 - world - "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity: the unmodified image round-trips fine erofs_to_filesystem::(&image).unwrap(); @@ -2582,4 +3374,515 @@ mod tests { "unexpected error: {msg}" ); } + + /// Regression test for the block-boundary EUCLEAN bug (bug.md). + /// + /// Old kernels (< 6.12) return EFSCORRUPTED from erofs_fill_symlink() when: + /// (inode_offset % block_size) + inode_and_xattr_size + symlink_len > block_size + /// + /// The V1 writer previously used the wrong condition (derived from the + /// non-symlink branch of the C reference) and padded the wrong target + /// (inline_start rather than inode_start), silently producing images that + /// would EUCLEAN on CentOS Stream 9 (kernel 5.14) for symlinks with large + /// SELinux xattrs such as those in /etc/pki/ca-trust/extracted/pem/directory-hash/. + /// + /// This test: + /// 1. Builds a V1 image that forces a symlink inode near a block boundary + /// by packing enough filler inodes before it. + /// 2. Asserts the validator passes (writer fixed the layout). + /// 3. Asserts the symlink round-trips correctly. + /// + /// The construction: inode table starts at offset 1152. We add enough + /// compact filler inodes (FIFOs, 32 bytes each with min mtime) to push + /// the subsequent symlink to a position where the old code would have + /// placed it straddling the 4096-byte boundary. + #[test] + fn test_v1_symlink_block_boundary_euclean_regression() { + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; + + // A realistic SELinux label of the kind found on ca-trust symlinks. + // 76 bytes — enough that header(64) + xattr(~140) + symlink(23) > 4096 + // when the inode starts near offset 3968 within a block. + let selinux_label = "system_u:object_r:cert_t:s0\x00".repeat(2); + // Trim to exactly 56 bytes so xattr body is predictable + let selinux_label = &selinux_label[..selinux_label.len().min(56)]; + + // Build the dumpfile: root + many compact filler FIFOs + the victim symlink. + // + // Filler FIFOs: mtime=0, no xattrs → compact inode (32 bytes each in V1). + // The inode table starts at 1152. We need to fill up to offset ~3968 within + // some 4096-block, which is (3968 - 1152) % 4096 = 2816 bytes = 88 compact inodes + // in the first block. Add a few more to cross into block 1 and land the + // victim at the right position in block 1. + // + // We overshoot slightly and rely on the writer's fix to pad correctly. + // The validator then confirms no inode violates the kernel condition. + let mut dumpfile = String::from("/ 0 40755 2 0 0 0 0.0 - - -\n"); + for i in 0..120usize { + dumpfile.push_str(&format!("/filler{i:03} 0 10644 1 0 0 0 0.0 - - -\n")); + } + // Victim: symlink with a large SELinux xattr. + let target = "/etc/pki/ca-trust/source"; // 24-byte target + let target_len = target.len(); + let xattr_val_hex: String = selinux_label + .bytes() + .map(|b| format!("\\x{b:02x}")) + .collect(); + dumpfile.push_str(&format!( + "/victim {target_len} 120777 1 0 0 0 0.0 {target} - - security.selinux={xattr_val_hex}\n" + )); + + let fs = dumpfile_to_filesystem::(&dumpfile).unwrap(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + + // The validator must pass: the writer should have padded the inode + // to a block boundary so the kernel condition is never violated. + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("V1 writer should produce valid inline layout (block-boundary fix)"); + + // The symlink target must round-trip correctly. + let fs_rt = + erofs_to_filesystem::(&image).expect("image should parse cleanly"); + let victim_id = fs_rt + .root + .leaf_id(std::ffi::OsStr::new("victim")) + .expect("victim symlink not found in round-tripped filesystem"); + let link_target = match &fs_rt.leaves[victim_id.0].content { + crate::tree::LeafContent::Symlink(t) => t.clone(), + other => panic!("victim should be a symlink, got {other:?}"), + }; + assert_eq!( + link_target.as_ref(), + std::ffi::OsStr::new(target), + "symlink target mismatch after V1 round-trip" + ); + } + + /// Tests that `fsck_metadata` catches a V1 image where symlink + /// padding was suppressed, causing the inode+inline data to cross a block + /// boundary. Uses `WriterFaults` to inject the fault rather than raw byte + /// surgery, so the image is otherwise structurally coherent. + #[test] + fn test_v1_inline_layout_validator_catches_bad_layout() { + use crate::erofs::{ + format::FormatVersion, + writer::{WriterFaults, mkfs_erofs_versioned, mkfs_erofs_with_faults}, + }; + + // Layout math (all sizes in bytes, block_size = 4096): + // + // A symlink crosses a block boundary when: + // symlink_pos % 4096 + 32 (inode) + target_len > 4096 + // => symlink_pos % 4096 > 4096 - 32 - target_len + // + // With target_len = SYMLINK_MAX = 1024 (crate::SYMLINK_MAX): + // symlink_pos % 4096 > 3040 (i.e. slot >= 96 within a block) + // + // Inode table layout (V1): + // Bytes 0..1152 : composefs header (32 B) + pad to 1024 + EROFS superblock (128 B) + // = 36 slots (NID 0-35) + // NID 36 : root inode (32 B inode header) + // NID 36 inline : root dir entries (inline, variable) + // + // With 50 filler files named "f00".."f49" (sort before "link"): + // - 51 dirents: 51 * 12 = 612 B + // - names: 50*3 + 4 = 154 B + // - total inline: 766 B + // - root occupies: 32 + ~766 = 798 B (slot-padded) + // - 50 empty files: 50 * 32 = 1600 B + // - symlink (without block-boundary padding): NID 113, pos_in_block=3616 + // 3616 + 32 + 1024 = 4672 > 4096 → crossing condition ✓ + // + // Note: the *good* image places the symlink at pos_in_block == 0 because + // the writer correctly pads it to a block boundary. We verify crossing + // by checking the *bad* image (padding suppressed) instead. + + // filler_count=50 places the symlink at NID 113 (pos_in_block=3616). + // Without the block-boundary padding: 3616 + 32 + 1024 = 4672 > 4096 ✓ + // The assertion below verifies this whenever the test runs. + let filler_count = 50usize; + let mut lines = String::from("/ 0 40755 2 0 0 0 0.0 - - -\n"); + for i in 0..filler_count { + lines.push_str(&format!("/f{i:02} 0 100644 1 0 0 0 0.0 - - -\n")); + } + let target = "a".repeat(crate::SYMLINK_MAX); + lines.push_str(&format!( + "/link {len} 120777 1 0 0 0 0.0 {target} - -\n", + len = target.len(), + target = target, + )); + let fs = dumpfile_to_filesystem::(&lines).unwrap(); + let vfs = ValidatedFileSystem::new(fs).unwrap(); + + // The good image must pass validation. + let good_image = mkfs_erofs_versioned(&vfs, FormatVersion::V1); + Image::open(&good_image) + .unwrap() + .fsck_metadata() + .expect("valid image should pass"); + + // Build the faulted image (symlink pad suppressed). + let mut faults = WriterFaults::new(42); + faults.skip_symlink_pad_rate = 1.0; // always skip padding + let bad_image = mkfs_erofs_with_faults(&vfs, FormatVersion::V1, faults); + + // Confirm the symlink in the bad image actually crosses a block boundary — + // i.e. the fault injection put the symlink at a dangerous slot. + { + let img = Image::open(&bad_image).unwrap(); + let root_nid = img.sb.root_nid.get() as u64; + let link_nid = img + .find_child_nid(root_nid, b"link") + .unwrap() + .expect("link nid not found"); + let link_offset = (link_nid * 32) as usize; + let pos_in_block = link_offset % 4096; + assert!( + pos_in_block + 32 + crate::SYMLINK_MAX > 4096, + "symlink at pos_in_block={pos_in_block} does not cross a block boundary \ + in the bad image (32+{symlink_max}={total} ≤ 4096); \ + increase filler_count (currently {filler_count})", + symlink_max = crate::SYMLINK_MAX, + total = 32 + crate::SYMLINK_MAX, + ); + } + + // The faulted image must fail validation. + let result = Image::open(&bad_image).unwrap().fsck_metadata(); + assert!( + result.is_err(), + "validator should reject image with suppressed symlink padding" + ); + let msg = result.unwrap_err().to_string(); + assert!( + msg.contains("EUCLEAN") || msg.contains("nid"), + "error should mention EUCLEAN or nid, got: {msg}" + ); + } + + /// B2: Files with a negative `st_mtim_sec` (pre-epoch mtime) must not corrupt + /// the V1 superblock `build_time` field. + /// + /// `calculate_min_mtime` casts `st_mtim_sec as u64`. A value of -1 wraps to + /// `u64::MAX`, which is larger than any positive timestamp, so positive mtimes + /// are correctly selected as the minimum. This test verifies that a filesystem + /// containing one inode with mtime = -1 and one with mtime = 1000 produces a + /// V1 image whose superblock `build_time` equals 1000. + #[test] + fn test_negative_mtime_does_not_corrupt_build_time() { + use std::{collections::BTreeMap, ffi::OsStr}; + + use crate::{ + erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree::{self, RegularFile}, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + + // Inode with negative mtime (-1). As u64 this wraps to u64::MAX, which + // is larger than 1000, so it should NOT win the minimum comparison. + let neg_stat = Stat { + st_mode: 0o100644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: -1, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + let leaf_id = fs.push_leaf( + neg_stat, + LeafContent::Regular(RegularFile::Inline(Box::new([]))), + ); + fs.root + .insert(OsStr::new("neg"), tree::Inode::leaf(leaf_id)); + + // add_overlay_whiteouts is required for V1 compatibility + fs.add_overlay_whiteouts(); + + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + let img = Image::open(&image).expect("failed to open V1 image"); + + // The superblock build_time must be 1000 (the root mtime), not u64::MAX or 0. + assert_eq!( + img.sb.build_time.get(), + 1000, + "build_time should be the positive minimum mtime (1000), \ + not the wrapped negative value" + ); + } + + /// B3: Directories with enough entries to span multiple 4096-byte blocks must + /// survive a round-trip through the V2 EROFS writer. + /// + /// Each dirent is 12 bytes (header) + name length bytes. With 50 entries of + /// 90-byte names: 50 × (12 + 90) = 5100 bytes > 4096, which forces + /// `Directory::from_entries` to split across at least two blocks. + /// + /// This test verifies that all entry names survive the round-trip intact. + #[test] + fn test_multiblock_directory_round_trip() { + use std::{collections::BTreeMap, ffi::OsStr}; + + use crate::{ + erofs::writer::mkfs_erofs, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree::{self, RegularFile}, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let leaf_stat = Stat { + st_mode: 0o100644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let mut fs = tree::FileSystem::::new(root_stat.clone()); + + const N: usize = 50; + let mut expected_names: Vec = vec![".".into(), "..".into()]; + + // Build a subdirectory with N entries, each with a 90-byte name. + // N × (12 + 90) = 5100 bytes — forces a multi-block directory. + let mut subdir = tree::Directory::::new(root_stat); + for i in 0..N { + let name = format!("{:0>90}", i); + let leaf_id = fs.push_leaf( + leaf_stat.clone(), + LeafContent::Regular(RegularFile::Inline(Box::new([]))), + ); + subdir.insert(OsStr::new(&name), tree::Inode::leaf(leaf_id)); + expected_names.push(name); + } + + fs.root.insert( + OsStr::new("bigdir"), + tree::Inode::Directory(Box::new(subdir)), + ); + + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + let img = Image::open(&image).expect("failed to open image"); + + // Locate "bigdir" in root + let root_nid = img.sb.root_nid.get() as u64; + let bigdir_nid = img + .find_child_nid(root_nid, b"bigdir") + .expect("find_child_nid error") + .expect("bigdir not found in root"); + + // Collect all entry names from bigdir (blocks + inline) + let bigdir_inode = img.inode(bigdir_nid).unwrap(); + let mut found_names: Vec = Vec::new(); + if let Some(inline) = bigdir_inode.inline() { + let inline_block = DirectoryBlock::ref_from_bytes(inline).unwrap(); + for entry in inline_block.entries().unwrap() { + let entry = entry.unwrap(); + found_names.push(String::from_utf8(entry.name.to_vec()).unwrap()); + } + } + for blkid in img.inode_blocks(&bigdir_inode).unwrap() { + let block = img.directory_block(blkid).unwrap(); + for entry in block.entries().unwrap() { + let entry = entry.unwrap(); + found_names.push(String::from_utf8(entry.name.to_vec()).unwrap()); + } + } + + found_names.sort(); + expected_names.sort(); + + assert_eq!( + found_names, expected_names, + "multi-block directory lost entries after round-trip" + ); + + // Verify the image is a valid EROFS filesystem that can be round-tripped + let _fs_rt = erofs_to_filesystem::(&image) + .expect("erofs_to_filesystem failed on multi-block directory image"); + + // Sanity: verify the image passes fsck.erofs if available + if let Some(ok) = run_fsck_erofs(&image) { + assert!( + ok, + "fsck.erofs reported errors in multi-block directory image" + ); + } + } + + /// `ValidatedFileSystem::new` must reject a hardlinked whiteout. + /// A whiteout (chardev rdev=0) with nlink > 1 is semantically invalid. + #[test] + fn test_hardlinked_whiteout_writer_rejects() { + use std::ffi::OsStr; + + use crate::{ + erofs::writer::ValidatedFileSystem, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let whiteout_stat = Stat { + st_mode: 0o20000, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + let leaf_id = fs.push_leaf(whiteout_stat, LeafContent::CharacterDevice(0)); + fs.root + .insert(OsStr::new("whiteout"), tree::Inode::leaf(leaf_id)); + fs.root.insert( + OsStr::new("hardlink_to_whiteout"), + tree::Inode::leaf(leaf_id), + ); + + let result = ValidatedFileSystem::new(fs); + assert!( + result.is_err(), + "ValidatedFileSystem::new should reject hardlinked whiteout" + ); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("whiteout inode has nlink > 1"), + "unexpected error message: {err}" + ); + } + + /// The reader must reject an image with a hardlinked whiteout. + /// + /// We build a valid image with a hardlinked chardev(rdev=1), which the writer + /// accepts. We then patch the inode's `u` field (rdev) from 1 to 0 in the raw + /// image bytes, turning it into a whiteout on-disk while leaving nlink > 1. + /// The reader must detect this and return an error. + #[test] + fn test_hardlinked_whiteout_reader_rejects() { + use std::ffi::OsStr; + + use crate::{ + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let chardev_stat = Stat { + st_mode: 0o20000, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + // Use rdev=1 (not a whiteout) so the writer accepts the hardlink. + let leaf_id = fs.push_leaf(chardev_stat, LeafContent::CharacterDevice(1)); + fs.root + .insert(OsStr::new("chardev"), tree::Inode::leaf(leaf_id)); + fs.root.insert( + OsStr::new("hardlink_to_chardev"), + tree::Inode::leaf(leaf_id), + ); + + use crate::erofs::writer::mkfs_erofs; + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + + // Sanity: the unpatched image must be accepted. + erofs_to_filesystem::(&base_image) + .expect("unmodified image with rdev=1 hardlink should be accepted"); + + // Locate the chardev inode in the image using the erofs Image API. + let img = Image::open(&base_image).unwrap(); + let root_nid = img.sb.root_nid.get() as u64; + let chardev_nid = img + .find_child_nid(root_nid, b"chardev") + .unwrap() + .expect("chardev entry must exist"); + + // Parse the inode via the Image API to learn its layout (compact vs + // extended) and locate its slot in the image. We record what we need + // before releasing the shared borrow so we can take `&mut` afterwards. + let inode = img.inode(chardev_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + // The inode region is the `inodes` sub-slice of `image`; the slot for + // NID n starts at n*32 bytes into that region. + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + chardev_nid as usize * 32; + drop(inode); + drop(img); + + // Mutate a copy of the image: set the `u` field (rdev) from 1 → 0, + // turning the chardev into a whiteout on-disk while leaving nlink > 1. + // Use zerocopy to reinterpret the slot bytes as the concrete header type + // so we get a typed `&mut` rather than raw byte arithmetic. + let mut image = base_image.to_vec(); + let slot = &mut image[inode_slot_start..]; + if is_extended { + use core::mem::size_of; + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + assert_eq!(hdr.u.get(), 1, "expected rdev=1 before patching"); + hdr.u = zerocopy::little_endian::U32::new(0); + } else { + use core::mem::size_of; + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + assert_eq!(hdr.u.get(), 1, "expected rdev=1 before patching"); + hdr.u = zerocopy::little_endian::U32::new(0); + } + + // The reader must reject the patched image. + let result = erofs_to_filesystem::(&image); + let err = result.expect_err("reader should reject image with hardlinked whiteout"); + let err_msg = format!("{err:#}"); + assert!( + err_msg.contains("nlink"), + "error message should mention nlink, got: {err_msg}" + ); + } } diff --git a/crates/composefs/src/erofs/writer.rs b/crates/composefs/src/erofs/writer.rs index 7fe59e61..e9780106 100644 --- a/crates/composefs/src/erofs/writer.rs +++ b/crates/composefs/src/erofs/writer.rs @@ -5,8 +5,9 @@ //! and metadata serialization. use std::{ - collections::{BTreeMap, HashMap}, + collections::{BTreeMap, HashMap, HashSet}, mem::size_of, + num::NonZeroUsize, os::unix::ffi::OsStrExt, }; @@ -21,42 +22,281 @@ use crate::{ tree, }; -#[derive(Clone, Copy, Debug)] -enum Offset { - Header, - Superblock, - Inode, - XAttr, - Block, - End, +/// A composefs filesystem tree validated for EROFS serialization. +/// +/// Can only be constructed via [`ValidatedFileSystem::new`], which checks +/// that the tree satisfies all EROFS invariants — for example, that no +/// whiteout inode (character device with rdev=0) has `nlink > 1`. +/// +/// Passing a `ValidatedFileSystem` to [`mkfs_erofs`] or +/// [`mkfs_erofs_versioned`] therefore cannot panic. +pub struct ValidatedFileSystem(pub(crate) tree::FileSystem); + +impl std::fmt::Debug + for ValidatedFileSystem +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("ValidatedFileSystem").field(&self.0).finish() + } +} + +impl ValidatedFileSystem { + /// Validate `fs` and wrap it. Returns an error if any invariant is violated. + pub fn new(fs: tree::FileSystem) -> anyhow::Result { + validate_filesystem(&fs)?; + Ok(Self(fs)) + } +} + +impl std::ops::Deref for ValidatedFileSystem { + type Target = tree::FileSystem; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +pub(crate) fn validate_filesystem( + fs: &tree::FileSystem, +) -> anyhow::Result<()> { + // Check structural invariants: leaf ref bounds, no orphaned leaves. + fs.fsck() + .map_err(|e| anyhow::anyhow!("invalid composefs filesystem: {e}"))?; + + // Check EROFS-specific constraint: whiteout inodes (chardev rdev=0) must not be hardlinked. + let nlinks = fs.nlinks(); + for (idx, leaf) in fs.leaves.iter().enumerate() { + if matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) { + let nlink = nlinks[idx]; + if nlink > 1 { + anyhow::bail!("invalid composefs filesystem: whiteout inode has nlink > 1"); + } + } + } + Ok(()) +} + +/// Size of one EROFS inode slot in bytes. All inode offsets must be a multiple of this. +const INODE_SLOT_SIZE: usize = 32; + +/// EROFS xattr values are addressed in 4-byte words; all xattr offsets and counts use this unit. +const XATTR_WORD_SIZE: usize = size_of::(); + +/// Size of the InodeXAttrHeader in bytes, used in xattr_icount calculation. +const INODE_XATTR_HEADER_SIZE: usize = size_of::(); + +/// Returns the byte offset of `pos` within its EROFS block (i.e. `pos % BLOCK_SIZE`). +/// +/// `BLOCK_SIZE` (4096) is a nonzero constant, so this operation never panics. +fn block_offset(pos: u64) -> u64 { + pos % u64::from(format::BLOCK_SIZE) +} + +/// Returns the number of bytes from `pos` to the next EROFS block boundary, +/// or `None` if `pos` is already block-aligned (no padding needed). +/// +/// When `Some`, the result is always in `1..BLOCK_SIZE`. +fn bytes_to_block_boundary(pos: u64) -> Option { + let offset = block_offset(pos); + if offset == 0 { + return None; + } + let block_size = u64::from(format::BLOCK_SIZE); + let padding = block_size + .checked_sub(offset) + .expect("block_offset(pos) < BLOCK_SIZE by construction"); + debug_assert!(padding >= 1 && padding < block_size); + Some(padding) +} + +/// Deterministic fault injector for writer tests. +/// +/// Each field is a probability in [0.0, 1.0]: +/// 0.0 = never inject this fault +/// 1.0 = always inject this fault +/// +/// Construct with `WriterFaults::new(seed)` then set the rates you need. +/// Because `write_erofs` runs twice (layout pass then emit pass), decisions +/// are recorded during the first pass and replayed during the second so that +/// both passes make identical choices and produce a structurally coherent image. +#[cfg(test)] +pub(crate) struct WriterFaults { + rng: rand::rngs::SmallRng, + /// Skip the symlink block-boundary padding (produces a malformed image). + pub skip_symlink_pad_rate: f64, + /// Decisions recorded during the first pass; replayed during the second. + decisions: Vec, + /// Index into `decisions` during replay. + replay_idx: usize, + /// True after `start_replay()` is called. + replaying: bool, +} + +#[cfg(test)] +impl WriterFaults { + pub fn new(seed: u64) -> Self { + use rand::SeedableRng; + Self { + rng: rand::rngs::SmallRng::seed_from_u64(seed), + skip_symlink_pad_rate: 0.0, + decisions: Vec::new(), + replay_idx: 0, + replaying: false, + } + } + + /// Call between first and second pass to switch to replay mode. + pub(crate) fn start_replay(&mut self) { + self.replaying = true; + self.replay_idx = 0; + } + + fn should_skip_symlink_pad(&mut self) -> bool { + if self.replaying { + let decision = self.decisions[self.replay_idx]; + self.replay_idx += 1; + decision + } else { + use rand::RngExt; + let decision = self.rng.random::() < self.skip_symlink_pad_rate; + self.decisions.push(decision); + decision + } + } +} + +/// Bundles the parameters that are constant across a single `write_erofs` call. +struct WriteContext { + version: format::FormatVersion, + min_mtime: (u64, u32), + header_flags: u32, + /// The `composefs_version` value written to the ComposefsHeader. + /// + /// For V2: always 2 (COMPOSEFS_VERSION). + /// For V1: 0 normally, but 1 when the tree contains user-land whiteouts (char + /// devices with rdev=0 that were escaped by the V1 writer). This matches C + /// mkcomposefs, which bumps `options->version` from 0 to 1 when it encounters + /// a whiteout in the input tree (before adding the 256 overlay stubs). + composefs_version: u32, + #[cfg(test)] + faults: Option, } trait Output { - fn note_offset(&mut self, offset_type: Offset); - fn get(&self, offset_type: Offset, idx: usize) -> usize; + // --- Recording (first pass only, no-ops in second pass) --- + fn note_header_emitted(&mut self); + fn note_superblock_emitted(&mut self); + fn note_inode(&mut self); + fn note_inodes_end(&mut self); + fn note_xattr(&mut self); + fn note_block(&mut self); + fn note_end(&mut self); + + // --- Retrieval (None in first pass when offsets not yet known, Some in second pass) --- + fn get_inode_offset(&self, idx: usize) -> Option; + fn get_inodes_end(&self) -> Option; + fn get_xattr_offset(&self, idx: usize) -> Option; + fn get_block_offset(&self, idx: usize) -> Option; + fn get_end(&self) -> Option; + + // --- I/O --- fn write(&mut self, data: &[u8]); fn pad(&mut self, alignment: usize); fn len(&self) -> usize; - fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize { - let offset = self.get(offset_type, idx); - assert_eq!(offset % div, 0); - offset / div + /// Write `n` zero bytes. Default implementation avoids heap allocation. + fn write_zeros(&mut self, n: usize) { + const BUF: [u8; 1024] = [0u8; 1024]; + let mut remaining = n; + while remaining > 0 { + let chunk = remaining.min(BUF.len()); + self.write(&BUF[..chunk]); + remaining -= chunk; + } } - fn get_nid(&self, idx: usize) -> u64 { - self.get_div(Offset::Inode, idx, 32) as u64 + // --- Typed write methods: note + write bundled, removing duplication --- + + /// Write the composefs header and pad to 1024 bytes. + fn write_composefs_header(&mut self, hdr: format::ComposefsHeader) { + self.note_header_emitted(); + self.write(hdr.as_bytes()); + self.pad(1024); } - fn get_xattr(&self, idx: usize) -> u32 { - self.get_div(Offset::XAttr, idx, 4).try_into().unwrap() + /// Write the EROFS superblock. + fn write_superblock(&mut self, sb: format::Superblock) { + self.note_superblock_emitted(); + self.write(sb.as_bytes()); } + // --- Derived helpers --- + fn write_struct(&mut self, st: impl IntoBytes + Immutable) { self.write(st.as_bytes()); } + + /// Node ID for inode `idx`, or 0 as a placeholder in the first pass. + fn get_nid(&self, idx: usize) -> u64 { + let Some(offset) = self.get_inode_offset(idx) else { + return 0; + }; + assert_eq!(offset.get() % INODE_SLOT_SIZE, 0); + (offset.get() / INODE_SLOT_SIZE) as u64 + } + + /// Shared xattr reference value (V1 format), or 0 as a placeholder in the first pass. + fn get_xattr_v1(&self, idx: usize) -> u32 { + let (Some(absolute_offset), Some(inodes_end)) = + (self.get_xattr_offset(idx), self.get_inodes_end()) + else { + return 0; + }; + let (absolute_offset, inodes_end) = (absolute_offset.get(), inodes_end.get()); + let offset_within_block = inodes_end % format::BLOCK_SIZE as usize; + let xattr_offset_from_inodes_end = absolute_offset + .checked_sub(inodes_end) + .expect("shared xattr offset must be >= inode table end"); + let raw_ref = (offset_within_block + xattr_offset_from_inodes_end) / XATTR_WORD_SIZE; + raw_ref + .try_into() + .expect("xattr reference index exceeds u32::MAX") + } + + /// Shared xattr reference value (V2 format), or 0 as a placeholder in the first pass. + fn get_xattr_v2(&self, idx: usize) -> u32 { + let Some(offset) = self.get_xattr_offset(idx) else { + return 0; + }; + assert_eq!(offset.get() % XATTR_WORD_SIZE, 0); + (offset.get() / XATTR_WORD_SIZE) + .try_into() + .expect("xattr reference index exceeds u32::MAX") + } + + /// Byte offset of inode `idx`'s block data, or 0 as a placeholder in the first pass. + fn get_block_start(&self, idx: usize) -> usize { + self.get_block_offset(idx).map_or(0, NonZeroUsize::get) + } + + /// Block index of the V1 xattr region, or 0 as a placeholder in the first pass. + fn get_xattr_blkaddr(&self) -> u32 { + self.get_inodes_end() + .map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32) + } + + /// Total number of blocks in the image, or 0 as a placeholder in the first pass. + fn get_block_count(&self) -> u32 { + self.get_end() + .map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32) + } } +/// Extended attribute stored in EROFS format. +/// +/// The derived Ord sorts by (prefix, suffix, value) which is used for V2. +/// For V1, use `cmp_by_full_key` which sorts by full key name (prefix string + suffix) +/// to match C mkcomposefs behavior. #[derive(PartialOrd, PartialEq, Eq, Ord, Clone)] struct XAttr { prefix: u8, @@ -64,6 +304,32 @@ struct XAttr { value: Box<[u8]>, } +impl XAttr { + /// Compare by full key name (prefix string + suffix), then by value. + /// This matches C mkcomposefs `cmp_xattr` which uses `strcmp(na->key, nb->key)`. + /// Uses lazy iterator chaining to avoid heap allocation on every comparison. + /// + /// Value tiebreaker uses length-first comparison to match C `xattrs_ht_sort()`, + /// which compares `value_len` before `memcmp`. This differs from Rust's + /// lexicographic `[u8]::cmp` when values have different lengths (e.g. + /// `\x00\x00` vs `\xee`: lexicographic says `\x00\x00 < \xee`, but + /// length-first says `\xee < \x00\x00` because 1 < 2). + fn cmp_by_full_key(&self, other: &Self) -> std::cmp::Ordering { + let self_key = format::XATTR_PREFIXES[self.prefix as usize] + .iter() + .chain(self.suffix.iter()); + let other_key = format::XATTR_PREFIXES[other.prefix as usize] + .iter() + .chain(other.suffix.iter()); + self_key.cmp(other_key).then_with(|| { + self.value + .len() + .cmp(&other.value.len()) + .then_with(|| self.value.cmp(&other.value)) + }) + } +} + #[derive(Clone, Default)] struct InodeXAttrs { shared: Vec, @@ -71,13 +337,37 @@ struct InodeXAttrs { filter: u32, } +/// Index into [`InodeCollector::inodes`]. This is NOT an EROFS nid; the nid is computed +/// from the byte offset of the inode during the second pass via [`Output::get_nid`]. +type InodeIdx = usize; + +/// Reference to an inode in a directory entry. +/// +/// Used in [`DirEnt`] during BFS in [`InodeCollector::collect_tree`]. When a hardlink's +/// canonical occurrence hasn't been BFS-processed yet, the entry is stored as +/// `Deferred(leaf_id)` and resolved to `Known(nid)` in the post-BFS resolution pass. +#[derive(Debug, Clone, Copy)] +enum InodeRef { + Known(InodeIdx), + Deferred(LeafId), +} + #[derive(Debug)] struct DirEnt<'a> { name: &'a [u8], - inode: usize, + inode: InodeRef, file_type: format::FileType, } +/// Metadata returned by `Inode::inode_meta` used to fill inode header fields. +struct InodeMeta { + layout: format::DataLayout, + /// The `i_u` field: meaning depends on layout (rdev, chunk format, or block offset / BLOCK_SIZE). + i_u: u32, + size: u64, + nlink: usize, +} + #[derive(Debug, Default)] struct Directory<'a> { blocks: Box<[Box<[DirEnt<'a>]>]>, @@ -102,6 +392,9 @@ struct Inode<'a, ObjectID: FsVerityHashValue> { stat: &'a tree::Stat, xattrs: InodeXAttrs, content: InodeContent<'a, ObjectID>, + /// V1 only: this inode was originally a char device with rdev=0 (overlay whiteout) + /// and has been escaped to a regular file per C mkcomposefs v1.0.8 behavior. + escaped_whiteout: bool, } impl XAttr { @@ -113,13 +406,26 @@ impl XAttr { }); output.write(&self.suffix); output.write(&self.value); - output.pad(4); + output.pad(XATTR_WORD_SIZE); } } impl InodeXAttrs { - fn add(&mut self, name: &[u8], value: &[u8]) { + /// Returns the serialized byte size of this xattr block. + fn byte_size(&self, version: format::FormatVersion) -> usize { + let mut counter = FirstPass::default(); + self.write(&mut counter, version); + counter.offset + } + + fn add(&mut self, name: &[u8], value: &[u8], version: format::FormatVersion) { for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() { + // V1 compatibility: C mkcomposefs v1.0.8 does not include lustre. (index 5) + // in its prefix table, so lustre.* xattrs use index 0 (raw fallback) in C. + // Skip index 5 for V1 images to match that behavior. + if version == format::FormatVersion::V1 && idx == 5 { + continue; + } if let Some(suffix) = name.strip_prefix(*prefix) { self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32); self.local.push(XAttr { @@ -133,7 +439,7 @@ impl InodeXAttrs { unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0) } - fn write(&self, output: &mut impl Output) { + fn write(&self, output: &mut impl Output, version: format::FormatVersion) { if self.filter != 0 { trace!(" write xattrs block"); output.write_struct(format::InodeXAttrHeader { @@ -143,7 +449,11 @@ impl InodeXAttrs { }); for idx in &self.shared { trace!(" shared {} @{}", idx, output.len()); - output.write(&output.get_xattr(*idx).to_le_bytes()); + let xattr_ref = match version { + format::FormatVersion::V1 => output.get_xattr_v1(*idx), + format::FormatVersion::V2 => output.get_xattr_v2(*idx), + }; + output.write(&xattr_ref.to_le_bytes()); } for attr in &self.local { trace!(" local @{}", output.len()); @@ -226,9 +536,13 @@ impl<'a> Directory<'a> { nameofs, output.len() ); + let inode_idx = match entry.inode { + InodeRef::Known(idx) => idx, + InodeRef::Deferred(_) => panic!("all inodes must be resolved before writing"), + }; output.write_struct(format::DirectoryEntryHeader { name_offset: (nameofs as u16).into(), - inode_offset: output.get_nid(entry.inode).into(), + inode_offset: output.get_nid(inode_idx).into(), file_type: entry.file_type.into(), ..Default::default() }); @@ -260,21 +574,65 @@ impl<'a> Directory<'a> { } } - fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) { - let (layout, u) = if self.inline.is_empty() { - (format::DataLayout::FlatPlain, block_offset as u32 / 4096) + fn inode_meta(&self, block_offset: usize) -> InodeMeta { + let blkaddr: u32 = (block_offset / 4096) + .try_into() + .expect("block address exceeds u32::MAX"); + let (layout, i_u) = if self.inline.is_empty() { + (format::DataLayout::FlatPlain, blkaddr) } else if !self.blocks.is_empty() { - (format::DataLayout::FlatInline, block_offset as u32 / 4096) + (format::DataLayout::FlatInline, blkaddr) } else { (format::DataLayout::FlatInline, 0) }; - (layout, u, self.size, self.nlink) + InodeMeta { + layout, + i_u, + size: self.size, + nlink: self.nlink, + } } } +/// Calculates the chunk format bits for an external file based on its size. +/// +/// For EROFS chunk-based inodes, the `u` field contains the chunk format +/// which encodes the chunk size as `chunkbits - BLOCK_BITS`. +/// +/// The algorithm matches the C implementation: +/// 1. Calculate chunkbits = ilog2(size - 1) + 1 +/// 2. Clamp to at least BLOCK_BITS (12) +/// 3. Clamp to at most BLOCK_BITS + 31 (max representable) +/// 4. Return chunkbits - BLOCK_BITS +fn compute_chunk_format(file_size: u64) -> u32 { + const BLOCK_BITS: u32 = format::BLOCK_BITS as u32; + const CHUNK_FORMAT_BLKBITS_MASK: u32 = 0x001F; // 31 + + // Compute the chunkbits to use for the file size. + // We want as few chunks as possible, but not an unnecessarily large chunk. + let mut chunkbits = if file_size > 1 { + // ilog2(file_size - 1) + 1 + 64 - (file_size - 1).leading_zeros() + } else { + 1 + }; + + // At least one logical block + if chunkbits < BLOCK_BITS { + chunkbits = BLOCK_BITS; + } + + // Not larger chunks than max possible + if chunkbits - BLOCK_BITS > CHUNK_FORMAT_BLKBITS_MASK { + chunkbits = CHUNK_FORMAT_BLKBITS_MASK + BLOCK_BITS; + } + + chunkbits - BLOCK_BITS +} + impl Leaf<'_, ObjectID> { - fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) { - let (layout, u, size) = match &self.content { + fn inode_meta(&self, version: format::FormatVersion) -> InodeMeta { + let (layout, i_u, size) = match &self.content { tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => { if data.is_empty() { (format::DataLayout::FlatPlain, 0, data.len() as u64) @@ -283,10 +641,19 @@ impl Leaf<'_, ObjectID> { } } tree::LeafContent::Regular(tree::RegularFile::External(.., size)) => { - (format::DataLayout::ChunkBased, 31, *size) + // V1: compute chunk format from file size + // V2: hardcode 31 (origin/main behavior) + let chunk_format = match version { + format::FormatVersion::V1 => compute_chunk_format(*size), + format::FormatVersion::V2 => 31, + }; + (format::DataLayout::ChunkBased, chunk_format, *size) } tree::LeafContent::CharacterDevice(rdev) | tree::LeafContent::BlockDevice(rdev) => { - (format::DataLayout::FlatPlain, *rdev as u32, 0) + let rdev32: u32 = (*rdev) + .try_into() + .expect("device number exceeds EROFS u32 limit"); + (format::DataLayout::FlatPlain, rdev32, 0) } tree::LeafContent::Fifo | tree::LeafContent::Socket => { (format::DataLayout::FlatPlain, 0, 0) @@ -301,7 +668,12 @@ impl Leaf<'_, ObjectID> { (format::DataLayout::FlatInline, 0, target.len() as u64) } }; - (layout, u, size, self.nlink) + InodeMeta { + layout, + i_u, + size, + nlink: self.nlink, + } } fn write_inline(&self, output: &mut impl Output) { @@ -316,6 +688,11 @@ impl Leaf<'_, ObjectID> { impl Inode<'_, ObjectID> { fn file_type(&self) -> format::FileType { + // V1 whiteout escaping: char device (rdev=0) entries are written as regular files + // to match C mkcomposefs v1.0.8 behavior. + if self.escaped_whiteout { + return format::FileType::RegularFile; + } match &self.content { InodeContent::Directory(..) => format::FileType::Directory, InodeContent::Leaf(leaf) => match &leaf.content { @@ -329,84 +706,248 @@ impl Inode<'_, ObjectID> { } } - fn write_inode(&self, output: &mut impl Output, idx: usize) { - let (layout, u, size, nlink) = match &self.content { - InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)), - InodeContent::Leaf(leaf) => leaf.inode_meta(), + /// Check if this inode can use compact format (32 bytes instead of 64). + /// + /// Compact format is used when: + /// - mtime matches min_mtime (stored in superblock build_time) + /// - nlink, uid, gid fit in u16 + /// - size fits in u32 + fn fits_in_compact(&self, min_mtime: (u64, u32), size: u64, nlink: usize) -> bool { + // mtime (both sec and nsec) must match the minimum (which will be stored in superblock + // build_time / build_time_nsec). The C implementation requires both to match. + if self.stat.st_mtim_sec as u64 != min_mtime.0 { + return false; + } + if self.stat.st_mtim_nsec != min_mtime.1 { + return false; + } + + // nlink must fit in u16 + if nlink > u16::MAX as usize { + return false; + } + + // uid and gid must fit in u16 + if self.stat.st_uid > u16::MAX as u32 || self.stat.st_gid > u16::MAX as u32 { + return false; + } + + // size must fit in u32 + if size > u32::MAX as u64 { + return false; + } + + true + } + + /// Handle inline tail padding for V1 format. + /// + /// Port of C mkcomposefs `compute_erofs_inode_padding_for_tail()`. + /// + /// Two branches based on file type: + /// - Symlinks: pad the *inode start* to a block boundary whenever the inode + xattrs + + /// symlink target would cross into a new block (prevents EFSCORRUPTED on old kernels). + /// - All other FlatInline types (dirs, inline files): pad the *tail* only if it would + /// cross into yet another block after inline_start. + fn pad_inline_tail_v1( + &self, + output: &mut impl Output, + inode_and_xattr_size: u64, + size: u64, + #[cfg(test)] ctx: &mut WriteContext, + #[cfg(not(test))] _ctx: &mut WriteContext, + ) { + let block_size = u64::from(format::BLOCK_SIZE); + let current_pos: u64 = output.len().try_into().unwrap(); + let inline_size = size % block_size; + + if matches!(self.file_type(), format::FileType::Symlink) { + // Symlink branch: pad *inode start* to a block boundary when + // inode + xattrs + symlink target would cross into a new block. + // Matches C: pos_block != end_block. + // + // Old kernels (< 6.12) return EFSCORRUPTED from erofs_fill_symlink() + // when (inode_offset % block_size) + inode_and_xattr_size + inline_size + // > block_size. Padding the inode start to a block boundary prevents + // this because then inode_offset % block_size == 0. + #[cfg(test)] + let skip_pad = ctx + .faults + .as_mut() + .map(|f| f.should_skip_symlink_pad()) + .unwrap_or(false); + #[cfg(not(test))] + let skip_pad = false; + + if !skip_pad { + let total_size = inode_and_xattr_size + inline_size; + // Does [current_pos, current_pos+total_size) cross a block boundary? + // block_offset tells us how far into the current block we are; + // if adding total_size exceeds block_size, we spill into the next block. + if block_offset(current_pos) + total_size > block_size { + // Align inode start to the next block boundary so the inode + // doesn't straddle a block (prevents EUCLEAN on old kernels). + // block_size (4096) is divisible by 32 (EROFS slot size), + // so slot alignment is preserved after this padding. + // None means current_pos is already block-aligned; no padding needed. + if let Some(pad_size) = bytes_to_block_boundary(current_pos) { + output.write_zeros(pad_size as usize); + } + } + } + } else { + // Non-symlink branch (dirs, inline files): pad the *tail* to fit + // within the block that inline_start lands in. + // Matches C: block_remainder < inline_size, pad = block_remainder + // rounded up to the next 32-byte slot boundary. + let inline_start = current_pos + .checked_add(inode_and_xattr_size) + .expect("image position + inode header size cannot overflow u64"); + // If inline_start is block-aligned, block_remainder would be BLOCK_SIZE which + // always exceeds inline_size (< BLOCK_SIZE), so no padding — None is correct. + if let Some(block_remainder) = bytes_to_block_boundary(inline_start) + && block_remainder < inline_size + { + let pad_size = (block_remainder.div_ceil(INODE_SLOT_SIZE as u64) + * INODE_SLOT_SIZE as u64) as usize; + output.write_zeros(pad_size); + } + } + } + + /// Handle inline tail padding for V2 format (origin/main algorithm). + fn pad_inline_tail_v2(&self, output: &mut impl Output, inode_and_xattr_size: u64, size: u64) { + let block_size = u64::from(format::BLOCK_SIZE); + let inline_start: u64 = output.len().try_into().unwrap(); + let inline_start = inline_start + .checked_add(inode_and_xattr_size) + .expect("image position + inode header size cannot overflow u64"); + // Restore origin/main logic: end_of_metadata is the last byte of the metadata, + // inline_end is the last byte of the inline data. If they land in different + // blocks we must pad so the inline data starts at a fresh block boundary. + let end_of_metadata = inline_start - 1; + let inline_end = inline_start + (size % block_size); + if end_of_metadata / block_size != inline_end / block_size { + let pad_size = (block_size - end_of_metadata % block_size) as usize; + output.write_zeros(pad_size); + output.pad(INODE_SLOT_SIZE); + } + } + + fn write_inode(&self, output: &mut impl Output, idx: usize, ctx: &mut WriteContext) { + let version = ctx.version; + let min_mtime = ctx.min_mtime; + let meta = match &self.content { + InodeContent::Directory(dir) => dir.inode_meta(output.get_block_start(idx)), + InodeContent::Leaf(leaf) => leaf.inode_meta(version), }; + let InodeMeta { + layout, + i_u: u, + size, + nlink, + } = meta; + + let xattr_size = self.xattrs.byte_size(version); - let xattr_size = { - let mut xattr = FirstPass::default(); - self.xattrs.write(&mut xattr); - xattr.offset + // V1: compact inodes when possible; V2: always extended + let use_compact = + version == format::FormatVersion::V1 && self.fits_in_compact(min_mtime, size, nlink); + + let inode_header_size = if use_compact { + size_of::() + } else { + size_of::() }; // We need to make sure the inline part doesn't overlap a block boundary - output.pad(32); + output.pad(INODE_SLOT_SIZE); if matches!(layout, format::DataLayout::FlatInline) { - let block_size = u64::from(format::BLOCK_SIZE); - let inode_and_xattr_size: u64 = (size_of::() + xattr_size) - .try_into() - .unwrap(); - let inline_start: u64 = output.len().try_into().unwrap(); - let inline_start = inline_start + inode_and_xattr_size; - let end_of_metadata = inline_start - 1; - let inline_end = inline_start + (size % block_size); - if end_of_metadata / block_size != inline_end / block_size { - // If we proceed, then we'll violate the rule about crossing block boundaries. - // The easiest thing to do is to add padding so that the inline data starts close - // to the start of a fresh block boundary, while ensuring inode alignment. - // pad_size is always < block_size (4096), so fits in usize - let pad_size = (block_size - end_of_metadata % block_size) as usize; - let pad = vec![0; pad_size]; - trace!("added pad {}", pad.len()); - output.write(&pad); - output.pad(32); + let inode_and_xattr_size: u64 = (inode_header_size + xattr_size).try_into().unwrap(); + + match version { + format::FormatVersion::V1 => { + self.pad_inline_tail_v1(output, inode_and_xattr_size, size, ctx); + } + format::FormatVersion::V2 => { + self.pad_inline_tail_v2(output, inode_and_xattr_size, size); + } } } - let format = format::InodeLayout::Extended | layout; + let xattr_icount: u16 = match xattr_size { + 0 => 0, + n => { + let word_count = n + .checked_sub(INODE_XATTR_HEADER_SIZE) + .expect("non-empty xattr block must be >= header size") + / XATTR_WORD_SIZE; + (1 + word_count) as u16 + } + }; - trace!( - "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}", - output.len() / 32, - format, - self.file_type(), - match xattr_size { - 0 => 0, - n => (1 + (n - 12) / 4) as u16, - }, - size % 4096, - output.len() - ); + output.note_inode(); + + if use_compact { + let format = format::InodeLayout::Compact | layout; + + // V1: use sequential ino + let ino = idx as u32; + + output.write_struct(format::CompactInodeHeader { + format, + xattr_icount: xattr_icount.into(), + mode: self.file_type() | self.stat.st_mode, + nlink: (nlink as u16).into(), + size: (size as u32).into(), + reserved: 0.into(), + u: u.into(), + ino: ino.into(), + uid: (self.stat.st_uid as u16).into(), + gid: (self.stat.st_gid as u16).into(), + reserved2: [0; 4], + }); + } else { + let format = format::InodeLayout::Extended | layout; - output.note_offset(Offset::Inode); - output.write_struct(format::ExtendedInodeHeader { - format, - xattr_icount: match xattr_size { - 0 => 0, - n => (1 + (n - 12) / 4) as u16, - } - .into(), - mode: self.file_type() | self.stat.st_mode, - size: size.into(), - u: u.into(), - ino: ((output.len() / 32) as u32).into(), - uid: self.stat.st_uid.into(), - gid: self.stat.st_gid.into(), - mtime: (self.stat.st_mtim_sec as u64).into(), - nlink: (nlink as u32).into(), - ..Default::default() - }); + // V1 uses the BFS index as i_ino (matching C mkcomposefs behaviour). + // V2 uses the NID (byte offset / INODE_SLOT_SIZE) for 32-bit stat compatibility. + let ino = match version { + format::FormatVersion::V1 => idx as u32, + format::FormatVersion::V2 => (output.len() / INODE_SLOT_SIZE) as u32, + }; - self.xattrs.write(output); + // V2 does not store sub-second mtime precision (mtime_nsec=0), + // matching origin/main which used ..Default::default() to zero it. + // V1 preserves full nanosecond precision. + let mtime_nsec: u32 = match version { + format::FormatVersion::V1 => self.stat.st_mtim_nsec, + format::FormatVersion::V2 => 0, + }; + output.write_struct(format::ExtendedInodeHeader { + format, + xattr_icount: xattr_icount.into(), + mode: self.file_type() | self.stat.st_mode, + size: size.into(), + u: u.into(), + ino: ino.into(), + uid: self.stat.st_uid.into(), + gid: self.stat.st_gid.into(), + mtime: (self.stat.st_mtim_sec as u64).into(), + mtime_nsec: mtime_nsec.into(), + nlink: (nlink as u32).into(), + ..Default::default() + }); + } + + self.xattrs.write(output, version); match &self.content { InodeContent::Directory(dir) => dir.write_inline(output), InodeContent::Leaf(leaf) => leaf.write_inline(output), }; - output.pad(32); + output.pad(INODE_SLOT_SIZE); } fn write_blocks(&self, output: &mut impl Output) { @@ -418,13 +959,18 @@ impl Inode<'_, ObjectID> { struct InodeCollector<'a, ObjectID: FsVerityHashValue> { inodes: Vec>, - hardlinks: HashMap, + hardlinks: HashMap, fs: &'a tree::FileSystem, - nlink_map: &'a [u32], + nlink_map: Vec, + version: format::FormatVersion, } impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { - fn push_inode(&mut self, stat: &'a tree::Stat, content: InodeContent<'a, ObjectID>) -> usize { + fn push_inode( + &mut self, + stat: &'a tree::Stat, + content: InodeContent<'a, ObjectID>, + ) -> InodeIdx { let mut xattrs = InodeXAttrs::default(); // We need to record extra xattrs for some files. These come first. @@ -434,23 +980,28 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { }) = content { xattrs.add( - b"trusted.overlay.metacopy", + format::XATTR_OVERLAY_METACOPY, OverlayMetacopy::new(id).as_bytes(), + self.version, ); let redirect = format!("/{}", id.to_object_pathname()); - xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes()); + xattrs.add( + format::XATTR_OVERLAY_REDIRECT, + redirect.as_bytes(), + self.version, + ); } // Add the normal xattrs. They're already listed in sorted order. for (name, value) in stat.xattrs.iter() { let name = name.as_bytes(); - if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") { - let escaped = [b"trusted.overlay.overlay.", escapee].concat(); - xattrs.add(&escaped, value); + if let Some(escapee) = name.strip_prefix(format::XATTR_OVERLAY_PREFIX) { + let escaped = [format::XATTR_OVERLAY_ESCAPED_PREFIX, escapee].concat(); + xattrs.add(&escaped, value, self.version); } else { - xattrs.add(name, value); + xattrs.add(name, value, self.version); } } @@ -461,11 +1012,12 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { stat, xattrs, content, + escaped_whiteout: false, }); inode } - fn collect_leaf(&mut self, leaf_id: LeafId) -> usize { + fn collect_leaf(&mut self, leaf_id: LeafId) -> InodeIdx { let nlink = self.nlink_map[leaf_id.0] as usize; if nlink > 1 @@ -475,6 +1027,14 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { } let leaf = self.fs.leaf(leaf_id); + + // Hardlinked whiteouts are semantically invalid: a whiteout represents the + // absence of a file in an overlay, so having nlink > 1 is meaningless. + // ValidatedFileSystem guarantees this invariant was checked at construction time. + debug_assert!( + !(matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) && nlink > 1), + "ValidatedFileSystem guarantees whiteout nlink == 1" + ); let inode = self.push_inode( &leaf.stat, InodeContent::Leaf(Leaf { @@ -490,27 +1050,24 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { inode } - fn insert_sorted( - entries: &mut Vec>, - name: &'a [u8], - inode: usize, - file_type: format::FileType, - ) { - let entry = DirEnt { - name, - inode, - file_type, - }; - let point = entries.partition_point(|e| e.name < entry.name); - entries.insert(point, entry); - } - - fn collect_dir(&mut self, dir: &'a tree::Directory, parent: usize) -> usize { + /// Collect inodes using depth-first traversal (V2 / origin/main behavior). + fn collect_dir(&mut self, dir: &'a tree::Directory, parent: InodeIdx) -> InodeIdx { // The root inode number needs to fit in a u16. That more or less compels us to write the // directory inode before the inode of the children of the directory. Reserve a slot. let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default())); - let mut entries = vec![]; + let mut entries = vec![ + DirEnt { + name: b".", + inode: InodeRef::Known(me), + file_type: format::FileType::Directory, + }, + DirEnt { + name: b"..", + inode: InodeRef::Known(parent), + file_type: format::FileType::Directory, + }, + ]; for (name, inode) in dir.sorted_entries() { let child = match inode { @@ -519,34 +1076,359 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { }; entries.push(DirEnt { name: name.as_bytes(), - inode: child, + inode: InodeRef::Known(child), file_type: self.inodes[child].file_type(), }); } - // We're expected to add those, too - Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory); - Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory); + entries.sort_unstable_by_key(|e| e.name); // Now that we know the actual content, we can write it to our reserved slot self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); me } + /// Returns true if this leaf entry is an overlay whiteout stub generated internally + /// by `add_overlay_whiteouts()`, as opposed to a user-provided whiteout. These stubs + /// must NOT be escaped during V1 whiteout processing. + fn is_overlay_whiteout_stub( + &self, + name: &[u8], + leaf_id: LeafId, + me: InodeIdx, + root_inode: InodeIdx, + ) -> bool { + let root_stat = &self.fs.root.stat; + let leaf_stat = &self.fs.leaf(leaf_id).stat; + let selinux_key = std::ffi::OsStr::new("security.selinux"); + let expected_xattrs = if root_stat.xattrs.contains_key(selinux_key) { + 1 + } else { + 0 + }; + let has_correct_xattrs = leaf_stat.xattrs.len() == expected_xattrs + && (expected_xattrs == 0 + || leaf_stat.xattrs.get(selinux_key) == root_stat.xattrs.get(selinux_key)); + + me == root_inode + && name.len() == 2 + && name + .iter() + .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f')) + && leaf_stat.st_mode == 0o644 + && leaf_stat.st_uid == root_stat.st_uid + && leaf_stat.st_gid == root_stat.st_gid + && leaf_stat.st_mtim_sec == root_stat.st_mtim_sec + && leaf_stat.st_mtim_nsec == root_stat.st_mtim_nsec + && has_correct_xattrs + } + + /// Returns true if a leaf content is a V1 overlay whiteout (char device, rdev=0). + fn is_v1_whiteout(content: &tree::LeafContent) -> bool { + matches!(content, tree::LeafContent::CharacterDevice(0)) + } + + /// Collect all inodes using queue-based breadth-first traversal (V1). + /// + /// This algorithm matches the C mkcomposefs `lcfs_compute_tree()` function which uses + /// a linked-list queue to process directories. All nodes at depth N are assigned inode + /// numbers before any nodes at depth N+1. + /// + /// For V1, char device entries with rdev=0 (overlay whiteouts) are escaped to regular + /// files matching C mkcomposefs v1.0.8 `add_overlayfs_xattrs()` behavior: + /// - Child entry: converted to regular file + gets `trusted.overlay.overlay.whiteout=""` + /// and `user.overlay.whiteout=""` xattrs. + /// - Parent directory: gets `trusted.overlay.overlay.whiteouts=""`, + /// `user.overlay.whiteouts=""`, `trusted.overlay.overlay.opaque=x`, + /// `user.overlay.opaque=x` xattrs (added at most once per directory). + fn collect_tree(&mut self, root: &'a tree::Directory) { + use std::collections::VecDeque; + + // Pre-pass: for each multi-link leaf, find which directory holds the canonical + // (first DFS sorted-order) occurrence. + // + // In C mkcomposefs, when a dumpfile is parsed, the first occurrence of each + // inode (same content / nlink target) is the "original" and subsequent occurrences + // are "hardlinks" (with link_to pointer). During BFS, hardlinks are SKIPPED — only + // originals get inode numbers. Hardlink directory entries use the original's nid. + // + // The dumpfile is written in DFS sorted order (see write_dumpfile). So the canonical + // occurrence is whichever path appears first in that DFS traversal. + // + // We replicate this: when BFS encounters a non-canonical occurrence of a multi-link + // leaf (its canonical directory doesn't match the current directory), we defer the + // nid assignment until the canonical occurrence is processed. + // + // KEY: we record the DIRECTORY POINTER of the canonical occurrence, not just the + // leaf_id, because two occurrences of the same leaf share the same leaf_id — we + // need the directory pointer to distinguish canonical from non-canonical at BFS time. + let canonical_dirs = Self::find_canonical_dirs(root, &self.nlink_map); + + let root_inode = self.push_inode(&root.stat, InodeContent::Directory(Directory::default())); + let mut queue: VecDeque<(&'a tree::Directory, InodeIdx, InodeIdx)> = + VecDeque::new(); + queue.push_back((root, root_inode, root_inode)); + + // dir_entries: accumulates (me, parent, entries) for each directory processed in BFS order. + // Leaf entries whose canonical occurrence hasn't been BFS-processed yet are stored as + // InodeRef::Deferred(leaf_id) and resolved in a single post-BFS pass once all canonical + // inodes have been assigned. + let mut dir_entries: Vec<(InodeIdx, InodeIdx, Vec>)> = vec![]; // (me, parent, entries) + + while let Some((dir, parent, me)) = queue.pop_front() { + let mut entries = vec![ + DirEnt { + name: b".", + inode: InodeRef::Known(me), + file_type: format::FileType::Directory, + }, + DirEnt { + name: b"..", + inode: InodeRef::Known(parent), + file_type: format::FileType::Directory, + }, + ]; + let mut dir_has_whiteout = false; + + for (name, inode) in dir.sorted_entries() { + match inode { + tree::Inode::Directory(subdir) => { + let child = self.push_inode( + &subdir.stat, + InodeContent::Directory(Directory::default()), + ); + queue.push_back((subdir, me, child)); + entries.push(DirEnt { + name: name.as_bytes(), + inode: InodeRef::Known(child), + file_type: format::FileType::Directory, + }); + } + tree::Inode::Leaf(leaf_id, _) => { + // V1 whiteout escaping: char device with rdev=0 → regular file. + // Matches C mkcomposefs v1.0.8 `rewrite_tree_node_for_erofs()`, which + // escapes user-provided char devices. + // + // IMPORTANT: the 256 stubs added by add_overlay_whiteouts() are NOT + // escaped in C — they are added AFTER `rewrite_tree_node_for_erofs()` + // so they never go through escaping. We skip them by detecting root-level + // 2-char hex entries (the names used by add_overlay_whiteouts()) THAT ALSO + // exactly match the metadata applied by add_overlay_whiteouts(). This + // correctly distinguishes them from user-provided whiteouts that happen + // to have a 2-char hex name. + let name_bytes = name.as_bytes(); + let is_stub = + self.is_overlay_whiteout_stub(name_bytes, *leaf_id, me, root_inode); + + // Determine if this occurrence is canonical (first in DFS order). + // + // For multi-link leaves (nlink > 1), the canonical occurrence is the + // one in the directory recorded by find_canonical_dirs(). We compare + // the current directory pointer to identify it precisely. + // + // For single-link leaves (nlink = 1), there is only one occurrence, + // so it is always canonical (no entry in canonical_dirs). + let nlink = self.nlink_map[leaf_id.0]; + let is_canonical = if nlink > 1 { + // Multi-link: canonical iff this is the recorded canonical directory. + // We use pointer identity (std::ptr::eq) to match the current + // directory reference against the one recorded during the DFS + // pre-pass. The pointers are stable borrows from the tree, which + // outlives this entire function. + canonical_dirs + .get(leaf_id) + .is_some_and(|&p| std::ptr::eq(p, dir)) + } else { + // Single-link: always canonical + true + }; + + let child_ref = if is_canonical { + // Canonical occurrence: create nid now. + InodeRef::Known(self.collect_leaf(*leaf_id)) + } else if let Some(&nid) = self.hardlinks.get(leaf_id) { + // Non-canonical, and the canonical has already been processed. + InodeRef::Known(nid) + } else { + // Non-canonical, and canonical hasn't been assigned a nid yet + // (canonical is in a deeper directory, not yet BFS-processed). + // Store as Deferred; resolved in the post-BFS pass. + InodeRef::Deferred(*leaf_id) + }; + + // Apply whiteout escaping on the first canonical occurrence only. + // + // `is_canonical` is true for any entry whose directory pointer matches + // the canonical directory, so if a whiteout leaf and its hardlink both + // live in the same directory, both appear "canonical" by that check. + // We guard with `!escaped_whiteout` to ensure the xattrs are added + // exactly once — on the very first encounter of the inode. + if is_canonical + && matches!(child_ref, InodeRef::Known(_)) + && self.version == format::FormatVersion::V1 + && !is_stub + && Self::is_v1_whiteout(&self.fs.leaf(*leaf_id).content) + { + let InodeRef::Known(child) = child_ref else { + unreachable!() + }; + if !self.inodes[child].escaped_whiteout { + self.inodes[child].escaped_whiteout = true; + // Add per-entry whiteout xattrs (already-escaped names): + // C adds OVERLAY_XATTR_ESCAPED_WHITEOUT and OVERLAY_XATTR_USERXATTR_WHITEOUT. + self.inodes[child].xattrs.add( + format::XATTR_OVERLAY_WHITEOUT, + b"", + self.version, + ); + self.inodes[child].xattrs.add( + format::XATTR_USERXATTR_WHITEOUT, + b"", + self.version, + ); + dir_has_whiteout = true; + } + } + + // file_type for the dir entry: for Deferred entries, use a placeholder; + // it will be corrected in the post-BFS resolution pass. + let file_type = if let InodeRef::Known(child) = child_ref { + // file_type() already returns RegularFile when escaped_whiteout=true + self.inodes[child].file_type() + } else { + // Deferred; file_type will be updated in the resolution pass + format::FileType::RegularFile + }; + + entries.push(DirEnt { + name: name.as_bytes(), + inode: child_ref, + file_type, + }); + } + } + } + + // V1: if this directory had whiteout children, add parent xattrs. + // C adds these once per directory, on first whiteout child found. + // Matches OVERLAY_XATTR_ESCAPED_WHITEOUTS, OVERLAY_XATTR_USERXATTR_WHITEOUTS, + // OVERLAY_XATTR_ESCAPED_OPAQUE (=x), OVERLAY_XATTR_USERXATTR_OPAQUE (=x). + if self.version == format::FormatVersion::V1 && dir_has_whiteout { + self.inodes[me] + .xattrs + .add(format::XATTR_OVERLAY_WHITEOUTS, b"", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_USERXATTR_WHITEOUTS, b"", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_OVERLAY_OPAQUE, b"x", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_USERXATTR_OPAQUE, b"x", self.version); + } + + entries.sort_unstable_by_key(|e| e.name); + + dir_entries.push((me, parent, entries)); + } + + // Post-BFS: resolve all Deferred entries. + // At this point all canonical leaves have been assigned nids and are in self.hardlinks. + for (_me, _parent, entries) in &mut dir_entries { + for entry in entries.iter_mut() { + if let InodeRef::Deferred(leaf_id) = entry.inode { + let nid = *self + .hardlinks + .get(&leaf_id) + .expect("canonical leaf must have been assigned a nid during BFS"); + entry.inode = InodeRef::Known(nid); + entry.file_type = self.inodes[nid].file_type(); + } + } + } + + // Build directory content for each directory inode. + for (me, _parent, entries) in dir_entries { + self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); + } + } + + /// DFS pre-pass: find which directory contains the canonical occurrence of each + /// multi-link leaf (first encounter in DFS sorted order). + /// + /// C mkcomposefs parses dumpfiles in DFS sorted order. The first occurrence of each + /// leaf (by `LeafId`) is the "original"; subsequent occurrences are "hardlinks". + /// Only originals get inode numbers in BFS; hardlinks reuse the original's nid. + /// + /// The dumpfile writer (`write_dumpfile`) uses DFS sorted traversal, so we replicate + /// the same traversal here to determine canonical occurrences. + /// + /// Note: we cannot simplify to "first BFS encounter wins" because DFS and BFS visit + /// directories at different depths in different order (e.g. DFS visits `/a/deep/` + /// before `/b/`, while BFS visits `/b/` first). Changing the canonical ordering + /// would break binary compatibility with C mkcomposefs. + /// + /// Returns a `HashMap` mapping each multi-link leaf to the + /// directory pointer where its canonical (first DFS) occurrence lives. + /// Single-link leaves are NOT in the map (they're trivially canonical anywhere). + /// + /// We use raw pointers for directory identity comparison (`std::ptr::eq`) rather + /// than dereferencing. The pointers are stable `&'a` borrows from the tree which + /// outlives the entire `collect_tree` call. + fn find_canonical_dirs( + root: &'a tree::Directory, + nlink_map: &[u32], + ) -> HashMap> { + let mut seen: HashSet = HashSet::new(); + let mut canonical_dirs: HashMap> = HashMap::new(); + Self::dfs_find_canonical(root, nlink_map, &mut seen, &mut canonical_dirs); + canonical_dirs + } + + fn dfs_find_canonical( + dir: &'a tree::Directory, + nlink_map: &[u32], + seen: &mut HashSet, + canonical_dirs: &mut HashMap>, + ) { + let dir_ptr: *const tree::Directory = dir; + for (_, inode) in dir.sorted_entries() { + match inode { + tree::Inode::Directory(subdir) => { + Self::dfs_find_canonical(subdir, nlink_map, seen, canonical_dirs); + } + tree::Inode::Leaf(leaf_id, _) => { + if nlink_map[leaf_id.0] > 1 && seen.insert(*leaf_id) { + // First DFS encounter → canonical occurrence is in this directory. + canonical_dirs.insert(*leaf_id, dir_ptr); + // Second+ encounter → non-canonical (hardlink), dir not recorded + } + // Single-link leaves are always canonical; no need to record them + } + } + } + } + pub fn collect( fs: &'a tree::FileSystem, - nlink_map: &'a [u32], + version: format::FormatVersion, ) -> Vec> { let mut this = Self { inodes: vec![], hardlinks: HashMap::new(), fs, - nlink_map, + nlink_map: fs.nlinks(), + version, }; - // '..' of the root directory is the root directory again - let root_inode = this.collect_dir(&fs.root, 0); - assert_eq!(root_inode, 0); + match version { + format::FormatVersion::V1 => this.collect_tree(&fs.root), + format::FormatVersion::V2 => { + let root_inode = this.collect_dir(&fs.root, 0); + assert_eq!(root_inode, 0); + } + } this.inodes } @@ -554,9 +1436,23 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { /// Takes a list of inodes where each inode contains only local xattr values, determines which /// xattrs (key, value) pairs appear more than once, and shares them. -fn share_xattrs(inodes: &mut [Inode]) -> Vec { +/// +/// For V1: sorts locals by full key, reverses shared table, uses InodesEnd-relative xattr offsets. +/// For V2: uses natural BTreeMap order (derived Ord), ascending shared table. +fn share_xattrs( + inodes: &mut [Inode], + version: format::FormatVersion, +) -> Vec { let mut xattrs: BTreeMap = BTreeMap::new(); + // V1: sort local xattrs by full key to match C behavior + // V2: don't sort (insertion order is fine, BTreeMap handles shared ordering) + if version == format::FormatVersion::V1 { + for inode in inodes.iter_mut() { + inode.xattrs.local.sort_by(|a, b| a.cmp_by_full_key(b)); + } + } + // Collect all xattrs from the inodes for inode in inodes.iter() { for attr in &inode.xattrs.local { @@ -571,88 +1467,158 @@ fn share_xattrs(inodes: &mut [Inode]) -> Vec { // Share only xattrs with more than one user xattrs.retain(|_k, v| *v > 1); - // Repurpose the refcount field as an index lookup - for (idx, value) in xattrs.values_mut().enumerate() { - *value = idx; - } + let (xattrs, shared): (BTreeMap, Vec) = match version { + format::FormatVersion::V1 => { + // C mkcomposefs sorts shared xattrs by full key string (strcmp), then writes + // them in DESCENDING order in the shared xattr block. Our BTreeMap is ordered + // by (prefix_index, suffix, value) which differs from strcmp order when prefix + // indices don't sort the same way as prefix strings (e.g. "security."=6 sorts + // numerically after "trusted."=4, but 'security.' < 'trusted.' lexicographically). + // Collect into a Vec, sort by full key ascending, then reverse = descending. + let mut sorted: Vec<_> = xattrs.into_iter().collect(); + sorted.sort_by(|(a, _), (b, _)| a.cmp_by_full_key(b)); + let n_shared = sorted.len(); + // Assign indices in descending order: first entry on disk gets the highest ref. + // After reversal, sorted[0] (ascending-smallest) ends up last on disk. + // We iterate ascending-sorted and assign index = n-1-i so that the entry + // written LAST (smallest key in ascending order) gets the SMALLEST index. + // Reconstruct a map for the lookup phase below. + let xattrs_map: BTreeMap = sorted + .iter() + .enumerate() + .map(|(i, (k, _))| (k.clone(), n_shared - 1 - i)) + .collect(); + + // Return in descending full-key order (last in ascending = first written) + let mut out = sorted; + out.reverse(); + let shared_vec = out.into_iter().map(|(k, _)| k).collect(); + (xattrs_map, shared_vec) + } + format::FormatVersion::V2 => { + // Ascending order: sequential index assignment + for (idx, value) in xattrs.values_mut().enumerate() { + *value = idx; + } + + // Return in ascending order (natural BTreeMap order) + let shared_vec = xattrs.keys().cloned().collect(); + (xattrs, shared_vec) + } + }; - // Visit each inode and change local xattrs into shared xattrs + // Visit each inode and promote xattrs that are in the shared table. + // This is the same for both V1 and V2: remove from local, push index to shared. for inode in inodes.iter_mut() { inode.xattrs.local.retain(|attr| { if let Some(idx) = xattrs.get(attr) { inode.xattrs.shared.push(*idx); - false // drop the local xattr: we converted it + false } else { - true // retain the local xattr: we didn't convert it + true } }); } - // Return the shared xattrs as a vec - xattrs.into_keys().collect() + shared } fn write_erofs( output: &mut impl Output, inodes: &[Inode], xattrs: &[XAttr], + ctx: &mut WriteContext, ) { - // Write composefs header - output.note_offset(Offset::Header); - output.write_struct(format::ComposefsHeader { + let version = ctx.version; + let min_mtime = ctx.min_mtime; + let header_flags = ctx.header_flags; + let composefs_version: u32 = ctx.composefs_version; + // Determine build_time based on format version + // V1: use minimum mtime across all inodes for reproducibility + // V2: use 0 (not used) + let (build_time, build_time_nsec) = match version { + format::FormatVersion::V1 => min_mtime, + format::FormatVersion::V2 => (0, 0), + }; + + // Write composefs header (pads to 1024 bytes internally) + output.write_composefs_header(format::ComposefsHeader { magic: format::COMPOSEFS_MAGIC, version: format::VERSION, - flags: 0.into(), - composefs_version: format::COMPOSEFS_VERSION, + flags: header_flags.into(), + composefs_version: composefs_version.into(), ..Default::default() }); - output.pad(1024); // Write superblock - output.note_offset(Offset::Superblock); - output.write_struct(format::Superblock { + // V1: set xattr_blkaddr to computed value; V2: leave as 0 + let xattr_blkaddr = match version { + format::FormatVersion::V1 => output.get_xattr_blkaddr(), + format::FormatVersion::V2 => 0, + }; + output.write_superblock(format::Superblock { magic: format::MAGIC_V1, blkszbits: format::BLOCK_BITS, feature_compat: (format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER).into(), root_nid: (output.get_nid(0) as u16).into(), inos: (inodes.len() as u64).into(), - blocks: ((output.get(Offset::End, 0) / usize::from(format::BLOCK_SIZE)) as u32).into(), + blocks: output.get_block_count().into(), + build_time: build_time.into(), + build_time_nsec: build_time_nsec.into(), + xattr_blkaddr: xattr_blkaddr.into(), ..Default::default() }); // Write inode table for (idx, inode) in inodes.iter().enumerate() { // The inode may add padding to itself, so it notes its own offset - inode.write_inode(output, idx); + inode.write_inode(output, idx, ctx); } + // Mark end of inode table (slot-aligned) + output.pad(INODE_SLOT_SIZE); + output.note_inodes_end(); + // Write shared xattr table for xattr in xattrs { - output.note_offset(Offset::XAttr); + output.note_xattr(); xattr.write(output); } // Write blocks from inodes that have them output.pad(4096); for inode in inodes.iter() { - output.note_offset(Offset::Block); + output.note_block(); inode.write_blocks(output); } // That's it - output.note_offset(Offset::End); + output.note_end(); } +/// Offsets recorded during the first pass and consumed by the second pass. +/// Only contains values that are actually retrieved; singletons that are +/// write-only (header, superblock) are tracked as bools in `FirstPass`. #[derive(Default)] struct Layout { - offset_types: Vec, - offsets: Vec, + /// Byte offset of each inode, indexed by InodeIdx. + inodes: Vec, + /// Byte offset immediately after the last inode (slot-aligned). + inodes_end: Option, + /// Byte offset of each shared xattr entry, indexed sequentially. + xattrs: Vec, + /// Byte offset of each inode's block data region, indexed by InodeIdx. + blocks: Vec, + /// Total byte length of the image. + end: Option, } #[derive(Default)] struct FirstPass { offset: usize, layout: Layout, + header_emitted: bool, + superblock_emitted: bool, } struct SecondPass { @@ -660,87 +1626,312 @@ struct SecondPass { layout: Layout, } -impl Output for SecondPass { - fn note_offset(&mut self, _offset_type: Offset) { - /* no-op */ +impl Output for FirstPass { + fn note_header_emitted(&mut self) { + assert!(!self.header_emitted, "composefs header written twice"); + self.header_emitted = true; + } + fn note_superblock_emitted(&mut self) { + assert!(!self.superblock_emitted, "superblock written twice"); + self.superblock_emitted = true; + } + fn note_inode(&mut self) { + self.layout + .inodes + .push(NonZeroUsize::new(self.offset).expect("inode recorded at offset 0")); + } + fn note_inodes_end(&mut self) { + assert!( + self.layout.inodes_end.is_none(), + "inodes_end recorded twice" + ); + self.layout.inodes_end = NonZeroUsize::new(self.offset); + } + fn note_xattr(&mut self) { + self.layout + .xattrs + .push(NonZeroUsize::new(self.offset).expect("xattr recorded at offset 0")); + } + fn note_block(&mut self) { + debug_assert_eq!( + self.offset % format::BLOCK_SIZE as usize, + 0, + "block data must start at a block-aligned offset" + ); + self.layout + .blocks + .push(NonZeroUsize::new(self.offset).expect("block recorded at offset 0")); + } + fn note_end(&mut self) { + assert!(self.layout.end.is_none(), "end recorded twice"); + self.layout.end = NonZeroUsize::new(self.offset); } - fn get(&self, offset_type: Offset, idx: usize) -> usize { - let start = self.layout.offset_types[offset_type as usize]; - self.layout.offsets[start + idx] + fn get_inode_offset(&self, _idx: usize) -> Option { + None + } + fn get_inodes_end(&self) -> Option { + None + } + fn get_xattr_offset(&self, _idx: usize) -> Option { + None + } + fn get_block_offset(&self, _idx: usize) -> Option { + None + } + fn get_end(&self) -> Option { + None } fn write(&mut self, data: &[u8]) { - self.output.extend_from_slice(data); + self.offset += data.len(); } - fn pad(&mut self, alignment: usize) { - self.output - .resize(round_up(self.output.len(), alignment), 0); + self.offset = round_up(self.offset, alignment); } - fn len(&self) -> usize { - self.output.len() + self.offset } } -impl Output for FirstPass { - fn note_offset(&mut self, offset_type: Offset) { - while self.layout.offset_types.len() <= offset_type as usize { - self.layout.offset_types.push(self.layout.offsets.len()); - } - assert_eq!(self.layout.offset_types.len(), offset_type as usize + 1); - - trace!( - "{:?} #{} @{}", - offset_type, - self.layout.offsets.len() - self.layout.offset_types[offset_type as usize], - self.offset +impl Output for SecondPass { + fn note_header_emitted(&mut self) {} + fn note_superblock_emitted(&mut self) {} + fn note_inode(&mut self) {} + fn note_inodes_end(&mut self) { + debug_assert_eq!( + self.output.len(), + self.layout + .inodes_end + .expect("inodes_end not recorded") + .get(), + "second pass diverged from first at inodes_end" + ); + } + fn note_xattr(&mut self) {} + fn note_block(&mut self) {} + fn note_end(&mut self) { + debug_assert_eq!( + self.output.len(), + self.layout.end.expect("end not recorded").get(), + "second pass diverged from first at end" ); - self.layout.offsets.push(self.offset); } - fn get(&self, _: Offset, _: usize) -> usize { - 0 // We don't know offsets in the first pass, so fake it + fn get_inode_offset(&self, idx: usize) -> Option { + Some(self.layout.inodes[idx]) + } + fn get_inodes_end(&self) -> Option { + Some(self.layout.inodes_end.expect("inodes_end not recorded")) + } + fn get_xattr_offset(&self, idx: usize) -> Option { + Some(self.layout.xattrs[idx]) + } + fn get_block_offset(&self, idx: usize) -> Option { + Some(self.layout.blocks[idx]) + } + fn get_end(&self) -> Option { + Some(self.layout.end.expect("end not recorded")) } fn write(&mut self, data: &[u8]) { - self.offset += data.len(); + self.output.extend_from_slice(data); } - fn pad(&mut self, alignment: usize) { - self.offset = round_up(self.offset, alignment); + self.output + .resize(round_up(self.output.len(), alignment), 0); } - fn len(&self) -> usize { - self.offset + self.output.len() } } -/// Creates an EROFS filesystem image from a composefs tree +/// Calculates the minimum mtime across all inodes in the collection. +/// +/// This is used for V1 compatibility where build_time is set to the +/// minimum mtime for reproducibility. Returns `(0, 0)` for an empty slice. +fn calculate_min_mtime(inodes: &[Inode]) -> (u64, u32) { + inodes + .iter() + .map(|inode| (inode.stat.st_mtim_sec as u64, inode.stat.st_mtim_nsec)) + .reduce(|(a_sec, a_nsec), (b_sec, b_nsec)| { + if (b_sec, b_nsec) < (a_sec, a_nsec) { + (b_sec, b_nsec) + } else { + (a_sec, a_nsec) + } + }) + .unwrap_or((0, 0)) +} + +/// Return type of [`prepare_erofs_inodes`]: +/// `(inodes, shared_xattrs, min_mtime, header_flags, composefs_version)`. +type PreparedInodes<'a, ObjectID> = (Vec>, Vec, (u64, u32), u32, u32); + +/// Shared setup for all `mkfs_erofs_*` entry points. +/// +/// Collects inodes from the filesystem, injects the V1 opaque xattr on the +/// root directory, computes `header_flags` and `composefs_version`, promotes +/// repeated xattrs to the shared table, and calculates `min_mtime`. +/// +/// Returns `(inodes, shared_xattrs, min_mtime, header_flags, composefs_version)`. +fn prepare_erofs_inodes<'a, ObjectID: FsVerityHashValue>( + fs: &'a tree::FileSystem, + version: format::FormatVersion, +) -> PreparedInodes<'a, ObjectID> { + let mut inodes = InodeCollector::collect(fs, version); + + // For V1, add trusted.overlay.opaque xattr to root directory. + // This is done after collection (and thus after xattr escaping) to match + // the C implementation behavior. + if version == format::FormatVersion::V1 && !inodes.is_empty() { + inodes[0] + .xattrs + .add(format::XATTR_OVERLAY_OPAQUE_ROOT, b"y", version); + } + + // For V1, compute header flags and composefs_version matching C mkcomposefs behavior. + // This must be checked before share_xattrs(), while all xattrs are still local. + let (header_flags, composefs_version) = if version == format::FormatVersion::V1 { + // COMPOSEFS_FLAGS_HAS_ACL (bit 0) is set when any inode has POSIX ACL xattrs. + let has_acl = inodes.iter().any(|inode| { + inode.xattrs.local.iter().any(|xattr| { + xattr.prefix == format::XATTR_INDEX_POSIX_ACL_ACCESS + || xattr.prefix == format::XATTR_INDEX_POSIX_ACL_DEFAULT + }) + }); + let flags = if has_acl { + format::COMPOSEFS_FLAGS_HAS_ACL.get() + } else { + 0 + }; + + // C mkcomposefs bumps composefs_version from 0 to 1 when any user-provided + // node is a whiteout (char device with rdev=0). In the Rust writer, such + // nodes are detected and marked as `escaped_whiteout` by InodeCollector + // (stubs added by add_overlay_whiteouts() are deliberately excluded from + // this flag via the root-level 2-hex-char name heuristic). + let has_user_whiteout = inodes.iter().any(|inode| inode.escaped_whiteout); + let cfs_ver = if has_user_whiteout { 1u32 } else { 0u32 }; + + (flags, cfs_ver) + } else { + (0u32, format::COMPOSEFS_VERSION.get()) + }; + + let xattrs = share_xattrs(&mut inodes, version); + let min_mtime = calculate_min_mtime(&inodes); + + (inodes, xattrs, min_mtime, header_flags, composefs_version) +} + +/// Creates an EROFS filesystem image from a composefs tree using the default format (V2). /// /// This function performs a two-pass generation: /// 1. First pass determines the layout and sizes of all structures /// 2. Second pass writes the actual image data /// /// Returns the complete EROFS image as a byte array. -pub fn mkfs_erofs(fs: &tree::FileSystem) -> Box<[u8]> { - // Create the intermediate representation: flattened inodes and shared xattrs - let nlink_map = fs.nlinks(); - let mut inodes = InodeCollector::collect(fs, &nlink_map); - let xattrs = share_xattrs(&mut inodes); +pub fn mkfs_erofs(fs: &ValidatedFileSystem) -> Box<[u8]> { + mkfs_erofs_versioned(fs, format::FormatVersion::default()) +} - // Do a first pass with the writer to determine the layout +/// Internal two-pass EROFS image generator shared by all public entry points. +/// +/// Runs a layout pass (first pass) followed by an emit pass (second pass). +/// When `faults` is `Some`, decisions are recorded during the first pass and +/// replayed during the second so both passes make identical choices. +pub(crate) fn mkfs_erofs_inner( + fs: &tree::FileSystem, + version: format::FormatVersion, + #[cfg(test)] faults: Option, +) -> Box<[u8]> { + let (inodes, xattrs, min_mtime, header_flags, composefs_version) = + prepare_erofs_inodes(fs, version); + + let mut ctx = WriteContext { + version, + min_mtime, + header_flags, + composefs_version, + #[cfg(test)] + faults, + }; + + // First pass: determine the layout. let mut first_pass = FirstPass::default(); - write_erofs(&mut first_pass, &inodes, &xattrs); + write_erofs(&mut first_pass, &inodes, &xattrs, &mut ctx); + + // Switch fault injector to replay mode so the second pass makes identical choices. + #[cfg(test)] + if let Some(ref mut f) = ctx.faults { + f.start_replay(); + } - // Do a second pass with the writer to get the actual bytes + // Second pass: emit the actual bytes. let mut second_pass = SecondPass { output: vec![], layout: first_pass.layout, }; - write_erofs(&mut second_pass, &inodes, &xattrs); + write_erofs(&mut second_pass, &inodes, &xattrs, &mut ctx); - // That's it second_pass.output.into_boxed_slice() } + +/// Creates an EROFS filesystem image from a composefs tree with an explicit format version. +/// +/// The `version` parameter controls the format version: +/// - `FormatVersion::V1`: C mkcomposefs compatible (compact inodes, BFS) +/// - `FormatVersion::V2`: Current default (composefs_version=2, extended inodes, DFS) +/// +/// Returns the complete EROFS image as a byte array. +pub fn mkfs_erofs_versioned( + fs: &ValidatedFileSystem, + version: format::FormatVersion, +) -> Box<[u8]> { + mkfs_erofs_inner( + fs, + version, + #[cfg(test)] + None, + ) +} + +/// Test-only: write a versioned EROFS image with fault injection. +/// +/// `faults` controls which writer invariants are intentionally violated. +/// Pass `WriterFaults::new(seed)` with the desired rates set. +#[cfg(test)] +pub(crate) fn mkfs_erofs_with_faults( + fs: &ValidatedFileSystem, + version: format::FormatVersion, + faults: WriterFaults, +) -> Box<[u8]> { + mkfs_erofs_inner(&fs.0, version, Some(faults)) +} + +#[cfg(test)] +mod tests { + use super::compute_chunk_format; + + /// Unit tests for `compute_chunk_format` with boundary values. + /// + /// The function converts a file size into the EROFS chunk-format field: + /// chunkbits = ilog2(size - 1) + 1, clamped to [BLOCK_BITS=12, 43] + /// result = chunkbits - BLOCK_BITS + #[test] + fn test_compute_chunk_format_boundary_values() { + // size=1: file_size <= 1 branch → chunkbits=1 → clamped to 12 → result 0 + assert_eq!(compute_chunk_format(1), 0, "size=1"); + // size=2: ilog2(1)+1=1 → clamped to 12 → result 0 + assert_eq!(compute_chunk_format(2), 0, "size=2"); + // size=4096: ilog2(4095)+1=12 → no clamp → result 0 + assert_eq!(compute_chunk_format(4096), 0, "size=4096"); + // size=4097: ilog2(4096)+1=13 → result 1 + assert_eq!(compute_chunk_format(4097), 1, "size=4097"); + // size=1<<20: ilog2((1<<20)-1)+1=20 → result 8 + assert_eq!(compute_chunk_format(1 << 20), 8, "size=1<<20"); + // size=(1<<20)+1: ilog2(1<<20)+1=21 → result 9 + assert_eq!(compute_chunk_format((1 << 20) + 1), 9, "size=(1<<20)+1"); + } +} diff --git a/crates/composefs/src/filesystem_ops.rs b/crates/composefs/src/filesystem_ops.rs index 31012230..917db518 100644 --- a/crates/composefs/src/filesystem_ops.rs +++ b/crates/composefs/src/filesystem_ops.rs @@ -4,22 +4,68 @@ //! FileSystem objects, including computing image IDs, committing to //! repositories, and generating dumpfiles. +use std::collections::HashMap; + use anyhow::Result; use fn_error_context::context; use crate::{ dumpfile::write_dumpfile, - erofs::writer::mkfs_erofs, + erofs::{ + format::{FormatSet, FormatVersion}, + writer::{mkfs_erofs_inner, validate_filesystem}, + }, fsverity::{FsVerityHashValue, compute_verity}, repository::Repository, tree::FileSystem, }; impl FileSystem { + /// Commits this filesystem as EROFS images for each version in `formats`. + /// + /// Returns a map from [`FormatVersion`] to the fsverity digest of the + /// stored image for that version. + /// + /// The `image_name` named ref (if provided) is assigned to the **first** + /// version yielded by `formats.iter()` (i.e. V1 when the set includes V1). + /// All subsequent versions are stored anonymously (no named ref). This + /// prevents the ref from silently being overwritten and left pointing at the + /// last written version. + /// + /// Note: Callers should ensure root metadata is set before calling this, + /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. + #[context("Committing filesystem as EROFS images")] + pub fn commit_images( + &self, + repository: &Repository, + image_name: Option<&str>, + formats: FormatSet, + ) -> Result> { + // Validate once before writing any version. + validate_filesystem(self)?; + let mut result = HashMap::new(); + let mut first = true; + for version in formats.iter() { + // Only the primary (first) version claims the named ref. + let name = if first { image_name } else { None }; + first = false; + let image_data = mkfs_erofs_inner( + self, + version, + #[cfg(test)] + None, + ); + let id = repository.write_image(name, &image_data)?; + result.insert(version, id); + } + Ok(result) + } + /// Commits this filesystem as an EROFS image to the repository. /// - /// Generates an EROFS filesystem image and writes it to the repository - /// with the optional name. Returns the fsverity digest of the committed image. + /// Generates an EROFS filesystem image using the repository's configured + /// EROFS format version and writes it with the optional name. Returns the + /// fsverity digest of the committed image. /// /// Note: Callers should ensure root metadata is set before calling this, /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. @@ -29,18 +75,31 @@ impl FileSystem { repository: &Repository, image_name: Option<&str>, ) -> Result { - repository.write_image(image_name, &mkfs_erofs(self)) + let version = repository.erofs_version(); + let formats = FormatSet::from(version); + let mut map = self.commit_images(repository, image_name, formats)?; + Ok(map.remove(&version).expect("format version must be in map")) } /// Computes the fsverity digest for this filesystem as an EROFS image. /// - /// Generates the EROFS image and returns its fsverity digest without - /// writing to a repository. + /// The digest depends on the EROFS format version: V1 and V2 produce + /// different on-disk layouts and therefore different digests. Callers + /// must supply the version explicitly so that the digest matches what is + /// actually stored (or will be stored) in the repository. /// /// Note: Callers should ensure root metadata is set before calling this, /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. - pub fn compute_image_id(&self) -> ObjectID { - compute_verity(&mkfs_erofs(self)) + pub fn compute_image_id(&self, version: FormatVersion) -> ObjectID { + // Callers are responsible for ensuring the tree is valid before calling this. + // In practice this is always called on freshly-built trees that don't have + // invalid constructs like hardlinked whiteouts. + compute_verity(&mkfs_erofs_inner( + self, + version, + #[cfg(test)] + None, + )) } /// Prints this filesystem in dumpfile format to stdout. diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index 76ac2f3a..029f3c5a 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -39,9 +39,201 @@ use crate::{ repository::Repository, shared_internals::IO_BUF_CAPACITY, tree::{Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}, - util::proc_self_fd, + util::{create_tmpfile_in, proc_self_fd, reopen_tmpfile_ro}, }; +// --------------------------------------------------------------------------- +// ObjectStore trait +// --------------------------------------------------------------------------- + +/// An abstraction over content-addressed storage for file objects. +/// +/// Both [`Repository`] and the C-compatible [`FlatDigestStore`] implement +/// this trait so that [`read_filesystem`] can write file content to either +/// layout without duplicating the scanning logic. +pub trait ObjectStore: Send + Sync { + /// Store `fd` as an object, returning its verity digest. + /// + /// If an object with the same digest already exists, this is a no-op + /// and the existing digest is returned. + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result; + + /// Return a semaphore that gates concurrent object writes. + fn write_semaphore(&self) -> Arc; +} + +impl ObjectStore for Repository { + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result { + self.ensure_object_from_fd(fd, size) + } + + fn write_semaphore(&self) -> Arc { + self.write_semaphore() + } +} + +/// C-compatible flat digest store (`/XX/DIGEST`). +/// +/// This mirrors the layout written by `mkcomposefs --digest-store` from the C +/// implementation, where file objects live at `//` +/// (e.g. `/ab/abcdef01234...`). This is distinct from the composefs-rs +/// [`Repository`] layout which nests objects under an `objects/` subdirectory. +/// +/// The flat layout makes the digest store interchangeable with the C tooling. +#[derive(Debug)] +pub struct FlatDigestStore { + /// Open directory fd for the store root. + root: Arc, + semaphore: Arc, + /// If true, fall back to userspace hashing when kernel fs-verity is + /// unavailable (e.g. tmpfs, overlayfs). Matches `Repository::insecure`. + insecure: bool, +} + +impl FlatDigestStore { + /// Open or create a flat digest store at `path`. + /// + /// `concurrency` controls how many concurrent object writes are permitted. + /// `insecure` enables userspace-hashing fallback when fs-verity is unavailable + /// (e.g. on tmpfs or overlayfs). Set to `true` for CLI use where the filesystem + /// may not support verity; set to `false` for strict security requirements. + pub fn open(path: &Path, concurrency: usize, insecure: bool) -> Result { + use rustix::fs::{Mode, mkdirat}; + + match mkdirat(CWD, path, Mode::from_raw_mode(0o755)) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e) + .with_context(|| format!("Failed to create flat digest store: {path:?}")); + } + } + + let root = openat( + CWD, + path, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .with_context(|| format!("Failed to open flat digest store: {path:?}"))?; + + Ok(Self { + root: Arc::new(root), + semaphore: Arc::new(Semaphore::new(concurrency)), + insecure, + }) + } +} + +impl ObjectStore for FlatDigestStore { + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result { + use crate::fsverity::{EnableVerityError, enable_verity_maybe_copy, measure_verity}; + use std::io::BufRead as _; + + // 1. Create an anonymous O_TMPFILE in the store root. + // No name collision possible; invisible until linked. + let tmpfile_fd = create_tmpfile_in(self.root.as_fd()) + .context("Creating O_TMPFILE in flat digest store")?; + + // 2. Stream from source fd into tmpfile (no in-memory buffering). + let mut src = std::io::BufReader::with_capacity(IO_BUF_CAPACITY, File::from(fd)); + let mut dst = File::from(tmpfile_fd.try_clone().context("Cloning tmpfile fd")?); + let copied = std::io::copy(&mut src, &mut dst).context("Copying object data to tmpfile")?; + ensure!( + copied == size, + "object size mismatch: expected {size}, copied {copied}" + ); + drop(dst); + + // 3. Reopen as read-only (kernel requires no writable fds to enable verity). + let ro_fd = + reopen_tmpfile_ro(File::from(tmpfile_fd)).context("Reopening tmpfile as read-only")?; + + // 4. Enable kernel fs-verity (kernel reads and hashes the file for us). + let (ro_fd, verity_enabled) = + match enable_verity_maybe_copy::(self.root.as_fd(), ro_fd.as_fd()) { + Ok(None) => (ro_fd, true), + Ok(Some(new_fd)) => (new_fd, true), + Err(EnableVerityError::AlreadyEnabled) => (ro_fd, true), + Err(EnableVerityError::FilesystemNotSupported) if self.insecure => (ro_fd, false), + Err(e) => { + return Err(anyhow::anyhow!(e)).context("Enabling verity on object tmpfile"); + } + }; + + // 5. Get the digest — from the kernel (fast) or userspace fallback. + let id: ObjectID = if verity_enabled { + measure_verity(&ro_fd).context("Measuring verity digest after enable")? + } else { + // Insecure fallback: re-read the tmpfile to compute the digest. + let mut reader = std::io::BufReader::with_capacity( + IO_BUF_CAPACITY, + File::from(ro_fd.try_clone().context("Cloning ro_fd for digest")?), + ); + let mut hasher = FsVerityHasher::::new(); + loop { + let buf = reader.fill_buf().context("Reading tmpfile for digest")?; + if buf.is_empty() { + break; + } + let chunk = &buf[..buf.len().min(FsVerityHasher::::BLOCK_SIZE)]; + hasher.add_block(chunk); + let n = chunk.len(); + reader.consume(n); + } + hasher.digest() + }; + + // 6. Derive flat path: XX/rest-of-hex (C-compatible layout). + let obj_path = id.to_object_pathname(); + let slash = obj_path + .find('/') + .expect("to_object_pathname always has '/'"); + let dir_name = &obj_path[..slash]; + let file_name = &obj_path[slash + 1..]; + + // 7. Create XX/ subdirectory if needed. + match mkdirat(self.root.as_fd(), dir_name, Mode::from_raw_mode(0o755)) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e) + .with_context(|| format!("Creating digest store subdirectory {dir_name:?}")); + } + } + + // 8. Open the XX/ subdirectory for use as linkat target. + let subdir = openat( + self.root.as_fd(), + dir_name, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .with_context(|| format!("Opening digest store subdirectory {dir_name:?}"))?; + + // 9. Atomically link the tmpfile into its final content-addressed path. + // EEXIST means another writer already stored the same object — fine. + match linkat( + CWD, + proc_self_fd(&ro_fd), + &subdir, + file_name, + AtFlags::SYMLINK_FOLLOW, + ) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e).with_context(|| { + format!("Linking object into flat digest store: {obj_path:?}") + }); + } + } + + Ok(id) + } + + fn write_semaphore(&self) -> Arc { + self.semaphore.clone() + } +} + /// Attempt to use O_TMPFILE + rename to atomically set file contents. /// Will fall back to a non-atomic write if the target doesn't support O_TMPFILE. #[context("Setting file contents for {}", name.to_string_lossy())] @@ -216,6 +408,7 @@ fn stat_fd(fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, generic_tr st_uid: buf.st_uid, st_gid: buf.st_gid, st_mtim_sec: buf.st_mtime as i64, + st_mtim_nsec: buf.st_mtime_nsec as u32, xattrs: read_xattrs(fd)?, }, )) @@ -533,18 +726,63 @@ pub fn read_file( /// /// If `repo` is `Some`, file objects are stored in the repository. /// If `None`, fsverity digests are computed without writing to disk. +/// +/// An optional `semaphore` can be provided to override the default concurrency +/// control. When `None`, the semaphore is derived from the repository (if any) +/// or from [`available_parallelism`]. pub async fn read_filesystem( dirfd: OwnedFd, path: PathBuf, repo: Option>>, ) -> Result> { - let semaphore = repo - .as_ref() - .map(|r| r.write_semaphore()) - .unwrap_or_else(|| { - let n = available_parallelism().map(|n| n.get()).unwrap_or(4); - Arc::new(Semaphore::new(n)) - }); + let store: Option>> = + repo.map(|r| r as Arc>); + read_filesystem_impl(dirfd, path, store, None).await +} + +/// Like [`read_filesystem`] but accepts any [`ObjectStore`] implementation. +/// +/// This is the preferred entry point when using a custom store (e.g. +/// [`FlatDigestStore`] for C-compatible `--digest-store` behaviour). +pub async fn read_filesystem_with_store( + dirfd: OwnedFd, + path: PathBuf, + store: Option>>, +) -> Result> { + read_filesystem_impl(dirfd, path, store, None).await +} + +/// Like [`read_filesystem`] but with an explicit concurrency limit. +/// +/// The `semaphore`, if provided, overrides the default parallelism derived from +/// the repository or [`available_parallelism`]. This is the recommended way to +/// honour a user-supplied `--threads` argument when no repository is present. +pub async fn read_filesystem_with_semaphore( + dirfd: OwnedFd, + path: PathBuf, + repo: Option>>, + semaphore: Arc, +) -> Result> { + let store: Option>> = + repo.map(|r| r as Arc>); + read_filesystem_impl(dirfd, path, store, Some(semaphore)).await +} + +async fn read_filesystem_impl( + dirfd: OwnedFd, + path: PathBuf, + store: Option>>, + semaphore_override: Option>, +) -> Result> { + let semaphore = semaphore_override.unwrap_or_else(|| { + store + .as_ref() + .map(|s| s.write_semaphore()) + .unwrap_or_else(|| { + let n = available_parallelism().map(|n| n.get()).unwrap_or(4); + Arc::new(Semaphore::new(n)) + }) + }); // Channel for streaming work items from the scan thread to the // async runtime. The scan sends (key, fd, size) as files are @@ -598,11 +836,11 @@ pub async fn read_filesystem( item = items.next(), if items_open => { match item { Some(((key, fd, size), permit)) => { - let repo = repo.clone(); + let store = store.clone(); tasks.spawn_blocking(move || { let _permit = permit; - let id = if let Some(repo) = repo { - repo.ensure_object_from_fd(fd, size)? + let id = if let Some(store) = store { + store.ensure_object_from_fd(fd, size)? } else { compute_verity_from_fd::(fd)? }; @@ -689,6 +927,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: Default::default(), + st_mtim_nsec: Default::default(), xattrs: Default::default(), }; set_file_contents(&td, OsStr::new("testfile"), &st, b"new contents").unwrap(); @@ -696,4 +935,60 @@ mod tests { assert_eq!(std::fs::read(testpath)?, b"new contents"); Ok(()) } + + /// Verify that `FlatDigestStore` stores objects in the C-compatible `XX/DIGEST` layout. + #[test] + fn test_flat_digest_store_layout() -> Result<()> { + use crate::fsverity::Sha256HashValue; + + let td = tempfile::tempdir()?; + let store_path = td.path().join("store"); + let store = FlatDigestStore::open(&store_path, 1, true)?; + + // Store a small piece of content. + let content = b"hello, flat digest store!"; + let src_dir = tempfile::tempdir()?; + let src_path = src_dir.path().join("file"); + std::fs::write(&src_path, content)?; + let src_fd = openat( + CWD, + &src_path, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::from_raw_mode(0), + )?; + + let id = >::ensure_object_from_fd( + &store, + src_fd, + content.len() as u64, + )?; + + // Verify the layout: store/XX/rest-of-digest + let expected_path = id.to_object_pathname(); // e.g. "ab/cdef0123..." + let full_path = store_path.join(&expected_path); + assert!( + full_path.exists(), + "Expected object at flat path {full_path:?}" + ); + + // Verify content is intact. + let stored = std::fs::read(&full_path)?; + assert_eq!(stored, content); + + // Idempotent: storing the same object again should succeed. + let src_fd2 = openat( + CWD, + &src_path, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::from_raw_mode(0), + )?; + let id2 = >::ensure_object_from_fd( + &store, + src_fd2, + content.len() as u64, + )?; + assert_eq!(id, id2); + + Ok(()) + } } diff --git a/crates/composefs/src/generic_tree.rs b/crates/composefs/src/generic_tree.rs index 1f296d50..1e06f5fe 100644 --- a/crates/composefs/src/generic_tree.rs +++ b/crates/composefs/src/generic_tree.rs @@ -21,6 +21,8 @@ pub struct Stat { pub st_gid: u32, /// Modification time in seconds since Unix epoch. pub st_mtim_sec: i64, + /// Modification time nanosecond component (0..999_999_999). + pub st_mtim_nsec: u32, /// Extended attributes as key-value pairs. pub xattrs: BTreeMap, Box<[u8]>>, } @@ -46,6 +48,7 @@ impl Stat { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -100,7 +103,7 @@ pub struct Leaf { } /// A directory node containing named entries. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Directory { /// Metadata for this directory. pub stat: Stat, @@ -109,7 +112,7 @@ pub struct Directory { } /// A filesystem inode representing either a directory or a leaf node. -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Inode { /// A directory inode. Directory(Box>), @@ -496,17 +499,30 @@ impl Directory { self.entries.clear(); } + /// Retains only top-level entries whose names satisfy the predicate. + /// This is used for filtering dump output to specific entries. + pub fn retain_top_level(&mut self, mut f: impl FnMut(&str) -> bool) { + self.entries.retain(|name, _| { + // Convert OsStr to str for comparison; non-UTF8 names never match + name.to_str().is_some_and(&mut f) + }); + } + /// Recursively finds the newest modification time in this directory tree. /// /// Returns the maximum modification time among this directory's metadata - /// and all files and subdirectories it contains. + /// and all files and subdirectories it contains, as a `(sec, nsec)` tuple + /// for full nanosecond precision. /// /// The `leaves` table is needed to resolve leaf mtimes. - pub fn newest_file(&self, leaves: &[Leaf]) -> i64 { - let mut newest = self.stat.st_mtim_sec; + pub fn newest_file(&self, leaves: &[Leaf]) -> (i64, u32) { + let mut newest = (self.stat.st_mtim_sec, self.stat.st_mtim_nsec); for inode in self.entries.values() { let mtime = match inode { - Inode::Leaf(id, _) => leaves[id.0].stat.st_mtim_sec, + Inode::Leaf(id, _) => { + let s = &leaves[id.0].stat; + (s.st_mtim_sec, s.st_mtim_nsec) + } Inode::Directory(dir) => dir.newest_file(leaves), }; if mtime > newest { @@ -583,6 +599,71 @@ pub struct FileSystem { } impl FileSystem { + /// Add 256 overlay whiteout stub entries to the root directory. + /// + /// This is required for Format 1.0 compatibility with the C mkcomposefs. + /// Each whiteout is a character device named "00" through "ff" with rdev=0. + /// They inherit uid/gid/mtime from the root directory but have empty xattrs. + /// + /// These entries allow overlay filesystems to efficiently represent + /// deleted files using device stubs that match the naming convention. + /// + /// Adds the 256 two-character hex-named whiteout stub entries (`00`..`ff`) to + /// the root directory, skipping any that already exist. + /// + /// Matches C mkcomposefs v1.0.8 `add_overlay_whiteouts()`: each stub inherits + /// `uid`, `gid`, and `mtime` from root, gets mode `S_IFCHR|0644` with `rdev=0`, + /// and **only** the `security.selinux` xattr from root (if present). No other + /// xattrs are propagated — copying all root xattrs would make them appear on 257 + /// inodes instead of 1, causing the xattr-sharing pass to turn them into shared + /// references and bloating the inode body in a way C does not. + pub fn add_overlay_whiteouts(&mut self) { + use std::ffi::OsString; + + // C mkcomposefs only inherits security.selinux from root for the stubs. + // Copying all root xattrs would change shared-vs-local xattr storage and + // produce a binary-incompatible image. + let selinux_key = std::ffi::OsStr::new("security.selinux"); + let mut whiteout_xattrs = std::collections::BTreeMap::new(); + if let Some(val) = self.root.stat.xattrs.get(selinux_key) { + whiteout_xattrs.insert(Box::from(selinux_key), val.clone()); + } + + let whiteout_stat = Stat { + st_mode: 0o644, + st_uid: self.root.stat.st_uid, + st_gid: self.root.stat.st_gid, + st_mtim_sec: self.root.stat.st_mtim_sec, + st_mtim_nsec: self.root.stat.st_mtim_nsec, + xattrs: whiteout_xattrs, + }; + + for i in 0..=255u8 { + let name = OsString::from(format!("{:02x}", i)); + + // Skip if entry already exists + if self.root.entries.contains_key(name.as_os_str()) { + continue; + } + + let leaf_id = self.push_leaf(whiteout_stat.clone(), LeafContent::CharacterDevice(0)); + self.root + .entries + .insert(name.into_boxed_os_str(), Inode::leaf(leaf_id)); + } + } + + /// Add trusted.overlay.opaque="y" xattr to root directory. + /// + /// This is required for Format 1.0 when whiteout entries are present, + /// marking the directory as opaque for the overlay filesystem. + pub fn set_overlay_opaque(&mut self) { + self.root.stat.xattrs.insert( + Box::from(std::ffi::OsStr::new("trusted.overlay.opaque")), + Box::from(*b"y"), + ); + } + /// Creates a new filesystem with a root directory having the given metadata. pub fn new(root_stat: Stat) -> Self { Self { @@ -631,6 +712,7 @@ impl FileSystem { let st_uid = usr.stat.st_uid; let st_gid = usr.stat.st_gid; let st_mtim_sec = usr.stat.st_mtim_sec; + let st_mtim_nsec = usr.stat.st_mtim_nsec; let xattrs = usr.stat.xattrs.clone(); // Apply copied metadata to root @@ -638,6 +720,7 @@ impl FileSystem { self.root.stat.st_uid = st_uid; self.root.stat.st_gid = st_gid; self.root.stat.st_mtim_sec = st_mtim_sec; + self.root.stat.st_mtim_nsec = st_mtim_nsec; self.root.stat.xattrs = xattrs; Ok(()) @@ -722,9 +805,10 @@ impl FileSystem { /// Returns an error if `/usr` does not exist (needed to get the mtime). pub fn canonicalize_run(&mut self) -> Result<(), ImageError> { if self.root.get_directory_opt(OsStr::new("run"))?.is_some() { - let usr_mtime = self.root.get_directory(OsStr::new("usr"))?.stat.st_mtim_sec; + let usr = self.root.get_directory(OsStr::new("usr"))?.stat.clone(); let run_dir = self.root.get_directory_mut(OsStr::new("run"))?; - run_dir.stat.st_mtim_sec = usr_mtime; + run_dir.stat.st_mtim_sec = usr.st_mtim_sec; + run_dir.stat.st_mtim_nsec = usr.st_mtim_nsec; run_dir.clear(); } Ok(()) @@ -979,7 +1063,9 @@ impl<'a, T> DirectoryRef<'a, T> { } /// Recursively finds the newest modification time in this directory tree. - pub fn newest_file(&self) -> i64 { + /// + /// Returns a `(sec, nsec)` tuple for full nanosecond precision. + pub fn newest_file(&self) -> (i64, u32) { self.dir.newest_file(self.leaves) } } @@ -1001,6 +1087,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -1012,6 +1099,7 @@ mod tests { st_uid: 1000, st_gid: 1000, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -1241,27 +1329,27 @@ mod tests { let mut leaves = Vec::new(); let mut root = Directory::new(stat_with_mtime(5)); - assert_eq!(root.newest_file(&leaves), 5); + assert_eq!(root.newest_file(&leaves), (5, 0)); let leaf_id_10 = push_leaf_file(&mut leaves, 10); root.insert(OsStr::new("file1"), Inode::leaf(leaf_id_10)); - assert_eq!(root.newest_file(&leaves), 10); + assert_eq!(root.newest_file(&leaves), (10, 0)); let subdir_stat = stat_with_mtime(15); let mut subdir = Box::new(Directory::new(subdir_stat)); let leaf_id_12 = push_leaf_file(&mut leaves, 12); subdir.insert(OsStr::new("subfile1"), Inode::leaf(leaf_id_12)); root.insert(OsStr::new("subdir"), Inode::Directory(subdir)); - assert_eq!(root.newest_file(&leaves), 15); + assert_eq!(root.newest_file(&leaves), (15, 0)); if let Some(Inode::Directory(sd)) = root.entries.get_mut(OsStr::new("subdir")) { let leaf_id_20 = push_leaf_file(&mut leaves, 20); sd.insert(OsStr::new("subfile2"), Inode::leaf(leaf_id_20)); } - assert_eq!(root.newest_file(&leaves), 20); + assert_eq!(root.newest_file(&leaves), (20, 0)); root.stat.st_mtim_sec = 25; - assert_eq!(root.newest_file(&leaves), 25); + assert_eq!(root.newest_file(&leaves), (25, 0)); } #[test] @@ -1313,6 +1401,7 @@ mod tests { st_uid: 42, st_gid: 43, st_mtim_sec: 1234567890, + st_mtim_nsec: 0, xattrs: BTreeMap::from([( Box::from(OsStr::new("security.selinux")), Box::from(b"system_u:object_r:usr_t:s0".as_slice()), @@ -1358,6 +1447,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::from([ ( Box::from(OsStr::new("security.selinux")), @@ -1610,6 +1700,7 @@ mod tests { st_uid: 100, st_gid: 200, st_mtim_sec: 54321, + st_mtim_nsec: 0, xattrs: BTreeMap::from([( Box::from(OsStr::new("user.test")), Box::from(b"val".as_slice()), @@ -1782,4 +1873,96 @@ mod tests { assert_eq!(fs.root.stat.st_mtim_sec, 200); assert_eq!(fs.leaves[0].stat.st_mtim_sec, 400); } + + #[test] + fn test_add_overlay_whiteouts() { + let root_stat = Stat { + st_mode: 0o755, + st_uid: 1000, + st_gid: 2000, + st_mtim_sec: 12345, + st_mtim_nsec: 0, + xattrs: BTreeMap::from([( + Box::from(OsStr::new("security.selinux")), + Box::from(b"system_u:object_r:root_t:s0".as_slice()), + )]), + }; + let mut fs = FileSystem::::new(root_stat); + + // Add a pre-existing entry that should not be overwritten + let pre_id = fs.push_leaf( + stat_with_mtime(99999), + LeafContent::Regular(FileContents {}), + ); + fs.root.insert(OsStr::new("00"), Inode::leaf(pre_id)); + + fs.add_overlay_whiteouts(); + + // Should have 256 whiteout entries (255 new + 1 pre-existing) + assert_eq!(fs.root.entries.len(), 256); + + // The pre-existing "00" should still have its original mtime + if let Some(Inode::Leaf(id, _)) = fs.root.entries.get(OsStr::new("00")) { + assert_eq!(fs.leaf(*id).stat.st_mtim_sec, 99999); + } else { + panic!("Expected '00' to remain a leaf"); + } + + // Check a newly created whiteout entry + if let Some(Inode::Leaf(id, _)) = fs.root.entries.get(OsStr::new("ff")) { + let leaf = fs.leaf(*id); + // Should be a character device with rdev=0 + assert!(matches!(leaf.content, LeafContent::CharacterDevice(0))); + // Should have mode 0o644 + assert_eq!(leaf.stat.st_mode, 0o644); + // Should inherit uid/gid/mtime from root + assert_eq!(leaf.stat.st_uid, 1000); + assert_eq!(leaf.stat.st_gid, 2000); + assert_eq!(leaf.stat.st_mtim_sec, 12345); + // Should inherit xattrs from root (e.g. SELinux label) — matching + // C mkcomposefs behaviour where whiteout entries copy root metadata. + assert_eq!( + leaf.stat + .xattrs + .get(OsStr::new("security.selinux")) + .map(|v| v.as_ref()), + Some(b"system_u:object_r:root_t:s0".as_slice()) + ); + } else { + panic!("Expected 'ff' to be a leaf"); + } + + // Check some middle entries exist + assert!(fs.root.entries.contains_key(OsStr::new("7f"))); + assert!(fs.root.entries.contains_key(OsStr::new("a0"))); + } + + #[test] + fn test_set_overlay_opaque() { + let mut fs = FileSystem::::new(default_stat()); + + fs.set_overlay_opaque(); + + let opaque = fs + .root + .stat + .xattrs + .get(OsStr::new("trusted.overlay.opaque")); + assert!(opaque.is_some()); + assert_eq!(opaque.unwrap().as_ref(), b"y"); + } + + #[test] + fn test_add_overlay_whiteouts_empty_fs() { + let mut fs = FileSystem::::new(default_stat()); + + fs.add_overlay_whiteouts(); + + // Should have exactly 256 entries + assert_eq!(fs.root.entries.len(), 256); + + // Check first and last entries + assert!(fs.root.entries.contains_key(OsStr::new("00"))); + assert!(fs.root.entries.contains_key(OsStr::new("ff"))); + } } diff --git a/crates/composefs/src/lib.rs b/crates/composefs/src/lib.rs index 38d55e1f..ebd74307 100644 --- a/crates/composefs/src/lib.rs +++ b/crates/composefs/src/lib.rs @@ -14,6 +14,7 @@ pub mod fs; pub mod fsverity; pub mod mount; pub mod mountcompat; +pub mod progress; pub mod repository; pub mod splitstream; pub mod tree; diff --git a/crates/composefs/src/progress.rs b/crates/composefs/src/progress.rs new file mode 100644 index 00000000..4787105b --- /dev/null +++ b/crates/composefs/src/progress.rs @@ -0,0 +1,565 @@ +//! Progress reporting API for pull and download operations. +//! +//! Library crates emit [`ProgressEvent`]s through a [`ProgressReporter`] trait +//! object. The default implementation, [`NullReporter`], discards all events +//! at zero cost. Callers such as `cfsctl` supply their own implementation +//! (e.g. an `indicatif`-backed renderer) via [`PullOptions::progress`]. + +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use tokio::io::{AsyncRead, ReadBuf}; + +/// Identity of a component being tracked, typically an OCI layer diff_id or +/// an HTTP object path. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ComponentId(String); + +impl ComponentId { + /// Return the underlying string slice. + pub fn as_str(&self) -> &str { + &self.0 + } +} + +impl> From for ComponentId { + fn from(s: S) -> Self { + ComponentId(s.into()) + } +} + +impl std::fmt::Display for ComponentId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.0) + } +} + +/// The unit of measurement for a progress component. +/// +/// Progress events may track either raw bytes (for layer downloads) or an +/// abstract item count (for object fetches where individual sizes are unknown). +/// Renderers should adapt their display accordingly. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum ProgressUnit { + /// The `fetched`/`total` fields count bytes. + Bytes, + /// The `fetched`/`total` fields count discrete items (e.g. objects). + Items, +} + +/// Events emitted during a pull or download operation. +#[derive(Debug, Clone, PartialEq)] +#[non_exhaustive] +pub enum ProgressEvent { + /// A new component (layer/object) has started being fetched. + Started { + /// Identifier for this component. + id: ComponentId, + /// Total amount to transfer (bytes or items depending on `unit`), if known. + total: Option, + /// Unit of measurement for `total` and subsequent `Progress` events. + unit: ProgressUnit, + }, + /// Progress update for a component. + Progress { + /// Identifier for this component. + id: ComponentId, + /// Amount transferred so far (bytes or items depending on the `Started` unit). + fetched: u64, + /// Total amount (bytes or items), if known. + total: Option, + }, + /// A component was skipped because it was already present. + /// + /// This event may be emitted without a preceding [`ProgressEvent::Started`] + /// when the component is determined to be cached before any download begins. + /// Renderers must handle this case gracefully. + Skipped { + /// Identifier for the skipped component. + id: ComponentId, + }, + /// A component completed successfully. + Done { + /// Identifier for this component. + id: ComponentId, + /// Amount actually transferred (bytes or items per the `Started` unit). + transferred: u64, + }, + /// A human-readable status message (replaces progress-bar text lines). + Message(String), +} + +/// Receives progress events from a pull or download operation. +/// +/// Implementations must be `Send + Sync` so they can be shared across async +/// tasks. All methods take `&self` so that the reporter can be held behind an +/// `Arc` without requiring interior mutability beyond what the implementation +/// itself manages (typically a `Mutex`). +pub trait ProgressReporter: Send + Sync { + /// Handle a single progress event. + fn report(&self, event: ProgressEvent); +} + +/// A no-op reporter that discards all events. +/// +/// This is the default when no reporter is provided. Because it has no +/// branches or allocations it compiles away entirely in release builds. +#[derive(Debug, Default)] +pub struct NullReporter; + +impl ProgressReporter for NullReporter { + #[inline] + fn report(&self, _event: ProgressEvent) {} +} + +/// Convenience type alias for a shared, type-erased progress reporter. +pub type SharedReporter = Arc; + +/// An [`AsyncRead`] wrapper that tracks bytes read via a `watch` channel. +/// +/// The reader itself is intentionally minimal: it only increments a counter and +/// publishes it through a non-blocking [`tokio::sync::watch`] channel on each +/// successful read. This keeps the hot I/O path free from any reporter logic. +/// +/// Backpressure is handled by the watch channel itself: if the progress +/// renderer is slow, intermediate byte counts are coalesced — the sender +/// never blocks waiting for the receiver to catch up. +/// +/// Use [`ProgressRead::new`] to construct the reader and its companion driver +/// future. The driver must run concurrently with the read (e.g. via +/// `tokio::join!`) to actually emit [`ProgressEvent::Progress`] events. +/// +/// Place this wrapper *before* any decompressor so that the `fetched` counter +/// reflects compressed bytes-over-the-wire, matching the `total` from the +/// preceding [`ProgressEvent::Started`] event. +pub struct ProgressRead { + inner: R, + /// Non-blocking sender; updating it on every read is fine. + tx: tokio::sync::watch::Sender, +} + +impl std::fmt::Debug for ProgressRead { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProgressRead") + .field("inner", &self.inner) + .field("bytes_read", &*self.tx.borrow()) + .finish_non_exhaustive() + } +} + +impl ProgressRead { + /// Wrap `inner` and return `(reader, driver)`. + /// + /// The driver is a future that translates raw byte counts into + /// [`ProgressEvent::Progress`] events via `reporter`. It completes when + /// the reader is dropped (i.e. the channel closes). Run it concurrently: + /// + /// ```ignore + /// let (reader, driver) = ProgressRead::new(blob, reporter, id, total); + /// let decompressor = decompress_async(reader, media_type)?; + /// let (import_result, ()) = tokio::join!(import_tar_async(repo, decompressor), driver); + /// ``` + /// + /// `total` should match the value passed to the preceding `Started` event + /// so the renderer can compute a meaningful percentage. + pub fn new( + inner: R, + reporter: SharedReporter, + id: ComponentId, + total: Option, + ) -> (Self, impl Future) { + let (tx, mut rx) = tokio::sync::watch::channel(0u64); + let driver = async move { + while rx.changed().await.is_ok() { + let fetched = *rx.borrow_and_update(); + reporter.report(ProgressEvent::Progress { + id: id.clone(), + fetched, + total, + }); + } + }; + (Self { inner, tx }, driver) + } +} + +impl AsyncRead for ProgressRead { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + let before = buf.filled().len(); + let result = Pin::new(&mut self.inner).poll_read(cx, buf); + if let Poll::Ready(Ok(())) = &result { + let n = (buf.filled().len() - before) as u64; + if n > 0 { + // Overflow-safe: update by adding the delta. Errors are + // ignored — if the driver has already dropped its receiver + // (e.g. the pull was cancelled), we simply stop sending. + self.tx.send_modify(|v| *v += n); + } + } + result + } +} + +// Bring `Future` into scope for the `impl Future` return type. +use std::future::Future; + +#[cfg(any(test, feature = "test"))] +pub mod test_support { + //! Test helpers for verifying progress event sequences. + + use std::sync::Mutex; + + use super::{ProgressEvent, ProgressReporter}; + + /// A [`ProgressReporter`] that records all events for later inspection. + /// + /// Useful in unit tests to assert that the correct sequence of events + /// was emitted during a pull or download operation. + pub struct RecordingReporter { + events: Mutex>, + } + + impl std::fmt::Debug for RecordingReporter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RecordingReporter") + .field("events", &self.events.lock().unwrap().len()) + .finish() + } + } + + impl Default for RecordingReporter { + fn default() -> Self { + Self { + events: Mutex::new(Vec::new()), + } + } + } + + impl RecordingReporter { + /// Create a new empty recorder. + pub fn new() -> Self { + Self::default() + } + + /// Return a snapshot of all events recorded so far. + pub fn events(&self) -> Vec { + self.events.lock().unwrap().clone() + } + } + + impl ProgressReporter for RecordingReporter { + fn report(&self, event: ProgressEvent) { + self.events.lock().unwrap().push(event); + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use super::test_support::RecordingReporter; + use super::*; + + // ── NullReporter ──────────────────────────────────────────────────────── + + /// Calling `report` on `NullReporter` with every variant must not panic. + #[test] + fn test_null_reporter_does_not_panic() { + let reporter = NullReporter; + reporter.report(ProgressEvent::Started { + id: "layer1".into(), + total: Some(1024), + unit: ProgressUnit::Bytes, + }); + reporter.report(ProgressEvent::Progress { + id: "layer1".into(), + fetched: 512, + total: Some(1024), + }); + reporter.report(ProgressEvent::Skipped { + id: "layer2".into(), + }); + reporter.report(ProgressEvent::Done { + id: "layer1".into(), + transferred: 1024, + }); + reporter.report(ProgressEvent::Message("done".to_string())); + } + + // ── ComponentId ───────────────────────────────────────────────────────── + + /// `ComponentId` can be constructed from `&str` and `String`, and its + /// `Display` impl round-trips the inner value. + #[test] + fn test_component_id_conversions() { + let cases = [ + "sha256:abc123", + "objects:my-stream", + "", + "docker://quay.io/foo:latest", + ]; + for input in cases { + let from_str: ComponentId = input.into(); + let from_string: ComponentId = input.to_string().into(); + assert_eq!( + from_str.as_str(), + input, + "ComponentId::from(&str) should store value" + ); + assert_eq!( + from_string.as_str(), + input, + "ComponentId::from(String) should store value" + ); + assert_eq!(from_str.to_string(), input, "Display should round-trip"); + assert_eq!(from_str, from_string, "both constructors should be equal"); + } + } + + /// `ComponentId` implements `Hash` + `Eq` correctly, so it works as a + /// `HashMap` key — which `IndicatifReporter` relies on. + #[test] + fn test_component_id_hash_map_key() { + let mut map: HashMap = HashMap::new(); + let id: ComponentId = "layer1".into(); + map.insert(id.clone(), 42); + + assert_eq!( + map.get(&ComponentId::from("layer1")), + Some(&42), + "lookup by equal ComponentId should succeed" + ); + assert_eq!( + map.get(&ComponentId::from("layer2")), + None, + "lookup by different ComponentId should return None" + ); + + // Ensure remove also works (used in IndicatifReporter on Done/Skipped) + let removed = map.remove(&id); + assert_eq!(removed, Some(42)); + assert!(map.is_empty()); + } + + // ── ProgressEvent ──────────────────────────────────────────────────────── + + /// Every `ProgressEvent` variant must implement `Debug` without panicking. + #[test] + fn test_progress_event_debug_all_variants() { + let events = [ + ProgressEvent::Started { + id: "x".into(), + total: Some(100), + unit: ProgressUnit::Bytes, + }, + ProgressEvent::Started { + id: "y".into(), + total: None, + unit: ProgressUnit::Items, + }, + ProgressEvent::Progress { + id: "x".into(), + fetched: 50, + total: Some(100), + }, + ProgressEvent::Skipped { id: "z".into() }, + ProgressEvent::Done { + id: "x".into(), + transferred: 100, + }, + ProgressEvent::Message("status update".into()), + ]; + for event in &events { + let debug = format!("{event:?}"); + assert!(!debug.is_empty(), "Debug output must not be empty"); + } + } + + /// `ProgressEvent` must be `Clone` and the clone must have the same + /// `Debug` representation as the original. + #[test] + fn test_progress_event_clone() { + let event = ProgressEvent::Started { + id: "layer".into(), + total: Some(1000), + unit: ProgressUnit::Bytes, + }; + let cloned = event.clone(); + assert_eq!( + format!("{event:?}"), + format!("{cloned:?}"), + "Clone should produce an identical value" + ); + } + + // ── RecordingReporter ──────────────────────────────────────────────────── + + /// `RecordingReporter` captures events in order and returns them via + /// `events()`. + #[test] + fn test_recording_reporter_captures_events_in_order() { + let reporter = RecordingReporter::new(); + reporter.report(ProgressEvent::Message("hello".into())); + reporter.report(ProgressEvent::Started { + id: "c1".into(), + total: Some(100), + unit: ProgressUnit::Bytes, + }); + reporter.report(ProgressEvent::Done { + id: "c1".into(), + transferred: 100, + }); + + let events = reporter.events(); + assert_eq!(events.len(), 3, "all three events should be recorded"); + assert!( + matches!(&events[0], ProgressEvent::Message(m) if m == "hello"), + "first event should be Message" + ); + assert!( + matches!(&events[1], ProgressEvent::Started { id, .. } if id.as_str() == "c1"), + "second event should be Started for c1" + ); + assert!( + matches!(&events[2], ProgressEvent::Done { id, .. } if id.as_str() == "c1"), + "third event should be Done for c1" + ); + } + + /// `SharedReporter = Arc` must be safely usable + /// from multiple threads simultaneously. + #[test] + fn test_shared_reporter_is_send_sync() { + let inner = Arc::new(RecordingReporter::new()); + let handles: Vec<_> = (0..4u32) + .map(|i| { + let r = Arc::clone(&inner); + std::thread::spawn(move || { + r.report(ProgressEvent::Message(format!("thread {i}"))); + }) + }) + .collect(); + for handle in handles { + handle.join().expect("thread should not panic"); + } + assert_eq!( + inner.events().len(), + 4, + "all four threads should have recorded their event" + ); + } + + // ── ProgressUnit ───────────────────────────────────────────────────────── + + /// Both `ProgressUnit` variants must be accessible and `Debug`-able. + #[test] + fn test_progress_unit_variants() { + let bytes = ProgressUnit::Bytes; + let items = ProgressUnit::Items; + assert_ne!(bytes, items); + assert!(!format!("{bytes:?}").is_empty()); + assert!(!format!("{items:?}").is_empty()); + } + + // ── ProgressRead ───────────────────────────────────────────────────────── + + /// Helper: run `ProgressRead` over `data` with a concurrent driver task, + /// and return all recorded `Progress` events. + async fn run_progress_read( + data: Vec, + id: ComponentId, + total: Option, + ) -> Vec { + use tokio::io::AsyncReadExt; + + let reporter = Arc::new(test_support::RecordingReporter::new()); + let cursor = tokio::io::BufReader::new(std::io::Cursor::new(data)); + let (mut reader, driver) = + ProgressRead::new(cursor, Arc::clone(&reporter) as SharedReporter, id, total); + // Spawn the driver so it runs independently. When the reader is + // dropped (after read_to_end), the watch sender closes and the driver + // task completes on its own. + let driver_handle = tokio::spawn(driver); + let mut buf = Vec::new(); + reader.read_to_end(&mut buf).await.unwrap(); + // Drop the reader explicitly so the watch sender closes, which lets + // the driver task observe channel closure and exit. + drop(reader); + driver_handle.await.unwrap(); + reporter.events() + } + + /// `ProgressRead` emits at least one `Progress` event when non-empty data + /// is read. Every byte goes through the watch channel, so any non-empty + /// read must produce at least one event. + #[tokio::test] + async fn test_progress_read_emits_events() { + let id: ComponentId = "test-layer".into(); + let total: u64 = 1024; + let data = vec![0u8; total as usize]; + let events = run_progress_read(data, id.clone(), Some(total)).await; + + let progress_events: Vec<_> = events + .iter() + .filter(|e| matches!(e, ProgressEvent::Progress { .. })) + .collect(); + + assert!( + !progress_events.is_empty(), + "expected at least one Progress event" + ); + // All events must carry the correct id and total + for event in &progress_events { + if let ProgressEvent::Progress { + id: eid, + total: etot, + .. + } = event + { + assert_eq!(eid, &id); + assert_eq!(*etot, Some(total)); + } + } + // The last Progress event must report fetched == total + if let Some(ProgressEvent::Progress { fetched, .. }) = progress_events.last() { + assert_eq!( + *fetched, total, + "last Progress event should have fetched == total" + ); + } + } + + /// `ProgressRead` with a zero-length source emits no `Progress` events + /// since the watch value never changes from its initial state. + #[tokio::test] + async fn test_progress_read_empty_source_no_events() { + let events = run_progress_read(vec![], "empty".into(), Some(0)).await; + assert!( + events.is_empty(), + "no events should be emitted for an empty source" + ); + } + + /// Every byte is sent through the watch channel, so even a single byte + /// should produce exactly one `Progress` event. + #[tokio::test] + async fn test_progress_read_single_byte_one_event() { + let events = run_progress_read(vec![42u8], "single".into(), Some(1)).await; + let progress_count = events + .iter() + .filter(|e| matches!(e, ProgressEvent::Progress { .. })) + .count(); + assert_eq!( + progress_count, 1, + "single byte should produce exactly one Progress event" + ); + } +} diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 73047eed..bc65a511 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -107,6 +107,7 @@ use rustix::{ }; use crate::{ + erofs::format::{FormatSet, FormatVersion}, fsverity::{ Algorithm, CompareVerityError, DEFAULT_LG_BLOCKSIZE, EnableVerityError, FsVerityHashValue, FsVerityHasher, MeasureVerityError, compute_verity, enable_verity_maybe_copy, @@ -192,13 +193,29 @@ pub const REPO_FORMAT_VERSION: u32 = 1; /// but prevent any writes (adding objects, creating images, GC, …). /// - Unknown **incompatible** features cause the repository to be /// rejected entirely. -/// -/// There are currently no defined features. pub mod known_features { + /// The ro-compat feature flag set on V1 EROFS repositories. + /// + /// Old tools that don't recognize this flag will open the repository + /// as read-only, preventing accidental V2 image writes. + pub const CFS_EROFS_VERSION: &str = "cfs_erofs_version"; + + /// The ro-compat feature flag indicating that this repository generates only V1 EROFS images. + /// + /// When present, the repository was initialized with [`FormatSet::V1_ONLY`] and all + /// committed images use the V1 (C-tool compatible) format. When absent, the repository + /// generates both V1 and V2 images ([`FormatSet::BOTH`]). + pub const V1_EROFS: &str = "v1_erofs"; + /// Compatible features understood by this version. pub const COMPAT: &[&str] = &[]; /// Read-only compatible features understood by this version. - pub const RO_COMPAT: &[&str] = &[]; + /// + /// `cfs_erofs_version` is set on V1 repositories so that older tools + /// (which don't know about the EROFS version field) open them read-only + /// rather than writing new images in the wrong format. + /// `v1_erofs` signals that this repository uses the V1-only format set. + pub const RO_COMPAT: &[&str] = &[CFS_EROFS_VERSION, V1_EROFS]; /// Incompatible features understood by this version. pub const INCOMPAT: &[&str] = &[]; } @@ -282,6 +299,10 @@ impl FeatureFlags { /// (ext4, XFS, EROFS): a base version integer for fundamental layout /// changes, plus three tiers of feature flags for finer-grained /// evolution. +/// +/// The EROFS format version is not stored as an explicit field; it is +/// derived from the feature flags: the presence of `"cfs_erofs_version"` +/// in `read_only_compatible` means V1, its absence means V2. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct RepoMetadata { /// Base repository format version. Tools must refuse to operate @@ -297,21 +318,67 @@ pub struct RepoMetadata { } impl RepoMetadata { - /// Build metadata for a repository using the given hash type. - pub fn for_hash() -> Self { - Self { - version: REPO_FORMAT_VERSION, - algorithm: Algorithm::for_hash::(), - features: FeatureFlags::default(), + /// Derive the default EROFS format version from the feature flags. + /// + /// - `"cfs_erofs_version"` present in `read_only_compatible` → [`FormatVersion::V1`] + /// - absent → [`FormatVersion::V2`] + pub fn erofs_version(&self) -> FormatVersion { + if self + .features + .read_only_compatible + .iter() + .any(|f| f == known_features::CFS_EROFS_VERSION) + { + FormatVersion::V1 + } else { + FormatVersion::V2 } } +} - /// Build metadata from an explicit [`Algorithm`]. +impl RepoMetadata { + /// Build metadata for a repository using the given hash type, with the default (V2) EROFS version. + pub fn for_hash() -> Self { + Self::new_with_formats( + Algorithm::for_hash::(), + FormatVersion::default(), + FormatSet::BOTH, + ) + } + + /// Build metadata from an explicit [`Algorithm`], with the default (V2) EROFS format version. pub fn new(algorithm: Algorithm) -> Self { + Self::new_with_formats(algorithm, FormatVersion::default(), FormatSet::BOTH) + } + + /// Build metadata with the correct feature flags for the given EROFS format version + /// and format set. + /// + /// The EROFS format version is encoded entirely in the feature flags: + /// - V1 repositories add `"cfs_erofs_version"` to `ro_compat` so that older + /// tools open them read-only rather than writing images in the wrong format. + /// - [`FormatSet::V1_ONLY`] repositories additionally add `"v1_erofs"` to `ro_compat`. + /// - [`FormatSet::BOTH`] repositories omit `"v1_erofs"`. + pub fn new_with_formats( + algorithm: Algorithm, + erofs_version: FormatVersion, + erofs_formats: FormatSet, + ) -> Self { + let mut features = FeatureFlags::default(); + if erofs_version == FormatVersion::V1 { + features + .read_only_compatible + .push(known_features::CFS_EROFS_VERSION.to_string()); + } + if erofs_formats == FormatSet::V1_ONLY { + features + .read_only_compatible + .push(known_features::V1_EROFS.to_string()); + } Self { version: REPO_FORMAT_VERSION, algorithm, - features: FeatureFlags::default(), + features, } } @@ -351,6 +418,81 @@ impl RepoMetadata { } } +/// Configuration for initializing a new composefs repository. +/// +/// Passed to [`Repository::init_path`] to specify the algorithm, +/// fs-verity policy, and default EROFS format version. +/// +/// fs-verity is **required by default**. Call [`set_insecure`](Self::set_insecure) +/// to opt out (e.g. on tmpfs or in tests). +/// +/// # Examples +/// +/// ```no_run +/// use composefs::repository::RepositoryConfig; +/// use composefs::fsverity::Algorithm; +/// +/// // Default: SHA-256, fs-verity required, EROFS V2. +/// let config = RepositoryConfig::default(); +/// +/// // SHA-512 with fs-verity required. +/// let config = RepositoryConfig::new(Algorithm::SHA512); +/// +/// // Insecure mode (tmpfs, testing). +/// let config = RepositoryConfig::default().set_insecure(); +/// +/// // Custom algorithm, insecure. +/// let config = RepositoryConfig::new(Algorithm::SHA512).set_insecure(); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RepositoryConfig { + /// The fs-verity hash algorithm for content-addressed objects. + pub algorithm: Algorithm, + /// Default EROFS format version for images produced by this repository. + /// V1 is compatible with C `mkcomposefs` 1.0.8; V2 is the composefs-rs native format. + pub erofs_version: FormatVersion, + /// The set of EROFS format versions to generate when committing images. + /// + /// Defaults to [`FormatSet::V1_ONLY`] (C-tool compatible). Set to + /// [`FormatSet::BOTH`] when both V1 and V2 images should be produced + /// (e.g. for bootc workflows). + pub erofs_formats: FormatSet, + /// When `true`, fs-verity is NOT enabled on `meta.json` and is not required + /// on stored objects. Use [`set_insecure`](Self::set_insecure) to set this. + insecure: bool, +} + +impl RepositoryConfig { + /// Create a config with the given algorithm and all other settings at their defaults + /// (fs-verity required, `erofs_version = V2`, `erofs_formats = V1_ONLY`). + pub fn new(algorithm: Algorithm) -> Self { + Self { + algorithm, + ..Self::default() + } + } + + /// Disable fs-verity for this repository. + /// + /// Suitable for use on filesystems that do not support fs-verity (tmpfs, + /// overlayfs) or in test environments. Returns `self` for chaining. + pub fn set_insecure(mut self) -> Self { + self.insecure = true; + self + } +} + +impl Default for RepositoryConfig { + fn default() -> Self { + Self { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::default(), + erofs_formats: FormatSet::V1_ONLY, + insecure: false, + } + } +} + /// Read the fs-verity algorithm from a repository's `meta.json`. /// /// This is the public API for determining which algorithm a repository @@ -440,6 +582,23 @@ pub fn system_path() -> PathBuf { PathBuf::from("/sysroot/composefs") } +/// Derive the [`FormatSet`] from a [`RepoMetadata`]. +/// +/// - `"v1_erofs"` present in `ro_compat` → [`FormatSet::V1_ONLY`] +/// - `"v1_erofs"` absent → [`FormatSet::BOTH`] +fn repo_formats_from_meta(meta: &RepoMetadata) -> FormatSet { + if meta + .features + .read_only_compatible + .iter() + .any(|f| f == known_features::V1_EROFS) + { + FormatSet::V1_ONLY + } else { + FormatSet::BOTH + } +} + /// Write `meta.json` into a repository directory fd. /// /// This atomically writes (via O_TMPFILE + linkat) the metadata file. @@ -735,8 +894,15 @@ pub struct Repository { repository: OwnedFd, objects: OnceCell, write_semaphore: OnceCell>, + /// Optional override for the number of concurrent object writes. + /// Set via [`set_write_concurrency`](Self::set_write_concurrency) before the semaphore + /// is first used; if `None`, defaults to [`available_parallelism`]. + write_concurrency: Option, insecure: bool, metadata: RepoMetadata, + /// Per-invocation EROFS version override set by [`set_erofs_version`](Self::set_erofs_version). + /// Does not rewrite `meta.json`; only affects this `Repository` instance. + erofs_version_override: Option, /// When true, SplitStreamWriter::done() writes old-format (pre-repr(C)) /// headers. Used to test backward compatibility with splitstreams /// written before #[repr(C)] was added to SplitstreamHeader. @@ -1028,15 +1194,40 @@ impl Repository { .get_or_try_init(|| ensure_dir_and_openat(&self.repository, "objects", OFlags::PATH)) } + /// Override the maximum number of concurrent object writes. + /// + /// Must be called before the first use of [`write_semaphore`](Self::write_semaphore); + /// has no effect if the semaphore has already been initialized. + pub fn set_write_concurrency(&mut self, n: usize) { + // Guard: the semaphore is lazily initialized on first use. If it's + // already been initialized, this call has no effect. Callers must + // set concurrency before any write operations begin. + debug_assert!( + self.write_semaphore.get().is_none(), + "set_write_concurrency called after write_semaphore was already initialized; \ + call this before any write operations" + ); + if self.write_semaphore.get().is_some() { + log::warn!( + "set_write_concurrency called after semaphore was already initialized; ignoring" + ); + return; + } + self.write_concurrency = Some(n); + } + /// Return a shared semaphore for limiting concurrent object writes. /// - /// This semaphore is lazily initialized with `available_parallelism()` permits, + /// This semaphore is lazily initialized with `available_parallelism()` permits + /// (or the value set via [`set_write_concurrency`](Self::set_write_concurrency)), /// and shared across all operations on this repository. Use this to limit /// concurrent I/O when processing multiple files or layers in parallel. pub fn write_semaphore(&self) -> Arc { self.write_semaphore .get_or_init(|| { - let max_concurrent = available_parallelism().map(|n| n.get()).unwrap_or(4); + let max_concurrent = self + .write_concurrency + .unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); Arc::new(Semaphore::new(max_concurrent)) }) .clone() @@ -1045,17 +1236,17 @@ impl Repository { /// Initialize a new repository at the target path and open it. /// /// Creates the directory (mode 0700) if it does not exist, writes - /// `meta.json` for the given `algorithm`, and returns the opened + /// `meta.json` using the parameters from `config`, and returns the opened /// repository together with a flag indicating whether this was a /// fresh initialization (`true`) or an idempotent open of an /// existing repository with the same algorithm (`false`). /// - /// The `algorithm` must be compatible with this repository's + /// The `config.algorithm` must be compatible with this repository's /// `ObjectID` type (e.g. `Algorithm::Sha512` for /// `Repository`). /// - /// If `enable_verity` is true, fs-verity is enabled on `meta.json`, - /// signaling that all objects must also have verity. + /// Unless `config` has been made insecure via [`RepositoryConfig::set_insecure`], + /// fs-verity is enabled on `meta.json`, signaling that all objects must also have verity. /// /// If `meta.json` already exists with a different algorithm, an /// error is returned. @@ -1063,10 +1254,16 @@ impl Repository { pub fn init_path( dirfd: impl AsFd, path: impl AsRef, - algorithm: Algorithm, - enable_verity: bool, + config: RepositoryConfig, ) -> Result<(Self, bool)> { let path = path.as_ref(); + let RepositoryConfig { + algorithm, + erofs_version, + erofs_formats, + insecure, + } = config; + let require_fsverity = !insecure; if !algorithm.is_compatible::() { bail!( @@ -1088,11 +1285,12 @@ impl Repository { ) .with_context(|| format!("opening repository directory {}", path.display()))?; - let meta = RepoMetadata::new(algorithm); + let meta = RepoMetadata::new_with_formats(algorithm, erofs_version, erofs_formats); // Try to write meta.json. If it already exists, check for - // idempotency: same algorithm is fine, different is an error. - if let Err(write_err) = write_repo_metadata(&repo_fd, &meta, enable_verity) { + // idempotency: same config is fine; certain upgrades are allowed; + // incompatible changes are errors. + if let Err(write_err) = write_repo_metadata(&repo_fd, &meta, require_fsverity) { match read_repo_metadata(&repo_fd)? { Some(existing) if existing == meta => { // Idempotent: same config, already initialized. @@ -1100,11 +1298,41 @@ impl Repository { return Ok((repo, false)); } Some(existing) => { + let existing_formats = repo_formats_from_meta(&existing); + // Allow upgrading V1_ONLY → BOTH by rewriting meta.json. + if existing.algorithm == meta.algorithm + && existing.erofs_version() == meta.erofs_version() + && existing_formats == FormatSet::V1_ONLY + && erofs_formats == FormatSet::BOTH + { + // Upgrade: remove old meta.json (unlink) so write succeeds. + unlinkat(&repo_fd, REPO_METADATA_FILENAME, AtFlags::empty()) + .context("removing old meta.json for format-set upgrade")?; + write_repo_metadata(&repo_fd, &meta, require_fsverity) + .context("rewriting meta.json for format-set upgrade")?; + drop(repo_fd); + let repo = Self::open_path(dirfd, path)?; + return Ok((repo, false)); + } + // Downgrading BOTH → V1_ONLY is not allowed. + if existing.algorithm == meta.algorithm + && existing.erofs_version() == meta.erofs_version() + && existing_formats == FormatSet::BOTH + && erofs_formats == FormatSet::V1_ONLY + { + bail!( + "repository already initialized with erofs_formats=BOTH; \ + downgrading to V1_ONLY is not permitted" + ); + } bail!( - "repository already initialized with algorithm '{}'; \ - cannot re-initialize with '{}'", + "repository already initialized with different configuration \ + (algorithm: {}, erofs_version: {:?}); \ + cannot re-initialize with (algorithm: {}, erofs_version: {:?})", existing.algorithm, + existing.erofs_version(), meta.algorithm, + meta.erofs_version(), ); } None => { @@ -1152,8 +1380,10 @@ impl Repository { repository, objects: OnceCell::new(), write_semaphore: OnceCell::new(), + write_concurrency: None, insecure: !has_verity, metadata, + erofs_version_override: None, #[cfg(any(test, feature = "test"))] write_old_splitstream_format: std::sync::atomic::AtomicBool::new(false), _data: std::marker::PhantomData, @@ -1197,6 +1427,10 @@ impl Repository { ); } + // Use `new` (no `cfs_erofs_version` or `v1_erofs` flags) for legacy repos + // that pre-date the format-set feature. No feature flags → V2 + BOTH, which + // is correct: old repos may contain images of any version and should not be + // artificially restricted. let meta = RepoMetadata::new(algorithm); write_repo_metadata(&repo_fd, &meta, has_verity)?; @@ -1850,6 +2084,19 @@ impl Repository { self.insecure } + /// Override the EROFS format version for this repository session. + /// + /// Changes the in-memory default used by [`FileSystem::commit_image`] + /// and [`FileSystem::compute_image_id`] for the lifetime of this + /// Override the EROFS format version for this `Repository` instance only. + /// + /// Does **not** rewrite `meta.json`. Intended for CLI tools that accept a + /// per-invocation `--erofs-version` flag to override the repository's stored default. + pub fn set_erofs_version(&mut self, version: FormatVersion) -> &mut Self { + self.erofs_version_override = Some(version); + self + } + /// Mark this repository as insecure, disabling verification of /// fs-verity digests. This allows operation on filesystems /// without verity support. @@ -3245,6 +3492,25 @@ impl Repository { &self.metadata } + /// Returns the effective EROFS format version for this repository. + /// + /// Returns the per-invocation override set by [`set_erofs_version`](Self::set_erofs_version) + /// if one is active, otherwise derives the version from the `meta.json` feature flags + /// (presence of `"cfs_erofs_version"` in `read_only_compatible` → V1, absent → V2). + pub fn erofs_version(&self) -> FormatVersion { + self.erofs_version_override + .unwrap_or_else(|| self.metadata.erofs_version()) + } + + /// Returns the [`FormatSet`] configured for this repository. + /// + /// Derived from the `"v1_erofs"` ro_compat feature flag in `meta.json`: + /// - flag present → [`FormatSet::V1_ONLY`] + /// - flag absent → [`FormatSet::BOTH`] + pub fn default_format_set(&self) -> FormatSet { + repo_formats_from_meta(&self.metadata) + } + /// Lists all named stream references under a given prefix. /// /// Returns (name, target) pairs where name is relative to the prefix. @@ -3406,7 +3672,11 @@ mod tests { /// Create a test repository in insecure mode (no fs-verity required). fn create_test_repo(path: &Path) -> Result>> { - let (repo, _) = Repository::init_path(CWD, path, Algorithm::SHA512, false)?; + let (repo, _) = Repository::init_path( + CWD, + path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + )?; Ok(Arc::new(repo)) } @@ -3916,6 +4186,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), } } @@ -3929,6 +4200,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::External(obj.clone(), size)), @@ -4091,6 +4363,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::External(obj2.clone(), size2)), @@ -4755,8 +5028,7 @@ mod tests { #[tokio::test] async fn test_fsck_detects_corrupt_erofs_image() -> Result<()> { // Exercises fsck_image: corrupts the erofs image data so that - // parsing fails. The catch_unwind should catch the panic from - // the current erofs reader. + // parsing fails. fsck_image returns an error rather than panicking. let tmp = tempdir(); let repo = create_test_repo(&tmp.path().join("repo"))?; @@ -4793,6 +5065,45 @@ mod tests { Ok(()) } + /// Helper to create a V1 (C-compatible) EROFS image and write it to the repo. + fn commit_v1_image( + repo: &Repository, + obj_id: &Sha512HashValue, + obj_size: u64, + ) -> Result { + use crate::erofs::writer::{ValidatedFileSystem, mkfs_erofs_versioned}; + + let mut fs = make_test_fs(obj_id, obj_size); + fs.add_overlay_whiteouts(); + let image_data = + mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + repo.write_image(None, &image_data) + } + + #[tokio::test] + async fn test_fsck_validates_v1_erofs_image() -> Result<()> { + // V1 images (C-compatible format) should pass fsck just like V2. + // This catches regressions where fsck or the reader doesn't handle + // compact inodes, BFS ordering, or the whiteout table. + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj_size: u64 = 32 * 1024; + let obj = generate_test_data(obj_size, 0xBB); + let obj_id = repo.ensure_object(&obj)?; + + commit_v1_image(&repo, &obj_id, obj_size)?; + repo.sync()?; + + let result = repo.fsck().await?; + assert!( + result.is_ok(), + "V1 (C-compatible) erofs image should pass fsck: {result}" + ); + assert!(result.images_checked > 0, "should have checked the image"); + Ok(()) + } + // ---- Fsck metadata validation tests ---- #[tokio::test] @@ -4867,7 +5178,12 @@ mod tests { // Open a sha512 repo as sha256 → AlgorithmMismatch. let tmp = tempdir(); let path = tmp.path().join("sha512-repo"); - Repository::::init_path(CWD, &path, Algorithm::SHA512, false).unwrap(); + Repository::::init_path( + CWD, + &path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); assert!(matches!( Repository::::open_path(CWD, &path), Err(RepositoryOpenError::AlgorithmMismatch { .. }) @@ -4938,6 +5254,155 @@ mod tests { ); } + // ---- erofs_version / cfs_erofs_version feature tests ---- + + #[test] + fn test_init_v1_repo_metadata() { + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + FormatVersion::V1, + FormatSet::V1_ONLY, + ); + assert_eq!(meta.erofs_version(), FormatVersion::V1); + assert!( + meta.features + .read_only_compatible + .contains(&known_features::CFS_EROFS_VERSION.to_string()), + "V1 repo must list cfs_erofs_version in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + } + + #[test] + fn test_init_v2_repo_metadata() { + let meta = + RepoMetadata::new_with_formats(Algorithm::SHA256, FormatVersion::V2, FormatSet::BOTH); + assert_eq!(meta.erofs_version(), FormatVersion::V2); + assert!( + !meta + .features + .read_only_compatible + .contains(&known_features::CFS_EROFS_VERSION.to_string()), + "V2 repo must NOT list cfs_erofs_version in ro_compat" + ); + } + + #[test] + fn test_init_path_erofs_version_mismatch() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + // First init: V1 + let config_v1 = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V1, + ..RepositoryConfig::default().set_insecure() + }; + Repository::::init_path(CWD, &path, config_v1)?; + + // Second init: V2 — should fail because meta.json already exists with V1 config + let config_v2 = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + ..RepositoryConfig::default().set_insecure() + }; + let result = Repository::::init_path(CWD, &path, config_v2); + assert!( + result.is_err(), + "re-initializing with different erofs_version must fail" + ); + let err = result.unwrap_err(); + // Use the full chain representation so we see the inner bail! message, + // not just the outermost fn_error_context wrapper. + let msg = format!("{err:#}"); + assert!( + msg.contains("erofs_version"), + "error message must mention erofs_version, got: {msg}" + ); + Ok(()) + } + + #[test] + fn test_init_path_same_erofs_version_is_idempotent() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V1, + ..RepositoryConfig::default().set_insecure() + }; + let (_, was_new) = Repository::::init_path(CWD, &path, config.clone())?; + assert!(was_new, "first init must be fresh"); + + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(!was_new, "second init with same config must be idempotent"); + assert_eq!(repo.erofs_version(), FormatVersion::V1); + Ok(()) + } + + #[test] + fn test_legacy_repo_defaults_to_v2() { + // A repo with no feature flags → no cfs_erofs_version → derived version is V2. + let json = br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{}}"#; + let meta: RepoMetadata = serde_json::from_slice(json).unwrap(); + assert_eq!( + meta.erofs_version(), + FormatVersion::V2, + "repo with no cfs_erofs_version flag should derive V2" + ); + + // A repo with cfs_erofs_version in ro_compat → derived version is V1. + let json_v1 = br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{"read-only-compatible":["cfs_erofs_version"]}}"#; + let meta_v1: RepoMetadata = serde_json::from_slice(json_v1).unwrap(); + assert_eq!( + meta_v1.erofs_version(), + FormatVersion::V1, + "repo with cfs_erofs_version flag should derive V1" + ); + + // Old JSON that happens to have an erofs_version field (written by a previous + // version of this code) must deserialize successfully — serde ignores unknown fields. + let json_old = + br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{},"erofs_version":2}"#; + let meta_old: RepoMetadata = serde_json::from_slice(json_old).unwrap(); + assert_eq!( + meta_old.erofs_version(), + FormatVersion::V2, + "old JSON with explicit erofs_version field should still derive V2 from flags" + ); + } + + #[test] + fn test_old_tool_blocked_on_v1_repo() { + // Simulate an old tool that does not know about "cfs_erofs_version". + // A V1 repo places "cfs_erofs_version" in ro_compat, so any tool that + // does not recognise that feature must open the repo read-only. + // We model this by constructing the FeatureFlags directly and filtering + // against an empty ro_compat allowlist. + let features = FeatureFlags { + compatible: vec![], + read_only_compatible: vec![known_features::CFS_EROFS_VERSION.to_string()], + incompatible: vec![], + }; + + // An unknown ro_compat feature must not prevent opening, but must + // signal read-only access. + let unknown_ro: Vec = features + .read_only_compatible + .iter() + .filter(|f| ![].contains(&f.as_str())) // empty old-tool allowlist + .cloned() + .collect(); + assert_eq!( + unknown_ro, + vec![known_features::CFS_EROFS_VERSION.to_string()], + "old tool should see cfs_erofs_version as an unknown ro_compat feature" + ); + // And the current tool knows about it, so check() returns ReadWrite. + assert_eq!(features.check().unwrap(), FeatureCheck::ReadWrite); + } + #[test] fn test_object_store_method_variants() { // Verify all variants exist and are distinct @@ -4971,9 +5436,12 @@ mod tests { // Create a repo, store an object, then remove meta.json to // simulate an old-format repository. - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA256, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); let data = b"hello world"; let obj_id = repo.ensure_object(data).unwrap(); drop(repo); @@ -5022,9 +5490,12 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA512, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); let data = b"sha512 test data"; let obj_id = repo.ensure_object(data).unwrap(); drop(repo); @@ -5059,9 +5530,12 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA512, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); repo.ensure_object(b"some data").unwrap(); drop(repo); @@ -5099,11 +5573,337 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - Repository::::init_path(CWD, &repo_path, Algorithm::SHA256, false) - .unwrap(); + Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); let (_repo, upgraded) = Repository::::open_upgrade(CWD, &repo_path).unwrap(); assert!(!upgraded); } + + #[tokio::test] + async fn test_fsck_v1_image_detects_missing_object() -> Result<()> { + // Same as test_fsck_validates_erofs_image_objects but with a V1 image, + // ensuring fsck correctly parses V1 images to find object references. + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj_size: u64 = 32 * 1024; + let obj = generate_test_data(obj_size, 0xBC); + let obj_id = repo.ensure_object(&obj)?; + + commit_v1_image(&repo, &obj_id, obj_size)?; + repo.sync()?; + + // Sanity: passes before we break it + let result = repo.fsck().await?; + assert!( + result.is_ok(), + "healthy V1 image should pass fsck: {result}" + ); + + // Delete the referenced object + let hex = obj_id.to_hex(); + let (prefix, rest) = hex.split_at(2); + let dir = open_test_repo_dir(&tmp); + dir.remove_file(format!("objects/{prefix}/{rest}"))?; + + let result = repo.fsck().await?; + assert!( + !result.is_ok(), + "fsck should detect missing object in V1 erofs image: {result}" + ); + assert!( + result.missing_objects > 0, + "should report missing objects: {result}" + ); + Ok(()) + } + + // ---- FormatSet / v1_erofs feature flag tests ---- + + #[test] + fn test_format_set_v1_only_has_v1_erofs_flag() { + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + FormatVersion::V2, + FormatSet::V1_ONLY, + ); + assert!( + meta.features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V1_ONLY format set must add v1_erofs to ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + } + + #[test] + fn test_format_set_both_omits_v1_erofs_flag() { + let meta = + RepoMetadata::new_with_formats(Algorithm::SHA256, FormatVersion::V2, FormatSet::BOTH); + assert!( + !meta + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "BOTH format set must NOT add v1_erofs to ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + } + + #[test] + fn test_default_format_set_from_v1_erofs_flag() { + // v1_erofs present → V1_ONLY + let meta_v1_only = RepoMetadata::new_with_formats( + Algorithm::SHA256, + FormatVersion::V2, + FormatSet::V1_ONLY, + ); + assert_eq!( + repo_formats_from_meta(&meta_v1_only), + FormatSet::V1_ONLY, + "v1_erofs flag present must decode to V1_ONLY" + ); + + // v1_erofs absent → BOTH + let meta_both = + RepoMetadata::new_with_formats(Algorithm::SHA256, FormatVersion::V2, FormatSet::BOTH); + assert_eq!( + repo_formats_from_meta(&meta_both), + FormatSet::BOTH, + "v1_erofs flag absent must decode to BOTH" + ); + } + + #[test] + fn test_init_path_v1_only_format_set() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::V1_ONLY, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(was_new); + assert_eq!(repo.default_format_set(), FormatSet::V1_ONLY); + assert!( + repo.metadata() + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "v1_erofs must be in ro_compat for V1_ONLY repos" + ); + Ok(()) + } + + #[test] + fn test_init_path_both_format_set() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(was_new); + assert_eq!(repo.default_format_set(), FormatSet::BOTH); + assert!( + !repo + .metadata() + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "v1_erofs must NOT be in ro_compat for BOTH repos" + ); + Ok(()) + } + + #[test] + fn test_format_set_upgrade_v1_only_to_both() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + // Init as V1_ONLY + let config_v1_only = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::V1_ONLY, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config_v1_only)?; + assert!(was_new); + assert_eq!(repo.default_format_set(), FormatSet::V1_ONLY); + drop(repo); + + // Upgrade to BOTH — should succeed + let config_both = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + let (repo2, was_new2) = Repository::::init_path(CWD, &path, config_both)?; + assert!(!was_new2, "upgrade must report not-new (existing repo)"); + assert_eq!( + repo2.default_format_set(), + FormatSet::BOTH, + "after upgrade, format set must be BOTH" + ); + Ok(()) + } + + #[test] + fn test_format_set_downgrade_both_to_v1_only_fails() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + // Init as BOTH + let config_both = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + Repository::::init_path(CWD, &path, config_both)?; + + // Attempt downgrade to V1_ONLY — must fail + let config_v1_only = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::V1_ONLY, + ..RepositoryConfig::default().set_insecure() + }; + let result = Repository::::init_path(CWD, &path, config_v1_only); + assert!(result.is_err(), "downgrade from BOTH to V1_ONLY must fail"); + let msg = format!("{:#}", result.unwrap_err()); + assert!( + msg.contains("downgrad") || msg.contains("V1_ONLY"), + "error must mention downgrade, got: {msg}" + ); + Ok(()) + } + + /// Verify that `default_format_set()` on the repo handle returned by + /// `init_path` (which re-opens the repo internally after an upgrade) + /// already reflects the new format set. This documents that callers do + /// NOT need to re-open the handle after an upgrade. + #[test] + fn test_default_format_set_reflects_upgrade_on_returned_handle() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + // Phase 1: init as V1_ONLY; old handle is dropped. + let config_v1_only = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::V1_ONLY, + ..RepositoryConfig::default().set_insecure() + }; + let (repo_old, _) = Repository::::init_path(CWD, &path, config_v1_only)?; + assert_eq!(repo_old.default_format_set(), FormatSet::V1_ONLY); + drop(repo_old); + + // Phase 2: upgrade to BOTH. The returned handle is a fresh open. + let config_both = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + let (repo_upgraded, was_new) = + Repository::::init_path(CWD, &path, config_both)?; + assert!(!was_new, "upgrade returns was_new=false"); + assert_eq!( + repo_upgraded.default_format_set(), + FormatSet::BOTH, + "handle returned by init_path after upgrade must already report BOTH \ + without a separate re-open" + ); + Ok(()) + } + + /// Verify `commit_images` with `BOTH` and a named ref: + /// - both ObjectIDs are in the returned map, + /// - both image symlinks exist in `images/`, + /// - the named ref points to the V1 image (the primary / first version). + #[test] + fn test_commit_images_both_named_ref_points_to_v1() -> Result<()> { + use crate::tree::{FileSystem, Stat}; + + let tmp = tempdir(); + let repo_path = tmp.path().join("repo"); + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, _) = Repository::::init_path(CWD, &repo_path, config)?; + + // Build a minimal filesystem (empty root dir is enough). + let root_stat = Stat { + st_mode: 0o755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let fs: FileSystem = FileSystem::new(root_stat); + + let map = fs.commit_images(&repo, Some("myref"), FormatSet::BOTH)?; + repo.sync()?; + + // Both versions must be in the result. + let v1_id = map + .get(&FormatVersion::V1) + .expect("V1 must be in result map"); + let v2_id = map + .get(&FormatVersion::V2) + .expect("V2 must be in result map"); + + // Both image symlinks must exist under images/. + let v1_image_path = format!("images/{}", v1_id.to_hex()); + let v2_image_path = format!("images/{}", v2_id.to_hex()); + assert!( + test_path_exists_in_repo(&tmp, &v1_image_path)?, + "V1 image symlink must exist: {v1_image_path}" + ); + assert!( + test_path_exists_in_repo(&tmp, &v2_image_path)?, + "V2 image symlink must exist: {v2_image_path}" + ); + + // The named ref must exist and must point to the V1 image (primary). + let ref_path = "images/refs/myref"; + assert!( + test_path_exists_in_repo(&tmp, ref_path)?, + "named ref images/refs/myref must exist" + ); + // The ref symlink target should contain the V1 image hex, not V2. + let ref_full = tmp.path().join("repo").join(ref_path); + let target = readlinkat(CWD, &ref_full, Vec::new())?; + let target_str = target.to_str()?; + assert!( + target_str.contains(&v1_id.to_hex()), + "named ref must point to V1 image ({}), but points to: {target_str}", + v1_id.to_hex() + ); + assert!( + !target_str.contains(&v2_id.to_hex()), + "named ref must NOT point to V2 image, but points to: {target_str}" + ); + Ok(()) + } } diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs index 6b338dee..8fd45f0b 100644 --- a/crates/composefs/src/splitstream.rs +++ b/crates/composefs/src/splitstream.rs @@ -1168,8 +1168,11 @@ mod tests { /// Create a test repository in insecure mode (no fs-verity required). fn create_test_repo(path: &Path) -> Result>> { - let (repo, _) = - Repository::init_path(CWD, path, crate::fsverity::Algorithm::SHA256, false)?; + let (repo, _) = Repository::init_path( + CWD, + path, + crate::repository::RepositoryConfig::default().set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs/src/test.rs b/crates/composefs/src/test.rs index b5674e9c..194cf98c 100644 --- a/crates/composefs/src/test.rs +++ b/crates/composefs/src/test.rs @@ -9,7 +9,10 @@ use once_cell::sync::Lazy; use rustix::fs::CWD; use tempfile::TempDir; -use crate::{fsverity::FsVerityHashValue, repository::Repository}; +use crate::{ + fsverity::FsVerityHashValue, + repository::{Repository, RepositoryConfig}, +}; static TMPDIR: Lazy = Lazy::new(|| { if let Some(path) = std::env::var_os("CFS_TEST_TMPDIR") { @@ -63,8 +66,12 @@ impl TestRepo { pub fn new() -> Self { let dir = tempdir(); let repo_path = dir.path().join("repo"); - let (repo, _) = Repository::init_path(CWD, &repo_path, ObjectID::ALGORITHM, false) - .expect("initializing test repo"); + let (repo, _) = Repository::init_path( + CWD, + &repo_path, + RepositoryConfig::new(ObjectID::ALGORITHM).set_insecure(), + ) + .expect("initializing test repo"); Self { repo: Arc::new(repo), repo_path, @@ -139,19 +146,37 @@ pub(crate) mod proptest_strategies { /// /// Linux filenames are arbitrary bytes except `/` (0x2F) and `\0` (0x00), /// with a max length of [`NAME_MAX`] (255) bytes. We generate a mix of - /// ASCII names and binary names, occasionally long, to exercise directory - /// entry layout edge cases. + /// lengths to exercise directory entry layout edge cases: + /// + /// - Short ASCII (common case) + /// - Binary bytes (no NUL or `/`) + /// - Long ASCII (crosses xattr/inode inline-data boundaries) + /// - Near-NAME_MAX: lengths 252–255 exercise all four 4-byte padding + /// residues in the erofs directory entry format (names are padded to the + /// next 4-byte boundary, so a 255-byte name has 1 pad byte, 254 has 2, + /// 253 has 3, 252 has 0) + /// - Exactly NAME_MAX (255 bytes): the hard limit pub fn filename() -> impl Strategy { prop_oneof![ // Short ASCII names (common case) - 6 => proptest::string::string_regex("[a-zA-Z0-9._-]{1,20}") + 5 => proptest::string::string_regex("[a-zA-Z0-9._-]{1,20}") .expect("valid regex") .prop_map(OsString::from), // Binary names with arbitrary bytes (no NUL or /) - 3 => prop::collection::vec(1..=0xFEu8, 1..=30) + 2 => prop::collection::vec(1..=0xFEu8, 1..=30) .prop_map(|mut v| { v.iter_mut().for_each(|b| if *b == b'/' { *b = b'_' }); OsString::from_vec(v) }), - // Long ASCII names (up to NAME_MAX) - 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{100,{NAME_MAX}}}")) + // Long ASCII names (100..=251) — crosses inline-data boundaries + 1 => proptest::string::string_regex("[a-zA-Z0-9._-]{100,251}") + .expect("valid regex") + .prop_map(OsString::from), + // Near-NAME_MAX (252–254): all four mod-4 padding residues in erofs dirents + 1 => (252usize..=254).prop_flat_map(|len| { + proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{{len}}}")) + .expect("valid regex") + .prop_map(OsString::from) + }), + // Exactly NAME_MAX (255): the hard limit + 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{{NAME_MAX}}}")) .expect("valid regex") .prop_map(OsString::from), ] @@ -162,29 +187,38 @@ pub(crate) mod proptest_strategies { pub fn stat() -> impl Strategy { ( 0..=0o7777u32, // permission bits - 0..=65535u32, // uid - 0..=65535u32, // gid - 0..=2_000_000_000i64, // mtime + 0..=131071u32, // uid — crosses u16::MAX to exercise extended inodes + 0..=131071u32, // gid — crosses u16::MAX to exercise extended inodes + 0..=2_000_000_000i64, // mtime sec + 0..1_000_000_000u32, // mtime nsec xattrs(), ) - .prop_map(|(mode, uid, gid, mtime, xattrs)| tree::Stat { - st_mode: mode, - st_uid: uid, - st_gid: gid, - st_mtim_sec: mtime, - xattrs, - }) + .prop_map( + |(mode, uid, gid, mtime_sec, mtime_nsec, xattrs)| tree::Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime_sec, + st_mtim_nsec: mtime_nsec, + xattrs, + }, + ) } /// Strategy for xattr keys covering all erofs prefix namespaces. /// /// The erofs format uses prefix indices to compress xattr names: - /// 0 = "" (fallback), 1 = "user.", 2 = "system.posix_acl_access", + /// 0 = "" (fallback, for unrecognized prefixes like com.example.*), + /// 1 = "user.", 2 = "system.posix_acl_access", /// 3 = "system.posix_acl_default", 4 = "trusted.", 5 = "lustre.", /// 6 = "security." /// /// The writer also escapes `trusted.overlay.*` → `trusted.overlay.overlay.*`, /// so we must test that path too. + /// + /// `lustre.*` keys are included here. For V1 images the writer skips index 5 during + /// prefix matching, so lustre.* xattrs fall through to prefix index 0 (raw fallback), + /// matching C mkcomposefs v1.0.8 behavior. fn xattr_key() -> impl Strategy { prop_oneof![ // user.* namespace (index 1) — most common @@ -210,6 +244,16 @@ pub(crate) mod proptest_strategies { 1 => Just("system.posix_acl_access".to_string()), // system.posix_acl_default (index 3) — exact name, no suffix 1 => Just("system.posix_acl_default".to_string()), + // Fallback prefix (index 0) — unrecognized prefix, full key stored as suffix. + // Both Rust and C agree on index 0 for these keys. + 1 => (0..3u32).prop_map(|n| format!("com.example.test_{n}")), + // lustre.* (index 5 in EROFS spec, but index 0 in C mkcomposefs v1.0.8). + // For V1 images, the writer skips index 5 so lustre.* falls through to index 0, + // matching C behavior for binary compatibility. + 1 => prop_oneof![ + Just("lustre.lov".to_string()), + Just("lustre.lma".to_string()), + ], ] } @@ -228,6 +272,58 @@ pub(crate) mod proptest_strategies { }) } + /// Strategy for xattr keys that stress corner cases in the V1 writer: + /// - Multiple `trusted.overlay.*` keys → all get escaped on disk + /// - `trusted.overlay.overlay.X` → double-escaped to `trusted.overlay.overlay.overlay.X` + /// - `security.selinux` + `security.ima` combinations + /// - `system.posix_acl_access` → triggers LCFS_EROFS_FLAGS_HAS_ACL header bit + fn xattr_key_unusual() -> impl Strategy { + prop_oneof![ + // trusted.overlay.* — each gets escaped to trusted.overlay.overlay.* on disk + 4 => prop_oneof![ + Just("trusted.overlay.custom".to_string()), + Just("trusted.overlay.origin".to_string()), + Just("trusted.overlay.upper".to_string()), + Just("trusted.overlay.redirect".to_string()), + Just("trusted.overlay.nfs_fh".to_string()), + ], + // Already-escaped key: trusted.overlay.overlay.X → double-escape on disk + 2 => Just("trusted.overlay.overlay.nested".to_string()), + // security.* — two labels on same inode + 3 => prop_oneof![ + Just("security.selinux".to_string()), + Just("security.ima".to_string()), + Just("security.capability".to_string()), + ], + // ACL — triggers LCFS_EROFS_FLAGS_HAS_ACL + 2 => Just("system.posix_acl_access".to_string()), + // user.* — filler + 1 => proptest::string::string_regex("user\\.[a-z]{1,10}") + .expect("valid regex"), + ] + } + + /// Xattr strategy for the unusual generator: 2–8 xattr pairs (key collisions are silently deduplicated by BTreeMap) with long values allowed. + fn xattrs_unusual() -> impl Strategy, Box<[u8]>>> { + prop::collection::vec( + ( + xattr_key_unusual(), + // Mix of short and long values — long values stress xattr dedup/block layout + prop_oneof![ + 3 => prop::collection::vec(any::(), 0..=20), + 1 => prop::collection::vec(any::(), 64..=512), + ], + ), + 2..=8, + ) + .prop_map(|pairs| { + pairs + .into_iter() + .map(|(k, v)| (OsStr::new(&k).into(), v.into_boxed_slice())) + .collect() + }) + } + /// Strategy for symlink targets as OsString. /// /// Symlink targets on Linux are arbitrary bytes except `\0`, up to @@ -252,7 +348,7 @@ pub(crate) mod proptest_strategies { /// /// External file references store raw hash bytes rather than a concrete /// `ObjectID` type, so the same spec works with any hash algorithm. - #[derive(Debug)] + #[derive(Debug, Clone)] pub enum LeafContentSpec { Inline(Vec), /// External file: random hash bytes (truncated to hash size at build time) and size. @@ -261,6 +357,10 @@ pub(crate) mod proptest_strategies { BlockDevice(u64), CharacterDevice(u64), Fifo, + Socket, + /// Overlay whiteout: char device with rdev=0. Always maps to CharacterDevice(0). + /// Distinct from CharacterDevice(rdev) to allow weighted generation. + Whiteout, } /// Strategy for hash-type-agnostic leaf content. @@ -270,7 +370,7 @@ pub(crate) mod proptest_strategies { // Inline file data is capped at INLINE_CONTENT_MAX_V0 (64 bytes) to match // the composefs invariant: larger files must be external (ChunkBased). ( - 0..10u8, + 0..11u8, prop::collection::vec(any::(), 0..=INLINE_CONTENT_MAX_V0), symlink_target(), prop::collection::vec(any::(), 64..=64), @@ -284,13 +384,14 @@ pub(crate) mod proptest_strategies { 5..=6 => LeafContentSpec::Symlink(symlink_target), 7 => LeafContentSpec::BlockDevice(rdev), 8 => LeafContentSpec::CharacterDevice(rdev), - _ => LeafContentSpec::Fifo, + 9 => LeafContentSpec::Fifo, + _ => LeafContentSpec::Socket, }, ) } /// A hash-type-agnostic leaf node specification. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct LeafSpec { pub stat: tree::Stat, pub content: LeafContentSpec, @@ -301,8 +402,17 @@ pub(crate) mod proptest_strategies { } /// Strategy for a list of uniquely-named leaf specs. - fn named_leaf_specs(max_entries: usize) -> impl Strategy> { - prop::collection::vec((filename(), leaf_spec()), 0..=max_entries).prop_map(|entries| { + /// Strategy for a list of uniquely-named leaf specs with a given entry count range. + /// + /// The `min..=max` range controls how many entries are attempted before + /// deduplication. Use `named_leaf_specs(0, 30)` for a small directory and + /// `named_leaf_specs(150, 300)` to reliably cross a 4 KiB directory block + /// boundary (~170 entries with typical short names × ~20 bytes each). + fn named_leaf_specs( + min: usize, + max: usize, + ) -> impl Strategy> { + prop::collection::vec((filename(), leaf_spec()), min..=max).prop_map(|entries| { let mut seen = std::collections::HashSet::new(); entries .into_iter() @@ -312,7 +422,7 @@ pub(crate) mod proptest_strategies { } /// Description of a directory to be built, including potential hardlinks. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct DirSpec { /// Stat metadata for this directory. pub stat: tree::Stat, @@ -323,7 +433,7 @@ pub(crate) mod proptest_strategies { } /// Description of a filesystem to be built, with hardlink info. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct FsSpec { /// Root directory specification. pub root: DirSpec, @@ -340,9 +450,35 @@ pub(crate) mod proptest_strategies { pub link_name: OsString, } + /// Hardlink spec for the unusual generator: places hardlink in root or a named subdir. + /// `target_dir_index: None` → root; `Some(i)` → subdirs[i % subdirs.len()]`. + #[derive(Debug, Clone)] + pub struct UnusualHardlinkSpec { + /// Index into the flat all-leaves list (root leaves first, then subdir leaves in order). + pub source_leaf_index: usize, + /// Name for the hardlink entry. + pub link_name: OsString, + /// Which directory receives the hardlink entry. + pub target_dir_index: Option, + } + + /// Filesystem description for the unusual generator. + #[derive(Debug, Clone)] + pub struct UnusualFsSpec { + pub root: DirSpec, + pub hardlinks: Vec, + } + /// Strategy for a subdirectory (no further nesting). + /// + /// Usually small (0–20 entries), but 1-in-4 times generates a large + /// directory (150–300 entries) to exercise multi-block directory layout. fn subdir_spec() -> impl Strategy { - (filename(), stat(), named_leaf_specs(10)).prop_map(|(name, stat, leaves)| { + let leaves_strat = prop_oneof![ + 3 => named_leaf_specs(0, 20), + 1 => named_leaf_specs(150, 300), + ]; + (filename(), stat(), leaves_strat).prop_map(|(name, stat, leaves)| { ( name, DirSpec { @@ -366,14 +502,19 @@ pub(crate) mod proptest_strategies { /// Strategy for generating a complete `FsSpec`. /// - /// Generates a root directory with up to 15 file entries and up to 5 - /// subdirectories (each with up to 10 entries, max depth 2). Then - /// optionally generates 0-3 hardlinks that reference existing leaves. + /// Root directory entry count is weighted: usually small (0–30), but + /// 1-in-4 times large (150–300) to reliably cross the 4 KiB directory + /// block boundary. Subdirectories use the same weighted split inside + /// `subdir_spec`. pub fn filesystem_spec() -> impl Strategy { + let root_leaves_strat = prop_oneof![ + 3 => named_leaf_specs(0, 30), + 1 => named_leaf_specs(150, 300), + ]; ( stat(), - named_leaf_specs(15), - unique_subdirs(5), + root_leaves_strat, + unique_subdirs(10), // Hardlink candidates: (source index placeholder, link name) prop::collection::vec((any::(), filename()), 0..=3), ) @@ -416,6 +557,153 @@ pub(crate) mod proptest_strategies { ) } + /// Strategy for the "unusual content" proptest generator. + /// + /// Explicitly constructs filesystem trees that stress corner cases in the V1 writer: + /// - Whiteout files (rdev=0 char devices) at root and in subdirs + /// - Multiple trusted.overlay.* xattrs per inode (escape path) + /// - Large external file sizes (up to 30 GB) + /// - Hardlinks across all leaf types and directories (post-generation pass) + pub fn unusual_filesystem_spec() -> impl Strategy { + fn unusual_stat() -> impl Strategy { + ( + 0u32..=0o7777u32, + 0u32..=131071u32, + 0u32..=131071u32, + 0u64..=u32::MAX as u64, + 0u32..=999_999_999u32, + xattrs_unusual(), + ) + .prop_map(|(mode, uid, gid, mtime_sec, mtime_nsec, xattrs)| { + tree::Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime_sec as i64, + st_mtim_nsec: mtime_nsec, + xattrs, + } + }) + } + + fn unusual_leaf_content_spec() -> impl Strategy { + let hash_bytes = prop::collection::vec(any::(), 64..=64); + let ext_size = prop_oneof![ + 5 => 1u64..=1_000_000u64, + 3 => 1_000_001u64..=100_000_000u64, + 2 => 100_000_001u64..=30_000_000_000u64, + ]; + ( + 0u8..=10u8, + prop::collection::vec(any::(), 0..=INLINE_CONTENT_MAX_V0), + symlink_target(), + hash_bytes, + ext_size, + 1u64..=65535u64, + ) + .prop_map( + |(tag, file_data, symlink_target, hash_bytes, ext_size, rdev)| match tag { + 0..=1 => LeafContentSpec::Inline(file_data), + 2..=3 => LeafContentSpec::External(hash_bytes, ext_size), + 4..=5 => LeafContentSpec::Symlink(symlink_target), + 6..=7 => LeafContentSpec::Whiteout, + 8 => LeafContentSpec::BlockDevice(rdev), + 9 => LeafContentSpec::Fifo, + _ => LeafContentSpec::Socket, + }, + ) + } + + fn unusual_leaf_spec() -> impl Strategy { + (unusual_stat(), unusual_leaf_content_spec()) + .prop_map(|(stat, content)| LeafSpec { stat, content }) + } + + fn unusual_named_leaves(max: usize) -> impl Strategy> { + prop::collection::vec((filename(), unusual_leaf_spec()), 0..=max).prop_map(|entries| { + let mut seen = std::collections::HashSet::new(); + entries + .into_iter() + .filter(|(name, _)| seen.insert(name.clone())) + .collect() + }) + } + + fn unusual_subdir_spec() -> impl Strategy { + (filename(), unusual_stat(), unusual_named_leaves(10)).prop_map( + |(name, stat, leaves)| { + ( + name, + DirSpec { + stat, + leaves, + subdirs: vec![], + }, + ) + }, + ) + } + + fn unusual_unique_subdirs(max: usize) -> impl Strategy> { + prop::collection::vec(unusual_subdir_spec(), 0..=max).prop_map(|dirs| { + let mut seen = std::collections::HashSet::new(); + dirs.into_iter() + .filter(|(name, _)| seen.insert(name.clone())) + .collect() + }) + } + + ( + unusual_stat(), + unusual_named_leaves(15), + unusual_unique_subdirs(5), + prop::collection::vec((any::(), filename(), any::()), 0..=5), + ) + .prop_map( + |(root_stat, mut root_leaves, mut root_subdirs, hl_candidates)| { + let mut seen: std::collections::HashSet = + std::collections::HashSet::new(); + root_subdirs.retain(|(name, _)| seen.insert(name.clone())); + root_leaves.retain(|(name, _)| seen.insert(name.clone())); + + let root_leaf_count = root_leaves.len(); + let total_leaves: usize = root_leaf_count + + root_subdirs + .iter() + .map(|(_, d)| d.leaves.len()) + .sum::(); + + let hardlinks = if total_leaves > 0 { + hl_candidates + .into_iter() + .map(|(src_idx, name, dir_idx)| UnusualHardlinkSpec { + source_leaf_index: src_idx % total_leaves, + link_name: name, + target_dir_index: if root_subdirs.is_empty() { + None + } else if dir_idx % 2 == 0 { + None + } else { + Some(dir_idx % root_subdirs.len()) + }, + }) + .collect() + } else { + vec![] + }; + + UnusualFsSpec { + root: DirSpec { + stat: root_stat, + leaves: root_leaves, + subdirs: root_subdirs, + }, + hardlinks, + } + }, + ) + } + /// Convert a `LeafContentSpec` into a concrete `tree::LeafContent`. fn build_leaf_content( spec: LeafContentSpec, @@ -436,6 +724,8 @@ pub(crate) mod proptest_strategies { LeafContentSpec::BlockDevice(rdev) => tree::LeafContent::BlockDevice(rdev), LeafContentSpec::CharacterDevice(rdev) => tree::LeafContent::CharacterDevice(rdev), LeafContentSpec::Fifo => tree::LeafContent::Fifo, + LeafContentSpec::Socket => tree::LeafContent::Socket, + LeafContentSpec::Whiteout => tree::LeafContent::CharacterDevice(0), } } @@ -485,4 +775,79 @@ pub(crate) mod proptest_strategies { fs } + + /// Build a `tree::FileSystem` from an `UnusualFsSpec`. + /// + /// Handles post-generation hardlink injection: hardlinks can target any leaf type + /// (symlinks, whiteouts, devices, FIFOs) and can be placed in root or any subdir. + pub fn build_unusual_filesystem( + spec: UnusualFsSpec, + ) -> tree::FileSystem { + let mut fs = tree::FileSystem::new(spec.root.stat); + + let mut all_leaf_ids: Vec = Vec::new(); + let mut root_used_names: std::collections::HashSet = + std::collections::HashSet::new(); + + // Insert root leaves + for (name, leaf_spec) in spec.root.leaves { + let leaf_id = fs.push_leaf(leaf_spec.stat, build_leaf_content(leaf_spec.content)); + all_leaf_ids.push(leaf_id); + root_used_names.insert(name.clone()); + fs.root.insert(&name, tree::Inode::leaf(leaf_id)); + } + + // Remember subdir names and per-subdir used-name sets for hardlink dedup + let mut subdir_names: Vec = Vec::new(); + let mut subdir_used_names: Vec> = Vec::new(); + + for (dir_name, dir_spec) in spec.root.subdirs { + subdir_names.push(dir_name.clone()); + let mut used: std::collections::HashSet = std::collections::HashSet::new(); + let mut subdir = tree::Directory::new(dir_spec.stat); + for (name, leaf_spec) in dir_spec.leaves { + let leaf_id = fs.push_leaf(leaf_spec.stat, build_leaf_content(leaf_spec.content)); + all_leaf_ids.push(leaf_id); + used.insert(name.clone()); + subdir.insert(&name, tree::Inode::leaf(leaf_id)); + } + subdir_used_names.push(used); + root_used_names.insert(dir_name.clone()); + fs.root + .insert(&dir_name, tree::Inode::Directory(Box::new(subdir))); + } + + // Post-generation hardlink pass: inject hardlinks to any leaf type, any dir. + // Whiteouts (chardev rdev=0) are excluded: hardlinked whiteouts are invalid. + let non_whiteout_leaf_ids: Vec = all_leaf_ids + .iter() + .copied() + .filter(|&id| !matches!(fs.leaf(id).content, tree::LeafContent::CharacterDevice(0))) + .collect(); + if !non_whiteout_leaf_ids.is_empty() { + for hl in spec.hardlinks { + let leaf_id = + non_whiteout_leaf_ids[hl.source_leaf_index % non_whiteout_leaf_ids.len()]; + match hl.target_dir_index { + None => { + if root_used_names.insert(hl.link_name.clone()) { + fs.root.insert(&hl.link_name, tree::Inode::leaf(leaf_id)); + } + } + Some(raw_idx) => { + let idx = raw_idx % subdir_names.len(); + if subdir_used_names[idx].insert(hl.link_name.clone()) { + if let Ok(subdir) = + fs.root.get_directory_mut(subdir_names[idx].as_os_str()) + { + subdir.insert(&hl.link_name, tree::Inode::leaf(leaf_id)); + } + } + } + } + } + } + + fs + } } diff --git a/crates/composefs/src/tree.rs b/crates/composefs/src/tree.rs index dd8865d4..ddfc61bf 100644 --- a/crates/composefs/src/tree.rs +++ b/crates/composefs/src/tree.rs @@ -57,6 +57,7 @@ mod tests { st_uid: 1000, st_gid: 1000, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -76,6 +77,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } diff --git a/crates/composefs/src/util.rs b/crates/composefs/src/util.rs index 7a5ac23e..bdc43485 100644 --- a/crates/composefs/src/util.rs +++ b/crates/composefs/src/util.rs @@ -72,6 +72,23 @@ pub(crate) fn reopen_tmpfile_ro(file: std::fs::File) -> std::io::Result rustix::io::Result { + rustix::fs::openat( + dirfd, + ".", + rustix::fs::OFlags::RDWR | rustix::fs::OFlags::TMPFILE | rustix::fs::OFlags::CLOEXEC, + rustix::fs::Mode::from_raw_mode(0o644), + ) +} + /// This function reads the exact amount of bytes required to fill the buffer, possibly performing /// multiple reads to do so (and also retrying if required to deal with EINTR). /// diff --git a/crates/composefs/tests/mkfs.rs b/crates/composefs/tests/mkfs.rs index b2896c69..45eb974e 100644 --- a/crates/composefs/tests/mkfs.rs +++ b/crates/composefs/tests/mkfs.rs @@ -12,7 +12,11 @@ use tempfile::NamedTempFile; use composefs::{ dumpfile::write_dumpfile, - erofs::{debug::debug_img, writer::mkfs_erofs}, + erofs::{ + debug::debug_img, + format::FormatVersion, + writer::{ValidatedFileSystem, mkfs_erofs, mkfs_erofs_versioned}, + }, fsverity::{FsVerityHashValue, Sha256HashValue}, tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}, }; @@ -23,12 +27,13 @@ fn default_stat() -> Stat { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } fn debug_fs(fs: FileSystem) -> String { - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let mut output = vec![]; debug_img(&mut output, &image).unwrap(); String::from_utf8(output).unwrap() @@ -54,6 +59,7 @@ fn add_leaf( st_uid: 0, st_mode: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, content, @@ -94,22 +100,80 @@ fn test_simple() { insta::assert_snapshot!(debug_fs(fs)); } -fn foreach_case(f: fn(&FileSystem)) { +fn foreach_case(f: fn(FileSystem)) { for case in [empty, simple] { let mut fs = FileSystem::new(default_stat()); case(&mut fs); - f(&fs); + f(fs); } } #[test_with::executable(fsck.erofs)] fn test_fsck() { foreach_case(|fs| { + // V2 (default) let mut tmp = NamedTempFile::new().unwrap(); - tmp.write_all(&mkfs_erofs(fs)).unwrap(); + tmp.write_all(&mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap())) + .unwrap(); let mut fsck = Command::new("fsck.erofs").arg(tmp.path()).spawn().unwrap(); assert!(fsck.wait().unwrap().success()); }); + + // V1 — needs its own filesystem instances for add_overlay_whiteouts + for case in [empty, simple] { + let mut fs = FileSystem::::new(default_stat()); + case(&mut fs); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + let mut tmp = NamedTempFile::new().unwrap(); + tmp.write_all(&image).unwrap(); + let mut fsck = Command::new("fsck.erofs").arg(tmp.path()).spawn().unwrap(); + assert!(fsck.wait().unwrap().success()); + } +} + +/// Verify byte-for-byte identity with C mkcomposefs for the pinned test cases. +/// +/// These fixed cases (`empty`, `simple`) complement the proptest binary-compat +/// tests in reader.rs which cover random trees. Keeping them pinned here means +/// a regression on these canonical shapes is immediately visible without proptest +/// shrinking, and is also validated by the digest stability tests above. +#[test_with::executable(mkcomposefs)] +fn test_vs_mkcomposefs() { + for case in [empty, simple] { + let mut fs_rust = FileSystem::new(default_stat()); + case(&mut fs_rust); + let mut fs_c = FileSystem::new(default_stat()); + case(&mut fs_c); + + fs_rust.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_rust).unwrap(), + FormatVersion::V1, + ); + + let mut mkcomposefs = Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let mut stdin = mkcomposefs.stdin.take().unwrap(); + write_dumpfile(&mut stdin, &fs_c).unwrap(); + drop(stdin); + + let output = mkcomposefs.wait_with_output().unwrap(); + assert!(output.status.success()); + let mkcomposefs_image = output.stdout.into_boxed_slice(); + + if image != mkcomposefs_image { + let dump = dump_image(&image); + let mkcomposefs_dump = dump_image(&mkcomposefs_image); + assert_eq!(mkcomposefs_dump, dump, "structural diff (rust vs C)"); + } + assert_eq!(image, mkcomposefs_image); + } } fn dump_image(img: &[u8]) -> String { @@ -139,7 +203,7 @@ fn test_erofs_digest_stability() { for (name, case, expected_digest) in cases { let mut fs = FileSystem::::new(default_stat()); case(&mut fs); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let digest = composefs::fsverity::compute_verity::(&image); let hex = digest.to_hex(); assert_eq!( @@ -149,32 +213,33 @@ fn test_erofs_digest_stability() { } } -#[should_panic] -#[test_with::executable(mkcomposefs)] -fn test_vs_mkcomposefs() { - foreach_case(|fs| { - let image = mkfs_erofs(fs); - - let mut mkcomposefs = Command::new("mkcomposefs") - .args(["--min-version=3", "--from-file", "-", "-"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .spawn() - .unwrap(); - - let mut stdin = mkcomposefs.stdin.take().unwrap(); - write_dumpfile(&mut stdin, fs).unwrap(); - drop(stdin); - - let output = mkcomposefs.wait_with_output().unwrap(); - assert!(output.status.success()); - let mkcomposefs_image = output.stdout.into_boxed_slice(); +#[test] +fn test_erofs_v1_digest_stability() { + // Same as test_erofs_digest_stability but for V1 (C-compatible) format. + // V1 output must be byte-stable since it needs to match C mkcomposefs. + let cases: &[(&str, fn(&mut FileSystem), &str)] = &[ + ( + "empty_v1", + empty, + "8f589e8f57ecb88823736b0d857ddca1e1068a23e264fad164b28f7038eb3682", + ), + ( + "simple_v1", + simple, + "9f3f5620ee0c54708516467d0d58741e7963047c7106b245d94c298259d0fa01", + ), + ]; - if image != mkcomposefs_image { - let dump = dump_image(&image); - let mkcomposefs_dump = dump_image(&mkcomposefs_image); - assert_eq!(mkcomposefs_dump, dump); - } - assert_eq!(image, mkcomposefs_image); // fallback if the dump is somehow the same - }); + for (name, case, expected_digest) in cases { + let mut fs = FileSystem::::new(default_stat()); + case(&mut fs); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + let digest = composefs::fsverity::compute_verity::(&image); + let hex = digest.to_hex(); + assert_eq!( + &hex, expected_digest, + "{name}: V1 EROFS digest changed — if this is intentional, update the pinned value" + ); + } } diff --git a/crates/composefs/tests/test.sh b/crates/composefs/tests/test.sh index 036279d8..4e935ad9 100755 --- a/crates/composefs/tests/test.sh +++ b/crates/composefs/tests/test.sh @@ -54,7 +54,7 @@ mount -o bind "${blkdev}" "${sysroot}" composefs-setup-root \ --config "${config}" \ - --cmdline "composefs=${imageid}" \ + --cmdline "composefs.digest=${imageid}" \ --root-fs "${root}" \ --sysroot "${sysroot}" \ ${null} diff --git a/doc/repository.md b/doc/repository.md index e3188305..ab36f347 100644 --- a/doc/repository.md +++ b/doc/repository.md @@ -65,11 +65,52 @@ created by `cfsctl init` and contains: - `read-only-compatible` — old tools may read but must not write. - `incompatible` — old tools must refuse the repository entirely. + The currently defined feature flags are: + - `cfs_erofs_version` (read-only-compatible) — present on repositories + whose default EROFS image format is V1. The EROFS format version is + derived entirely from this flag: present → V1, absent → V2. Old + tools that do not recognise this flag open the repository read-only + rather than accidentally writing images in the wrong format. + - `v1_erofs` (read-only-compatible) — controls how many EROFS format + versions are generated when committing images. When **present**, the + repository generates only V1 EROFS (C-tool compatible mode, the + default for new repositories initialized with `cfsctl init` or + `cfsctl init --erofs v1`). When **absent**, both V1 and V2 EROFS are + generated for each image (dual mode, used by bootc and other + multi-format consumers; enabled with `cfsctl init --erofs dual`). + Old tools that do not understand this flag treat the repository as + read-only rather than accidentally committing images in the wrong set + of formats. `v1_erofs` and `cfs_erofs_version` are independent and + may both be present. + When `meta.json` is present, `cfsctl` auto-detects the hash algorithm and errors if `--hash` is explicitly passed with a conflicting value. When the file is absent (for repositories created before this feature), `--hash` is honored as before and defaults to `sha512`. +### `cfsctl init --erofs` + +The `--erofs` flag controls which EROFS format versions are generated when +images are committed to the repository. It maps to the `v1_erofs` feature +flag in `meta.json`: + +``` +cfsctl init --erofs v1 # default: generate only V1 EROFS (C-tool compatible) +cfsctl init --erofs dual # generate both V1 and V2 EROFS (bootc mode) +``` + +Omitting `--erofs` is equivalent to `--erofs v1`. The `dual` mode is +intended for consumers such as bootc that need to serve both the C-tool +compatible V1 format and the composefs-rs native V2 format from the same +repository. + +Upgrading an existing repository from `v1` to `dual` by re-running +`cfsctl init --erofs dual` is explicitly supported: `meta.json` is +rewritten in place to clear the `v1_erofs` flag. Downgrading in the +reverse direction (from `dual` back to `v1`) is rejected with an error, +because silently stopping V2 generation could leave sealed UKIs or other +consumers without the EROFS image format they depend on. + ## `objects/` This is where the content-addressed data is stored. The immediate children of diff --git a/examples/bls/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service b/examples/bls/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service index ffc404d6..ad9b5532 100644 --- a/examples/bls/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service +++ b/examples/bls/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service @@ -15,7 +15,8 @@ [Unit] DefaultDependencies=no -ConditionKernelCommandLine=composefs +ConditionKernelCommandLine=|composefs +ConditionKernelCommandLine=|composefs.digest ConditionPathExists=/etc/initrd-release After=sysroot.mount Requires=sysroot.mount diff --git a/examples/bls/extra/usr/lib/initcpio/hooks/composefs b/examples/bls/extra/usr/lib/initcpio/hooks/composefs index 775ea403..21f04379 100644 --- a/examples/bls/extra/usr/lib/initcpio/hooks/composefs +++ b/examples/bls/extra/usr/lib/initcpio/hooks/composefs @@ -3,7 +3,11 @@ run_latehook() { local composefs - composefs="$(getarg composefs)" + # composefs.digest= is the V1 EROFS karg; composefs= is the legacy V2 karg. + composefs="$(getarg composefs.digest)" + if [ -z "$composefs" ]; then + composefs="$(getarg composefs)" + fi if [ -z "$composefs" ]; then return 0 fi diff --git a/examples/common/fix-verity/dracut-hook.sh b/examples/common/fix-verity/dracut-hook.sh index 44d01532..3110198b 100644 --- a/examples/common/fix-verity/dracut-hook.sh +++ b/examples/common/fix-verity/dracut-hook.sh @@ -1,11 +1,13 @@ # dracut hook for fixing fs-verity on composefs sysroot mount -o remount,rw /sysroot ( - cd /sysroot/composefs/objects + cd /sysroot/composefs echo >&2 'Enabling fsverity on composefs objects' - for i in */*; do - fsverity enable $i; + for i in objects/*/*; do + fsverity enable "$i" done + echo >&2 'Enabling fsverity on meta.json' + fsverity enable meta.json echo >&2 'done!' ) umount /sysroot diff --git a/examples/common/fix-verity/fix-verity b/examples/common/fix-verity/fix-verity index 788c9796..e9672bd7 100755 --- a/examples/common/fix-verity/fix-verity +++ b/examples/common/fix-verity/fix-verity @@ -18,10 +18,29 @@ if [ ! -f ${fix_verity_efi} ]; then mv "${fix_verity_efi}.tmp" "${fix_verity_efi}" fi +ovmf_code="" +ovmf_vars="" +for d in /usr/share/edk2/ovmf /usr/share/OVMF /usr/share/ovmf /usr/share/edk2/x64; do + if [ -f "$d/OVMF_CODE.fd" ] && [ -f "$d/OVMF_VARS.fd" ]; then + ovmf_code="$d/OVMF_CODE.fd" + ovmf_vars="$d/OVMF_VARS.fd" + break + fi +done + +ovmf_vars_tmp="" +if [ -n "$ovmf_code" ]; then + ovmf_vars_tmp="$(mktemp --suffix=.fd)" + cp "$ovmf_vars" "$ovmf_vars_tmp" + trap 'rm -f "$ovmf_vars_tmp"' EXIT +fi + qemu-system-x86_64 \ -nographic \ -m 4096 \ -enable-kvm \ - -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -machine q35 \ + ${ovmf_code:+-drive if=pflash,format=raw,readonly=on,file="$ovmf_code"} \ + ${ovmf_vars_tmp:+-drive if=pflash,format=raw,file="$ovmf_vars_tmp"} \ -drive file="$1",format=raw,if=virtio,media=disk \ -kernel "${fix_verity_efi}" diff --git a/examples/testthing.py b/examples/testthing.py index 9ec191d0..2631c4e9 100644 --- a/examples/testthing.py +++ b/examples/testthing.py @@ -184,19 +184,36 @@ def _find_qemu() -> Path: raise FileNotFoundError("Unable to find qemu-kvm") -def _find_ovmf() -> tuple[str, Path]: - candidates = [ - # path for Fedora/RHEL (our tasks container) - "/usr/share/OVMF/OVMF_CODE.fd", +def _find_ovmf() -> tuple[str | tuple[str, str], ...]: + # Prefer split CODE+VARS pflash files (required on RHEL10/CentOS10 QEMU + # where -bios with the combined file hangs). Fall back to -bios with a + # combined image for Ubuntu CI and Arch. + split_candidates = [ + ("/usr/share/edk2/ovmf/OVMF_CODE.fd", "/usr/share/edk2/ovmf/OVMF_VARS.fd"), + ("/usr/share/OVMF/OVMF_CODE.fd", "/usr/share/OVMF/OVMF_VARS.fd"), + ] + for code, varst in split_candidates: + if Path(code).exists() and Path(varst).exists(): + # Copy VARS so UEFI can write to it without modifying the original. + import tempfile, shutil, atexit + tmp = tempfile.NamedTemporaryFile(suffix=".fd", delete=False) + shutil.copy2(varst, tmp.name) + atexit.register(lambda p=tmp.name: Path(p).unlink(missing_ok=True)) + return ( + ("-machine", "q35"), + ("-drive", f"if=pflash,format=raw,readonly=on,file={code}"), + ("-drive", f"if=pflash,format=raw,file={tmp.name}"), + ) + + bios_candidates = [ # path for Ubuntu (GitHub Actions runners) "/usr/share/ovmf/OVMF.fd", # path for Arch "/usr/share/edk2/x64/OVMF.4m.fd", ] - - for path in map(Path, candidates): + for path in map(Path, bios_candidates): if path.exists(): - return "-bios", path + return (("-bios", str(path)),) raise FileNotFoundError("Unable to find OVMF UEFI BIOS") @@ -391,7 +408,7 @@ def __init__( sit: bool = False, snapshot: bool = True, status_messages: bool = False, - timeout: float = 30.0, + timeout: float = 60.0, verbose: bool = False, ) -> None: """Construct a VM. @@ -618,7 +635,7 @@ async def _qemu( args = ( _find_qemu(), "-nodefaults", - _find_ovmf(), + *_find_ovmf(), ("-cpu", "host"), ("-smp", f"{self._cpus}"), ("-m", f"{self._memory}"), @@ -1060,7 +1077,7 @@ def _main() -> None: "--ssh-key", "-i", type=Path, help="Path to SSH private key (default: generate)" ) parser.add_argument( - "--timeout", type=float, help="For startup, in seconds, or 'inf' (default: 30)" + "--timeout", type=float, help="For startup, in seconds, or 'inf' (default: 60)" ) parser.add_argument("image", type=Path, help="The path to a qcow2 VM image to run") args = parser.parse_args() diff --git a/examples/uki/Containerfile b/examples/uki/Containerfile index 3f31bf73..1f30a072 100644 --- a/examples/uki/Containerfile +++ b/examples/uki/Containerfile @@ -9,9 +9,9 @@ # changes may be made vs. the base image. This is best-accomplished with a # multi-stage build. # -# - during the build stages following 'base', the `COMPOSEFS_FSVERITY` build -# arg will be set to the fsverity digest of the container image. This should -# be baked into the UKI. +# - during the build stages following 'base', the `COMPOSEFS_KARG` build +# arg will be set to the composefs kernel argument string (e.g. +# composefs.digest=). This should be baked into the UKI. FROM fedora:43 AS base RUN --mount=type=cache,target=/var/cache/libdnf5 < /etc/kernel/cmdline + echo "${COMPOSEFS_KARG} rw" > /etc/kernel/cmdline kernel-install add-all EOF diff --git a/examples/uki/Containerfile.arch b/examples/uki/Containerfile.arch index 2283c539..acfbb89e 100644 --- a/examples/uki/Containerfile.arch +++ b/examples/uki/Containerfile.arch @@ -30,10 +30,10 @@ RUN < /etc/kernel/cmdline + echo "root=/dev/vda2 ${COMPOSEFS_KARG} rw" > /etc/kernel/cmdline mkinitcpio -p linux EOF diff --git a/examples/uki/build b/examples/uki/build index dfc3eb30..bf8b84a8 100755 --- a/examples/uki/build +++ b/examples/uki/build @@ -41,13 +41,13 @@ ${PODMAN_BUILD} \ BASE_ID="$(cat tmp/base.iid)" ${CFSCTL} oci pull containers-storage:"${BASE_ID}" -BASE_IMAGE_FSVERITY="$(${CFSCTL} oci compute-id --bootable "@${BASE_ID}")" +BASE_KARG="$(${CFSCTL} oci composefs-digest-karg "@${BASE_ID}")" ${PODMAN_BUILD} \ --iidfile=tmp/final.iid \ --build-context=base="container-image://${BASE_ID}" \ - --build-arg=COMPOSEFS_FSVERITY="${BASE_IMAGE_FSVERITY}" \ - --label=containers.composefs.fsverity="${BASE_IMAGE_FSVERITY}" \ + --build-arg=COMPOSEFS_KARG="${BASE_KARG}" \ + --label=containers.composefs.karg="${BASE_KARG}" \ -f "${containerfile}" \ . diff --git a/examples/uki/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service b/examples/uki/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service index ffc404d6..ad9b5532 100644 --- a/examples/uki/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service +++ b/examples/uki/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service @@ -15,7 +15,8 @@ [Unit] DefaultDependencies=no -ConditionKernelCommandLine=composefs +ConditionKernelCommandLine=|composefs +ConditionKernelCommandLine=|composefs.digest ConditionPathExists=/etc/initrd-release After=sysroot.mount Requires=sysroot.mount diff --git a/examples/uki/extra/usr/lib/initcpio/hooks/composefs b/examples/uki/extra/usr/lib/initcpio/hooks/composefs index 775ea403..21f04379 100644 --- a/examples/uki/extra/usr/lib/initcpio/hooks/composefs +++ b/examples/uki/extra/usr/lib/initcpio/hooks/composefs @@ -3,7 +3,11 @@ run_latehook() { local composefs - composefs="$(getarg composefs)" + # composefs.digest= is the V1 EROFS karg; composefs= is the legacy V2 karg. + composefs="$(getarg composefs.digest)" + if [ -z "$composefs" ]; then + composefs="$(getarg composefs)" + fi if [ -z "$composefs" ]; then return 0 fi diff --git a/examples/unified-secureboot/Containerfile b/examples/unified-secureboot/Containerfile index 8472d046..945e5e4d 100644 --- a/examples/unified-secureboot/Containerfile +++ b/examples/unified-secureboot/Containerfile @@ -45,10 +45,10 @@ RUN --mount=type=bind,from=base,target=/mnt/base < /etc/kernel/cmdline + echo "${COMPOSEFS_KARG} rw" > /etc/kernel/cmdline EOF RUN --mount=type=cache,target=/var/cache/libdnf5 \ --mount=type=secret,id=key \ diff --git a/examples/unified-secureboot/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service b/examples/unified-secureboot/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service index ffc404d6..ad9b5532 100644 --- a/examples/unified-secureboot/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service +++ b/examples/unified-secureboot/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service @@ -15,7 +15,8 @@ [Unit] DefaultDependencies=no -ConditionKernelCommandLine=composefs +ConditionKernelCommandLine=|composefs +ConditionKernelCommandLine=|composefs.digest ConditionPathExists=/etc/initrd-release After=sysroot.mount Requires=sysroot.mount diff --git a/examples/unified/Containerfile b/examples/unified/Containerfile index da113a2b..51986a13 100644 --- a/examples/unified/Containerfile +++ b/examples/unified/Containerfile @@ -43,10 +43,10 @@ RUN --mount=type=bind,from=base,target=/mnt/base < /etc/kernel/cmdline + echo "${COMPOSEFS_KARG} rw" > /etc/kernel/cmdline kernel-install add-all EOF diff --git a/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service index ffc404d6..ad9b5532 100644 --- a/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service +++ b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-setup-root.service @@ -15,7 +15,8 @@ [Unit] DefaultDependencies=no -ConditionKernelCommandLine=composefs +ConditionKernelCommandLine=|composefs +ConditionKernelCommandLine=|composefs.digest ConditionPathExists=/etc/initrd-release After=sysroot.mount Requires=sysroot.mount