From 0106df58aea1a7035c9260eabfdd789982bba814 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 3 May 2026 10:55:28 -0400 Subject: [PATCH 1/9] fs,repository: Add write concurrency control and semaphore override Add set_write_concurrency() to Repository for overriding the default parallelism. Add read_filesystem_with_semaphore() as a public entry point that accepts an explicit Semaphore, and refactor the internal read_filesystem_impl() to centralize semaphore selection. Prep for wiring up --threads in mkcomposefs. Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- crates/composefs/src/fs.rs | 42 +++++++++++++++++++++++++----- crates/composefs/src/repository.rs | 34 ++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index 76ac2f3a..f4380f93 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -533,18 +533,46 @@ pub fn read_file( /// /// If `repo` is `Some`, file objects are stored in the repository. /// If `None`, fsverity digests are computed without writing to disk. +/// +/// An optional `semaphore` can be provided to override the default concurrency +/// control. When `None`, the semaphore is derived from the repository (if any) +/// or from [`available_parallelism`]. pub async fn read_filesystem( dirfd: OwnedFd, path: PathBuf, repo: Option>>, ) -> Result> { - let semaphore = repo - .as_ref() - .map(|r| r.write_semaphore()) - .unwrap_or_else(|| { - let n = available_parallelism().map(|n| n.get()).unwrap_or(4); - Arc::new(Semaphore::new(n)) - }); + read_filesystem_impl(dirfd, path, repo, None).await +} + +/// Like [`read_filesystem`] but with an explicit concurrency limit. +/// +/// The `semaphore`, if provided, overrides the default parallelism derived from +/// the repository or [`available_parallelism`]. This is the recommended way to +/// honour a user-supplied `--threads` argument when no repository is present. +pub async fn read_filesystem_with_semaphore( + dirfd: OwnedFd, + path: PathBuf, + repo: Option>>, + semaphore: Arc, +) -> Result> { + read_filesystem_impl(dirfd, path, repo, Some(semaphore)).await +} + +async fn read_filesystem_impl( + dirfd: OwnedFd, + path: PathBuf, + repo: Option>>, + semaphore_override: Option>, +) -> Result> { + let semaphore = semaphore_override.unwrap_or_else(|| { + repo.as_ref() + .map(|r| r.write_semaphore()) + .unwrap_or_else(|| { + let n = available_parallelism().map(|n| n.get()).unwrap_or(4); + Arc::new(Semaphore::new(n)) + }) + }); // Channel for streaming work items from the scan thread to the // async runtime. The scan sends (key, fd, size) as files are diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 73047eed..a569dd06 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -735,6 +735,10 @@ pub struct Repository { repository: OwnedFd, objects: OnceCell, write_semaphore: OnceCell>, + /// Optional override for the number of concurrent object writes. + /// Set via [`set_write_concurrency`](Self::set_write_concurrency) before the semaphore + /// is first used; if `None`, defaults to [`available_parallelism`]. + write_concurrency: Option, insecure: bool, metadata: RepoMetadata, /// When true, SplitStreamWriter::done() writes old-format (pre-repr(C)) @@ -1028,15 +1032,40 @@ impl Repository { .get_or_try_init(|| ensure_dir_and_openat(&self.repository, "objects", OFlags::PATH)) } + /// Override the maximum number of concurrent object writes. + /// + /// Must be called before the first use of [`write_semaphore`](Self::write_semaphore); + /// has no effect if the semaphore has already been initialized. + pub fn set_write_concurrency(&mut self, n: usize) { + // Guard: the semaphore is lazily initialized on first use. If it's + // already been initialized, this call has no effect. Callers must + // set concurrency before any write operations begin. + debug_assert!( + self.write_semaphore.get().is_none(), + "set_write_concurrency called after write_semaphore was already initialized; \ + call this before any write operations" + ); + if self.write_semaphore.get().is_some() { + log::warn!( + "set_write_concurrency called after semaphore was already initialized; ignoring" + ); + return; + } + self.write_concurrency = Some(n); + } + /// Return a shared semaphore for limiting concurrent object writes. /// - /// This semaphore is lazily initialized with `available_parallelism()` permits, + /// This semaphore is lazily initialized with `available_parallelism()` permits + /// (or the value set via [`set_write_concurrency`](Self::set_write_concurrency)), /// and shared across all operations on this repository. Use this to limit /// concurrent I/O when processing multiple files or layers in parallel. pub fn write_semaphore(&self) -> Arc { self.write_semaphore .get_or_init(|| { - let max_concurrent = available_parallelism().map(|n| n.get()).unwrap_or(4); + let max_concurrent = self + .write_concurrency + .unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); Arc::new(Semaphore::new(max_concurrent)) }) .clone() @@ -1152,6 +1181,7 @@ impl Repository { repository, objects: OnceCell::new(), write_semaphore: OnceCell::new(), + write_concurrency: None, insecure: !has_verity, metadata, #[cfg(any(test, feature = "test"))] From a05a54d3d2e1cdea3c91c9300cc000a0b7d962f5 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 17 May 2026 18:08:45 -0400 Subject: [PATCH 2/9] just: Fix bootc::patch recipe crate path The patch recipe referenced crates/cfsctl which was never a valid path; the crate has always been named composefs-ctl. Also relax the clean-tree check to allow untracked files (only committed changes need to match the pinned revision). Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- bootc/Justfile | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bootc/Justfile b/bootc/Justfile index 8ad4a840..7856f8fb 100644 --- a/bootc/Justfile +++ b/bootc/Justfile @@ -51,23 +51,25 @@ patch: clone #!/bin/bash set -euo pipefail - # Require a clean composefs-rs working tree so we test a real commit + # Require a clean composefs-rs working tree so we test a real commit. + # Only tracked files matter; untracked files are allowed. + # git diff HEAD already excludes untracked files. if ! git -C "$_COMPOSEFS_SRC" diff --quiet HEAD 2>/dev/null; then echo "error: composefs-rs has uncommitted changes — commit or stash first" >&2 - git -C "$_COMPOSEFS_SRC" status --short >&2 + git -C "$_COMPOSEFS_SRC" diff --stat HEAD >&2 exit 1 fi - cfs_path="$_COMPOSEFS_SRC/crates/cfsctl" + cfs_path="$_COMPOSEFS_SRC/crates/composefs-ctl" cd "$COMPOSEFS_BOOTC_PATH" # Add or update the [patch] section with a path override - patch_value="cfsctl = { path = \"${cfs_path}\" } # Patched by composefs-rs" + patch_value="composefs-ctl = { path = \"${cfs_path}\" } # Patched by composefs-rs" if grep -q '^[[:space:]]*\[patch\."https://github.com/composefs/composefs-rs"\]' Cargo.toml; then - # Patch section already exists (uncommented) — replace the cfsctl line + # Patch section already exists (uncommented) — replace the composefs-ctl line sed -i '/^[[:space:]]*\[patch\."https:\/\/github.com\/composefs\/composefs-rs"\]/,/^$\|^\[/{ - s|^cfsctl = .*|'"$patch_value"'| + s|^composefs-ctl = .*|'"$patch_value"'| }' Cargo.toml else # No patch section yet — append one From 9e1d3776a57b60dcbf13a16c417d98c92589b4ab Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 17 May 2026 09:30:11 -0400 Subject: [PATCH 3/9] oci: Check repo writability before opening OCI layout source import_oci_layout() was opening the layout directory before calling ensure_writable(), so pulling into a read-only repo produced a misleading ENOENT error instead of a clear 'not writable' message. Move the write check to the top of the function, matching the existing skopeo pull path. Fixes privileged_pull_readonly_repo integration test. Signed-off-by: Colin Walters --- crates/composefs-oci/src/oci_layout.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/composefs-oci/src/oci_layout.rs b/crates/composefs-oci/src/oci_layout.rs index 08bb72f6..b857c7cc 100644 --- a/crates/composefs-oci/src/oci_layout.rs +++ b/crates/composefs-oci/src/oci_layout.rs @@ -78,6 +78,10 @@ pub async fn import_oci_layout( layout_path: &Path, layout_tag: Option<&str>, ) -> Result<(PullResult, ImportStats)> { + // Check writability before touching the source, so a read-only repo gives + // a clear "not writable" error rather than a misleading source-open error. + repo.ensure_writable()?; + // Open the OCI layout directory let dir = cap_std::fs::Dir::open_ambient_dir(layout_path, cap_std::ambient_authority()) .with_context(|| format!("Opening OCI layout directory {}", layout_path.display()))?; From e82e720116432603e8781f986907fdd8da96772d Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 3 May 2026 10:55:47 -0400 Subject: [PATCH 4/9] fs: Add ObjectStore trait and FlatDigestStore For compatibility with the C composefs, we need to support writing directly to a flat XX/DIGEST path, without a leading `objects/`. Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- crates/composefs/src/fs.rs | 283 +++++++++++++++++++++++++++++++++-- crates/composefs/src/util.rs | 17 +++ 2 files changed, 291 insertions(+), 9 deletions(-) diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index f4380f93..790537e0 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -39,9 +39,201 @@ use crate::{ repository::Repository, shared_internals::IO_BUF_CAPACITY, tree::{Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}, - util::proc_self_fd, + util::{create_tmpfile_in, proc_self_fd, reopen_tmpfile_ro}, }; +// --------------------------------------------------------------------------- +// ObjectStore trait +// --------------------------------------------------------------------------- + +/// An abstraction over content-addressed storage for file objects. +/// +/// Both [`Repository`] and the C-compatible [`FlatDigestStore`] implement +/// this trait so that [`read_filesystem`] can write file content to either +/// layout without duplicating the scanning logic. +pub trait ObjectStore: Send + Sync { + /// Store `fd` as an object, returning its verity digest. + /// + /// If an object with the same digest already exists, this is a no-op + /// and the existing digest is returned. + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result; + + /// Return a semaphore that gates concurrent object writes. + fn write_semaphore(&self) -> Arc; +} + +impl ObjectStore for Repository { + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result { + self.ensure_object_from_fd(fd, size) + } + + fn write_semaphore(&self) -> Arc { + self.write_semaphore() + } +} + +/// C-compatible flat digest store (`/XX/DIGEST`). +/// +/// This mirrors the layout written by `mkcomposefs --digest-store` from the C +/// implementation, where file objects live at `//` +/// (e.g. `/ab/abcdef01234...`). This is distinct from the composefs-rs +/// [`Repository`] layout which nests objects under an `objects/` subdirectory. +/// +/// The flat layout makes the digest store interchangeable with the C tooling. +#[derive(Debug)] +pub struct FlatDigestStore { + /// Open directory fd for the store root. + root: Arc, + semaphore: Arc, + /// If true, fall back to userspace hashing when kernel fs-verity is + /// unavailable (e.g. tmpfs, overlayfs). Matches `Repository::insecure`. + insecure: bool, +} + +impl FlatDigestStore { + /// Open or create a flat digest store at `path`. + /// + /// `concurrency` controls how many concurrent object writes are permitted. + /// `insecure` enables userspace-hashing fallback when fs-verity is unavailable + /// (e.g. on tmpfs or overlayfs). Set to `true` for CLI use where the filesystem + /// may not support verity; set to `false` for strict security requirements. + pub fn open(path: &Path, concurrency: usize, insecure: bool) -> Result { + use rustix::fs::{Mode, mkdirat}; + + match mkdirat(CWD, path, Mode::from_raw_mode(0o755)) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e) + .with_context(|| format!("Failed to create flat digest store: {path:?}")); + } + } + + let root = openat( + CWD, + path, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .with_context(|| format!("Failed to open flat digest store: {path:?}"))?; + + Ok(Self { + root: Arc::new(root), + semaphore: Arc::new(Semaphore::new(concurrency)), + insecure, + }) + } +} + +impl ObjectStore for FlatDigestStore { + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result { + use crate::fsverity::{EnableVerityError, enable_verity_maybe_copy, measure_verity}; + use std::io::BufRead as _; + + // 1. Create an anonymous O_TMPFILE in the store root. + // No name collision possible; invisible until linked. + let tmpfile_fd = create_tmpfile_in(self.root.as_fd()) + .context("Creating O_TMPFILE in flat digest store")?; + + // 2. Stream from source fd into tmpfile (no in-memory buffering). + let mut src = std::io::BufReader::with_capacity(IO_BUF_CAPACITY, File::from(fd)); + let mut dst = File::from(tmpfile_fd.try_clone().context("Cloning tmpfile fd")?); + let copied = std::io::copy(&mut src, &mut dst).context("Copying object data to tmpfile")?; + ensure!( + copied == size, + "object size mismatch: expected {size}, copied {copied}" + ); + drop(dst); + + // 3. Reopen as read-only (kernel requires no writable fds to enable verity). + let ro_fd = + reopen_tmpfile_ro(File::from(tmpfile_fd)).context("Reopening tmpfile as read-only")?; + + // 4. Enable kernel fs-verity (kernel reads and hashes the file for us). + let (ro_fd, verity_enabled) = + match enable_verity_maybe_copy::(self.root.as_fd(), ro_fd.as_fd()) { + Ok(None) => (ro_fd, true), + Ok(Some(new_fd)) => (new_fd, true), + Err(EnableVerityError::AlreadyEnabled) => (ro_fd, true), + Err(EnableVerityError::FilesystemNotSupported) if self.insecure => (ro_fd, false), + Err(e) => { + return Err(anyhow::anyhow!(e)).context("Enabling verity on object tmpfile"); + } + }; + + // 5. Get the digest — from the kernel (fast) or userspace fallback. + let id: ObjectID = if verity_enabled { + measure_verity(&ro_fd).context("Measuring verity digest after enable")? + } else { + // Insecure fallback: re-read the tmpfile to compute the digest. + let mut reader = std::io::BufReader::with_capacity( + IO_BUF_CAPACITY, + File::from(ro_fd.try_clone().context("Cloning ro_fd for digest")?), + ); + let mut hasher = FsVerityHasher::::new(); + loop { + let buf = reader.fill_buf().context("Reading tmpfile for digest")?; + if buf.is_empty() { + break; + } + let chunk = &buf[..buf.len().min(FsVerityHasher::::BLOCK_SIZE)]; + hasher.add_block(chunk); + let n = chunk.len(); + reader.consume(n); + } + hasher.digest() + }; + + // 6. Derive flat path: XX/rest-of-hex (C-compatible layout). + let obj_path = id.to_object_pathname(); + let slash = obj_path + .find('/') + .expect("to_object_pathname always has '/'"); + let dir_name = &obj_path[..slash]; + let file_name = &obj_path[slash + 1..]; + + // 7. Create XX/ subdirectory if needed. + match mkdirat(self.root.as_fd(), dir_name, Mode::from_raw_mode(0o755)) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e) + .with_context(|| format!("Creating digest store subdirectory {dir_name:?}")); + } + } + + // 8. Open the XX/ subdirectory for use as linkat target. + let subdir = openat( + self.root.as_fd(), + dir_name, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .with_context(|| format!("Opening digest store subdirectory {dir_name:?}"))?; + + // 9. Atomically link the tmpfile into its final content-addressed path. + // EEXIST means another writer already stored the same object — fine. + match linkat( + CWD, + proc_self_fd(&ro_fd), + &subdir, + file_name, + AtFlags::SYMLINK_FOLLOW, + ) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e).with_context(|| { + format!("Linking object into flat digest store: {obj_path:?}") + }); + } + } + + Ok(id) + } + + fn write_semaphore(&self) -> Arc { + self.semaphore.clone() + } +} + /// Attempt to use O_TMPFILE + rename to atomically set file contents. /// Will fall back to a non-atomic write if the target doesn't support O_TMPFILE. #[context("Setting file contents for {}", name.to_string_lossy())] @@ -542,7 +734,21 @@ pub async fn read_filesystem( path: PathBuf, repo: Option>>, ) -> Result> { - read_filesystem_impl(dirfd, path, repo, None).await + let store: Option>> = + repo.map(|r| r as Arc>); + read_filesystem_impl(dirfd, path, store, None).await +} + +/// Like [`read_filesystem`] but accepts any [`ObjectStore`] implementation. +/// +/// This is the preferred entry point when using a custom store (e.g. +/// [`FlatDigestStore`] for C-compatible `--digest-store` behaviour). +pub async fn read_filesystem_with_store( + dirfd: OwnedFd, + path: PathBuf, + store: Option>>, +) -> Result> { + read_filesystem_impl(dirfd, path, store, None).await } /// Like [`read_filesystem`] but with an explicit concurrency limit. @@ -556,18 +762,21 @@ pub async fn read_filesystem_with_semaphore( repo: Option>>, semaphore: Arc, ) -> Result> { - read_filesystem_impl(dirfd, path, repo, Some(semaphore)).await + let store: Option>> = + repo.map(|r| r as Arc>); + read_filesystem_impl(dirfd, path, store, Some(semaphore)).await } async fn read_filesystem_impl( dirfd: OwnedFd, path: PathBuf, - repo: Option>>, + store: Option>>, semaphore_override: Option>, ) -> Result> { let semaphore = semaphore_override.unwrap_or_else(|| { - repo.as_ref() - .map(|r| r.write_semaphore()) + store + .as_ref() + .map(|s| s.write_semaphore()) .unwrap_or_else(|| { let n = available_parallelism().map(|n| n.get()).unwrap_or(4); Arc::new(Semaphore::new(n)) @@ -626,11 +835,11 @@ async fn read_filesystem_impl( item = items.next(), if items_open => { match item { Some(((key, fd, size), permit)) => { - let repo = repo.clone(); + let store = store.clone(); tasks.spawn_blocking(move || { let _permit = permit; - let id = if let Some(repo) = repo { - repo.ensure_object_from_fd(fd, size)? + let id = if let Some(store) = store { + store.ensure_object_from_fd(fd, size)? } else { compute_verity_from_fd::(fd)? }; @@ -724,4 +933,60 @@ mod tests { assert_eq!(std::fs::read(testpath)?, b"new contents"); Ok(()) } + + /// Verify that `FlatDigestStore` stores objects in the C-compatible `XX/DIGEST` layout. + #[test] + fn test_flat_digest_store_layout() -> Result<()> { + use crate::fsverity::Sha256HashValue; + + let td = tempfile::tempdir()?; + let store_path = td.path().join("store"); + let store = FlatDigestStore::open(&store_path, 1, true)?; + + // Store a small piece of content. + let content = b"hello, flat digest store!"; + let src_dir = tempfile::tempdir()?; + let src_path = src_dir.path().join("file"); + std::fs::write(&src_path, content)?; + let src_fd = openat( + CWD, + &src_path, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::from_raw_mode(0), + )?; + + let id = >::ensure_object_from_fd( + &store, + src_fd, + content.len() as u64, + )?; + + // Verify the layout: store/XX/rest-of-digest + let expected_path = id.to_object_pathname(); // e.g. "ab/cdef0123..." + let full_path = store_path.join(&expected_path); + assert!( + full_path.exists(), + "Expected object at flat path {full_path:?}" + ); + + // Verify content is intact. + let stored = std::fs::read(&full_path)?; + assert_eq!(stored, content); + + // Idempotent: storing the same object again should succeed. + let src_fd2 = openat( + CWD, + &src_path, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::from_raw_mode(0), + )?; + let id2 = >::ensure_object_from_fd( + &store, + src_fd2, + content.len() as u64, + )?; + assert_eq!(id, id2); + + Ok(()) + } } diff --git a/crates/composefs/src/util.rs b/crates/composefs/src/util.rs index 7a5ac23e..bdc43485 100644 --- a/crates/composefs/src/util.rs +++ b/crates/composefs/src/util.rs @@ -72,6 +72,23 @@ pub(crate) fn reopen_tmpfile_ro(file: std::fs::File) -> std::io::Result rustix::io::Result { + rustix::fs::openat( + dirfd, + ".", + rustix::fs::OFlags::RDWR | rustix::fs::OFlags::TMPFILE | rustix::fs::OFlags::CLOEXEC, + rustix::fs::Mode::from_raw_mode(0o644), + ) +} + /// This function reads the exact amount of bytes required to fill the buffer, possibly performing /// multiple reads to do so (and also retrying if required to deal with EINTR). /// From 3d4f53c2d872ad879bc31b5492489eb158fdfb56 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 10:15:47 -0400 Subject: [PATCH 5/9] examples: fix-verity: probe OVMF firmware paths, add q35 machine type The script hardcoded /usr/share/edk2/ovmf/OVMF_CODE.fd which is only present on Fedora. Probe a list of common paths (Ubuntu's ovmf package uses /usr/share/ovmf/OVMF.fd, Arch uses /usr/share/edk2/x64/OVMF.4m.fd) so the script works across distros without manual adjustment. Also add -machine q35, required on newer QEMU builds (e.g. RHEL10/CentOS Stream 10) where the default pc-i440fx machine type doesn't pair well with OVMF for EFI boot. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/common/fix-verity/fix-verity | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/common/fix-verity/fix-verity b/examples/common/fix-verity/fix-verity index 788c9796..c6689c93 100755 --- a/examples/common/fix-verity/fix-verity +++ b/examples/common/fix-verity/fix-verity @@ -18,10 +18,17 @@ if [ ! -f ${fix_verity_efi} ]; then mv "${fix_verity_efi}.tmp" "${fix_verity_efi}" fi +ovmf="" +for p in /usr/share/OVMF/OVMF_CODE.fd /usr/share/ovmf/OVMF.fd /usr/share/edk2/ovmf/OVMF_CODE.fd /usr/share/edk2/x64/OVMF.4m.fd; do + if [ -f "$p" ]; then ovmf="$p"; break; fi +done +[ -n "$ovmf" ] || { echo "Cannot find OVMF firmware"; exit 1; } + qemu-system-x86_64 \ -nographic \ -m 4096 \ -enable-kvm \ - -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -machine q35 \ + -bios "${ovmf}" \ -drive file="$1",format=raw,if=virtio,media=disk \ -kernel "${fix_verity_efi}" From 1d71809867b165cd35ade09bf92f5ac53695c9d7 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 10:54:22 -0400 Subject: [PATCH 6/9] examples: use OVMF pflash + q35 for local RHEL/CentOS QEMU compatibility The combined OVMF.qemuvars.fd with -bios hangs indefinitely on RHEL10/ CentOS Stream 10 QEMU (qemu-kvm 9.x). Use the split OVMF_CODE.fd + OVMF_VARS.fd files with -drive if=pflash and -machine q35 instead, which works correctly. Fall back to -bios with the combined image on distros that only ship the combined file (Ubuntu, Arch). Updated both testthing.py (which drives the example integration tests) and the fix-verity helper script (which runs the in-VM verity fixup pass). A temporary copy of OVMF_VARS.fd is made so UEFI can write to it without modifying the original system file. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/common/fix-verity/fix-verity | 22 ++++++++++++++---- examples/testthing.py | 33 ++++++++++++++++++++------- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/examples/common/fix-verity/fix-verity b/examples/common/fix-verity/fix-verity index c6689c93..e9672bd7 100755 --- a/examples/common/fix-verity/fix-verity +++ b/examples/common/fix-verity/fix-verity @@ -18,17 +18,29 @@ if [ ! -f ${fix_verity_efi} ]; then mv "${fix_verity_efi}.tmp" "${fix_verity_efi}" fi -ovmf="" -for p in /usr/share/OVMF/OVMF_CODE.fd /usr/share/ovmf/OVMF.fd /usr/share/edk2/ovmf/OVMF_CODE.fd /usr/share/edk2/x64/OVMF.4m.fd; do - if [ -f "$p" ]; then ovmf="$p"; break; fi +ovmf_code="" +ovmf_vars="" +for d in /usr/share/edk2/ovmf /usr/share/OVMF /usr/share/ovmf /usr/share/edk2/x64; do + if [ -f "$d/OVMF_CODE.fd" ] && [ -f "$d/OVMF_VARS.fd" ]; then + ovmf_code="$d/OVMF_CODE.fd" + ovmf_vars="$d/OVMF_VARS.fd" + break + fi done -[ -n "$ovmf" ] || { echo "Cannot find OVMF firmware"; exit 1; } + +ovmf_vars_tmp="" +if [ -n "$ovmf_code" ]; then + ovmf_vars_tmp="$(mktemp --suffix=.fd)" + cp "$ovmf_vars" "$ovmf_vars_tmp" + trap 'rm -f "$ovmf_vars_tmp"' EXIT +fi qemu-system-x86_64 \ -nographic \ -m 4096 \ -enable-kvm \ -machine q35 \ - -bios "${ovmf}" \ + ${ovmf_code:+-drive if=pflash,format=raw,readonly=on,file="$ovmf_code"} \ + ${ovmf_vars_tmp:+-drive if=pflash,format=raw,file="$ovmf_vars_tmp"} \ -drive file="$1",format=raw,if=virtio,media=disk \ -kernel "${fix_verity_efi}" diff --git a/examples/testthing.py b/examples/testthing.py index 9ec191d0..81e11c90 100644 --- a/examples/testthing.py +++ b/examples/testthing.py @@ -184,19 +184,36 @@ def _find_qemu() -> Path: raise FileNotFoundError("Unable to find qemu-kvm") -def _find_ovmf() -> tuple[str, Path]: - candidates = [ - # path for Fedora/RHEL (our tasks container) - "/usr/share/OVMF/OVMF_CODE.fd", +def _find_ovmf() -> tuple[str | tuple[str, str], ...]: + # Prefer split CODE+VARS pflash files (required on RHEL10/CentOS10 QEMU + # where -bios with the combined file hangs). Fall back to -bios with a + # combined image for Ubuntu CI and Arch. + split_candidates = [ + ("/usr/share/edk2/ovmf/OVMF_CODE.fd", "/usr/share/edk2/ovmf/OVMF_VARS.fd"), + ("/usr/share/OVMF/OVMF_CODE.fd", "/usr/share/OVMF/OVMF_VARS.fd"), + ] + for code, varst in split_candidates: + if Path(code).exists() and Path(varst).exists(): + # Copy VARS so UEFI can write to it without modifying the original. + import tempfile, shutil, atexit + tmp = tempfile.NamedTemporaryFile(suffix=".fd", delete=False) + shutil.copy2(varst, tmp.name) + atexit.register(lambda p=tmp.name: Path(p).unlink(missing_ok=True)) + return ( + ("-machine", "q35"), + ("-drive", f"if=pflash,format=raw,readonly=on,file={code}"), + ("-drive", f"if=pflash,format=raw,file={tmp.name}"), + ) + + bios_candidates = [ # path for Ubuntu (GitHub Actions runners) "/usr/share/ovmf/OVMF.fd", # path for Arch "/usr/share/edk2/x64/OVMF.4m.fd", ] - - for path in map(Path, candidates): + for path in map(Path, bios_candidates): if path.exists(): - return "-bios", path + return (("-bios", str(path)),) raise FileNotFoundError("Unable to find OVMF UEFI BIOS") @@ -618,7 +635,7 @@ async def _qemu( args = ( _find_qemu(), "-nodefaults", - _find_ovmf(), + *_find_ovmf(), ("-cpu", "host"), ("-smp", f"{self._cpus}"), ("-m", f"{self._memory}"), From 4113cc42850b05b5e4b7283de8c69e6cfc8bb237 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 11:02:41 -0400 Subject: [PATCH 7/9] examples: fix-verity: enable fs-verity on meta.json composefs-setup-root validates that the repo's meta.json has fs-verity enabled before trusting the repo. The dracut hook was only enabling verity on the content objects, so setup-root would see the repo as insecure and refuse to proceed. Switch the working directory to /sysroot/composefs (instead of the objects subdirectory) so we can enable verity on meta.json in addition to all the content objects. Also quote the loop variable and use the full relative path for clarity. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/common/fix-verity/dracut-hook.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/common/fix-verity/dracut-hook.sh b/examples/common/fix-verity/dracut-hook.sh index 44d01532..3110198b 100644 --- a/examples/common/fix-verity/dracut-hook.sh +++ b/examples/common/fix-verity/dracut-hook.sh @@ -1,11 +1,13 @@ # dracut hook for fixing fs-verity on composefs sysroot mount -o remount,rw /sysroot ( - cd /sysroot/composefs/objects + cd /sysroot/composefs echo >&2 'Enabling fsverity on composefs objects' - for i in */*; do - fsverity enable $i; + for i in objects/*/*; do + fsverity enable "$i" done + echo >&2 'Enabling fsverity on meta.json' + fsverity enable meta.json echo >&2 'done!' ) umount /sysroot From a8d377aac9e0f340972301790160a3ee9bcd73fc Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 11:03:16 -0400 Subject: [PATCH 8/9] examples: increase default VM startup timeout from 30s to 60s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 30s default is tight on slower hardware (e.g. CentOS Stream 10 with OVMF pflash init overhead) — the VM boots successfully but just barely misses the window. 60s gives enough headroom while still being short enough to catch genuinely broken VMs. CI on Ubuntu with KVM acceleration boots well under 30s so the extra budget costs nothing. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/testthing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/testthing.py b/examples/testthing.py index 81e11c90..2631c4e9 100644 --- a/examples/testthing.py +++ b/examples/testthing.py @@ -408,7 +408,7 @@ def __init__( sit: bool = False, snapshot: bool = True, status_messages: bool = False, - timeout: float = 30.0, + timeout: float = 60.0, verbose: bool = False, ) -> None: """Construct a VM. @@ -1077,7 +1077,7 @@ def _main() -> None: "--ssh-key", "-i", type=Path, help="Path to SSH private key (default: generate)" ) parser.add_argument( - "--timeout", type=float, help="For startup, in seconds, or 'inf' (default: 30)" + "--timeout", type=float, help="For startup, in seconds, or 'inf' (default: 60)" ) parser.add_argument("image", type=Path, help="The path to a qcow2 VM image to run") args = parser.parse_args() From 123ea7a094ec546366a175de46b254148973bf54 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 21 May 2026 18:27:42 -0400 Subject: [PATCH 9/9] composefs: Add V1 EROFS format with compat mkcomposefs and composefs-info CLI Add support for generating V1 EROFS images compatible with the C composefs tools (mkcomposefs/composefs-info 1.0.8+). V1 uses compact inodes, BFS layout, and a simpler on-disk structure. Adds --erofs-version flag to cfsctl, new mkcomposefs and composefs-info compatibility subcommands, and RepositoryConfig for cleaner repo initialization. Note: this commit does not compile with --features oci (the default) until the following commit migrates OCI crate callers. Assisted-by: OpenCode (Claude Sonnet 4.5) Signed-off-by: Colin Walters --- .gitignore | 2 + Justfile | 15 + crates/composefs-boot/src/lib.rs | 6 + crates/composefs-boot/src/selabel.rs | 2 + crates/composefs-ctl/src/composefs_info.rs | 331 ++++ crates/composefs-ctl/src/lib.rs | 108 +- crates/composefs-ctl/src/main.rs | 77 +- crates/composefs-ctl/src/mkcomposefs.rs | 409 ++++ crates/composefs-integration-tests/src/lib.rs | 12 +- .../src/tests/cli.rs | 10 +- .../src/tests/privileged.rs | 9 +- crates/composefs-oci/src/boot.rs | 3 +- crates/composefs-oci/src/image.rs | 6 +- crates/composefs-oci/src/lib.rs | 16 +- crates/composefs-oci/src/oci_layout.rs | 3 +- crates/composefs-oci/src/tar.rs | 8 +- crates/composefs-oci/src/test_util.rs | 14 +- crates/composefs/Cargo.toml | 1 + crates/composefs/fuzz/Cargo.lock | 16 +- crates/composefs/fuzz/generate_corpus.rs | 142 +- .../proptest-regressions/erofs/reader.txt | 8 + crates/composefs/src/dumpfile.rs | 83 +- crates/composefs/src/dumpfile_parse.rs | 11 +- crates/composefs/src/erofs/composefs.rs | 15 +- crates/composefs/src/erofs/debug.rs | 123 +- crates/composefs/src/erofs/format.rs | 175 +- crates/composefs/src/erofs/reader.rs | 1485 ++++++++++++++- crates/composefs/src/erofs/writer.rs | 1668 +++++++++++++++-- crates/composefs/src/filesystem_ops.rs | 77 +- crates/composefs/src/fs.rs | 2 + crates/composefs/src/generic_tree.rs | 211 ++- crates/composefs/src/repository.rs | 699 ++++++- crates/composefs/src/splitstream.rs | 7 +- crates/composefs/src/test.rs | 433 ++++- crates/composefs/src/tree.rs | 2 + crates/composefs/tests/mkfs.rs | 187 +- crates/composefs/tests/special_v1.dump | 7 + doc/repository.md | 31 + docs/booting.md | 77 + docs/erofs.md | 82 + 40 files changed, 5941 insertions(+), 632 deletions(-) create mode 100644 crates/composefs-ctl/src/composefs_info.rs create mode 100644 crates/composefs-ctl/src/mkcomposefs.rs create mode 100644 crates/composefs/proptest-regressions/erofs/reader.txt create mode 100644 crates/composefs/tests/special_v1.dump create mode 100644 docs/booting.md create mode 100644 docs/erofs.md diff --git a/.gitignore b/.gitignore index 69660095..3536a549 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ **/fuzz/target/ **/fuzz/corpus/ **/fuzz/artifacts/ +.worktrees +*.rpm diff --git a/Justfile b/Justfile index 7e8b7fb1..1a57c824 100644 --- a/Justfile +++ b/Justfile @@ -90,6 +90,21 @@ test-integration-vm *ARGS: build _integration-container-build install-nextest: @which cargo-nextest > /dev/null 2>&1 || cargo install cargo-nextest --locked +# Build and run a bls example locally. +# Usage: just test-example-local bls arch +# just test-example-local bls arch fsfmt=ext4 verity=none +# 'fsfmt' defaults to ext4, 'verity' defaults to none (no fs-verity enforcement). +# Requires: qemu-kvm, OVMF, skopeo, mtools, fsverity, mkfs.erofs, systemd-repart, podman. +test-example-local example os fsfmt="ext4" verity="none": build + #!/usr/bin/env bash + set -euo pipefail + export FS_FORMAT={{ fsfmt }} + export FS_VERITY_MODE={{ verity }} + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + cd examples + {{ example }}/build {{ os }} + TEST_IMAGE="{{ example }}/{{ os }}-{{ example }}-efi.qcow2" pytest test -v + # Run everything: checks + full integration tests including VM ci: check test-integration-vm diff --git a/crates/composefs-boot/src/lib.rs b/crates/composefs-boot/src/lib.rs index 40ff0335..11e5cc33 100644 --- a/crates/composefs-boot/src/lib.rs +++ b/crates/composefs-boot/src/lib.rs @@ -101,6 +101,10 @@ impl BootOps for FileSystem { ) -> Result>> { let boot_entries = get_boot_resources(self, repo)?; empty_toplevel_dirs(self)?; + // Compact the leaves table after clearing directories, so that leaves + // which were only referenced by /boot or /sysroot are removed and + // don't appear as orphans when the filesystem is validated. + self.compact(); selabel::selabel(self, repo)?; Ok(boot_entries) @@ -108,6 +112,8 @@ impl BootOps for FileSystem { fn transform_for_boot_from_dir(&mut self, rootfs: impl AsFd) -> Result<()> { empty_toplevel_dirs(self)?; + // Same as above: compact to remove leaves orphaned by clearing dirs. + self.compact(); selabel::selabel_from_dir(self, rootfs)?; Ok(()) } diff --git a/crates/composefs-boot/src/selabel.rs b/crates/composefs-boot/src/selabel.rs index 58842ecd..3400bca7 100644 --- a/crates/composefs-boot/src/selabel.rs +++ b/crates/composefs-boot/src/selabel.rs @@ -581,6 +581,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }; @@ -595,6 +596,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::Inline(data.to_vec().into_boxed_slice())), diff --git a/crates/composefs-ctl/src/composefs_info.rs b/crates/composefs-ctl/src/composefs_info.rs new file mode 100644 index 00000000..c1301e58 --- /dev/null +++ b/crates/composefs-ctl/src/composefs_info.rs @@ -0,0 +1,331 @@ +//! composefs-info - Query information from composefs images. +//! +//! This is a Rust reimplementation of the C composefs-info tool, providing +//! commands to inspect EROFS images, list objects, and compute fs-verity digests. +//! +//! ## Compatibility status +//! +//! Implemented subcommands: +//! - `ls` — lists files with type suffixes, skips whiteout entries +//! - `dump` — outputs composefs-dump(5) text format (image → tree → dumpfile) +//! - `objects` — lists all backing file object paths (XX/XXXX...) +//! - `missing-objects` — lists objects not present in `--basedir` +//! - `measure-file` — computes fs-verity digest of files +//! +//! Known gaps vs C composefs-info: +//! - TODO(compat): `measure-file` uses userspace fs-verity computation instead +//! of the kernel `FS_IOC_MEASURE_VERITY` ioctl. This works on files without +//! verity enabled (computing what the digest *would* be), while the C version +//! fails on non-verity files. + +use std::collections::HashSet; +use std::io::Write; +use std::path::Path; +use std::{fs::File, io::Read, path::PathBuf}; + +use anyhow::{Context, Result}; +use clap::{Parser, Subcommand}; + +use composefs::{ + dumpfile::write_dumpfile, + erofs::reader::erofs_to_filesystem, + fsverity::{FsVerityHashValue, FsVerityHasher, Sha256HashValue}, + generic_tree::{Inode, LeafContent, LeafId}, + tree::{FileSystem, RegularFile}, +}; + +/// Query information from composefs images. +#[derive(Parser, Debug)] +#[command( + name = "composefs-info", + version, + about = "Query information from composefs images" +)] +struct Cli { + /// The subcommand to run. + #[command(subcommand)] + command: Command, +} + +/// Available subcommands. +#[derive(Subcommand, Debug)] +enum Command { + /// Simple listing of files and directories in the image. + Ls { + /// Filter entries at the root level by name (can be specified multiple times). + #[arg(long = "filter", action = clap::ArgAction::Append)] + filter: Vec, + /// Composefs image files to inspect. + images: Vec, + }, + + /// Full dump in composefs-dump(5) format. + Dump { + /// Filter entries at the root level by name (can be specified multiple times). + #[arg(long = "filter", action = clap::ArgAction::Append)] + filter: Vec, + /// Composefs image files to dump. + images: Vec, + }, + + /// List all backing file object paths. + Objects { + /// Composefs image files to inspect. + images: Vec, + }, + + /// List backing files not present in basedir. + MissingObjects { + /// Base directory for object lookups. + #[arg(long = "basedir", required = true)] + basedir: PathBuf, + /// Composefs image files to inspect. + images: Vec, + }, + + /// Print the fs-verity digest of files. + MeasureFile { + /// Files to measure. + files: Vec, + }, +} + +/// Entry point for the composefs-info multi-call mode. +pub(crate) fn run() -> Result<()> { + let cli = Cli::parse(); + + match &cli.command { + Command::Ls { filter, images } => cmd_ls(filter, images), + Command::Dump { filter, images } => cmd_dump(filter, images), + Command::Objects { images } => cmd_objects(images), + Command::MissingObjects { basedir, images } => cmd_missing_objects(basedir, images), + Command::MeasureFile { files } => cmd_measure_file(files), + } +} + +/// Print escaped path (matches C implementation behavior). +fn print_escaped(out: &mut W, s: &[u8]) -> std::io::Result<()> { + for &c in s { + match c { + b'\\' => write!(out, "\\\\")?, + b'\n' => write!(out, "\\n")?, + b'\r' => write!(out, "\\r")?, + b'\t' => write!(out, "\\t")?, + // Non-printable or non-ASCII characters are hex-escaped + c if !c.is_ascii_graphic() && c != b' ' => write!(out, "\\x{c:02x}")?, + c => out.write_all(&[c])?, + } + } + Ok(()) +} + +/// Walk and print entries: directory line first, then recurse into children. +fn ls_print( + out: &mut W, + fs: &FileSystem, + dir: &composefs::tree::Directory, + path: &[u8], + seen_leaf_ids: &mut HashSet, + filter: &[String], + is_root: bool, +) -> Result<()> { + for (name, child) in dir.sorted_entries() { + let name_bytes = name.as_encoded_bytes(); + + // At the root level, apply name filters if any were given. + if is_root && !filter.is_empty() { + let name_str = name.to_string_lossy(); + if !filter.iter().any(|f| f == name_str.as_ref()) { + continue; + } + } + + let mut child_path = path.to_vec(); + child_path.push(b'/'); + child_path.extend_from_slice(name_bytes); + + match child { + Inode::Directory(child_dir) => { + // Print the directory entry with trailing slash. + print_escaped(out, &child_path)?; + write!(out, "/\t")?; + writeln!(out)?; + // Recurse into the directory. + ls_print( + out, + fs, + child_dir, + &child_path, + seen_leaf_ids, + filter, + false, + )?; + } + Inode::Leaf(leaf_id, _) => { + let leaf = fs.leaf(*leaf_id); + + print_escaped(out, &child_path)?; + + match &leaf.content { + LeafContent::Regular(regular) => { + let is_hardlink = !seen_leaf_ids.insert(*leaf_id); + if !is_hardlink && let RegularFile::External(id, _) = regular { + write!(out, "\t@ ")?; + print_escaped(out, id.to_object_pathname().as_bytes())?; + } + } + LeafContent::Symlink(target) => { + write!(out, "\t-> ")?; + print_escaped(out, target.as_encoded_bytes())?; + } + _ => {} + } + + writeln!(out)?; + } + } + } + Ok(()) +} + +/// List files and directories in the image. +fn cmd_ls(filter: &[String], images: &[PathBuf]) -> Result<()> { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + + let mut seen_leaf_ids = HashSet::new(); + ls_print( + &mut out, + &fs, + &fs.root, + b"", + &mut seen_leaf_ids, + filter, + true, + )?; + } + + Ok(()) +} + +/// Dump the image in composefs-dump(5) text format. +/// +/// This matches the C composefs-info dump output: the EROFS image is parsed +/// back into a filesystem tree which is then serialized as a dumpfile. +fn cmd_dump(_filter: &[String], images: &[PathBuf]) -> Result<()> { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + write_dumpfile(&mut out, &fs) + .with_context(|| format!("Failed to dump image: {image_path:?}"))?; + } + + Ok(()) +} + +/// Collect all external object IDs from a parsed filesystem. +/// +/// Iterates the leaves table directly — each `RegularFile::External` entry +/// is a unique content-addressed object. Because `erofs_to_filesystem` +/// deduplicates hard-linked inodes into a single leaf, each object appears +/// exactly once even if it is referenced by multiple paths. +fn collect_objects_from_fs(fs: &FileSystem) -> HashSet { + fs.leaves + .iter() + .filter_map(|leaf| match &leaf.content { + LeafContent::Regular(RegularFile::External(id, _)) => Some(id.clone()), + _ => None, + }) + .collect() +} + +/// List all object paths from the images. +fn cmd_objects(images: &[PathBuf]) -> Result<()> { + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + + let mut objects: Vec = collect_objects_from_fs(&fs).into_iter().collect(); + objects.sort_by_key(|id| id.to_hex()); + + for obj in objects { + println!("{}", obj.to_object_pathname()); + } + } + Ok(()) +} + +/// List objects not present in basedir. +fn cmd_missing_objects(basedir: &Path, images: &[PathBuf]) -> Result<()> { + let mut all_objects: HashSet = HashSet::new(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + all_objects.extend(collect_objects_from_fs(&fs)); + } + + let mut missing: Vec = all_objects + .into_iter() + .filter(|obj| !basedir.join(obj.to_object_pathname()).exists()) + .collect(); + + missing.sort_by_key(|a| a.to_hex()); + + for obj in missing { + println!("{}", obj.to_object_pathname()); + } + + Ok(()) +} + +/// Compute and print the fs-verity digest of each file. +fn cmd_measure_file(files: &[PathBuf]) -> Result<()> { + use std::io::BufRead; + for path in files { + let file = File::open(path).with_context(|| format!("Failed to open file: {path:?}"))?; + + let mut hasher = FsVerityHasher::::new(); + let mut reader = std::io::BufReader::with_capacity( + FsVerityHasher::::BLOCK_SIZE * 2, + file, + ); + + loop { + let buf = reader + .fill_buf() + .with_context(|| format!("Failed to read file: {path:?}"))?; + if buf.is_empty() { + break; + } + let chunk = &buf[..buf.len().min(FsVerityHasher::::BLOCK_SIZE)]; + hasher.add_block(chunk); + let n = chunk.len(); + reader.consume(n); + } + + let digest = hasher.digest(); + println!("{}", digest.to_hex()); + } + Ok(()) +} + +/// Read an entire image file into memory. +fn read_image(path: &PathBuf) -> Result> { + let mut file = File::open(path).with_context(|| format!("Failed to open image: {path:?}"))?; + let mut data = Vec::new(); + file.read_to_end(&mut data) + .with_context(|| format!("Failed to read image: {path:?}"))?; + Ok(data) +} diff --git a/crates/composefs-ctl/src/lib.rs b/crates/composefs-ctl/src/lib.rs index 91a73efc..59b61917 100644 --- a/crates/composefs-ctl/src/lib.rs +++ b/crates/composefs-ctl/src/lib.rs @@ -46,10 +46,13 @@ use composefs_boot::write_boot; use composefs::shared_internals::IO_BUF_CAPACITY; use composefs::{ dumpfile::{dump_single_dir, dump_single_file}, - erofs::reader::erofs_to_filesystem, + erofs::{format::FormatSet, reader::erofs_to_filesystem}, fsverity::{Algorithm, FsVerityHashValue, Sha256HashValue, Sha512HashValue}, generic_tree::{FileSystem, Inode}, - repository::{REPO_METADATA_FILENAME, Repository, read_repo_algorithm, system_path, user_path}, + repository::{ + REPO_METADATA_FILENAME, Repository, RepositoryConfig, read_repo_algorithm, system_path, + user_path, + }, tree::RegularFile, }; @@ -89,6 +92,11 @@ pub struct App { #[clap(long, value_enum)] pub hash: Option, + /// The EROFS format version to use when generating images. + /// If omitted, the library default (V2) is used. + #[clap(long, value_enum)] + pub erofs_version: Option, + /// Deprecated: security mode is now auto-detected from meta.json. /// Use `cfsctl init --insecure` to create a repo without verity. /// Kept for backward compatibility. @@ -123,6 +131,44 @@ pub enum HashType { Sha512, } +/// The EROFS format version used when generating images. +#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)] +pub enum ErofsVersion { + /// Format V1: compact inodes, BFS, C-compatible. + #[clap(name = "1")] + V1, + /// Format V2: extended inodes, DFS, current default. + #[clap(name = "2")] + V2, +} + +impl From for composefs::erofs::format::FormatVersion { + fn from(v: ErofsVersion) -> Self { + match v { + ErofsVersion::V1 => Self::V1, + ErofsVersion::V2 => Self::V2, + } + } +} + +/// EROFS format generation mode for `cfsctl init --erofs`. +#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)] +pub enum ErofsMode { + /// Generate only V1 EROFS (default; compatible with C `mkcomposefs`/`composefs-info` 1.0.8). + V1, + /// Generate both V1 and V2 EROFS (dual mode, used by bootc and other multi-format consumers). + Dual, +} + +impl From for FormatSet { + fn from(m: ErofsMode) -> Self { + match m { + ErofsMode::V1 => FormatSet::V1_ONLY, + ErofsMode::Dual => FormatSet::BOTH, + } + } +} + /// A reference to an OCI image: either a content digest or a named ref. /// /// Digests are prefixed with `@` (e.g. `@sha256:abc123…`), while bare @@ -405,6 +451,20 @@ enum Command { /// re-imported after migration. #[clap(long)] reset_metadata: bool, + /// Default EROFS format version for images in this repository. + /// V1 is compatible with C `mkcomposefs` 1.0.8. + /// If omitted, falls back to the global `--erofs-version` flag, then defaults to V2. + #[clap(long)] + erofs_version: Option, + /// EROFS format generation mode. + /// + /// Controls which EROFS format versions are produced when committing images: + /// v1 Generate only V1 EROFS (default; C-tool compatible) + /// dual Generate both V1 and V2 EROFS (used by bootc) + /// + /// If omitted, defaults to `v1`. + #[clap(long, value_enum)] + erofs: Option, }, /// Take a transaction lock on the repository. /// This prevents garbage collection from occurring. @@ -612,13 +672,27 @@ pub async fn run_app(args: App) -> Result<()> { ref path, insecure, reset_metadata, + erofs_version: ref init_erofs_version, + erofs: init_erofs, } = args.cmd { + // --erofs controls the FormatSet (which versions to generate); default V2-only. + let erofs_formats = init_erofs + .map(FormatSet::from) + .unwrap_or(FormatSet::from(composefs::erofs::format::FormatVersion::V2)); + // Prefer the subcommand-level --erofs-version; fall back to global flag. + // If neither is given, default to V2. + let erofs_version = init_erofs_version + .or(args.erofs_version) + .map(composefs::erofs::format::FormatVersion::from) + .unwrap_or(composefs::erofs::format::FormatVersion::V2); return run_init( algorithm, path.as_deref(), insecure || args.insecure, reset_metadata, + erofs_version, + erofs_formats, &args, ); } @@ -666,6 +740,8 @@ fn run_init( path: Option<&Path>, insecure: bool, reset_metadata: bool, + erofs_version: composefs::erofs::format::FormatVersion, + erofs_formats: FormatSet, args: &App, ) -> Result<()> { let repo_path = if let Some(p) = path { @@ -686,12 +762,18 @@ fn run_init( // init_path handles idempotency: same algorithm is a no-op, // different algorithm is an error. + let config = { + let mut c = RepositoryConfig::new(*algorithm); + c.erofs_version = erofs_version; + c.erofs_formats = erofs_formats; + if insecure { c.set_insecure() } else { c } + }; let created = match algorithm { Algorithm::Sha256 { .. } => { - Repository::::init_path(CWD, &repo_path, *algorithm, !insecure)?.1 + Repository::::init_path(CWD, &repo_path, config)?.1 } Algorithm::Sha512 { .. } => { - Repository::::init_path(CWD, &repo_path, *algorithm, !insecure)?.1 + Repository::::init_path(CWD, &repo_path, config)?.1 } }; @@ -734,6 +816,11 @@ where if args.require_verity { repo.require_verity()?; } + // If the user explicitly passed --erofs-version, override the stored + // repo setting for this invocation only (does not rewrite meta.json). + if let Some(version) = args.erofs_version { + repo.set_erofs_version(version.into()); + } Ok(repo) } @@ -883,10 +970,19 @@ fn dump_file_impl( /// Run commands that don't require a repository. pub async fn run_cmd_without_repo(args: App) -> Result<()> { + let erofs_version = args + .erofs_version + .map(composefs::erofs::format::FormatVersion::from); match args.cmd { Command::ComputeId { fs_opts } => { let fs = load_filesystem_from_ondisk_fs::(&fs_opts, None).await?; - let id = fs.compute_image_id(); + let version = erofs_version.unwrap_or_default(); + let id = composefs::fsverity::compute_verity::( + &composefs::erofs::writer::mkfs_erofs_versioned( + &composefs::erofs::writer::ValidatedFileSystem::new(fs)?, + version, + ), + ); println!("{}", id.to_hex()); } Command::CreateDumpfile { fs_opts } => { @@ -974,7 +1070,7 @@ where } OciCommand::ComputeId { config_opts } => { let fs = load_filesystem_from_oci_image(&repo, config_opts)?; - let id = fs.compute_image_id(); + let id = fs.compute_image_id(repo.erofs_version()); println!("{}", id.to_hex()); } OciCommand::Pull { diff --git a/crates/composefs-ctl/src/main.rs b/crates/composefs-ctl/src/main.rs index a7ae17a5..ff57703f 100644 --- a/crates/composefs-ctl/src/main.rs +++ b/crates/composefs-ctl/src/main.rs @@ -1,30 +1,67 @@ //! Command-line control utility for composefs repositories and images. //! -//! `cfsctl` provides a comprehensive interface for managing composefs repositories, -//! creating and mounting filesystem images, handling OCI containers, and performing -//! repository maintenance operations like garbage collection. +//! `cfsctl` is a multi-call binary: when invoked as `mkcomposefs` or +//! `composefs-info` (via symlink or hardlink), it dispatches to the +//! corresponding tool. Otherwise it runs the normal `cfsctl` interface. +//! +//! ## C composefs compatibility roadmap +//! +//! This work aims to provide a Rust implementation that is a drop-in for the +//! C composefs tools and library. See: +//! +//! +//! Status: +//! 1. **CLI interfaces** (`mkcomposefs`, `composefs-info`): Substantially +//! implemented. V1 EROFS output is byte-for-byte identical to C mkcomposefs. +//! See individual module docs for remaining gaps. +//! 2. **EROFS output format**: V1 (C-compatible) writer with compact inodes, +//! BFS ordering, whiteout table, and overlay xattr escaping is complete and +//! tested. V2 (Rust-native) is the default for the composefs-rs repository. +//! 3. **C shared library (`libcomposefs`)**: TODO(compat): Not yet started. +//! This is the next major milestone — providing a C-ABI compatible shared +//! library so that existing C consumers (e.g. ostree, bootc) can link +//! against the Rust implementation. Will require `#[no_mangle]` exports, +//! a `cdylib` crate, and C header generation (e.g. via cbindgen). -use composefs_ctl::App; +use std::path::Path; use anyhow::Result; -use clap::Parser; -fn main() -> Result<()> { - // If we were spawned as a userns helper process, handle that and exit. - // This MUST be called before the tokio runtime is created. - #[cfg(feature = "containers-storage")] - cstorage::init_if_helper(); - - // Now we can create the tokio runtime for the main application - tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()? - .block_on(async_main()) +mod composefs_info; +mod mkcomposefs; + +/// Extract the binary name from argv[0], stripping any directory prefix. +fn binary_name() -> Option { + std::env::args_os().next().and_then(|arg0| { + Path::new(&arg0) + .file_name() + .map(|f| f.to_string_lossy().into_owned()) + }) } -async fn async_main() -> Result<()> { - env_logger::init(); +fn main() -> Result<()> { + match binary_name().as_deref() { + Some("mkcomposefs") => mkcomposefs::run(), + Some("composefs-info") => composefs_info::run(), + _ => { + use clap::Parser; + use composefs_ctl::App; + + // If we were spawned as a userns helper process, handle that and exit. + // This MUST be called before the tokio runtime is created. + #[cfg(feature = "containers-storage")] + cstorage::init_if_helper(); + + env_logger::init(); - let args = App::parse(); - composefs_ctl::run_app(args).await + // Now we can create the tokio runtime for the main application + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()? + .block_on(async { + let args = App::parse(); + composefs_ctl::run_app(args).await + }) + } + } } diff --git a/crates/composefs-ctl/src/mkcomposefs.rs b/crates/composefs-ctl/src/mkcomposefs.rs new file mode 100644 index 00000000..c4a49df9 --- /dev/null +++ b/crates/composefs-ctl/src/mkcomposefs.rs @@ -0,0 +1,409 @@ +//! mkcomposefs - Create composefs images from directories or dumpfiles. +//! +//! This is a Rust reimplementation of the C mkcomposefs tool, providing +//! compatible command-line interface and output format. +//! +//! ## Compatibility status +//! +//! See for context. +//! +//! Implemented and tested (byte-for-byte match with C mkcomposefs): +//! - `--from-file`, `--print-digest`, `--print-digest-only` +//! - `--skip-devices`, `--skip-xattrs`, `--user-xattrs` +//! - `--min-version` / `--max-version` (V1 compact inodes, BFS ordering, whiteout table) +//! - `--digest-store` (C-compatible flat `XX/digest` layout via [`FlatDigestStore`]) +//! - `--threads` (controls tokio worker threads and verity-computation concurrency) +//! - Source from directory or dumpfile, output to file or stdout +//! +//! All known compatibility gaps have been resolved. + +use std::{ + ffi::OsString, + fs::File, + io::{self, BufReader, IsTerminal, Read, Write}, + path::{Path, PathBuf}, + sync::Arc, + thread::available_parallelism, +}; + +use anyhow::{Context, Result, bail}; +use clap::Parser; +use rustix::fs::CWD; +use tokio::sync::Semaphore; + +use composefs::{ + dumpfile::dumpfile_to_filesystem, + erofs::writer::{ValidatedFileSystem, mkfs_erofs_v1_min_version}, + fs::{ + FlatDigestStore, ObjectStore, read_filesystem_with_semaphore, read_filesystem_with_store, + }, + fsverity::{FsVerityHashValue, Sha256HashValue, compute_verity}, + tree::FileSystem, +}; + +/// Create a composefs image from a source directory or dumpfile. +/// +/// Composefs uses EROFS image files for metadata and separate content-addressed +/// backing directories for regular file data. +#[derive(Parser, Debug)] +#[command(name = "mkcomposefs", version, about)] +struct Args { + /// Treat SOURCE as a dumpfile in composefs-dump(5) format. + /// + /// If SOURCE is `-`, reads from stdin. + #[arg(long)] + from_file: bool, + + /// Print the fsverity digest of the image after writing. + #[arg(long)] + print_digest: bool, + + /// Print the fsverity digest without writing the image. + /// + /// When set, IMAGE must be omitted. + #[arg(long)] + print_digest_only: bool, + + /// Set modification time to zero (Unix epoch) for all files. + #[arg(long)] + use_epoch: bool, + + /// Exclude device nodes from the image. + #[arg(long)] + skip_devices: bool, + + /// Exclude all extended attributes. + #[arg(long)] + skip_xattrs: bool, + + /// Only include xattrs with the `user.` prefix. + #[arg(long)] + user_xattrs: bool, + + /// Minimum image format version to use (0 or 1). + #[arg(long, default_value = "0")] + min_version: u32, + + /// Maximum image format version (for auto-upgrade). + #[arg(long, default_value = "1")] + max_version: u32, + + /// Copy regular file content to the given object store directory. + /// + /// Files are stored by their fsverity digest using the same flat layout + /// as C mkcomposefs: `XX/DIGEST` where XX is the first byte of the digest. + /// The directory is created if it doesn't exist. The layout is compatible + /// with digest stores written by the C mkcomposefs tool. + #[arg(long)] + digest_store: Option, + + /// Number of threads to use for digest calculation and file copying. + #[arg(long)] + threads: Option, + + /// The source directory or dumpfile. + source: PathBuf, + + /// The output image path (use `-` for stdout). + /// + /// Must be omitted when using --print-digest-only. + image: Option, +} + +/// Entry point for the mkcomposefs multi-call mode. +pub(crate) fn run() -> Result<()> { + let args = Args::parse(); + + // Validate arguments + if args.print_digest_only && args.image.is_some() { + bail!("IMAGE must be omitted when using --print-digest-only"); + } + + if !args.print_digest_only && args.image.is_none() { + bail!("IMAGE is required (or use --print-digest-only)"); + } + + if args.min_version > args.max_version { + bail!( + "Invalid version range: --min-version ({}) must not exceed --max-version ({})", + args.min_version, + args.max_version + ); + } + + // The C composefs tool supports only versions 0 and 1 (LCFS_VERSION_MAX=1). + // Both use the same C-compatible on-disk format (compact inodes, BFS, whiteout + // table). The difference is in the `composefs_version` EROFS header field: + // - version 0 (default): starts at 0, auto-upgrades to 1 if a user-visible + // whiteout device (chr 0,0) is encountered. + // - version 1: always writes composefs_version=1 regardless of content. + // Versions > 1 are not defined by the C tool and are rejected below. + if args.min_version > 1 { + bail!( + "--min-version {} is not supported; the maximum C-compatible version is 1", + args.min_version + ); + } + // min_composefs_version mirrors C's `options->version` start value. + let min_composefs_version = args.min_version; + + // Open or create the digest store if specified. + // Always uses the C-compatible flat layout (XX/DIGEST) so that the store + // is interchangeable with the one written by C mkcomposefs. + let store: Option>> = + if let Some(store_path) = &args.digest_store { + let n = args + .threads + .unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); + Some(Arc::new(FlatDigestStore::open(store_path, n, true)?)) + } else { + None + }; + + // Warn if --digest-store is combined with --from-file (store is unused in that case) + if args.from_file && args.digest_store.is_some() { + eprintln!("warning: --digest-store is ignored when --from-file is specified"); + } + + // Read input + let mut fs = if args.from_file { + read_dumpfile(&args)? + } else { + read_directory(&args.source, store, args.threads)? + }; + + // Apply transformations based on flags + apply_transformations(&mut fs, &args)?; + + // Generate EROFS image + let image = mkfs_erofs_v1_min_version(&ValidatedFileSystem::new(fs)?, min_composefs_version); + + // Handle output + if args.print_digest_only { + let digest = compute_fsverity_digest(&image); + println!("{digest}"); + return Ok(()); + } + + // Write image + let image_path = args.image.as_ref().unwrap(); + write_image(image_path, &image)?; + + // Optionally print digest + if args.print_digest { + let digest = compute_fsverity_digest(&image); + println!("{digest}"); + } + + Ok(()) +} + +/// Read and parse a dumpfile from the given source. +fn read_dumpfile(args: &Args) -> Result> { + let content = if args.source.as_os_str() == "-" { + // Read from stdin + let stdin = io::stdin(); + let mut content = String::new(); + stdin.lock().read_to_string(&mut content)?; + content + } else { + // Read from file + let file = File::open(&args.source) + .with_context(|| format!("Failed to open dumpfile: {:?}", args.source))?; + let mut reader = BufReader::new(file); + let mut content = String::new(); + reader.read_to_string(&mut content)?; + content + }; + + dumpfile_to_filesystem(&content).context("Failed to parse dumpfile") +} + +/// Read a filesystem tree from a directory path. +/// +/// If a store is provided, large file contents are copied there and +/// referenced by digest. The store must implement [`ObjectStore`]. +/// +/// The `threads` argument controls both the tokio worker thread count and the +/// semaphore used to limit concurrent verity computations. `Some(1)` uses a +/// single-threaded runtime; `None` or `Some(n > 1)` uses the multi-threaded +/// scheduler. +fn read_directory( + path: &Path, + store: Option>>, + threads: Option, +) -> Result> { + use rustix::fs::{Mode, OFlags}; + + // Verify the path exists and is a directory + let metadata = std::fs::metadata(path) + .with_context(|| format!("Failed to access source directory: {path:?}"))?; + + if !metadata.is_dir() { + bail!("Source path is not a directory: {path:?}"); + } + + // Open a dirfd for the current directory (required by the async API) + let dirfd = rustix::fs::openat( + CWD, + ".", + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .context("Failed to open current directory")?; + + // Build a tokio runtime appropriate for the requested thread count. + // --threads 1 → current_thread (no extra OS threads, minimal overhead). + // --threads N → multi_thread with exactly N worker threads. + // (default) → multi_thread with the tokio default (one per logical CPU). + let rt = match threads { + Some(1) => tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .context("Failed to create single-threaded tokio runtime")?, + Some(n) => tokio::runtime::Builder::new_multi_thread() + .worker_threads(n) + .enable_all() + .build() + .context("Failed to create multi-threaded tokio runtime")?, + None => tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .context("Failed to create multi-threaded tokio runtime")?, + }; + + let path = path.to_path_buf(); + + // When a store is present its semaphore is already configured; + // delegate entirely to read_filesystem_with_store. + // When there is no store we build the semaphore ourselves so the + // requested thread count is honoured. + if store.is_some() { + rt.block_on(read_filesystem_with_store(dirfd, path, store)) + .context("Failed to read directory tree") + } else { + let n = threads.unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); + let semaphore = Arc::new(Semaphore::new(n)); + rt.block_on(read_filesystem_with_semaphore(dirfd, path, None, semaphore)) + .context("Failed to read directory tree") + } +} + +/// Write the image to the specified path (or stdout if `-`). +fn write_image(path: &PathBuf, image: &[u8]) -> Result<()> { + if path.as_os_str() == "-" { + let stdout = io::stdout(); + if stdout.is_terminal() { + bail!( + "Refusing to write binary image to terminal. Redirect stdout or use a file path." + ); + } + stdout.lock().write_all(image)?; + } else { + let mut file = + File::create(path).with_context(|| format!("Failed to create image file: {path:?}"))?; + file.write_all(image)?; + } + Ok(()) +} + +/// Compute the fsverity digest of the image. +fn compute_fsverity_digest(image: &[u8]) -> String { + let digest: Sha256HashValue = compute_verity(image); + digest.to_hex() +} + +/// Apply filesystem transformations based on command-line flags. +fn apply_transformations(fs: &mut FileSystem, args: &Args) -> Result<()> { + // Handle xattr filtering + if args.skip_xattrs { + // Remove all xattrs + fs.filter_xattrs(|_| false); + } else if args.user_xattrs { + // Keep only user.* xattrs + fs.filter_xattrs(|name| name.as_encoded_bytes().starts_with(b"user.")); + } + + // Handle --use-epoch (set all mtimes to 0) + if args.use_epoch { + set_all_mtimes_to_epoch(fs); + } + + // Handle --skip-devices (remove device nodes) + if args.skip_devices { + remove_device_nodes(fs); + } + + // Always add overlay whiteout stubs (we always produce V1/C-compatible format). + // Note: The overlay.opaque xattr is added by the writer (not here) to ensure + // it's not escaped by the trusted.overlay.* escaping logic. + fs.add_overlay_whiteouts(); + + Ok(()) +} + +/// Set all modification times in the filesystem to Unix epoch (0). +fn set_all_mtimes_to_epoch(fs: &mut FileSystem) { + fs.for_each_stat_mut(|stat| { + stat.st_mtim_sec = 0; + stat.st_mtim_nsec = 0; + }); +} + +/// Remove all device nodes (block and character devices) from the filesystem. +fn remove_device_nodes(fs: &mut FileSystem) { + use composefs::generic_tree::{Inode, LeafContent}; + + type Leaf = composefs::generic_tree::Leaf>; + type Dir = composefs::generic_tree::Directory>; + + fn process_dir(dir: &mut Dir, leaves: &[Leaf]) { + // First, collect names of subdirectories to process + let subdir_names: Vec = dir + .entries() + .filter_map(|(name, inode)| { + if matches!(inode, Inode::Directory(_)) { + Some(name.to_os_string()) + } else { + None + } + }) + .collect(); + + // Recursively process subdirectories + for name in subdir_names { + if let Ok(subdir) = dir.get_directory_mut(&name) { + process_dir(subdir, leaves); + } + } + + // Collect names of device nodes to remove + let devices_to_remove: Vec = dir + .entries() + .filter_map(|(name, inode)| { + if let Inode::Leaf(leaf_id, _) = inode + && matches!( + leaves[leaf_id.0].content, + LeafContent::BlockDevice(_) | LeafContent::CharacterDevice(_) + ) + { + return Some(name.to_os_string()); + } + None + }) + .collect(); + + // Remove device nodes + for name in devices_to_remove { + dir.remove(&name); + } + } + + // Split struct field borrows: Rust allows borrowing different fields simultaneously. + let FileSystem { root, leaves, .. } = fs; + process_dir(root, leaves); + + // Compact the leaves table to remove entries now unreferenced after + // device-node removal. Without this, fs.fsck() would report orphaned leaves. + fs.compact(); +} diff --git a/crates/composefs-integration-tests/src/lib.rs b/crates/composefs-integration-tests/src/lib.rs index c86ebfb1..b9decb88 100644 --- a/crates/composefs-integration-tests/src/lib.rs +++ b/crates/composefs-integration-tests/src/lib.rs @@ -11,8 +11,8 @@ use std::process::Command; use std::sync::Arc; use anyhow::Result; -use composefs_oci::composefs::fsverity::{Algorithm, Sha256HashValue}; -use composefs_oci::composefs::repository::Repository; +use composefs_oci::composefs::fsverity::Sha256HashValue; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; use tempfile::TempDir; /// A test function that returns a Result. @@ -110,9 +110,11 @@ pub fn create_test_repository(tempdir: &TempDir) -> Result::init_path(&fd, ".", Algorithm::SHA256, false)?; - repo.set_insecure(); + let (repo, _created) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::default().set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs-integration-tests/src/tests/cli.rs b/crates/composefs-integration-tests/src/tests/cli.rs index a3dda2e9..3250a878 100644 --- a/crates/composefs-integration-tests/src/tests/cli.rs +++ b/crates/composefs-integration-tests/src/tests/cli.rs @@ -19,10 +19,18 @@ const OCI_LAYOUT_COMPOSEFS_ID: &str = "f26c6eb439749b82f0d1520e83455bb21766572fb /// Create a fresh initialized insecure repository in a tempdir. /// /// Returns the tempdir (for lifetime) and the path to the repo. +/// +/// Creates a V2 (legacy) EROFS repo explicitly so that tests which compare +/// against pinned V2 digests (e.g. `OCI_LAYOUT_COMPOSEFS_ID`) continue to +/// work correctly now that `cfsctl init` defaults to V1. fn init_insecure_repo(sh: &Shell, cfsctl: &std::path::Path) -> Result { let repo_dir = tempfile::tempdir()?; let repo = repo_dir.path(); - cmd!(sh, "{cfsctl} --repo {repo} init --insecure").read()?; + cmd!( + sh, + "{cfsctl} --repo {repo} init --insecure --erofs-version 2" + ) + .read()?; Ok(repo_dir) } diff --git a/crates/composefs-integration-tests/src/tests/privileged.rs b/crates/composefs-integration-tests/src/tests/privileged.rs index de15ba63..8609a57d 100644 --- a/crates/composefs-integration-tests/src/tests/privileged.rs +++ b/crates/composefs-integration-tests/src/tests/privileged.rs @@ -15,7 +15,7 @@ use anyhow::{Context, Result, bail, ensure}; use xshell::{Shell, cmd}; use composefs_oci::composefs::fsverity::{FsVerityHashValue, Sha256HashValue, Sha512HashValue}; -use composefs_oci::composefs::repository::Repository; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; use crate::{cfsctl, integration_test}; @@ -657,8 +657,11 @@ fn init_insecure_repo_at( rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, 0.into(), )?; - let (mut repo, _created) = Repository::::init_path(&fd, ".", algorithm, false)?; - repo.set_insecure(); + let (repo, _created) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::new(algorithm).set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs-oci/src/boot.rs b/crates/composefs-oci/src/boot.rs index 726ded80..ec67cb18 100644 --- a/crates/composefs-oci/src/boot.rs +++ b/crates/composefs-oci/src/boot.rs @@ -86,6 +86,7 @@ pub fn remove_boot_image( #[cfg(all(test, feature = "boot"))] mod test { use super::*; + use composefs::erofs::format::FormatVersion; use composefs::fsverity::Sha256HashValue; use composefs::test::TestRepo; use composefs_boot::bootloader::get_boot_resources; @@ -121,7 +122,7 @@ mod test { assert_eq!(oci.boot_image_ref(), Some(&image_verity)); let plain_image = crate::image::create_filesystem(repo, &img.config_digest, None).unwrap(); - let plain_verity = plain_image.compute_image_id(); + let plain_verity = plain_image.compute_image_id(FormatVersion::V2); assert_ne!( image_verity, plain_verity, "boot-transformed image should differ from non-transformed image" diff --git a/crates/composefs-oci/src/image.rs b/crates/composefs-oci/src/image.rs index 14a8ae2f..44c05189 100644 --- a/crates/composefs-oci/src/image.rs +++ b/crates/composefs-oci/src/image.rs @@ -157,6 +157,7 @@ mod test { use composefs::{ dumpfile::write_dumpfile, fsverity::Sha256HashValue, + repository::RepositoryConfig, tree::{LeafContent, RegularFile, Stat}, }; use std::{collections::BTreeMap, io::BufRead, path::PathBuf}; @@ -171,6 +172,7 @@ mod test { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, item: TarItem::Leaf(LeafContent::Regular(RegularFile::Inline([].into()))), @@ -185,6 +187,7 @@ mod test { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, item: TarItem::Directory, @@ -344,8 +347,7 @@ mod test { let (repo, _) = Repository::::init_path( CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; let repo = Arc::new(repo); let (verity, _stats) = diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index f91cfedd..349eea71 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -806,7 +806,11 @@ mod test { use rustix::fs::CWD; - use composefs::{fsverity::Sha256HashValue, repository::Repository, test::tempdir}; + use composefs::{ + fsverity::Sha256HashValue, + repository::{Repository, RepositoryConfig}, + test::tempdir, + }; use super::*; @@ -843,13 +847,9 @@ mod test { fn create_test_repo() -> (tempfile::TempDir, Arc>) { let dir = tempdir(); let repo_path = dir.path().join("repo"); - let (repo, _) = Repository::init_path( - CWD, - &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, - ) - .expect("initializing test repo"); + let (repo, _) = + Repository::init_path(CWD, &repo_path, RepositoryConfig::default().set_insecure()) + .expect("initializing test repo"); (dir, Arc::new(repo)) } diff --git a/crates/composefs-oci/src/oci_layout.rs b/crates/composefs-oci/src/oci_layout.rs index b857c7cc..b8d51701 100644 --- a/crates/composefs-oci/src/oci_layout.rs +++ b/crates/composefs-oci/src/oci_layout.rs @@ -415,8 +415,7 @@ mod tests { let (repo, _) = composefs::repository::Repository::::init_path( rustix::fs::CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + composefs::repository::RepositoryConfig::default().set_insecure(), ) .unwrap(); let repo = std::sync::Arc::new(repo); diff --git a/crates/composefs-oci/src/tar.rs b/crates/composefs-oci/src/tar.rs index 1fde0246..1e2662cc 100644 --- a/crates/composefs-oci/src/tar.rs +++ b/crates/composefs-oci/src/tar.rs @@ -456,6 +456,7 @@ pub fn get_entry( st_gid: entry.gid as u32, st_mode: entry.mode, st_mtim_sec: entry.mtime as i64, + st_mtim_nsec: 0, xattrs, }, item, @@ -475,7 +476,9 @@ mod tests { use super::*; use composefs::{ - fsverity::Sha256HashValue, generic_tree::LeafContent, repository::Repository, + fsverity::Sha256HashValue, + generic_tree::LeafContent, + repository::{Repository, RepositoryConfig}, splitstream::SplitStreamReader, }; use std::{io::Read, path::Path, sync::Arc}; @@ -493,8 +496,7 @@ mod tests { let (repo, _) = Repository::init_path( rustix::fs::CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; // Store tempdir in static to keep it alive diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs index 73829c4a..2df99a5c 100644 --- a/crates/composefs-oci/src/test_util.rs +++ b/crates/composefs-oci/src/test_util.rs @@ -24,7 +24,7 @@ use crate::oci_image::write_manifest; use crate::skopeo::OCI_CONFIG_CONTENT_TYPE; use composefs::dumpfile_parse::{Entry, Item}; use composefs::fsverity::Sha256HashValue; -use composefs::repository::Repository; +use composefs::repository::{Repository, RepositoryConfig}; use containers_image_proxy::oci_spec::image::{ ConfigBuilder, DescriptorBuilder, Digest as OciDigest, ImageConfigurationBuilder, ImageManifestBuilder, MediaType, RootFsBuilder, @@ -639,13 +639,11 @@ pub async fn create_bootable_image( /// paths rather than `Repository` handles. Opens the repo, creates the /// image with `create_base_image`, generates the EROFS, and returns. pub fn create_test_oci_image(repo_path: &std::path::Path, tag: &str) -> anyhow::Result<()> { - let (mut repo, _) = Repository::::init_path( + let (repo, _) = Repository::::init_path( rustix::fs::CWD, repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; - repo.set_insecure(); let repo = Arc::new(repo); let rt = tokio::runtime::Runtime::new()?; rt.block_on(create_base_image(&repo, Some(tag))); @@ -663,13 +661,11 @@ pub fn create_test_bootable_oci_image( repo_path: &std::path::Path, tag: &str, ) -> anyhow::Result<()> { - let (mut repo, _) = Repository::::init_path( + let (repo, _) = Repository::::init_path( rustix::fs::CWD, repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; - repo.set_insecure(); let repo = Arc::new(repo); let rt = tokio::runtime::Runtime::new()?; let img = rt.block_on(create_bootable_image(&repo, Some(tag), 1)); diff --git a/crates/composefs/Cargo.toml b/crates/composefs/Cargo.toml index d8c78357..9a365956 100644 --- a/crates/composefs/Cargo.toml +++ b/crates/composefs/Cargo.toml @@ -19,6 +19,7 @@ test = ["tempfile"] anyhow = { version = "1.0.87", default-features = false } composefs-ioctls = { workspace = true } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] } +serde_repr = "0.1" fn-error-context = "0.2" hex = { version = "0.4.0", default-features = false, features = ["std"] } log = { version = "0.4.8", default-features = false } diff --git a/crates/composefs/fuzz/Cargo.lock b/crates/composefs/fuzz/Cargo.lock index e8640e0a..98909985 100644 --- a/crates/composefs/fuzz/Cargo.lock +++ b/crates/composefs/fuzz/Cargo.lock @@ -66,7 +66,7 @@ dependencies = [ [[package]] name = "composefs" -version = "0.3.0" +version = "0.4.0" dependencies = [ "anyhow", "composefs-ioctls", @@ -78,6 +78,7 @@ dependencies = [ "rustix", "serde", "serde_json", + "serde_repr", "sha2", "thiserror", "tokio", @@ -97,7 +98,7 @@ dependencies = [ [[package]] name = "composefs-ioctls" -version = "0.3.0" +version = "0.4.0" dependencies = [ "rustix", "thiserror", @@ -460,6 +461,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.11.0" diff --git a/crates/composefs/fuzz/generate_corpus.rs b/crates/composefs/fuzz/generate_corpus.rs index dc179f76..b8904290 100644 --- a/crates/composefs/fuzz/generate_corpus.rs +++ b/crates/composefs/fuzz/generate_corpus.rs @@ -12,7 +12,8 @@ use std::ffi::{OsStr, OsString}; use std::fs; use std::path::Path; -use composefs::erofs::writer::mkfs_erofs; +use composefs::erofs::format::FormatVersion; +use composefs::erofs::writer::{ValidatedFileSystem, mkfs_erofs, mkfs_erofs_versioned}; use composefs::fsverity::{FsVerityHashValue, Sha256HashValue}; use composefs::generic_tree::{self, LeafContent, Stat}; use composefs::tree::{self, FileSystem, RegularFile}; @@ -27,6 +28,7 @@ fn stat(mode: u32, uid: u32, gid: u32, mtime: i64) -> Stat { st_uid: uid, st_gid: gid, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -55,20 +57,37 @@ fn insert_dir<'a>(parent: &'a mut Dir, name: &str, s: Stat) -> &'a mut Dir { parent.get_directory_mut(OsStr::new(name)).unwrap() } +/// Generate both V1 and V2 images for a filesystem, pushing them into seeds. +/// +/// The V2 image uses the name as-is. The V1 image appends "_v1" to the name. +/// For V1, overlay whiteouts are added before writing (required for C compat). +fn push_both_versions( + seeds: &mut Vec<(String, Vec)>, + name: &str, + build_fs: impl Fn() -> FileSystem, +) { + // V2 (default) + let fs = build_fs(); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + seeds.push((name.to_string(), image.into())); + + // V1 (C-compatible) + let mut fs = build_fs(); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + seeds.push((format!("{name}_v1"), image.into())); +} + fn main() { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); - let mut seeds: Vec<(&str, Vec)> = Vec::new(); + let mut seeds: Vec<(String, Vec)> = Vec::new(); // 1. Empty root - { - let fs = empty_root(); - let image = mkfs_erofs(&fs); - seeds.push(("empty_root", image.into())); - } + push_both_versions(&mut seeds, "empty_root", empty_root); // 2. Single inline file (small content stored in inode) - { + push_both_versions(&mut seeds, "single_inline_file", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -77,12 +96,11 @@ fn main() { )), ); fs.root.insert(OsStr::new("hello.txt"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("single_inline_file", image.into())); - } + fs + }); // 3. Single external (chunk-based) regular file - { + push_both_versions(&mut seeds, "single_external_file", || { let mut fs = empty_root(); let hash = Sha256HashValue::EMPTY; let id = fs.push_leaf( @@ -90,66 +108,60 @@ fn main() { LeafContent::Regular(RegularFile::External(hash, 65536)), ); fs.root.insert(OsStr::new("data.bin"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("single_external_file", image.into())); - } + fs + }); // 4. Symlink - { + push_both_versions(&mut seeds, "symlink", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o777, 0, 0, 0), LeafContent::Symlink(OsString::from("/target/path").into_boxed_os_str()), ); fs.root.insert(OsStr::new("link"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("symlink", image.into())); - } + fs + }); // 5. FIFO - { + push_both_versions(&mut seeds, "fifo", || { let mut fs = empty_root(); let id = fs.push_leaf(file_stat(), LeafContent::Fifo); fs.root.insert(OsStr::new("mypipe"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("fifo", image.into())); - } + fs + }); // 6. Character device - { + push_both_versions(&mut seeds, "chardev", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o666, 0, 0, 0), LeafContent::CharacterDevice(makedev(1, 3)), ); fs.root.insert(OsStr::new("null"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("chardev", image.into())); - } + fs + }); // 7. Block device - { + push_both_versions(&mut seeds, "blockdev", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o660, 0, 6, 0), LeafContent::BlockDevice(makedev(8, 0)), ); fs.root.insert(OsStr::new("sda"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("blockdev", image.into())); - } + fs + }); // 8. Socket - { + push_both_versions(&mut seeds, "socket", || { let mut fs = empty_root(); let id = fs.push_leaf(file_stat(), LeafContent::Socket); fs.root.insert(OsStr::new("mysock"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("socket", image.into())); - } + fs + }); // 9. Nested directories: /a/b/c/file - { + push_both_versions(&mut seeds, "nested_dirs", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -161,12 +173,11 @@ fn main() { let b = insert_dir(a, "b", dir_stat()); let c = insert_dir(b, "c", dir_stat()); c.insert(OsStr::new("file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("nested_dirs", image.into())); - } + fs + }); // 10. Many entries (20+ files to exercise multi-block directories) - { + push_both_versions(&mut seeds, "many_entries", || { let mut fs = empty_root(); for i in 0..25 { let name = format!("file_{i:03}"); @@ -179,12 +190,11 @@ fn main() { ); fs.root.insert(OsStr::new(&name), Inode::leaf(id)); } - let image = mkfs_erofs(&fs); - seeds.push(("many_entries", image.into())); - } + fs + }); // 11. Extended attributes - { + push_both_versions(&mut seeds, "xattrs", || { let mut fs = empty_root(); let mut xattrs = BTreeMap::new(); xattrs.insert( @@ -206,12 +216,11 @@ fn main() { )), ); fs.root.insert(OsStr::new("xattr_file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("xattrs", image.into())); - } + fs + }); // 12. Mixed types — one of every file type in a single directory - { + push_both_versions(&mut seeds, "mixed_types", || { let mut fs = empty_root(); let ids = [ fs.push_leaf( @@ -246,9 +255,8 @@ fn main() { LeafContent::Regular(RegularFile::External(hash, 4096)), ); fs.root.insert(OsStr::new("external"), Inode::leaf(ext_id)); - let image = mkfs_erofs(&fs); - seeds.push(("mixed_types", image.into())); - } + fs + }); // 13. Hardlink — two entries sharing the same LeafId (nlink > 1) { @@ -263,12 +271,12 @@ fn main() { .insert(OsStr::new("original"), Inode::leaf(shared_id)); fs.root .insert(OsStr::new("hardlink"), Inode::leaf(shared_id)); - let image = mkfs_erofs(&fs); - seeds.push(("hardlink", image.into())); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + seeds.push(("hardlink".to_string(), image.into())); } // 14. Large inline — file with maximum inline content (just under 4096 bytes) - { + push_both_versions(&mut seeds, "large_inline", || { let mut fs = empty_root(); let content = vec![0xABu8; 4000]; // just under block size let id = fs.push_leaf( @@ -277,12 +285,11 @@ fn main() { ); fs.root .insert(OsStr::new("large_inline.bin"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("large_inline", image.into())); - } + fs + }); // 15. Deep nesting — 8 levels of directories - { + push_both_versions(&mut seeds, "deep_nesting", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -296,12 +303,11 @@ fn main() { current = insert_dir(current, name, dir_stat()); } current.insert(OsStr::new("deep_file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("deep_nesting", image.into())); - } + fs + }); // 16. Nonzero mtime - { + push_both_versions(&mut seeds, "nonzero_mtime", || { let mut fs = FileSystem::new(stat(0o755, 0, 0, 1000000)); let id1 = fs.push_leaf( stat(0o644, 0, 0, 500000), @@ -317,12 +323,11 @@ fn main() { ); fs.root.insert(OsStr::new("old"), Inode::leaf(id1)); fs.root.insert(OsStr::new("new"), Inode::leaf(id2)); - let image = mkfs_erofs(&fs); - seeds.push(("nonzero_mtime", image.into())); - } + fs + }); // 17. Large uid/gid — forces extended inodes - { + push_both_versions(&mut seeds, "large_uid_gid", || { let big_id = u16::MAX as u32 + 1; // 65536, won't fit in u16 let mut fs = FileSystem::new(stat(0o755, big_id, big_id, 0)); let id = fs.push_leaf( @@ -332,9 +337,8 @@ fn main() { )), ); fs.root.insert(OsStr::new("bigids.txt"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("large_uid_gid", image.into())); - } + fs + }); // Write seeds to corpus directories for both fuzz targets let targets = ["read_image", "debug_image"]; diff --git a/crates/composefs/proptest-regressions/erofs/reader.txt b/crates/composefs/proptest-regressions/erofs/reader.txt new file mode 100644 index 00000000..40e5b6ca --- /dev/null +++ b/crates/composefs/proptest-regressions/erofs/reader.txt @@ -0,0 +1,8 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 6938de6542fd6c10b28ba78a0b5c0a8754da1fa13340f4952df34bf43c913f6b # shrinks to spec = FsSpec { root: DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [], subdirs: [("A", DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [("a", LeafSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 1, xattrs: {} }, content: Inline([]) })], subdirs: [] })] }, hardlinks: [HardlinkSpec { source_index: 0, link_name: "G4._s_z6._cbp" }, HardlinkSpec { source_index: 0, link_name: "C1-1Pgx_Cg2g" }, HardlinkSpec { source_index: 0, link_name: "îA\xEB\xCE$\xE6Z\x90K^\u{1d}\xC8\u{18}s\u{10}\x81\u{3}E\xAA" }] } +cc 0ddc52acd61b4976d1e5e21694863a9a3dc6fd2a0af6b620b379c6dcb5603c48 # shrinks to spec = FsSpec { root: DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [], subdirs: [("A", DirSpec { stat: Stat { st_mode: 0, st_uid: 0, st_gid: 0, st_mtim_sec: 0, st_mtim_nsec: 0, xattrs: {} }, leaves: [("0_AA-aA-00-a0Aa_AA-0aA-aA000A-0_0_A__aAaa0a-_-__Aa0_a0A_aaa--aA0-A_aa-A0-0a0-aAaA0aAa__0AAA0A0-0aaa-a_aA000AA_-A00-a000-0aaaA--0AaAa_a0AaaA_-0Aa-_A0aaA0aA-0A0aaaaA_a0a00__A-aA__aA-A0A-00-00a0a-_A0A0A0_a0A-00aa0AAN5.n.bQHGB_-7GJbF-RrX0alT.t-KOi-S_B-_....Td", LeafSpec { stat: Stat { st_mode: 2755, st_uid: 37448, st_gid: 1536, st_mtim_sec: 341456497, st_mtim_nsec: 880834887, xattrs: {"lustre.lov": [164, 134, 7, 253, 237, 177, 226, 6, 175, 72, 217, 116], "system.posix_acl_default": [178, 49, 193, 209, 177, 17, 102, 91, 120, 161, 152], "user.test_1": [60, 197, 53], "user.test_4": [175, 169, 100, 201, 234, 81, 68, 205, 62, 158, 13]} }, content: Symlink("\x9B\u{1f}\x88\xB5K\xFC\x89uy\\\xD9\xC6\u{c}\u{7}\xA8") })], subdirs: [] })] }, hardlinks: [] } diff --git a/crates/composefs/src/dumpfile.rs b/crates/composefs/src/dumpfile.rs index 7143715d..8b3d6253 100644 --- a/crates/composefs/src/dumpfile.rs +++ b/crates/composefs/src/dumpfile.rs @@ -114,11 +114,12 @@ fn write_entry( let uid = stat.st_uid; let gid = stat.st_gid; let mtim_sec = stat.st_mtim_sec; + let mtim_nsec = stat.st_mtim_nsec; write_escaped(writer, path.as_os_str().as_bytes())?; write!( writer, - " {size} {mode:o} {nlink} {uid} {gid} {rdev} {mtim_sec}.0 " + " {size} {mode:o} {nlink} {uid} {gid} {rdev} {mtim_sec}.{mtim_nsec} " )?; write_escaped(writer, payload.as_ref().as_bytes())?; write!(writer, " ")?; @@ -422,7 +423,7 @@ pub fn add_entry_to_filesystem( // Handle root directory specially if path == Path::new("/") { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; fs.set_root_stat(stat); return Ok(()); } @@ -439,7 +440,7 @@ pub fn add_entry_to_filesystem( // Convert the entry to an inode let inode = match entry.item { Item::Directory { .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; Inode::Directory(Box::new(Directory::new(stat))) } Item::Hardlink { ref target } => { @@ -450,7 +451,7 @@ pub fn add_entry_to_filesystem( Inode::leaf(existing_id) } Item::RegularInline { ref content, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let data: Box<[u8]> = match content { std::borrow::Cow::Borrowed(d) => Box::from(*d), std::borrow::Cow::Owned(d) => d.clone().into_boxed_slice(), @@ -464,7 +465,7 @@ pub fn add_entry_to_filesystem( ref fsverity_digest, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let digest = fsverity_digest .as_ref() .ok_or_else(|| anyhow::anyhow!("External file missing fsverity digest"))?; @@ -473,10 +474,19 @@ pub fn add_entry_to_filesystem( let id = push_leaf(fs, stat, content); Inode::leaf(id) } - Item::Device { rdev, .. } => { - let stat = entry_to_stat(&entry); + Item::Device { rdev, nlink } => { // S_IFMT = 0o170000, S_IFBLK = 0o60000, S_IFCHR = 0o20000 - let content = if entry.mode & 0o170000 == 0o60000 { + let is_chardev = entry.mode & 0o170000 != 0o60000; + // A whiteout is a character device with rdev=0; hardlinked whiteouts + // are invalid because composefs cannot represent them correctly. + if is_chardev && rdev == 0 && nlink > 1 { + anyhow::bail!( + "invalid dumpfile: whiteout entry {:?} has nlink > 1", + entry.path + ); + } + let stat = entry_to_stat(&entry)?; + let content = if !is_chardev { LeafContent::BlockDevice(rdev) } else { LeafContent::CharacterDevice(rdev) @@ -485,7 +495,7 @@ pub fn add_entry_to_filesystem( Inode::leaf(id) } Item::Symlink { ref target, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let target_os: Box = match target { std::borrow::Cow::Borrowed(t) => Box::from(t.as_os_str()), std::borrow::Cow::Owned(t) => Box::from(t.as_os_str()), @@ -495,11 +505,17 @@ pub fn add_entry_to_filesystem( Inode::leaf(id) } Item::Fifo { .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let content = LeafContent::Fifo; let id = push_leaf(fs, stat, content); Inode::leaf(id) } + Item::Socket { .. } => { + let stat = entry_to_stat(&entry)?; + let content = LeafContent::Socket; + let id = push_leaf(fs, stat, content); + Inode::leaf(id) + } }; // Store LeafIds in the hardlinks map for future hardlink lookups @@ -521,7 +537,7 @@ pub fn add_entry_to_filesystem( } /// Convert a dumpfile Entry's metadata into a tree Stat structure. -fn entry_to_stat(entry: &Entry<'_>) -> Stat { +fn entry_to_stat(entry: &Entry<'_>) -> Result { let mut xattrs = BTreeMap::new(); for xattr in &entry.xattrs { let key: Box = match &xattr.key { @@ -535,13 +551,19 @@ fn entry_to_stat(entry: &Entry<'_>) -> Stat { xattrs.insert(key, value); } - Stat { + let nsec = entry.mtime.nsec; + if nsec >= 1_000_000_000 { + anyhow::bail!("Invalid mtime nanoseconds: {nsec} (must be < 1_000_000_000)"); + } + + Ok(Stat { st_mode: entry.mode & 0o7777, // Keep only permission bits st_uid: entry.uid, st_gid: entry.gid, st_mtim_sec: entry.mtime.sec as i64, + st_mtim_nsec: nsec as u32, xattrs, - } + }) } /// Parse a dumpfile string and build a complete FileSystem. @@ -566,7 +588,7 @@ pub fn dumpfile_to_filesystem( "Dumpfile must start with root directory entry, found: {:?}", entry.path ); - break entry_to_stat(&entry); + break entry_to_stat(&entry)?; } None => anyhow::bail!("Dumpfile is empty, expected root directory entry"), } @@ -591,6 +613,19 @@ pub fn dumpfile_to_filesystem( Ok(fs) } +/// Parse a composefs dumpfile string and validate the resulting filesystem +/// for EROFS serialization. +/// +/// Combines [`dumpfile_to_filesystem`] with [`ValidatedFileSystem::new`]. +/// Returns an error if the dumpfile is malformed or if the resulting +/// filesystem violates EROFS invariants (e.g. hardlinked whiteouts). +pub fn dumpfile_to_validated_filesystem( + dumpfile: &str, +) -> anyhow::Result> { + let fs = dumpfile_to_filesystem(dumpfile)?; + crate::erofs::writer::ValidatedFileSystem::new(fs) +} + #[cfg(test)] mod tests { use super::*; @@ -724,6 +759,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }); let leaf_id = fs.push_leaf( @@ -732,6 +768,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs, }, LeafContent::Regular(RegularFile::Inline(b"test".to_vec().into())), @@ -757,6 +794,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }; @@ -793,6 +831,23 @@ mod tests { Ok(()) } + /// A whiteout (chardev, rdev=0) with nlink > 1 must be rejected. + #[test] + fn test_hardlinked_whiteout_rejected() { + // /foo 0 20000 2 0 0 0 0.0 - - - + // ^size ^mode ^nlink ^uid ^gid ^rdev ^mtime ^payload ^digest ^xattrs + // mode 20000 = S_IFCHR (character device), rdev=0 → whiteout, nlink=2 + let dumpfile = "/ 0 40755 2 0 0 0 0.0 - - -\n\ + /foo 0 20000 2 0 0 0 0.0 - - -\n"; + let result = dumpfile_to_filesystem::(dumpfile); + let err = result.expect_err("hardlinked whiteout must be rejected"); + let msg = format!("{err:#}"); + assert!( + msg.contains("nlink"), + "error should mention nlink, got: {msg}" + ); + } + /// Helper to escape bytes through write_escaped and return the result. fn escaped(bytes: &[u8]) -> String { let mut out = String::new(); diff --git a/crates/composefs/src/dumpfile_parse.rs b/crates/composefs/src/dumpfile_parse.rs index f8cccdd8..f01a28a3 100644 --- a/crates/composefs/src/dumpfile_parse.rs +++ b/crates/composefs/src/dumpfile_parse.rs @@ -121,6 +121,11 @@ pub enum Item<'p> { /// Number of links nlink: u32, }, + /// A Unix domain socket + Socket { + /// Number of links + nlink: u32, + }, /// A directory Directory { /// Number of links @@ -482,7 +487,10 @@ impl<'p> Entry<'p> { Item::Directory { nlink } } FileType::Socket => { - anyhow::bail!("sockets are not supported"); + Self::check_nonregfile(content, fsverity_digest)?; + Self::check_rdev(rdev)?; + + Item::Socket { nlink } } FileType::Unknown => { anyhow::bail!("Unhandled file type from raw mode: {mode}") @@ -532,6 +540,7 @@ impl Item<'_> { Item::Symlink { nlink, .. } => *nlink, Item::Directory { nlink, .. } => *nlink, Item::Fifo { nlink, .. } => *nlink, + Item::Socket { nlink, .. } => *nlink, _ => 0, } } diff --git a/crates/composefs/src/erofs/composefs.rs b/crates/composefs/src/erofs/composefs.rs index 3acf1844..960064fa 100644 --- a/crates/composefs/src/erofs/composefs.rs +++ b/crates/composefs/src/erofs/composefs.rs @@ -7,19 +7,23 @@ use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; use crate::fsverity::FsVerityHashValue; -/* From linux/fs/overlayfs/overlayfs.h struct ovl_metacopy */ +/// Overlay metacopy xattr structure for fs-verity digest storage. +/// +/// From linux/fs/overlayfs/overlayfs.h struct ovl_metacopy #[derive(Debug, FromBytes, Immutable, KnownLayout, IntoBytes)] #[repr(C)] -pub(super) struct OverlayMetacopy { +pub struct OverlayMetacopy { version: u8, len: u8, flags: u8, digest_algo: u8, - pub(super) digest: H, + /// The fs-verity digest value. + pub digest: H, } impl OverlayMetacopy { - pub(super) fn new(digest: &H) -> Self { + /// Creates a new overlay metacopy entry with the given digest. + pub fn new(digest: &H) -> Self { Self { version: 0, len: size_of::() as u8, @@ -29,7 +33,8 @@ impl OverlayMetacopy { } } - pub(super) fn valid(&self) -> bool { + /// Checks whether this metacopy entry is valid. + pub fn valid(&self) -> bool { self.version == 0 && self.len == size_of::() as u8 && self.flags == 0 diff --git a/crates/composefs/src/erofs/debug.rs b/crates/composefs/src/erofs/debug.rs index d384d7de..e4905cf7 100644 --- a/crates/composefs/src/erofs/debug.rs +++ b/crates/composefs/src/erofs/debug.rs @@ -341,64 +341,6 @@ impl<'img> ImageVisitor<'img> { } } - fn visit_directory_block(&mut self, block: &DirectoryBlock, path: &Path) -> Result<()> { - for entry in block.entries()? { - let entry = entry?; - if entry.name == b"." || entry.name == b".." { - // TODO: maybe we want to follow those and let deduplication happen - continue; - } - self.visit_inode( - entry.header.inode_offset.get(), - &path.join(OsStr::from_bytes(entry.name)), - )?; - } - Ok(()) - } - - fn visit_inode(&mut self, id: u64, path: &Path) -> Result<()> { - let inode = self.image.inode(id)?; - let segment = match inode { - InodeType::Compact(inode) => SegmentType::CompactInode(inode), - InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), - }; - if self.note(segment, Some(path))? { - // TODO: maybe we want to throw an error if we detect loops - /* already processed */ - return Ok(()); - } - - if let Some(xattrs) = inode.xattrs()? { - for id in xattrs.shared()? { - self.note( - SegmentType::XAttr(self.image.shared_xattr(id.get())?), - Some(path), - )?; - } - } - - if inode.mode().is_dir() { - if let Some(inline) = inode.inline() { - let inline_block = DirectoryBlock::ref_from_bytes(inline) - .map_err(|_| anyhow::anyhow!("invalid inline directory block"))?; - self.visit_directory_block(inline_block, path)?; - } - - for id in self.image.inode_blocks(&inode)? { - let block = self.image.directory_block(id)?; - self.visit_directory_block(block, path)?; - self.note(SegmentType::DirectoryBlock(block), Some(path))?; - } - } else { - for id in self.image.inode_blocks(&inode)? { - let block = self.image.data_block(id)?; - self.note(SegmentType::DataBlock(block), Some(path))?; - } - } - - Ok(()) - } - #[allow(clippy::type_complexity)] fn visit_image( image: &'img Image<'img>, @@ -409,7 +351,70 @@ impl<'img> ImageVisitor<'img> { }; this.note(SegmentType::Header(image.header), None)?; this.note(SegmentType::Superblock(image.sb), None)?; - this.visit_inode(image.sb.root_nid.get() as u64, &PathBuf::from("/"))?; + + // Iterative traversal: push (nid, path) pairs rather than recursing. + // The previous mutual recursion (visit_inode ↔ visit_directory_block) + // had no depth limit and would stack-overflow on deeply nested images. + // Deduplication is by byte offset via note(), so cycles and hardlinks + // are safe: note() returns true on a second visit and we skip children. + let mut stack: Vec<(u64, PathBuf)> = + vec![(image.sb.root_nid.get() as u64, PathBuf::from("/"))]; + + while let Some((id, path)) = stack.pop() { + let inode = this.image.inode(id)?; + let segment = match inode { + InodeType::Compact(inode) => SegmentType::CompactInode(inode), + InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), + }; + if this.note(segment, Some(&path))? { + // Already visited this byte offset — additional path recorded, skip children. + continue; + } + + if let Some(xattrs) = inode.xattrs()? { + for xid in xattrs.shared()? { + this.note( + SegmentType::XAttr(this.image.shared_xattr(xid.get())?), + Some(&path), + )?; + } + } + + if inode.mode().is_dir() { + if let Some(inline) = inode.inline() { + let inline_block = DirectoryBlock::ref_from_bytes(inline) + .map_err(|_| anyhow::anyhow!("invalid inline directory block"))?; + for entry in inline_block.entries()? { + let entry = entry?; + if entry.name != b"." && entry.name != b".." { + stack.push(( + entry.header.inode_offset.get(), + path.join(OsStr::from_bytes(entry.name)), + )); + } + } + } + for blkid in this.image.inode_blocks(&inode)? { + let block = this.image.directory_block(blkid)?; + for entry in block.entries()? { + let entry = entry?; + if entry.name != b"." && entry.name != b".." { + stack.push(( + entry.header.inode_offset.get(), + path.join(OsStr::from_bytes(entry.name)), + )); + } + } + this.note(SegmentType::DirectoryBlock(block), Some(&path))?; + } + } else { + for blkid in this.image.inode_blocks(&inode)? { + let block = this.image.data_block(blkid)?; + this.note(SegmentType::DataBlock(block), Some(&path))?; + } + } + } + Ok(this.visited) } } diff --git a/crates/composefs/src/erofs/format.rs b/crates/composefs/src/erofs/format.rs index cc5a40a2..0b0e4241 100644 --- a/crates/composefs/src/erofs/format.rs +++ b/crates/composefs/src/erofs/format.rs @@ -81,7 +81,7 @@ const INODE_DATALAYOUT_FLAT_INLINE: u16 = 4; const INODE_DATALAYOUT_CHUNK_BASED: u16 = 8; /// Data layout method for file content storage -#[derive(Debug)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[repr(u16)] pub enum DataLayout { /// File data stored in separate blocks @@ -271,11 +271,103 @@ impl std::ops::BitOr for FileType { /// EROFS format version number pub const VERSION: U32 = U32::new(1); -/// Composefs-specific version number +/// Composefs-specific version number (V2, Rust-native format) pub const COMPOSEFS_VERSION: U32 = U32::new(2); +/// Composefs-specific version number for V1 (C-compatible format: compact inodes, whiteout table) +pub const COMPOSEFS_VERSION_V1: U32 = U32::new(0); /// Magic number identifying composefs images pub const COMPOSEFS_MAGIC: U32 = U32::new(0xd078629a); +/// Format version for composefs images +/// +/// This enum represents the different format versions supported by composefs. +/// The format version affects the composefs header version field and build time handling. +/// +/// Serialized as an integer: V1 → `1`, V2 → `2`. +#[repr(u32)] +#[derive( + Clone, + Copy, + Debug, + Default, + Hash, + PartialEq, + Eq, + serde_repr::Serialize_repr, + serde_repr::Deserialize_repr, +)] +pub enum FormatVersion { + /// Format V1: compact inodes, whiteout table. + /// + /// This is the original format used by older versions of composefs. + /// Build time is set to the minimum mtime across all inodes. + /// The `composefs_version` header field is 0 normally, but 1 when + /// user-land whiteout files are present (matching C mkcomposefs behavior). + V1 = 1, + /// Format V2: extended inodes, no whiteout table, composefs_version=2 + /// + /// This is the current default format. + #[default] + V2 = 2, +} + +impl FormatVersion { + /// Returns the composefs_version value for this format version + pub fn composefs_version(self) -> U32 { + match self { + FormatVersion::V1 => COMPOSEFS_VERSION_V1, + FormatVersion::V2 => COMPOSEFS_VERSION, + } + } +} + +/// The set of EROFS format versions to generate when committing images. +/// +/// Stored in `meta.json` via the `"v1_erofs"` ro_compat feature flag: +/// flag present → [`V1_ONLY`](Self::V1_ONLY), flag absent → [`BOTH`](Self::BOTH). +/// +/// A `FormatSet` is a small bitset (bit 0 = V1, bit 1 = V2) so it can be +/// cheaply copied and tested without heap allocation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FormatSet(u8); + +impl FormatSet { + /// Generate only V1 EROFS (default for new repos; C-tool compatible). + pub const V1_ONLY: FormatSet = FormatSet(0b01); + /// Generate both V1 and V2 EROFS (used by bootc and other multi-format consumers). + pub const BOTH: FormatSet = FormatSet(0b11); + + /// Map a [`FormatVersion`] to its bit position in the `FormatSet` bitset. + /// + /// V1 → bit 0 (`0b01`), V2 → bit 1 (`0b10`). Adding a V3 only requires + /// updating this one function. + fn version_bit(v: FormatVersion) -> u8 { + match v { + FormatVersion::V1 => 0b01, + FormatVersion::V2 => 0b10, + } + } + + /// Returns `true` if this set includes the given format version. + pub fn contains(self, v: FormatVersion) -> bool { + self.0 & Self::version_bit(v) != 0 + } + + /// Iterate over the format versions in this set, in ascending order (V1 before V2). + pub fn iter(self) -> impl Iterator { + [FormatVersion::V1, FormatVersion::V2] + .into_iter() + .filter(move |&v| self.contains(v)) + } +} + +impl From for FormatSet { + /// Create a single-version `FormatSet` from a [`FormatVersion`]. + fn from(v: FormatVersion) -> Self { + FormatSet(FormatSet::version_bit(v)) + } +} + /// Flag indicating the presence of ACL data pub const COMPOSEFS_FLAGS_HAS_ACL: U32 = U32::new(1 << 0); @@ -493,7 +585,52 @@ pub struct XAttrHeader { pub value_size: U16, } -/// Standard xattr name prefixes indexed by name_index +/// EROFS xattr prefix index for `system.posix_acl_access` (index 2). +pub const XATTR_INDEX_POSIX_ACL_ACCESS: u8 = 2; +/// EROFS xattr prefix index for `system.posix_acl_default` (index 3). +pub const XATTR_INDEX_POSIX_ACL_DEFAULT: u8 = 3; +/// EROFS xattr prefix index for `lustre.` (index 5). +/// Absent from C mkcomposefs v1.0.8's prefix table; V1 writer skips it. +pub const XATTR_INDEX_LUSTRE: u8 = 5; + +// Overlay xattr keys used by composefs V1 whiteout escaping. +// Named to match the C mkcomposefs OVERLAY_XATTR_* constants. +/// `trusted.overlay.overlay.whiteout` — V1 escaped whiteout marker. +pub const XATTR_OVERLAY_WHITEOUT: &[u8] = b"trusted.overlay.overlay.whiteout"; +/// `user.overlay.whiteout` — userxattr escaped whiteout marker. +pub const XATTR_USERXATTR_WHITEOUT: &[u8] = b"user.overlay.whiteout"; +/// `trusted.overlay.overlay.whiteouts` — escaped whiteouts directory marker. +pub const XATTR_OVERLAY_WHITEOUTS: &[u8] = b"trusted.overlay.overlay.whiteouts"; +/// `user.overlay.whiteouts` — userxattr whiteouts directory marker. +pub const XATTR_USERXATTR_WHITEOUTS: &[u8] = b"user.overlay.whiteouts"; +/// `trusted.overlay.overlay.opaque` — escaped opaque directory marker. +pub const XATTR_OVERLAY_OPAQUE: &[u8] = b"trusted.overlay.overlay.opaque"; +/// `user.overlay.opaque` — userxattr opaque directory marker. +pub const XATTR_USERXATTR_OPAQUE: &[u8] = b"user.overlay.opaque"; +/// `trusted.overlay.opaque` — root opaque marker written by V1 writer. +pub const XATTR_OVERLAY_OPAQUE_ROOT: &[u8] = b"trusted.overlay.opaque"; +/// `trusted.overlay.metacopy` — metacopy marker (C adds redirect xattr too). +pub const XATTR_OVERLAY_METACOPY: &[u8] = b"trusted.overlay.metacopy"; +/// `trusted.overlay.redirect` — redirect target xattr. +pub const XATTR_OVERLAY_REDIRECT: &[u8] = b"trusted.overlay.redirect"; +/// `trusted.overlay.` prefix — all xattrs with this prefix are escaped in V1. +pub const XATTR_OVERLAY_PREFIX: &[u8] = b"trusted.overlay."; +/// `trusted.overlay.overlay.` prefix — escaped overlay xattr prefix. +pub const XATTR_OVERLAY_ESCAPED_PREFIX: &[u8] = b"trusted.overlay.overlay."; +/// `security.selinux` — SELinux label, copied to overlay whiteout stubs. +pub const XATTR_SECURITY_SELINUX: &[u8] = b"security.selinux"; + +/// Standard xattr name prefixes indexed by EROFS name_index. +/// +/// Index 0 is the fallback (empty prefix, full name stored as suffix). +/// Indices 1–6 map to the well-known EROFS prefix constants: +/// EROFS_XATTR_INDEX_USER=1, POSIX_ACL_ACCESS=2, POSIX_ACL_DEFAULT=3, +/// EROFS_XATTR_INDEX_TRUSTED=4, EROFS_XATTR_INDEX_LUSTRE=5, EROFS_XATTR_INDEX_SECURITY=6. +/// +/// **V1 compatibility note:** C mkcomposefs v1.0.8 does NOT include `lustre.` (index 5) +/// in its prefix table. Any `lustre.*` xattr is therefore encoded with prefix index 0 +/// (raw fallback) by C. For V1 images the writer must skip index 5 during prefix +/// matching so that `lustre.*` xattrs fall through to the empty-string fallback. pub const XATTR_PREFIXES: [&[u8]; 7] = [ b"", b"user.", @@ -519,3 +656,35 @@ pub struct DirectoryEntryHeader { /// Reserved field pub reserved: u8, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_set_contains() { + assert!(FormatSet::BOTH.contains(FormatVersion::V1)); + assert!(FormatSet::BOTH.contains(FormatVersion::V2)); + assert!(FormatSet::V1_ONLY.contains(FormatVersion::V1)); + assert!(!FormatSet::V1_ONLY.contains(FormatVersion::V2)); + } + + #[test] + fn test_format_set_from_version() { + assert_eq!(FormatSet::from(FormatVersion::V1), FormatSet::V1_ONLY); + // V2 alone is a single-version set (neither V1_ONLY nor BOTH). + let v2_only = FormatSet::from(FormatVersion::V2); + assert!(!v2_only.contains(FormatVersion::V1)); + assert!(v2_only.contains(FormatVersion::V2)); + } + + #[test] + fn test_format_set_iter_order() { + // iter() must yield V1 before V2. + let versions: Vec<_> = FormatSet::BOTH.iter().collect(); + assert_eq!(versions, vec![FormatVersion::V1, FormatVersion::V2]); + + let v1_only: Vec<_> = FormatSet::V1_ONLY.iter().collect(); + assert_eq!(v1_only, vec![FormatVersion::V1]); + } +} diff --git a/crates/composefs/src/erofs/reader.rs b/crates/composefs/src/erofs/reader.rs index 06f09932..65568ec1 100644 --- a/crates/composefs/src/erofs/reader.rs +++ b/crates/composefs/src/erofs/reader.rs @@ -17,10 +17,10 @@ use zerocopy::{FromBytes, Immutable, KnownLayout, little_endian::U32}; use super::{ composefs::OverlayMetacopy, format::{ - self, BLOCK_BITS, COMPOSEFS_MAGIC, CompactInodeHeader, ComposefsHeader, DataLayout, - DirectoryEntryHeader, ExtendedInodeHeader, InodeXAttrHeader, MAGIC_V1, ModeField, S_IFBLK, - S_IFCHR, S_IFIFO, S_IFLNK, S_IFMT, S_IFREG, S_IFSOCK, Superblock, VERSION, XATTR_PREFIXES, - XAttrHeader, + self, BLOCK_BITS, COMPOSEFS_MAGIC, COMPOSEFS_VERSION, COMPOSEFS_VERSION_V1, + CompactInodeHeader, ComposefsHeader, DataLayout, DirectoryEntryHeader, ExtendedInodeHeader, + InodeXAttrHeader, MAGIC_V1, ModeField, S_IFBLK, S_IFCHR, S_IFIFO, S_IFLNK, S_IFMT, S_IFREG, + S_IFSOCK, Superblock, VERSION, XATTR_PREFIXES, XAttrHeader, }, }; use crate::MAX_INLINE_CONTENT; @@ -494,8 +494,18 @@ impl<'img> Image<'img> { self.header.version.get(), ))); } - // Note: we don't enforce composefs_version here because C mkcomposefs - // writes version 0 while the Rust writer uses version 2. Both are valid. + // Reject unknown composefs versions. + // 0 = V1 (C-compatible, no user whiteouts) + // 1 = V1 (C-compatible, user whiteouts present — C bumps version when it + // encounters a char-device-rdev-0 entry in the input tree) + // 2 = V2 (Rust-native format) + let cv = self.header.composefs_version.get(); + if cv != COMPOSEFS_VERSION.get() && cv != COMPOSEFS_VERSION_V1.get() && cv != 1 { + return Err(ErofsReaderError::InvalidImage(format!( + "unknown composefs_version {cv} (expected 0, 1, or {})", + COMPOSEFS_VERSION.get(), + ))); + } // Validate EROFS superblock magic if self.sb.magic != MAGIC_V1 { @@ -649,17 +659,29 @@ impl<'img> Image<'img> { } /// Returns a data block by its ID + /// Returns a byte slice of the image at `[offset, offset+len)`, validating + /// that both the offset and the range lie within the image. + /// + /// This is the single choke point for all raw byte accesses derived from + /// image fields (block addresses, xattr offsets, etc.). All callers that + /// compute `blkaddr * block_size + delta` should go through here rather + /// than slicing `self.image` directly. + pub fn image_slice(&self, offset: usize, len: usize) -> Result<&[u8], ErofsReaderError> { + let end = offset + .checked_add(len) + .ok_or(ErofsReaderError::OutOfBounds)?; + self.image + .get(offset..end) + .ok_or(ErofsReaderError::OutOfBounds) + } + + /// Returns a block by its ID as a raw byte slice, validated against the image size. pub fn block(&self, id: u64) -> Result<&[u8], ErofsReaderError> { let start = usize::try_from(id) .ok() .and_then(|id| id.checked_mul(self.block_size)) .ok_or(ErofsReaderError::OutOfBounds)?; - let end = start - .checked_add(self.block_size) - .ok_or(ErofsReaderError::OutOfBounds)?; - self.image - .get(start..end) - .ok_or(ErofsReaderError::OutOfBounds) + self.image_slice(start, self.block_size) } /// Returns a data block by its ID as a DataBlock reference @@ -711,6 +733,150 @@ impl<'img> Image<'img> { Ok(range) } + /// Performs a full structural fsck of the image metadata by traversing the + /// entire inode tree. + /// + /// This is separate from [`Self::restrict_to_composefs`], which only checks + /// superblock and header fields without any traversal. Call this when you + /// want a thorough integrity check (e.g. during repository fsck) rather than + /// just the cheap open-time validation. + /// + /// Currently checks: + /// - V1 images: no FlatInline symlink inode has a block-boundary layout that + /// old Linux kernels (< 6.12) would reject with `EFSCORRUPTED` (`EUCLEAN`). + pub fn fsck_metadata(&self) -> Result<(), ErofsReaderError> { + self.validate_v1_inline_layout() + } + + /// Validates that the image does not contain FlatInline inodes with a layout + /// that old Linux kernels (< 6.12) would reject with `EFSCORRUPTED` (`EUCLEAN`). + /// + /// Only V1 (C-compatible, `composefs_version` = 0 or 1) images are expected to be + /// mounted on kernels that may predate the 6.12 fix; V2 images use a different + /// block-boundary strategy that is frozen for digest stability, so this check + /// is deliberately restricted to V1. + /// + /// The kernel's pre-6.12 fast-symlink path checks: + /// ```text + /// (inode_offset % block_size) + inode_and_xattr_size + inline_size > block_size + /// ``` + /// and returns `-EFSCORRUPTED` if true. This method returns an error for any + /// inode where that condition holds. + fn validate_v1_inline_layout(&self) -> Result<(), ErofsReaderError> { + // Only applies to V1 (C-compatible) images: composefs_version 0 (no user + // whiteouts) or 1 (user whiteouts present). V2 images (composefs_version=2) + // use a frozen layout strategy and are never mounted on pre-6.12 kernels. + let cv = self.header.composefs_version.get(); + if cv >= format::COMPOSEFS_VERSION.get() { + return Ok(()); + } + + let block_size = self.block_size as u64; + + // Walk all reachable inodes from the root rather than iterating raw nid slots. + // The inode table is not densely packed — gaps arise from padding — so + // iterating 0..sb.inos by slot can hit mid-inode bytes that accidentally + // parse as valid-looking headers with garbage xattr_icount values. + let mut stack = vec![self.sb.root_nid.get() as u64]; + let mut visited = std::collections::HashSet::new(); + + while let Some(nid) = stack.pop() { + if !visited.insert(nid) { + continue; + } + let inode = match self.inode(nid) { + Ok(i) => i, + Err(_) => continue, + }; + + // Recurse into directories to find all symlink inodes. + if inode.mode().is_dir() { + // Collect child nids from both inline and block directory data. + let mut child_nids: Vec = Vec::new(); + if let Some(inline) = inode.inline() + && let Ok(block) = DirectoryBlock::ref_from_bytes(inline) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + let name = entry.name; + if name == b"." || name == b".." { + continue; + } + child_nids.push(entry.nid()); + } + } + if let Ok(range) = self.inode_blocks(&inode) { + for blkid in range { + if let Ok(block) = self.directory_block(blkid) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + let name = entry.name; + if name == b"." || name == b".." { + continue; + } + child_nids.push(entry.nid()); + } + } + } + } + stack.extend(child_nids); + continue; + } + + // Only the pre-6.12 symlink fast-path checks the block boundary. + let mode = inode.mode().0.get(); + if mode & S_IFMT != S_IFLNK { + continue; + } + + let layout = match inode.data_layout() { + Ok(l) => l, + Err(_) => continue, + }; + if !matches!(layout, DataLayout::FlatInline) { + continue; // symlink stored out-of-band (long target > block_size) + } + + let inline_size = inode.size() % block_size; + if inline_size == 0 { + continue; + } + + // nid * 32 is the byte offset from meta_start (which is 0 for composefs). + let inode_offset = nid + .checked_mul(32) + .ok_or_else(|| ErofsReaderError::InvalidImage("nid overflow".into()))?; + let inode_pos_in_block = inode_offset % block_size; + + let header_size: u64 = match &inode { + InodeType::Compact(_) => size_of::() as u64, + InodeType::Extended(_) => size_of::() as u64, + }; + let xattr_size = inode.xattr_size() as u64; + let inode_and_xattr_size = header_size.checked_add(xattr_size).ok_or_else(|| { + ErofsReaderError::InvalidImage("inode+xattr size overflow".into()) + })?; + + let total = inode_pos_in_block + .checked_add(inode_and_xattr_size) + .and_then(|t| t.checked_add(inline_size)) + .ok_or_else(|| { + ErofsReaderError::InvalidImage("inline layout size overflow".into()) + })?; + if total > block_size { + return Err(ErofsReaderError::InvalidImage(format!( + "inode at nid {nid} (FlatInline symlink, inode_pos_in_block={inode_pos_in_block}, \ + inode_and_xattr_size={inode_and_xattr_size}, inline_size={inline_size}) \ + would trigger EUCLEAN on kernels older than 6.12: \ + {inode_pos_in_block} + {inode_and_xattr_size} + {inline_size} = {total} > {block_size}" + ))); + } + } + + Ok(()) + } + /// Finds a child directory entry by name within a directory inode. /// /// Returns the nid (inode number) of the child if found. @@ -743,6 +909,41 @@ impl<'img> Image<'img> { } } +/// Check if an inode is a V1 escaped whiteout (a regular file carrying the +/// `trusted.overlay.overlay.whiteout` xattr added by the V1 writer). +/// +/// C composefs v1.0.8 converts char-device-rdev-0 entries to regular files +/// on write (whiteout escaping). The reader must reverse this. +fn is_escaped_v1_whiteout(img: &Image, inode: &InodeType) -> anyhow::Result { + // Only relevant for regular files + let mode = inode.mode().0.get(); + if mode & S_IFMT != S_IFREG { + return Ok(false); + } + + let Some(xattrs_section) = inode.xattrs()? else { + return Ok(false); + }; + + // Check shared xattrs + for id in xattrs_section.shared()? { + let xattr = img.shared_xattr(id.get())?; + let full_name = construct_xattr_name(xattr)?; + if full_name == format::XATTR_OVERLAY_WHITEOUT { + return Ok(true); + } + } + // Check local xattrs + for xattr in xattrs_section.local()? { + let xattr = xattr?; + let full_name = construct_xattr_name(xattr)?; + if full_name == format::XATTR_OVERLAY_WHITEOUT { + return Ok(true); + } + } + Ok(false) +} + // TODO: there must be an easier way... #[derive(FromBytes, Immutable, KnownLayout)] #[repr(C)] @@ -1041,6 +1242,7 @@ impl ObjectCollector { /// Returns a set of all referenced object IDs. pub fn collect_objects(image: &[u8]) -> ReadResult> { let img = Image::open(image)?.restrict_to_composefs()?; + img.fsck_metadata()?; let mut this = ObjectCollector { visited_nids: HashSet::new(), nids_to_visit: BTreeSet::new(), @@ -1078,21 +1280,23 @@ fn construct_xattr_name(xattr: &XAttr) -> Result, ErofsReaderError> { /// - Strips `trusted.overlay.metacopy` and `trusted.overlay.redirect` /// - Unescapes `trusted.overlay.overlay.X` back to `trusted.overlay.X` fn stat_from_inode_for_tree(img: &Image, inode: &InodeType) -> anyhow::Result { - let (st_mode, st_uid, st_gid, st_mtim_sec) = match inode { + let (st_mode, st_uid, st_gid, st_mtim_sec, st_mtim_nsec) = match inode { InodeType::Compact(inode) => ( inode.header.mode.0.get() as u32 & 0o7777, inode.header.uid.get() as u32, inode.header.gid.get() as u32, - // Compact inodes don't store mtime; the writer uses build_time - // but for round-trip purposes, 0 matches what was written for - // compact headers (the writer always uses ExtendedInodeHeader) - 0i64, + // Compact inodes don't store mtime; use superblock build_time + // (the writer sets build_time = min mtime across all inodes) + img.sb.build_time.get() as i64, + // and build_time_nsec for the nanosecond component + img.sb.build_time_nsec.get(), ), InodeType::Extended(inode) => ( inode.header.mode.0.get() as u32 & 0o7777, inode.header.uid.get(), inode.header.gid.get(), inode.header.mtime.get() as i64, + inode.header.mtime_nsec.get(), ), }; @@ -1120,6 +1324,7 @@ fn stat_from_inode_for_tree(img: &Image, inode: &InodeType) -> anyhow::Result anyhow::Result anyhow::Result, Box<[u8]>)>> { let full_name = construct_xattr_name(xattr)?; - // Skip internal overlay xattrs added by the writer - if full_name == b"trusted.overlay.metacopy" || full_name == b"trusted.overlay.redirect" { + // Skip internal overlay xattrs added by the writer (metacopy/redirect + // are composefs-internal and should not be exposed to readers). + if full_name == format::XATTR_OVERLAY_METACOPY || full_name == format::XATTR_OVERLAY_REDIRECT { + return Ok(None); + } + + // V1 whiteout escaping artifacts: strip these internal xattrs. + // XATTR_OVERLAY_WHITEOUT signals the inode is a whiteout (handled separately). + // The *_WHITEOUTS, *_OPAQUE, and user-namespace variants are parent-dir markers + // added by the V1 writer that are composefs-internal. + // Note: XATTR_OVERLAY_OPAQUE must be listed explicitly here because the general + // unescape handler below would otherwise expose it as trusted.overlay.opaque. + if full_name == format::XATTR_OVERLAY_WHITEOUT + || full_name == format::XATTR_OVERLAY_WHITEOUTS + || full_name == format::XATTR_OVERLAY_OPAQUE + || full_name == format::XATTR_USERXATTR_WHITEOUT + || full_name == format::XATTR_USERXATTR_WHITEOUTS + || full_name == format::XATTR_USERXATTR_OPAQUE + { return Ok(None); } // Unescape: trusted.overlay.overlay.X -> trusted.overlay.X - if let Some(rest) = full_name.strip_prefix(b"trusted.overlay.overlay.") { - let mut unescaped = b"trusted.overlay.".to_vec(); + if let Some(rest) = full_name.strip_prefix(format::XATTR_OVERLAY_ESCAPED_PREFIX) { + let mut unescaped = format::XATTR_OVERLAY_PREFIX.to_vec(); unescaped.extend_from_slice(rest); let name = Box::from(OsStr::from_bytes(&unescaped)); let value = Box::from(xattr.value()?); return Ok(Some((name, value))); } // Skip all other trusted.overlay.* xattrs (internal to composefs) - if full_name.starts_with(b"trusted.overlay.") { + if full_name.starts_with(format::XATTR_OVERLAY_PREFIX) { return Ok(None); } @@ -1393,6 +1615,25 @@ fn populate_directory( let name = OsStr::from_bytes(name_bytes); let child_inode = img.inode(nid)?; + // Skip overlay whiteout entries — but only in the root directory. + // C composefs only skips hex-named (00–ff) chardev(0,0) entries in root + // (lcfs-writer-erofs.c: "Skip real whiteouts (00-ff)"). + // A chardev(0,0) in a subdirectory is a legitimate device node. + // + // In V1 images the writer escapes whiteouts to regular files with + // trusted.overlay.overlay.whiteout xattr, so we must check both + // the native chardev form and the escaped regular-file form. + let is_root_dir = dir_nid == img.sb.root_nid.get() as u64; + let is_escaped_whiteout = is_escaped_v1_whiteout(img, &child_inode)?; + let is_native_whiteout = child_inode.is_whiteout(); + if is_root_dir + && (is_native_whiteout || is_escaped_whiteout) + && name_bytes.len() == 2 + && name_bytes.iter().all(|b| b.is_ascii_hexdigit()) + { + continue; + } + if child_inode.mode().is_dir() { n_subdirs = n_subdirs .checked_add(1) @@ -1427,7 +1668,14 @@ fn populate_directory( let content = match file_type { S_IFREG => { - if let Some(digest) = extract_metacopy_digest::(img, &child_inode)? { + // V1 images escape whiteouts (char dev rdev=0) to regular files. + // The is_escaped_whiteout flag was computed above (before the + // root-dir skip check), so reuse it here. + if is_escaped_whiteout { + tree::LeafContent::CharacterDevice(0) + } else if let Some(digest) = + extract_metacopy_digest::(img, &child_inode)? + { tree::LeafContent::Regular(tree::RegularFile::External( digest, child_inode.size(), @@ -1468,10 +1716,19 @@ fn populate_directory( _ => anyhow::bail!("unknown file type {:#o} for {:?}", file_type, name), }; + // Hardlinked whiteouts are semantically invalid: a whiteout represents the + // absence of a file in an overlay, so nlink > 1 is meaningless. + let on_disk_nlink = child_inode.nlink(); + if matches!(content, tree::LeafContent::CharacterDevice(0)) && on_disk_nlink > 1 { + anyhow::bail!( + "invalid composefs image: whiteout inode {:?} has nlink > 1", + name + ); + } + let leaf_id = builder.push_leaf(stat, content); // Track for hardlink detection if nlink > 1 - let on_disk_nlink = child_inode.nlink(); if on_disk_nlink > 1 { builder.hardlinks.insert(nid, leaf_id); } @@ -1572,7 +1829,7 @@ mod tests { use super::*; use crate::{ dumpfile::{dumpfile_to_filesystem, write_dumpfile}, - erofs::writer::mkfs_erofs, + erofs::writer::{ValidatedFileSystem, mkfs_erofs}, fsverity::Sha256HashValue, }; use std::collections::HashMap; @@ -1653,7 +1910,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Root should have . and .. and empty_dir @@ -1698,7 +1955,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Find dir1 @@ -1743,7 +2000,7 @@ mod tests { } let fs = dumpfile_to_filesystem::(&dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Find bigdir @@ -1793,7 +2050,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Navigate through the structure @@ -1831,7 +2088,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); let root_inode = img.root().unwrap(); @@ -1877,7 +2134,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // This should traverse all directories without error let result = collect_objects::(&image); @@ -1953,7 +2210,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Verify root entries @@ -2000,7 +2257,7 @@ mod tests { write_dumpfile(&mut orig_output, &fs_orig).unwrap(); let orig_str = String::from_utf8(orig_output).unwrap(); - let image = mkfs_erofs(&fs_orig); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs_orig).unwrap()); let fs_rt = erofs_to_filesystem::(&image).unwrap(); let mut rt_output = Vec::new(); @@ -2105,7 +2362,8 @@ mod tests { "#; let fs_orig = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs_orig); + let vfs_orig = ValidatedFileSystem::new(fs_orig).unwrap(); + let image = mkfs_erofs(&vfs_orig); let fs_rt = erofs_to_filesystem::(&image).unwrap(); // Verify hardlink sharing via LeafId @@ -2120,7 +2378,7 @@ mod tests { // Verify dumpfile round-trips correctly let mut orig_output = Vec::new(); - write_dumpfile(&mut orig_output, &fs_orig).unwrap(); + write_dumpfile(&mut orig_output, &vfs_orig.0).unwrap(); let orig_str = String::from_utf8(orig_output).unwrap(); let mut rt_output = Vec::new(); @@ -2149,7 +2407,7 @@ mod tests { // Build a minimal valid composefs image (just a root directory). let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity: the unmodified image passes restrict_to_composefs(). Image::open(&base_image) @@ -2278,7 +2536,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity: unmodified image round-trips fine erofs_to_filesystem::(&base_image) @@ -2335,9 +2593,14 @@ mod tests { for case in &cases { let mut image = base_image.clone(); - let offset = inline_offset + case.entry_byte_offset; + let entry_start = inline_offset + case.entry_byte_offset; // Write a bogus nid (0xDEAD) that doesn't match the directory's own nid - image[offset..offset + 8].copy_from_slice(&0xDEADu64.to_le_bytes()); + // Use zerocopy to get a typed &mut DirectoryEntryHeader instead of raw bytes. + let hdr = DirectoryEntryHeader::mut_from_bytes( + &mut image[entry_start..entry_start + size_of::()], + ) + .expect("entry slice must be a valid DirectoryEntryHeader"); + hdr.inode_offset = zerocopy::little_endian::U64::new(0xDEAD); let result = erofs_to_filesystem::(&image); let err = result.expect_err(&format!("{}: should have been rejected", case.name)); @@ -2369,7 +2632,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity check erofs_to_filesystem::(&base_image) @@ -2380,22 +2643,26 @@ mod tests { let root_nid = img.sb.root_nid.get() as u64; let file_nid = img.find_child_nid(root_nid, b"file").unwrap().unwrap(); - // Compute byte offset of the file's inode in the image - let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_byte_offset = meta_start + file_nid as usize * 32; - let is_extended = base_image[inode_byte_offset] & 1 != 0; + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(file_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + file_nid as usize * 32; + drop(inode); drop(img); let mut image = base_image.clone(); + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.nlink is U32 at byte offset 44 - let nlink_offset = inode_byte_offset + 44; - image[nlink_offset..nlink_offset + 4].copy_from_slice(&5u32.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.nlink = zerocopy::little_endian::U32::new(5); } else { - // CompactInodeHeader.nlink is U16 at byte offset 6 - let nlink_offset = inode_byte_offset + 6; - image[nlink_offset..nlink_offset + 2].copy_from_slice(&5u16.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.nlink = zerocopy::little_endian::U16::new(5); } let result = erofs_to_filesystem::(&image); @@ -2421,7 +2688,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity check erofs_to_filesystem::(&base_image) @@ -2432,21 +2699,26 @@ mod tests { let root_nid = img.sb.root_nid.get() as u64; let dir_nid = img.find_child_nid(root_nid, b"dir").unwrap().unwrap(); - let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_byte_offset = meta_start + dir_nid as usize * 32; - let is_extended = base_image[inode_byte_offset] & 1 != 0; + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(dir_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + dir_nid as usize * 32; + drop(inode); drop(img); let mut image = base_image.clone(); + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.nlink is U32 at byte offset 44 - let nlink_offset = inode_byte_offset + 44; - image[nlink_offset..nlink_offset + 4].copy_from_slice(&99u32.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.nlink = zerocopy::little_endian::U32::new(99); } else { - // CompactInodeHeader.nlink is U16 at byte offset 6 - let nlink_offset = inode_byte_offset + 6; - image[nlink_offset..nlink_offset + 2].copy_from_slice(&99u16.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.nlink = zerocopy::little_endian::U16::new(99); } let result = erofs_to_filesystem::(&image); @@ -2471,30 +2743,35 @@ mod tests { // stays the same and the inode still parses successfully. let dumpfile = "/ 0 40755 1 0 0 0 0.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let mut image = mkfs_erofs(&fs); + let mut image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); - let root_nid = img.sb.root_nid.get() as usize; + let root_nid = img.sb.root_nid.get() as u64; let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_offset = meta_start + root_nid * 32; - // Determine inode layout from the first byte - let is_extended = image[inode_offset] & 1 != 0; + + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(root_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + root_nid as usize * 32; + drop(inode); drop(img); // Use a huge size that is a multiple of block_size (4096) so inline // tail size stays 0 and the inode remains parseable. let huge_size: u64 = (block_size as u64) * 1_000_000_000; + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.size is a U64 at byte offset 8 - let size_offset = inode_offset + 8; - image[size_offset..size_offset + 8].copy_from_slice(&huge_size.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.size = zerocopy::little_endian::U64::new(huge_size); } else { - // CompactInodeHeader.size is a U32 at byte offset 8 - let size_offset = inode_offset + 8; - let truncated = huge_size as u32; - image[size_offset..size_offset + 4].copy_from_slice(&truncated.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.size = zerocopy::little_endian::U32::new(huge_size as u32); } let img = Image::open(&image).unwrap(); @@ -2510,43 +2787,558 @@ mod tests { mod proptest_tests { use super::*; + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; use crate::fsverity::Sha512HashValue; - use crate::test::proptest_strategies::{build_filesystem, filesystem_spec}; + use crate::test::proptest_strategies::{ + FsSpec, build_filesystem, build_unusual_filesystem, filesystem_spec, + unusual_filesystem_spec, + }; use proptest::prelude::*; - /// Round-trip a FileSystem through erofs and compare dumpfile output. - fn round_trip_filesystem( - fs_orig: &tree::FileSystem, - ) { - let mut orig_output = Vec::new(); - write_dumpfile(&mut orig_output, fs_orig).unwrap(); - - let image = mkfs_erofs(fs_orig); + /// Round-trip a FileSystem through V2 erofs and compare dumpfile output. + /// + /// V2 EROFS does not store mtime nanoseconds: the on-disk `mtime_nsec` + /// field is always zero. Build the expected dumpfile from a copy of the + /// filesystem with `mtime_nsec` zeroed so the comparison reflects what + /// V2 actually stores, not what the in-memory tree carries. + fn round_trip_filesystem(spec: FsSpec) { + // fs_write → source for the EROFS image. + // fs_expected → reference with mtime_nsec=0, matching V2 on-disk format. + let fs_write = build_filesystem::(spec.clone()); + let mut fs_expected = build_filesystem::(spec); + // V2 EROFS does not store mtime nanoseconds; zero them before comparing. + fs_expected.for_each_stat_mut(|s| s.st_mtim_nsec = 0); + + let mut expected_output = Vec::new(); + write_dumpfile(&mut expected_output, &fs_expected).unwrap(); + + let image = mkfs_erofs(&ValidatedFileSystem::new(fs_write).unwrap()); let fs_rt = erofs_to_filesystem::(&image).unwrap(); let mut rt_output = Vec::new(); write_dumpfile(&mut rt_output, &fs_rt).unwrap(); similar_asserts::assert_eq!( - String::from_utf8_lossy(&orig_output), + String::from_utf8_lossy(&expected_output), String::from_utf8_lossy(&rt_output) ); } + /// Round-trip a FileSystem through V1 erofs and compare dumpfile output. + /// + /// V1 uses compact inodes (when mtime matches the minimum), BFS ordering, + /// and includes overlay whiteout character device entries in the root. + /// The writer adds `trusted.overlay.opaque` to the root; the reader strips + /// internal overlay xattrs. Whiteout char-device entries (00–ff in root) + /// are also stripped, matching C composefs reader behaviour. + fn round_trip_filesystem_v1(spec: FsSpec) { + // Build two separate filesystems from the same spec so we avoid + // Rc::strong_count issues from sharing leaf Rcs. + let mut fs_write = build_filesystem::(spec.clone()); + let fs_expected = build_filesystem::(spec); + + // Only the write side needs whiteouts — the reader strips them + // just like C composefs does. + fs_write.add_overlay_whiteouts(); + + // The writer internally adds trusted.overlay.opaque=y to root, + // but the reader strips all trusted.overlay.* xattrs that aren't + // escaped user xattrs. So the expected filesystem should NOT have it. + + // Generate the V1 image from the write filesystem. + let image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_write).unwrap(), + FormatVersion::V1, + ); + + // Validate the layout invariant: no FlatInline inode should + // trigger EUCLEAN on kernels < 6.12. This catches the + // block-boundary bug even when proptest doesn't generate a + // case large enough to trip it at mount time. + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("V1 image should have valid inline layout for pre-6.12 kernels"); + + // Read back from the image. + let fs_rt = erofs_to_filesystem::(&image).unwrap(); + + // Compare via dumpfile serialization. + let mut expected_output = Vec::new(); + write_dumpfile(&mut expected_output, &fs_expected).unwrap(); + + let mut rt_output = Vec::new(); + write_dumpfile(&mut rt_output, &fs_rt).unwrap(); + + if expected_output != rt_output { + let expected_str = String::from_utf8_lossy(&expected_output); + let rt_str = String::from_utf8_lossy(&rt_output); + panic!( + "V1 round-trip mismatch:\n--- expected ---\n{expected_str}\n--- got ---\n{rt_str}" + ); + } + } + + /// Verify that C composefs-info can parse an EROFS image we generated, + /// and that its dump output matches our Rust reader's interpretation. + /// + /// This is the critical compatibility test: it proves that EROFS images + /// produced by our writer are consumable by the C implementation. + fn verify_c_composefs_info_reads_image(image: &[u8]) { + use std::io::Write; + + // Validate layout invariant before testing C reader compatibility. + Image::open(image) + .unwrap() + .fsck_metadata() + .expect("image should have valid inline layout for pre-6.12 kernels"); + + // Write image to a tempfile + let mut tmp = tempfile::NamedTempFile::new().unwrap(); + tmp.write_all(image).unwrap(); + tmp.flush().unwrap(); + + // Run C composefs-info dump on the image with a timeout. + let child = std::process::Command::new("composefs-info") + .arg("dump") + .arg(tmp.path()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .unwrap(); + + let output = { + let (tx, rx) = std::sync::mpsc::channel(); + std::thread::spawn(move || { + let _ = tx.send(child.wait_with_output()); + }); + rx.recv_timeout(std::time::Duration::from_secs(10)) + .expect("composefs-info timed out after 10 seconds") + .unwrap() + }; + + if !output.status.success() { + panic!( + "C composefs-info dump failed (exit {:?}):\nstderr: {}", + output.status.code(), + String::from_utf8_lossy(&output.stderr), + ); + } + + let c_dump = String::from_utf8(output.stdout).expect("C dump should be valid UTF-8"); + + // Get our Rust reader's interpretation of the same image + let fs_rt = erofs_to_filesystem::(image).unwrap(); + let mut rust_dump_bytes = Vec::new(); + write_dumpfile(&mut rust_dump_bytes, &fs_rt).unwrap(); + let rust_dump = String::from_utf8(rust_dump_bytes).unwrap(); + + // Parse both dumps into structured entries, then normalize and + // compare. This avoids fragile string munging and lets the + // dumpfile parser handle escaping, field splitting, etc. + // + // Apply the C reader empty-xattr workaround to the Rust dump as + // well: we are testing C-reader compatibility here, so we strip + // the same entries C would silently drop. Rust-only round-trip + // tests (test_erofs_round_trip_*) compare dumpfiles directly + // without this workaround, catching Rust writer bugs without masking them. + let c_entries = parse_c_dump(&c_dump); + let rust_entries = parse_c_dump(&rust_dump); + + similar_asserts::assert_eq!(c_entries, rust_entries); + } + + /// Parse a dump produced by C composefs-info and normalize for comparison. + /// + /// Applies the empty-xattr workaround for the known C reader bug: the + /// inline-xattr loop uses strict `<` instead of `<=` when checking the + /// end pointer, so it silently skips the last entry whenever it is exactly + /// 4 bytes (header only: name_len=0, value_size=0). This occurs for + /// system.posix_acl_access/default with empty values, where the prefix + /// index encodes the full key leaving a zero-length suffix. + fn parse_c_dump(dump: &str) -> Vec { + normalize_dump(dump, true) + } + + /// Parse a dump produced by our Rust reader and normalize for comparison. + /// + /// Does NOT apply the C reader empty-xattr workaround — Rust output must + /// be left unfiltered so any Rust writer bugs producing empty xattrs are + /// caught rather than silently masked. + /// + /// For C compat tests, use [`parse_c_dump`] on both sides so the + /// comparison accounts for the known C reader limitation. + + fn normalize_dump(dump: &str, strip_empty_xattrs: bool) -> Vec { + use crate::dumpfile_parse::{Entry, Item}; + use std::os::unix::ffi::OsStrExt; + + dump.lines() + .filter(|line| !line.is_empty()) + .filter_map(|line| { + let mut entry = Entry::parse(line).unwrap_or_else(|e| { + panic!("Failed to parse dump line: {e}\n line: {line}") + }); + + // C composefs-info (lcfs_build_node_from_image) unconditionally + // treats any chardev with rdev=0 as a whiteout and skips it, + // returning ENOTSUP regardless of where in the tree it appears: + // + // if (type == S_IFCHR && node->inode.st_rdev == 0) { + // errno = ENOTSUP; + // return NULL; + // } + // + // Our Rust reader preserves chardev(0,0) entries in subdirectories + // (it only strips the root-level 00–ff overlay whiteout stubs). + // Strip all chardev(0,0) entries from both sides of the comparison + // so the test reflects what C actually outputs. + if let Item::Device { rdev: 0, .. } = entry.item { + if (entry.mode & 0o170000) == 0o20000 { + return None; + } + } + + if strip_empty_xattrs { + entry.xattrs.retain(|x| !x.value.is_empty()); + } + // Strip overlay xattrs that the C reader keeps but our Rust reader + // strips as composefs-internal: + // - user.overlay.opaque: OVERLAY_XATTR_USERXATTR_OPAQUE, kept by C + // - trusted.overlay.opaque: the C reader unescapes + // trusted.overlay.overlay.opaque to this; Rust strips the + // escaped form before unescaping so it never appears in Rust + // output. Normalizing both sides makes the comparison test + // semantic content rather than internal overlay state. + entry.xattrs.retain(|x| { + x.key.as_bytes() != b"user.overlay.opaque" + && x.key.as_bytes() != b"trusted.overlay.opaque" + }); + Some(entry.to_string()) + }) + .collect() + } + proptest! { - #![proptest_config(ProptestConfig::with_cases(64))] + #![proptest_config(ProptestConfig::with_cases(200))] #[test] fn test_erofs_round_trip_sha256(spec in filesystem_spec()) { - let fs = build_filesystem::(spec); - round_trip_filesystem(&fs); + round_trip_filesystem::(spec); } #[test] fn test_erofs_round_trip_sha512(spec in filesystem_spec()) { - let fs = build_filesystem::(spec); - round_trip_filesystem(&fs); + round_trip_filesystem::(spec); + } + + #[test] + fn test_erofs_round_trip_v1_sha256(spec in filesystem_spec()) { + round_trip_filesystem_v1::(spec); + } + + #[test] + fn test_erofs_round_trip_v1_sha512(spec in filesystem_spec()) { + round_trip_filesystem_v1::(spec); + } + + } + + /// Verify C composefs-info can parse random V1 (C-compatible) EROFS + /// images generated by our writer, and that its dump output matches + /// our Rust reader's interpretation. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v1() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + let mut fs = build_filesystem::(spec); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Verify C composefs-info can parse random V2 (Rust-native) EROFS + /// images generated by our writer. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v2() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + let fs = build_filesystem::(spec); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Verify C composefs-info can parse random V2 EROFS images generated from + /// unusual content (whiteout escaping, ACLs, multiple overlay xattrs, large + /// external files, cross-type hardlinks), and that its dump output matches + /// our Rust reader's interpretation. + /// + /// Mirrors `test_v1_binary_identical_unusual_content` but for V2 images + /// where byte-for-byte C identity is not the goal (V2 is Rust-native); + /// instead we verify semantic equivalence via normalized dump comparison. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v2_unusual() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&unusual_filesystem_spec(), |spec| { + let fs = build_unusual_filesystem::(spec); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Run `debug_img` on an image and return the structured dump as a String. + fn debug_dump(image: &[u8]) -> String { + use crate::erofs::debug::debug_img; + let mut out = Vec::new(); + debug_img(&mut out, image).expect("debug_img failed"); + String::from_utf8(out).expect("debug_img produced non-UTF8") + } + + /// Diff two debug dumps, returning a unified-diff-style string of the differences. + fn diff_debug_dumps(label_a: &str, a: &str, label_b: &str, b: &str) -> String { + use std::fmt::Write; + let a_lines: Vec<&str> = a.lines().collect(); + let b_lines: Vec<&str> = b.lines().collect(); + let mut out = String::new(); + let max = a_lines.len().max(b_lines.len()); + let mut diffs = 0usize; + for i in 0..max { + let la = a_lines.get(i).copied().unwrap_or(""); + let lb = b_lines.get(i).copied().unwrap_or(""); + if la != lb { + diffs += 1; + if diffs <= 40 { + writeln!(out, "line {i}:").unwrap(); + writeln!(out, " {label_a}: {la}").unwrap(); + writeln!(out, " {label_b}: {lb}").unwrap(); + } + } + } + if diffs > 40 { + writeln!(out, "... and {} more differing lines", diffs - 40).unwrap(); } + if diffs == 0 { + out.push_str("(no differences)"); + } + out + } + + /// Run C `mkcomposefs --from-file -` on a dumpfile string and return the raw image bytes. + fn c_mkcomposefs_from_dumpfile(dumpfile: &str) -> Vec { + use std::io::{Read, Seek, SeekFrom, Write}; + // Write dumpfile to a tempfile + let mut tf = tempfile::tempfile().unwrap(); + tf.write_all(dumpfile.as_bytes()).unwrap(); + tf.seek(SeekFrom::Start(0)).unwrap(); + // Run mkcomposefs --from-file - - + let out_tf = tempfile::tempfile().unwrap(); + let mut child = std::process::Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(std::process::Stdio::from(tf)) + .stdout(std::process::Stdio::from(out_tf.try_clone().unwrap())) + .stderr(std::process::Stdio::inherit()) + .spawn() + .expect("failed to spawn mkcomposefs"); + let status = child.wait().unwrap(); + assert!(status.success(), "mkcomposefs failed: {status}"); + let mut out_tf = out_tf; + out_tf.seek(SeekFrom::Start(0)).unwrap(); + let mut bytes = Vec::new(); + out_tf.read_to_end(&mut bytes).unwrap(); + bytes + } + + /// Verify that our Rust V1 writer produces byte-for-byte identical EROFS images + /// to C mkcomposefs for the same user-level input. + /// + /// This is a stronger check than `test_c_composefs_info_reads_v1`: instead of + /// comparing parsed dump output (which won't catch wrong binary layout like the + /// EUCLEAN block-boundary bug), we compare raw image bytes. If our V1 writer + /// disagrees with the C reference even on a single padding byte, this fails. + /// + /// The test mirrors the production flow: C receives a dumpfile of the user-level + /// tree (no whiteout stubs) and adds the 256 stubs internally, while the Rust + /// writer operates on the in-memory tree after `add_overlay_whiteouts()`. + /// + /// On failure the structural diff from `debug_img` is printed to make the + /// divergence immediately obvious without a separate manual step. + #[test_with::executable(mkcomposefs)] + #[test] + fn test_v1_binary_identical_to_c_mkcomposefs() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + // Build two independent filesystems from the same spec: + // fs_c — user entries only, serialized as a dumpfile and fed to + // C mkcomposefs (which adds the 256 whiteout stubs internally) + // fs_rs — user entries + stubs added by add_overlay_whiteouts(), fed + // directly to our Rust V1 writer + // + // This mirrors the production flow: C receives a dumpfile without + // the stubs and adds them itself, while Rust adds them in-process. + // Using the same spec for both ensures the user-level content matches. + let fs_c = build_filesystem::(spec.clone()); + let mut fs_rs = build_filesystem::(spec); + fs_rs.add_overlay_whiteouts(); + + // Serialize the pre-whiteout tree for C (no stubs in dumpfile) + let mut dumpfile_bytes = Vec::new(); + write_dumpfile(&mut dumpfile_bytes, &fs_c).unwrap(); + let dumpfile = String::from_utf8(dumpfile_bytes).unwrap(); + + // Get C mkcomposefs binary output (C adds stubs internally) + let c_image = c_mkcomposefs_from_dumpfile(&dumpfile); + + // Get our Rust V1 writer binary output (stubs already in fs_rs) + let rust_image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V1, + ); + + if c_image != rust_image.as_ref() { + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + similar_asserts::assert_eq!( + c_debug, + rust_debug, + "binary mismatch (c={} bytes, rust={} bytes)\ndumpfile:\n{dumpfile}", + c_image.len(), + rust_image.len(), + ); + } + Ok(()) + }) + .unwrap(); + } + + /// Binary-compatibility test using the unusual-content generator. + /// + /// Covers corner cases in the V1 writer that the ordinary random generator almost + /// never exercises: whiteout escaping, multiple trusted.overlay.* xattrs per inode, + /// system.posix_acl_access (HAS_ACL flag), large external file sizes, and + /// cross-type hardlinks (to symlinks, whiteouts, devices, FIFOs). + /// + /// Runs 64 cases against C mkcomposefs byte-for-byte. + #[test_with::executable(mkcomposefs)] + #[test] + fn test_v1_binary_identical_unusual_content() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&unusual_filesystem_spec(), |spec| { + let fs_c = build_unusual_filesystem::(spec.clone()); + let mut fs_rs = build_unusual_filesystem::(spec); + fs_rs.add_overlay_whiteouts(); + + let mut dumpfile_bytes = Vec::new(); + write_dumpfile(&mut dumpfile_bytes, &fs_c).unwrap(); + let dumpfile = String::from_utf8(dumpfile_bytes).unwrap(); + + let c_image = c_mkcomposefs_from_dumpfile(&dumpfile); + let rust_image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V1, + ); + + if c_image != rust_image.as_ref() { + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + similar_asserts::assert_eq!( + c_debug, + rust_debug, + "binary mismatch (c={} bytes, rust={} bytes)\ndumpfile:\n{dumpfile}", + c_image.len(), + rust_image.len(), + ); + } + Ok(()) + }) + .unwrap(); + } + + /// Diagnostic: dump the structural diff between C mkcomposefs and our Rust V1 + /// writer for a known-failing minimal case (large flat directory, no xattrs). + /// + /// This test is `#[ignore]` — run it manually with: + /// cargo test -p composefs --lib -- erofs::reader::tests::proptest_tests::test_v1_binary_diff_diagnostic --ignored --nocapture + /// + /// It uses `debug_img` (our injective EROFS structure dumper) to show exactly + /// which fields diverge between the two images, making it easy to pinpoint + /// the bug in the writer without manually parsing hex dumps. + #[test_with::executable(mkcomposefs)] + #[test] + #[ignore] + fn test_v1_binary_diff_diagnostic() { + // Known-failing proptest case: use the exact dumpfile from a proptest failure. + // The flow matches the proptest exactly: + // - fs_c is built from spec and serialized to dumpfile (no stubs) for C + // - fs_rs has add_overlay_whiteouts() called on it, fed to Rust writer + let dumpfile = "\ +/ 0 40000 3 0 0 0 0.0 - - -\n\ +/B 0 47123 2 32924 6322 0 334277904.419157028 - - - user.test_3=\\x14\\x11\\xf5\\xbe\\xf0\\x1f\\x15<\\\\\\x84Gu(\\x17T\\xdb\\xca\\xd5\n\ +/B/\\x06\\xc3} 43 102747 1 14780 50024 0 1909128638.32940851 - X\\xb8\\xac\\xf9[\\x8br\\x1a\\x11\\xed\\x96]\\x9c\\xed\\xba\\x8f\\x13\\xcc/i\\x12\\x7fE\\x18\\xf8n\\xaeV_E\\x8bS]x\\x93/g\\x92\\x0f?\\xd8\\xf4\\xf5 - security.capability=r\\x93\\x84\\x18M user.test_3=&+\\xf2\\xee\\x89sz user.test_4=\n\ +/B/\\x1f\\xe3\\x17\\xcb\\xe9\\x81\\x9aT\\xd2\\x13\\x19\\xf2\\xaf\\xee\\x20\\xba\\xb3 43 102274 1 41061 21812 0 446804811.557100600 - <\\x10@Z\\x00\\xc5\\xf9\\xca\\xe1=\\xfc\\xe0\\x81)p\\xa4\\x9f\\xa8\\x18+\\x88\\x0e\\xc3\\xa2\\xdf0\\x82*\\xc2q[x\\x86\\x88\\x80\\xf1]b$\\\\\\x1f]\\xeb - system.posix_acl_access= trusted.test_0=\\x92 trusted.test_2=\\\\\\xec\\x83\\x89\\x85\"\\xf9\\x9b\\xbc\\xa5\\xb0\\xef\\xbcC\\xe8Z\\x88F\\x83\\x17 user.test_1=\\xc4\\xc1\\x08\\xff\\xfa\\xd3\\xed\\xad\\x9bS6f\\tS\\x8d\n\ +/B/#\\xcd\\x17\\xb2\\xf0\\x03g\\xea\\x87iI\\xe3{_\\xe1 7 100554 1 50668 49879 0 1545457558.133147722 - \\xb6\\xa1$?\\xd2:\\xb9 - system.posix_acl_default=\\x97\\xde\\xd1S;,; user.test_4=\\xf7\\x82S\\xa5\\xc3,?\\x98\\x84p\\xbf\\x14&\\x91+\\x8e\\xdb\n\ +/B/3\\xf4\\xf5\\xc2e\\x07\\xb5\\xacC\\xa1 45 106705 1 56683 56444 0 1577642975.579080132 - \\xdf[\\x83j\\x1e\\x99\\xd8\\xc0[\\x8ba\\xc0f\\xec\\xe0\\x8b*\\xee\\x031\\x91\\x0f38\\x0f\\x08\\xc0\\xcd\\xa9\\x1a^\\x90]\\xc9!>\\xa9S*\\x94\\x8c\\x17\\xa8h\\xc3 - security.ima=E\\x04L\\tb@9\\x07!h) trusted.overlay.custom=~\\x16\\x1f-\\xfc\\xa3\\x07\\x17\\xd1\\xa0 trusted.test_2=O\n\ +/B/Eap_z828H.-6-_S 0 14476 1 4557 40071 0 206142614.191638235 - - - security.ima=H\\xfd\\x9e&\\x9a:\\xe5\\x93\\xa4 system.posix_acl_access=N\\x1c|\\xc7$O3\\x198%\\xb4\\xe8 trusted.overlay.origin=Y+\\xa4\\xd1\\x16r\\xdd|\\xfaG user.test_4=\n\ +/B/Gv7O_..._.faB2-_-22dNscP_eGqkxP35_.0l.w.hfrZXl_v4h.MGEE7___GGF221-V-__WgP-h-6Th_NIB_._j.-U.Qj_2_iA.P_3_-_..9.1oxn4_mM_6XEAJ196_.6Z9iR_YM-Wr0L_.kz.icFqb_EzB27-___AC7bGW_.t_rwee8rtQ4_0rD_t1-J__5iR.r1_8cNUQXai5w4.e2_G-.7j.DyiD__Rfv6Lhgfzn-QFr_-J 44 124140 1 29304 30605 0 620161379.796821778 ____SlN/.yp1zAst_-P/5_RO_-cy7O_Z__310L__d2yo - -\n\ +/B/IP-_jBs 1 126270 1 31623 24545 0 1072774021.893731176 \\xcb - -\n\ +/B/KAS.d8m.y6U 16 125603 1 24529 17343 0 340236667.19836524 9\\x14\\xe2{\\xe9[\\x96q\\x08h;\\xc8\\x83\\xa4\\xb3\\xb9 - - trusted.overlay.origin=b\\xec'\\x8c\\x16\\xea\\xcb\\x10\\xc8\\xbe\\x18\\xf7*\\x0c\\x04\\xb8\\xb1 trusted.overlay.overlay.nested= trusted.test_1=e\\x08#\n\ +/B/Mp 27 106753 1 37244 13252 0 91373000.857571176 - OV\\x8e!\\xfdw9I\\xab\\x8f\\x9a;!\\xb4]f\\n]\\xc8\\x7f\\xa5\\x94\\x07\\xd4%\\x97\\x85 -\n\ +/B/Ze.7.-.9_._Ocl1k2_ 46 107670 1 14097 58513 0 488459452.877162371 - \\xc1\\x17\\x1d\\xa7\\x14S)\\xcd}\\xc9/~\\xa4d\\x1cN\\xbeN\\x184\\x90\\xa9A\\x12\\x8bY/(\\x1a,%\"\\xe3\\xb3\\xf2\\x86\\xec\\x20\\xf6\"Ug;\\x84\\\\A - trusted.overlay.origin=\\xfe\\xda7D\\xbf\\xb0\\xe9\\x9ct0Q user.test_4=-\\xdc\n\ +/B/]\\x05\\x19i\\x97\\xeb\\x8c\\xc4k\\x02\\\\jB`j\\x8f\\xb4\\xb6\\xfbw5\\xef\\xf3\\x0fd 0 23230 1 31997 45657 7135 105859383.867998730 - - - system.posix_acl_default=\\xb1p\\x96\\xe45\\xdcC\\x8bI\\x0e\\xfd#\\x8d\n\ +/B/_tvW.__t_l_-jK.4j 554649 106606 1 29300 51208 0 705049404.750293896 e5/39a0e32972ef85332212be14f7b863409d9e4113f80603285d1cd52a852822 - e539a0e32972ef85332212be14f7b863409d9e4113f80603285d1cd52a852822 user.test_4=\\xbf\\xbbL\\xe9\\xbc\\x92$\\xa3\\xf9\\xc6\\x06.\\x3d^\n\ +/B/q._v.T_.Mba__ 32 122305 1 29088 34366 0 881062039.274688283 _C_Kn1_.r_.IK/TGai6_zqLoTt___w_e - - trusted.overlay.overlay.nested=6\\x03\\xee\\xff\\xdbI\\xdcu(\\\\\\xe1\\x9a\\xee\\xd3e\\x06 user.test_2=\\x9a\\xc4$\\xe1\n\ +/B/u 25 105023 2 14652 44878 0 294073763.291036424 - \\x84R\\xd6@\\x0e\\x8b\\x04\\xb4(e\\x93\\xe9\\x86\\xdc\\x03\\xc7\\xbf\\xe1,OmC\\xe9U\\xf1 - trusted.overlay.origin=\\xc4mH\\x9a\n\ +/B/\\x81X\\xef\\r\\xce\\x12\\xf4U(p\\xc3\\xb2\\x19\\xe3r\\xd2v9\\x1c\\x02\\xca 46 121141 1 3272 11859 0 1219611767.718731195 jfsk35_Gz__n4tv4xzFFcj_.Z_AV__IJS_k_1I__FuSb.2 - - security.selinux= trusted.overlay.upper=\\x07\\xe8\\xa1%\\xbe\\xb0\\xc8)\\xcf\\xc2\\xf8\\xbah\\x19\\xae_\\xccH\\x9f\\xf0 trusted.test_1=i\\xe6\\xd9\\xd0 user.test_2=\\xc8\\xa0K\\xb2\\xa0V\\xb0\\xb7\\xd1\\xec(\\x95\\xfe\\xbb`\n\ +/B/\\xc4\\xf8\\x92\\xc2}<4\\xc8\\xec\\xd2\\xa5\\xe6\\x9ee\\xf0\\x95\\xf8(dumpfile).unwrap(); + fs_rs.add_overlay_whiteouts(); + + let c_image = c_mkcomposefs_from_dumpfile(dumpfile); + let rust_image = + mkfs_erofs_versioned(&ValidatedFileSystem::new(fs_rs).unwrap(), FormatVersion::V1); + + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + + println!("=== C mkcomposefs ({} bytes) ===", c_image.len()); + println!("{c_debug}"); + println!("=== Rust V1 writer ({} bytes) ===", rust_image.len()); + println!("{rust_debug}"); + println!("=== Structural diff (c vs rust) ==="); + println!("{}", diff_debug_dumps("c", &c_debug, "rust", &rust_debug)); + + assert_eq!( + c_image, + rust_image.as_ref(), + "images differ — see structural diff above" + ); } } @@ -2561,7 +3353,7 @@ mod tests { /bbb 5 100644 1 0 0 0 1000.0 - world - "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); // Sanity: the unmodified image round-trips fine erofs_to_filesystem::(&image).unwrap(); @@ -2582,4 +3374,515 @@ mod tests { "unexpected error: {msg}" ); } + + /// Regression test for the block-boundary EUCLEAN bug (bug.md). + /// + /// Old kernels (< 6.12) return EFSCORRUPTED from erofs_fill_symlink() when: + /// (inode_offset % block_size) + inode_and_xattr_size + symlink_len > block_size + /// + /// The V1 writer previously used the wrong condition (derived from the + /// non-symlink branch of the C reference) and padded the wrong target + /// (inline_start rather than inode_start), silently producing images that + /// would EUCLEAN on CentOS Stream 9 (kernel 5.14) for symlinks with large + /// SELinux xattrs such as those in /etc/pki/ca-trust/extracted/pem/directory-hash/. + /// + /// This test: + /// 1. Builds a V1 image that forces a symlink inode near a block boundary + /// by packing enough filler inodes before it. + /// 2. Asserts the validator passes (writer fixed the layout). + /// 3. Asserts the symlink round-trips correctly. + /// + /// The construction: inode table starts at offset 1152. We add enough + /// compact filler inodes (FIFOs, 32 bytes each with min mtime) to push + /// the subsequent symlink to a position where the old code would have + /// placed it straddling the 4096-byte boundary. + #[test] + fn test_v1_symlink_block_boundary_euclean_regression() { + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; + + // A realistic SELinux label of the kind found on ca-trust symlinks. + // 76 bytes — enough that header(64) + xattr(~140) + symlink(23) > 4096 + // when the inode starts near offset 3968 within a block. + let selinux_label = "system_u:object_r:cert_t:s0\x00".repeat(2); + // Trim to exactly 56 bytes so xattr body is predictable + let selinux_label = &selinux_label[..selinux_label.len().min(56)]; + + // Build the dumpfile: root + many compact filler FIFOs + the victim symlink. + // + // Filler FIFOs: mtime=0, no xattrs → compact inode (32 bytes each in V1). + // The inode table starts at 1152. We need to fill up to offset ~3968 within + // some 4096-block, which is (3968 - 1152) % 4096 = 2816 bytes = 88 compact inodes + // in the first block. Add a few more to cross into block 1 and land the + // victim at the right position in block 1. + // + // We overshoot slightly and rely on the writer's fix to pad correctly. + // The validator then confirms no inode violates the kernel condition. + let mut dumpfile = String::from("/ 0 40755 2 0 0 0 0.0 - - -\n"); + for i in 0..120usize { + dumpfile.push_str(&format!("/filler{i:03} 0 10644 1 0 0 0 0.0 - - -\n")); + } + // Victim: symlink with a large SELinux xattr. + let target = "/etc/pki/ca-trust/source"; // 24-byte target + let target_len = target.len(); + let xattr_val_hex: String = selinux_label + .bytes() + .map(|b| format!("\\x{b:02x}")) + .collect(); + dumpfile.push_str(&format!( + "/victim {target_len} 120777 1 0 0 0 0.0 {target} - - security.selinux={xattr_val_hex}\n" + )); + + let fs = dumpfile_to_filesystem::(&dumpfile).unwrap(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + + // The validator must pass: the writer should have padded the inode + // to a block boundary so the kernel condition is never violated. + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("V1 writer should produce valid inline layout (block-boundary fix)"); + + // The symlink target must round-trip correctly. + let fs_rt = + erofs_to_filesystem::(&image).expect("image should parse cleanly"); + let victim_id = fs_rt + .root + .leaf_id(std::ffi::OsStr::new("victim")) + .expect("victim symlink not found in round-tripped filesystem"); + let link_target = match &fs_rt.leaves[victim_id.0].content { + crate::tree::LeafContent::Symlink(t) => t.clone(), + other => panic!("victim should be a symlink, got {other:?}"), + }; + assert_eq!( + link_target.as_ref(), + std::ffi::OsStr::new(target), + "symlink target mismatch after V1 round-trip" + ); + } + + /// Tests that `fsck_metadata` catches a V1 image where symlink + /// padding was suppressed, causing the inode+inline data to cross a block + /// boundary. Uses `WriterFaults` to inject the fault rather than raw byte + /// surgery, so the image is otherwise structurally coherent. + #[test] + fn test_v1_inline_layout_validator_catches_bad_layout() { + use crate::erofs::{ + format::FormatVersion, + writer::{WriterFaults, mkfs_erofs_versioned, mkfs_erofs_with_faults}, + }; + + // Layout math (all sizes in bytes, block_size = 4096): + // + // A symlink crosses a block boundary when: + // symlink_pos % 4096 + 32 (inode) + target_len > 4096 + // => symlink_pos % 4096 > 4096 - 32 - target_len + // + // With target_len = SYMLINK_MAX = 1024 (crate::SYMLINK_MAX): + // symlink_pos % 4096 > 3040 (i.e. slot >= 96 within a block) + // + // Inode table layout (V1): + // Bytes 0..1152 : composefs header (32 B) + pad to 1024 + EROFS superblock (128 B) + // = 36 slots (NID 0-35) + // NID 36 : root inode (32 B inode header) + // NID 36 inline : root dir entries (inline, variable) + // + // With 50 filler files named "f00".."f49" (sort before "link"): + // - 51 dirents: 51 * 12 = 612 B + // - names: 50*3 + 4 = 154 B + // - total inline: 766 B + // - root occupies: 32 + ~766 = 798 B (slot-padded) + // - 50 empty files: 50 * 32 = 1600 B + // - symlink (without block-boundary padding): NID 113, pos_in_block=3616 + // 3616 + 32 + 1024 = 4672 > 4096 → crossing condition ✓ + // + // Note: the *good* image places the symlink at pos_in_block == 0 because + // the writer correctly pads it to a block boundary. We verify crossing + // by checking the *bad* image (padding suppressed) instead. + + // filler_count=50 places the symlink at NID 113 (pos_in_block=3616). + // Without the block-boundary padding: 3616 + 32 + 1024 = 4672 > 4096 ✓ + // The assertion below verifies this whenever the test runs. + let filler_count = 50usize; + let mut lines = String::from("/ 0 40755 2 0 0 0 0.0 - - -\n"); + for i in 0..filler_count { + lines.push_str(&format!("/f{i:02} 0 100644 1 0 0 0 0.0 - - -\n")); + } + let target = "a".repeat(crate::SYMLINK_MAX); + lines.push_str(&format!( + "/link {len} 120777 1 0 0 0 0.0 {target} - -\n", + len = target.len(), + target = target, + )); + let fs = dumpfile_to_filesystem::(&lines).unwrap(); + let vfs = ValidatedFileSystem::new(fs).unwrap(); + + // The good image must pass validation. + let good_image = mkfs_erofs_versioned(&vfs, FormatVersion::V1); + Image::open(&good_image) + .unwrap() + .fsck_metadata() + .expect("valid image should pass"); + + // Build the faulted image (symlink pad suppressed). + let mut faults = WriterFaults::new(42); + faults.skip_symlink_pad_rate = 1.0; // always skip padding + let bad_image = mkfs_erofs_with_faults(&vfs, FormatVersion::V1, faults); + + // Confirm the symlink in the bad image actually crosses a block boundary — + // i.e. the fault injection put the symlink at a dangerous slot. + { + let img = Image::open(&bad_image).unwrap(); + let root_nid = img.sb.root_nid.get() as u64; + let link_nid = img + .find_child_nid(root_nid, b"link") + .unwrap() + .expect("link nid not found"); + let link_offset = (link_nid * 32) as usize; + let pos_in_block = link_offset % 4096; + assert!( + pos_in_block + 32 + crate::SYMLINK_MAX > 4096, + "symlink at pos_in_block={pos_in_block} does not cross a block boundary \ + in the bad image (32+{symlink_max}={total} ≤ 4096); \ + increase filler_count (currently {filler_count})", + symlink_max = crate::SYMLINK_MAX, + total = 32 + crate::SYMLINK_MAX, + ); + } + + // The faulted image must fail validation. + let result = Image::open(&bad_image).unwrap().fsck_metadata(); + assert!( + result.is_err(), + "validator should reject image with suppressed symlink padding" + ); + let msg = result.unwrap_err().to_string(); + assert!( + msg.contains("EUCLEAN") || msg.contains("nid"), + "error should mention EUCLEAN or nid, got: {msg}" + ); + } + + /// B2: Files with a negative `st_mtim_sec` (pre-epoch mtime) must not corrupt + /// the V1 superblock `build_time` field. + /// + /// `calculate_min_mtime` casts `st_mtim_sec as u64`. A value of -1 wraps to + /// `u64::MAX`, which is larger than any positive timestamp, so positive mtimes + /// are correctly selected as the minimum. This test verifies that a filesystem + /// containing one inode with mtime = -1 and one with mtime = 1000 produces a + /// V1 image whose superblock `build_time` equals 1000. + #[test] + fn test_negative_mtime_does_not_corrupt_build_time() { + use std::{collections::BTreeMap, ffi::OsStr}; + + use crate::{ + erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree::{self, RegularFile}, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + + // Inode with negative mtime (-1). As u64 this wraps to u64::MAX, which + // is larger than 1000, so it should NOT win the minimum comparison. + let neg_stat = Stat { + st_mode: 0o100644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: -1, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + let leaf_id = fs.push_leaf( + neg_stat, + LeafContent::Regular(RegularFile::Inline(Box::new([]))), + ); + fs.root + .insert(OsStr::new("neg"), tree::Inode::leaf(leaf_id)); + + // add_overlay_whiteouts is required for V1 compatibility + fs.add_overlay_whiteouts(); + + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + let img = Image::open(&image).expect("failed to open V1 image"); + + // The superblock build_time must be 1000 (the root mtime), not u64::MAX or 0. + assert_eq!( + img.sb.build_time.get(), + 1000, + "build_time should be the positive minimum mtime (1000), \ + not the wrapped negative value" + ); + } + + /// B3: Directories with enough entries to span multiple 4096-byte blocks must + /// survive a round-trip through the V2 EROFS writer. + /// + /// Each dirent is 12 bytes (header) + name length bytes. With 50 entries of + /// 90-byte names: 50 × (12 + 90) = 5100 bytes > 4096, which forces + /// `Directory::from_entries` to split across at least two blocks. + /// + /// This test verifies that all entry names survive the round-trip intact. + #[test] + fn test_multiblock_directory_round_trip() { + use std::{collections::BTreeMap, ffi::OsStr}; + + use crate::{ + erofs::writer::mkfs_erofs, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree::{self, RegularFile}, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let leaf_stat = Stat { + st_mode: 0o100644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let mut fs = tree::FileSystem::::new(root_stat.clone()); + + const N: usize = 50; + let mut expected_names: Vec = vec![".".into(), "..".into()]; + + // Build a subdirectory with N entries, each with a 90-byte name. + // N × (12 + 90) = 5100 bytes — forces a multi-block directory. + let mut subdir = tree::Directory::::new(root_stat); + for i in 0..N { + let name = format!("{:0>90}", i); + let leaf_id = fs.push_leaf( + leaf_stat.clone(), + LeafContent::Regular(RegularFile::Inline(Box::new([]))), + ); + subdir.insert(OsStr::new(&name), tree::Inode::leaf(leaf_id)); + expected_names.push(name); + } + + fs.root.insert( + OsStr::new("bigdir"), + tree::Inode::Directory(Box::new(subdir)), + ); + + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + let img = Image::open(&image).expect("failed to open image"); + + // Locate "bigdir" in root + let root_nid = img.sb.root_nid.get() as u64; + let bigdir_nid = img + .find_child_nid(root_nid, b"bigdir") + .expect("find_child_nid error") + .expect("bigdir not found in root"); + + // Collect all entry names from bigdir (blocks + inline) + let bigdir_inode = img.inode(bigdir_nid).unwrap(); + let mut found_names: Vec = Vec::new(); + if let Some(inline) = bigdir_inode.inline() { + let inline_block = DirectoryBlock::ref_from_bytes(inline).unwrap(); + for entry in inline_block.entries().unwrap() { + let entry = entry.unwrap(); + found_names.push(String::from_utf8(entry.name.to_vec()).unwrap()); + } + } + for blkid in img.inode_blocks(&bigdir_inode).unwrap() { + let block = img.directory_block(blkid).unwrap(); + for entry in block.entries().unwrap() { + let entry = entry.unwrap(); + found_names.push(String::from_utf8(entry.name.to_vec()).unwrap()); + } + } + + found_names.sort(); + expected_names.sort(); + + assert_eq!( + found_names, expected_names, + "multi-block directory lost entries after round-trip" + ); + + // Verify the image is a valid EROFS filesystem that can be round-tripped + let _fs_rt = erofs_to_filesystem::(&image) + .expect("erofs_to_filesystem failed on multi-block directory image"); + + // Sanity: verify the image passes fsck.erofs if available + if let Some(ok) = run_fsck_erofs(&image) { + assert!( + ok, + "fsck.erofs reported errors in multi-block directory image" + ); + } + } + + /// `ValidatedFileSystem::new` must reject a hardlinked whiteout. + /// A whiteout (chardev rdev=0) with nlink > 1 is semantically invalid. + #[test] + fn test_hardlinked_whiteout_writer_rejects() { + use std::ffi::OsStr; + + use crate::{ + erofs::writer::ValidatedFileSystem, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let whiteout_stat = Stat { + st_mode: 0o20000, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + let leaf_id = fs.push_leaf(whiteout_stat, LeafContent::CharacterDevice(0)); + fs.root + .insert(OsStr::new("whiteout"), tree::Inode::leaf(leaf_id)); + fs.root.insert( + OsStr::new("hardlink_to_whiteout"), + tree::Inode::leaf(leaf_id), + ); + + let result = ValidatedFileSystem::new(fs); + assert!( + result.is_err(), + "ValidatedFileSystem::new should reject hardlinked whiteout" + ); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("whiteout inode has nlink > 1"), + "unexpected error message: {err}" + ); + } + + /// The reader must reject an image with a hardlinked whiteout. + /// + /// We build a valid image with a hardlinked chardev(rdev=1), which the writer + /// accepts. We then patch the inode's `u` field (rdev) from 1 to 0 in the raw + /// image bytes, turning it into a whiteout on-disk while leaving nlink > 1. + /// The reader must detect this and return an error. + #[test] + fn test_hardlinked_whiteout_reader_rejects() { + use std::ffi::OsStr; + + use crate::{ + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let chardev_stat = Stat { + st_mode: 0o20000, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + // Use rdev=1 (not a whiteout) so the writer accepts the hardlink. + let leaf_id = fs.push_leaf(chardev_stat, LeafContent::CharacterDevice(1)); + fs.root + .insert(OsStr::new("chardev"), tree::Inode::leaf(leaf_id)); + fs.root.insert( + OsStr::new("hardlink_to_chardev"), + tree::Inode::leaf(leaf_id), + ); + + use crate::erofs::writer::mkfs_erofs; + let base_image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); + + // Sanity: the unpatched image must be accepted. + erofs_to_filesystem::(&base_image) + .expect("unmodified image with rdev=1 hardlink should be accepted"); + + // Locate the chardev inode in the image using the erofs Image API. + let img = Image::open(&base_image).unwrap(); + let root_nid = img.sb.root_nid.get() as u64; + let chardev_nid = img + .find_child_nid(root_nid, b"chardev") + .unwrap() + .expect("chardev entry must exist"); + + // Parse the inode via the Image API to learn its layout (compact vs + // extended) and locate its slot in the image. We record what we need + // before releasing the shared borrow so we can take `&mut` afterwards. + let inode = img.inode(chardev_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + // The inode region is the `inodes` sub-slice of `image`; the slot for + // NID n starts at n*32 bytes into that region. + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + chardev_nid as usize * 32; + drop(inode); + drop(img); + + // Mutate a copy of the image: set the `u` field (rdev) from 1 → 0, + // turning the chardev into a whiteout on-disk while leaving nlink > 1. + // Use zerocopy to reinterpret the slot bytes as the concrete header type + // so we get a typed `&mut` rather than raw byte arithmetic. + let mut image = base_image.to_vec(); + let slot = &mut image[inode_slot_start..]; + if is_extended { + use core::mem::size_of; + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + assert_eq!(hdr.u.get(), 1, "expected rdev=1 before patching"); + hdr.u = zerocopy::little_endian::U32::new(0); + } else { + use core::mem::size_of; + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + assert_eq!(hdr.u.get(), 1, "expected rdev=1 before patching"); + hdr.u = zerocopy::little_endian::U32::new(0); + } + + // The reader must reject the patched image. + let result = erofs_to_filesystem::(&image); + let err = result.expect_err("reader should reject image with hardlinked whiteout"); + let err_msg = format!("{err:#}"); + assert!( + err_msg.contains("nlink"), + "error message should mention nlink, got: {err_msg}" + ); + } } diff --git a/crates/composefs/src/erofs/writer.rs b/crates/composefs/src/erofs/writer.rs index 7fe59e61..1e0c3f95 100644 --- a/crates/composefs/src/erofs/writer.rs +++ b/crates/composefs/src/erofs/writer.rs @@ -1,12 +1,47 @@ -//! EROFS image generation and writing functionality. +//! EROFS image generation from composefs trees. //! -//! This module provides functionality to generate EROFS filesystem images -//! from composefs tree structures, handling inode layout, directory blocks, -//! and metadata serialization. +//! The public entry points are: +//! +//! - [`mkfs_erofs`] — generate a single EROFS image using the repository's default format +//! - [`mkfs_erofs_versioned`] — generate an image for a specific [`FormatVersion`] +//! - [`mkfs_erofs_v1_min_version`] — like `mkfs_erofs_versioned` for V1, but with explicit +//! control over the minimum `composefs_version` field (mirrors `mkcomposefs --min-version`) +//! +//! All three require a [`ValidatedFileSystem`], which is the type-safe gate: constructing +//! one runs [`fsck`](crate::tree::FileSystem::fsck) and checks EROFS-specific invariants +//! (e.g. whiteout inodes must not be hardlinked). A validated filesystem cannot panic the +//! writer. +//! +//! ## Format versions +//! +//! Two on-disk formats are supported, selected by [`FormatVersion`]: +//! +//! **V1** (`FormatVersion::V1`) is byte-for-byte compatible with the C `mkcomposefs` tool. +//! It uses compact inodes (32 bytes) where the inode fits, extended (64 bytes) otherwise; +//! collects inodes in BFS order; includes a 256-entry whiteout stub table at the start of +//! the inode area; sets `build_time` to the minimum mtime; and encodes user-visible whiteout +//! files (chr 0,0) via `trusted.overlay.opaque=x` xattrs rather than storing them directly. +//! The `composefs_version` header field is 0 normally and auto-upgrades to 1 when user +//! whiteouts are present. +//! +//! **V2** (`FormatVersion::V2`, the default) is the composefs-rs native format. It always +//! uses extended inodes (64 bytes), collects inodes in DFS order, omits the whiteout stub +//! table, sets `build_time` to 0, and sets `composefs_version` to 2. Whiteout files are +//! stored without escaping. +//! +//! ## Two-pass layout + emit design +//! +//! `write_erofs` is called twice on the same inode list. The first pass uses a `FirstPass` +//! output that counts bytes and records the byte offset of every inode, block, and data +//! region without writing anything. The second pass uses a `SecondPass` output that +//! serializes bytes into a buffer. EROFS node IDs (nids) and cross-region offsets can only +//! be computed after the first pass, so all [`InodeRef::Known`] references are resolved +//! between the two passes. use std::{ - collections::{BTreeMap, HashMap}, + collections::{BTreeMap, HashMap, HashSet}, mem::size_of, + num::NonZeroUsize, os::unix::ffi::OsStrExt, }; @@ -21,42 +56,281 @@ use crate::{ tree, }; -#[derive(Clone, Copy, Debug)] -enum Offset { - Header, - Superblock, - Inode, - XAttr, - Block, - End, +/// A composefs filesystem tree validated for EROFS serialization. +/// +/// Can only be constructed via [`ValidatedFileSystem::new`], which checks +/// that the tree satisfies all EROFS invariants — for example, that no +/// whiteout inode (character device with rdev=0) has `nlink > 1`. +/// +/// Passing a `ValidatedFileSystem` to [`mkfs_erofs`] or +/// [`mkfs_erofs_versioned`] therefore cannot panic. +pub struct ValidatedFileSystem(pub(crate) tree::FileSystem); + +impl std::fmt::Debug + for ValidatedFileSystem +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("ValidatedFileSystem").field(&self.0).finish() + } +} + +impl ValidatedFileSystem { + /// Validate `fs` and wrap it. Returns an error if any invariant is violated. + pub fn new(fs: tree::FileSystem) -> anyhow::Result { + validate_filesystem(&fs)?; + Ok(Self(fs)) + } +} + +impl std::ops::Deref for ValidatedFileSystem { + type Target = tree::FileSystem; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +pub(crate) fn validate_filesystem( + fs: &tree::FileSystem, +) -> anyhow::Result<()> { + // Check structural invariants: leaf ref bounds, no orphaned leaves. + fs.fsck() + .map_err(|e| anyhow::anyhow!("invalid composefs filesystem: {e}"))?; + + // Check EROFS-specific constraint: whiteout inodes (chardev rdev=0) must not be hardlinked. + let nlinks = fs.nlinks(); + for (idx, leaf) in fs.leaves.iter().enumerate() { + if matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) { + let nlink = nlinks[idx]; + if nlink > 1 { + anyhow::bail!("invalid composefs filesystem: whiteout inode has nlink > 1"); + } + } + } + Ok(()) +} + +/// Size of one EROFS inode slot in bytes. All inode offsets must be a multiple of this. +const INODE_SLOT_SIZE: usize = 32; + +/// EROFS xattr values are addressed in 4-byte words; all xattr offsets and counts use this unit. +const XATTR_WORD_SIZE: usize = size_of::(); + +/// Size of the InodeXAttrHeader in bytes, used in xattr_icount calculation. +const INODE_XATTR_HEADER_SIZE: usize = size_of::(); + +/// Returns the byte offset of `pos` within its EROFS block (i.e. `pos % BLOCK_SIZE`). +/// +/// `BLOCK_SIZE` (4096) is a nonzero constant, so this operation never panics. +fn block_offset(pos: u64) -> u64 { + pos % u64::from(format::BLOCK_SIZE) +} + +/// Returns the number of bytes from `pos` to the next EROFS block boundary, +/// or `None` if `pos` is already block-aligned (no padding needed). +/// +/// When `Some`, the result is always in `1..BLOCK_SIZE`. +fn bytes_to_block_boundary(pos: u64) -> Option { + let offset = block_offset(pos); + if offset == 0 { + return None; + } + let block_size = u64::from(format::BLOCK_SIZE); + let padding = block_size + .checked_sub(offset) + .expect("block_offset(pos) < BLOCK_SIZE by construction"); + debug_assert!(padding >= 1 && padding < block_size); + Some(padding) +} + +/// Deterministic fault injector for writer tests. +/// +/// Each field is a probability in [0.0, 1.0]: +/// 0.0 = never inject this fault +/// 1.0 = always inject this fault +/// +/// Construct with `WriterFaults::new(seed)` then set the rates you need. +/// Because `write_erofs` runs twice (layout pass then emit pass), decisions +/// are recorded during the first pass and replayed during the second so that +/// both passes make identical choices and produce a structurally coherent image. +#[cfg(test)] +pub(crate) struct WriterFaults { + rng: rand::rngs::SmallRng, + /// Skip the symlink block-boundary padding (produces a malformed image). + pub skip_symlink_pad_rate: f64, + /// Decisions recorded during the first pass; replayed during the second. + decisions: Vec, + /// Index into `decisions` during replay. + replay_idx: usize, + /// True after `start_replay()` is called. + replaying: bool, +} + +#[cfg(test)] +impl WriterFaults { + pub fn new(seed: u64) -> Self { + use rand::SeedableRng; + Self { + rng: rand::rngs::SmallRng::seed_from_u64(seed), + skip_symlink_pad_rate: 0.0, + decisions: Vec::new(), + replay_idx: 0, + replaying: false, + } + } + + /// Call between first and second pass to switch to replay mode. + pub(crate) fn start_replay(&mut self) { + self.replaying = true; + self.replay_idx = 0; + } + + fn should_skip_symlink_pad(&mut self) -> bool { + if self.replaying { + let decision = self.decisions[self.replay_idx]; + self.replay_idx += 1; + decision + } else { + use rand::RngExt; + let decision = self.rng.random::() < self.skip_symlink_pad_rate; + self.decisions.push(decision); + decision + } + } +} + +/// Bundles the parameters that are constant across a single `write_erofs` call. +struct WriteContext { + version: format::FormatVersion, + min_mtime: (u64, u32), + header_flags: u32, + /// The `composefs_version` value written to the ComposefsHeader. + /// + /// For V2: always 2 (COMPOSEFS_VERSION). + /// For V1: 0 normally, but 1 when the tree contains user-land whiteouts (char + /// devices with rdev=0 that were escaped by the V1 writer). This matches C + /// mkcomposefs, which bumps `options->version` from 0 to 1 when it encounters + /// a whiteout in the input tree (before adding the 256 overlay stubs). + composefs_version: u32, + #[cfg(test)] + faults: Option, } trait Output { - fn note_offset(&mut self, offset_type: Offset); - fn get(&self, offset_type: Offset, idx: usize) -> usize; + // --- Recording (first pass only, no-ops in second pass) --- + fn note_header_emitted(&mut self); + fn note_superblock_emitted(&mut self); + fn note_inode(&mut self); + fn note_inodes_end(&mut self); + fn note_xattr(&mut self); + fn note_block(&mut self); + fn note_end(&mut self); + + // --- Retrieval (None in first pass when offsets not yet known, Some in second pass) --- + fn get_inode_offset(&self, idx: usize) -> Option; + fn get_inodes_end(&self) -> Option; + fn get_xattr_offset(&self, idx: usize) -> Option; + fn get_block_offset(&self, idx: usize) -> Option; + fn get_end(&self) -> Option; + + // --- I/O --- fn write(&mut self, data: &[u8]); fn pad(&mut self, alignment: usize); fn len(&self) -> usize; - fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize { - let offset = self.get(offset_type, idx); - assert_eq!(offset % div, 0); - offset / div + /// Write `n` zero bytes. Default implementation avoids heap allocation. + fn write_zeros(&mut self, n: usize) { + const BUF: [u8; 1024] = [0u8; 1024]; + let mut remaining = n; + while remaining > 0 { + let chunk = remaining.min(BUF.len()); + self.write(&BUF[..chunk]); + remaining -= chunk; + } } - fn get_nid(&self, idx: usize) -> u64 { - self.get_div(Offset::Inode, idx, 32) as u64 + // --- Typed write methods: note + write bundled, removing duplication --- + + /// Write the composefs header and pad to 1024 bytes. + fn write_composefs_header(&mut self, hdr: format::ComposefsHeader) { + self.note_header_emitted(); + self.write(hdr.as_bytes()); + self.pad(1024); } - fn get_xattr(&self, idx: usize) -> u32 { - self.get_div(Offset::XAttr, idx, 4).try_into().unwrap() + /// Write the EROFS superblock. + fn write_superblock(&mut self, sb: format::Superblock) { + self.note_superblock_emitted(); + self.write(sb.as_bytes()); } + // --- Derived helpers --- + fn write_struct(&mut self, st: impl IntoBytes + Immutable) { self.write(st.as_bytes()); } + + /// Node ID for inode `idx`, or 0 as a placeholder in the first pass. + fn get_nid(&self, idx: usize) -> u64 { + let Some(offset) = self.get_inode_offset(idx) else { + return 0; + }; + assert_eq!(offset.get() % INODE_SLOT_SIZE, 0); + (offset.get() / INODE_SLOT_SIZE) as u64 + } + + /// Shared xattr reference value (V1 format), or 0 as a placeholder in the first pass. + fn get_xattr_v1(&self, idx: usize) -> u32 { + let (Some(absolute_offset), Some(inodes_end)) = + (self.get_xattr_offset(idx), self.get_inodes_end()) + else { + return 0; + }; + let (absolute_offset, inodes_end) = (absolute_offset.get(), inodes_end.get()); + let offset_within_block = inodes_end % format::BLOCK_SIZE as usize; + let xattr_offset_from_inodes_end = absolute_offset + .checked_sub(inodes_end) + .expect("shared xattr offset must be >= inode table end"); + let raw_ref = (offset_within_block + xattr_offset_from_inodes_end) / XATTR_WORD_SIZE; + raw_ref + .try_into() + .expect("xattr reference index exceeds u32::MAX") + } + + /// Shared xattr reference value (V2 format), or 0 as a placeholder in the first pass. + fn get_xattr_v2(&self, idx: usize) -> u32 { + let Some(offset) = self.get_xattr_offset(idx) else { + return 0; + }; + assert_eq!(offset.get() % XATTR_WORD_SIZE, 0); + (offset.get() / XATTR_WORD_SIZE) + .try_into() + .expect("xattr reference index exceeds u32::MAX") + } + + /// Byte offset of inode `idx`'s block data, or 0 as a placeholder in the first pass. + fn get_block_start(&self, idx: usize) -> usize { + self.get_block_offset(idx).map_or(0, NonZeroUsize::get) + } + + /// Block index of the V1 xattr region, or 0 as a placeholder in the first pass. + fn get_xattr_blkaddr(&self) -> u32 { + self.get_inodes_end() + .map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32) + } + + /// Total number of blocks in the image, or 0 as a placeholder in the first pass. + fn get_block_count(&self) -> u32 { + self.get_end() + .map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32) + } } +/// Extended attribute stored in EROFS format. +/// +/// The derived Ord sorts by (prefix, suffix, value) which is used for V2. +/// For V1, use `cmp_by_full_key` which sorts by full key name (prefix string + suffix) +/// to match C mkcomposefs behavior. #[derive(PartialOrd, PartialEq, Eq, Ord, Clone)] struct XAttr { prefix: u8, @@ -64,6 +338,32 @@ struct XAttr { value: Box<[u8]>, } +impl XAttr { + /// Compare by full key name (prefix string + suffix), then by value. + /// This matches C mkcomposefs `cmp_xattr` which uses `strcmp(na->key, nb->key)`. + /// Uses lazy iterator chaining to avoid heap allocation on every comparison. + /// + /// Value tiebreaker uses length-first comparison to match C `xattrs_ht_sort()`, + /// which compares `value_len` before `memcmp`. This differs from Rust's + /// lexicographic `[u8]::cmp` when values have different lengths (e.g. + /// `\x00\x00` vs `\xee`: lexicographic says `\x00\x00 < \xee`, but + /// length-first says `\xee < \x00\x00` because 1 < 2). + fn cmp_by_full_key(&self, other: &Self) -> std::cmp::Ordering { + let self_key = format::XATTR_PREFIXES[self.prefix as usize] + .iter() + .chain(self.suffix.iter()); + let other_key = format::XATTR_PREFIXES[other.prefix as usize] + .iter() + .chain(other.suffix.iter()); + self_key.cmp(other_key).then_with(|| { + self.value + .len() + .cmp(&other.value.len()) + .then_with(|| self.value.cmp(&other.value)) + }) + } +} + #[derive(Clone, Default)] struct InodeXAttrs { shared: Vec, @@ -71,13 +371,37 @@ struct InodeXAttrs { filter: u32, } +/// Index into [`InodeCollector::inodes`]. This is NOT an EROFS nid; the nid is computed +/// from the byte offset of the inode during the second pass via [`Output::get_nid`]. +type InodeIdx = usize; + +/// Reference to an inode in a directory entry. +/// +/// Used in [`DirEnt`] during BFS in [`InodeCollector::collect_tree`]. When a hardlink's +/// canonical occurrence hasn't been BFS-processed yet, the entry is stored as +/// `Deferred(leaf_id)` and resolved to `Known(nid)` in the post-BFS resolution pass. +#[derive(Debug, Clone, Copy)] +enum InodeRef { + Known(InodeIdx), + Deferred(LeafId), +} + #[derive(Debug)] struct DirEnt<'a> { name: &'a [u8], - inode: usize, + inode: InodeRef, file_type: format::FileType, } +/// Metadata returned by `Inode::inode_meta` used to fill inode header fields. +struct InodeMeta { + layout: format::DataLayout, + /// The `i_u` field: meaning depends on layout (rdev, chunk format, or block offset / BLOCK_SIZE). + i_u: u32, + size: u64, + nlink: usize, +} + #[derive(Debug, Default)] struct Directory<'a> { blocks: Box<[Box<[DirEnt<'a>]>]>, @@ -102,6 +426,9 @@ struct Inode<'a, ObjectID: FsVerityHashValue> { stat: &'a tree::Stat, xattrs: InodeXAttrs, content: InodeContent<'a, ObjectID>, + /// V1 only: this inode was originally a char device with rdev=0 (overlay whiteout) + /// and has been escaped to a regular file per C mkcomposefs v1.0.8 behavior. + escaped_whiteout: bool, } impl XAttr { @@ -113,13 +440,26 @@ impl XAttr { }); output.write(&self.suffix); output.write(&self.value); - output.pad(4); + output.pad(XATTR_WORD_SIZE); } } impl InodeXAttrs { - fn add(&mut self, name: &[u8], value: &[u8]) { + /// Returns the serialized byte size of this xattr block. + fn byte_size(&self, version: format::FormatVersion) -> usize { + let mut counter = FirstPass::default(); + self.write(&mut counter, version); + counter.offset + } + + fn add(&mut self, name: &[u8], value: &[u8], version: format::FormatVersion) { for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() { + // V1 compatibility: C mkcomposefs v1.0.8 does not include lustre. (index 5) + // in its prefix table, so lustre.* xattrs use index 0 (raw fallback) in C. + // Skip index 5 for V1 images to match that behavior. + if version == format::FormatVersion::V1 && idx == 5 { + continue; + } if let Some(suffix) = name.strip_prefix(*prefix) { self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32); self.local.push(XAttr { @@ -133,7 +473,7 @@ impl InodeXAttrs { unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0) } - fn write(&self, output: &mut impl Output) { + fn write(&self, output: &mut impl Output, version: format::FormatVersion) { if self.filter != 0 { trace!(" write xattrs block"); output.write_struct(format::InodeXAttrHeader { @@ -143,7 +483,11 @@ impl InodeXAttrs { }); for idx in &self.shared { trace!(" shared {} @{}", idx, output.len()); - output.write(&output.get_xattr(*idx).to_le_bytes()); + let xattr_ref = match version { + format::FormatVersion::V1 => output.get_xattr_v1(*idx), + format::FormatVersion::V2 => output.get_xattr_v2(*idx), + }; + output.write(&xattr_ref.to_le_bytes()); } for attr in &self.local { trace!(" local @{}", output.len()); @@ -226,9 +570,13 @@ impl<'a> Directory<'a> { nameofs, output.len() ); + let inode_idx = match entry.inode { + InodeRef::Known(idx) => idx, + InodeRef::Deferred(_) => panic!("all inodes must be resolved before writing"), + }; output.write_struct(format::DirectoryEntryHeader { name_offset: (nameofs as u16).into(), - inode_offset: output.get_nid(entry.inode).into(), + inode_offset: output.get_nid(inode_idx).into(), file_type: entry.file_type.into(), ..Default::default() }); @@ -260,21 +608,65 @@ impl<'a> Directory<'a> { } } - fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) { - let (layout, u) = if self.inline.is_empty() { - (format::DataLayout::FlatPlain, block_offset as u32 / 4096) + fn inode_meta(&self, block_offset: usize) -> InodeMeta { + let blkaddr: u32 = (block_offset / 4096) + .try_into() + .expect("block address exceeds u32::MAX"); + let (layout, i_u) = if self.inline.is_empty() { + (format::DataLayout::FlatPlain, blkaddr) } else if !self.blocks.is_empty() { - (format::DataLayout::FlatInline, block_offset as u32 / 4096) + (format::DataLayout::FlatInline, blkaddr) } else { (format::DataLayout::FlatInline, 0) }; - (layout, u, self.size, self.nlink) + InodeMeta { + layout, + i_u, + size: self.size, + nlink: self.nlink, + } } } +/// Calculates the chunk format bits for an external file based on its size. +/// +/// For EROFS chunk-based inodes, the `u` field contains the chunk format +/// which encodes the chunk size as `chunkbits - BLOCK_BITS`. +/// +/// The algorithm matches the C implementation: +/// 1. Calculate chunkbits = ilog2(size - 1) + 1 +/// 2. Clamp to at least BLOCK_BITS (12) +/// 3. Clamp to at most BLOCK_BITS + 31 (max representable) +/// 4. Return chunkbits - BLOCK_BITS +fn compute_chunk_format(file_size: u64) -> u32 { + const BLOCK_BITS: u32 = format::BLOCK_BITS as u32; + const CHUNK_FORMAT_BLKBITS_MASK: u32 = 0x001F; // 31 + + // Compute the chunkbits to use for the file size. + // We want as few chunks as possible, but not an unnecessarily large chunk. + let mut chunkbits = if file_size > 1 { + // ilog2(file_size - 1) + 1 + 64 - (file_size - 1).leading_zeros() + } else { + 1 + }; + + // At least one logical block + if chunkbits < BLOCK_BITS { + chunkbits = BLOCK_BITS; + } + + // Not larger chunks than max possible + if chunkbits - BLOCK_BITS > CHUNK_FORMAT_BLKBITS_MASK { + chunkbits = CHUNK_FORMAT_BLKBITS_MASK + BLOCK_BITS; + } + + chunkbits - BLOCK_BITS +} + impl Leaf<'_, ObjectID> { - fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) { - let (layout, u, size) = match &self.content { + fn inode_meta(&self, version: format::FormatVersion) -> InodeMeta { + let (layout, i_u, size) = match &self.content { tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => { if data.is_empty() { (format::DataLayout::FlatPlain, 0, data.len() as u64) @@ -283,10 +675,19 @@ impl Leaf<'_, ObjectID> { } } tree::LeafContent::Regular(tree::RegularFile::External(.., size)) => { - (format::DataLayout::ChunkBased, 31, *size) + // V1: compute chunk format from file size + // V2: hardcode 31 (origin/main behavior) + let chunk_format = match version { + format::FormatVersion::V1 => compute_chunk_format(*size), + format::FormatVersion::V2 => 31, + }; + (format::DataLayout::ChunkBased, chunk_format, *size) } tree::LeafContent::CharacterDevice(rdev) | tree::LeafContent::BlockDevice(rdev) => { - (format::DataLayout::FlatPlain, *rdev as u32, 0) + let rdev32: u32 = (*rdev) + .try_into() + .expect("device number exceeds EROFS u32 limit"); + (format::DataLayout::FlatPlain, rdev32, 0) } tree::LeafContent::Fifo | tree::LeafContent::Socket => { (format::DataLayout::FlatPlain, 0, 0) @@ -301,7 +702,12 @@ impl Leaf<'_, ObjectID> { (format::DataLayout::FlatInline, 0, target.len() as u64) } }; - (layout, u, size, self.nlink) + InodeMeta { + layout, + i_u, + size, + nlink: self.nlink, + } } fn write_inline(&self, output: &mut impl Output) { @@ -316,6 +722,11 @@ impl Leaf<'_, ObjectID> { impl Inode<'_, ObjectID> { fn file_type(&self) -> format::FileType { + // V1 whiteout escaping: char device (rdev=0) entries are written as regular files + // to match C mkcomposefs v1.0.8 behavior. + if self.escaped_whiteout { + return format::FileType::RegularFile; + } match &self.content { InodeContent::Directory(..) => format::FileType::Directory, InodeContent::Leaf(leaf) => match &leaf.content { @@ -329,84 +740,248 @@ impl Inode<'_, ObjectID> { } } - fn write_inode(&self, output: &mut impl Output, idx: usize) { - let (layout, u, size, nlink) = match &self.content { - InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)), - InodeContent::Leaf(leaf) => leaf.inode_meta(), + /// Check if this inode can use compact format (32 bytes instead of 64). + /// + /// Compact format is used when: + /// - mtime matches min_mtime (stored in superblock build_time) + /// - nlink, uid, gid fit in u16 + /// - size fits in u32 + fn fits_in_compact(&self, min_mtime: (u64, u32), size: u64, nlink: usize) -> bool { + // mtime (both sec and nsec) must match the minimum (which will be stored in superblock + // build_time / build_time_nsec). The C implementation requires both to match. + if self.stat.st_mtim_sec as u64 != min_mtime.0 { + return false; + } + if self.stat.st_mtim_nsec != min_mtime.1 { + return false; + } + + // nlink must fit in u16 + if nlink > u16::MAX as usize { + return false; + } + + // uid and gid must fit in u16 + if self.stat.st_uid > u16::MAX as u32 || self.stat.st_gid > u16::MAX as u32 { + return false; + } + + // size must fit in u32 + if size > u32::MAX as u64 { + return false; + } + + true + } + + /// Handle inline tail padding for V1 format. + /// + /// Port of C mkcomposefs `compute_erofs_inode_padding_for_tail()`. + /// + /// Two branches based on file type: + /// - Symlinks: pad the *inode start* to a block boundary whenever the inode + xattrs + + /// symlink target would cross into a new block (prevents EFSCORRUPTED on old kernels). + /// - All other FlatInline types (dirs, inline files): pad the *tail* only if it would + /// cross into yet another block after inline_start. + fn pad_inline_tail_v1( + &self, + output: &mut impl Output, + inode_and_xattr_size: u64, + size: u64, + #[cfg(test)] ctx: &mut WriteContext, + #[cfg(not(test))] _ctx: &mut WriteContext, + ) { + let block_size = u64::from(format::BLOCK_SIZE); + let current_pos: u64 = output.len().try_into().unwrap(); + let inline_size = size % block_size; + + if matches!(self.file_type(), format::FileType::Symlink) { + // Symlink branch: pad *inode start* to a block boundary when + // inode + xattrs + symlink target would cross into a new block. + // Matches C: pos_block != end_block. + // + // Old kernels (< 6.12) return EFSCORRUPTED from erofs_fill_symlink() + // when (inode_offset % block_size) + inode_and_xattr_size + inline_size + // > block_size. Padding the inode start to a block boundary prevents + // this because then inode_offset % block_size == 0. + #[cfg(test)] + let skip_pad = ctx + .faults + .as_mut() + .map(|f| f.should_skip_symlink_pad()) + .unwrap_or(false); + #[cfg(not(test))] + let skip_pad = false; + + if !skip_pad { + let total_size = inode_and_xattr_size + inline_size; + // Does [current_pos, current_pos+total_size) cross a block boundary? + // block_offset tells us how far into the current block we are; + // if adding total_size exceeds block_size, we spill into the next block. + if block_offset(current_pos) + total_size > block_size { + // Align inode start to the next block boundary so the inode + // doesn't straddle a block (prevents EUCLEAN on old kernels). + // block_size (4096) is divisible by 32 (EROFS slot size), + // so slot alignment is preserved after this padding. + // None means current_pos is already block-aligned; no padding needed. + if let Some(pad_size) = bytes_to_block_boundary(current_pos) { + output.write_zeros(pad_size as usize); + } + } + } + } else { + // Non-symlink branch (dirs, inline files): pad the *tail* to fit + // within the block that inline_start lands in. + // Matches C: block_remainder < inline_size, pad = block_remainder + // rounded up to the next 32-byte slot boundary. + let inline_start = current_pos + .checked_add(inode_and_xattr_size) + .expect("image position + inode header size cannot overflow u64"); + // If inline_start is block-aligned, block_remainder would be BLOCK_SIZE which + // always exceeds inline_size (< BLOCK_SIZE), so no padding — None is correct. + if let Some(block_remainder) = bytes_to_block_boundary(inline_start) + && block_remainder < inline_size + { + let pad_size = (block_remainder.div_ceil(INODE_SLOT_SIZE as u64) + * INODE_SLOT_SIZE as u64) as usize; + output.write_zeros(pad_size); + } + } + } + + /// Handle inline tail padding for V2 format (origin/main algorithm). + fn pad_inline_tail_v2(&self, output: &mut impl Output, inode_and_xattr_size: u64, size: u64) { + let block_size = u64::from(format::BLOCK_SIZE); + let inline_start: u64 = output.len().try_into().unwrap(); + let inline_start = inline_start + .checked_add(inode_and_xattr_size) + .expect("image position + inode header size cannot overflow u64"); + // Restore origin/main logic: end_of_metadata is the last byte of the metadata, + // inline_end is the last byte of the inline data. If they land in different + // blocks we must pad so the inline data starts at a fresh block boundary. + let end_of_metadata = inline_start - 1; + let inline_end = inline_start + (size % block_size); + if end_of_metadata / block_size != inline_end / block_size { + let pad_size = (block_size - end_of_metadata % block_size) as usize; + output.write_zeros(pad_size); + output.pad(INODE_SLOT_SIZE); + } + } + + fn write_inode(&self, output: &mut impl Output, idx: usize, ctx: &mut WriteContext) { + let version = ctx.version; + let min_mtime = ctx.min_mtime; + let meta = match &self.content { + InodeContent::Directory(dir) => dir.inode_meta(output.get_block_start(idx)), + InodeContent::Leaf(leaf) => leaf.inode_meta(version), }; + let InodeMeta { + layout, + i_u: u, + size, + nlink, + } = meta; - let xattr_size = { - let mut xattr = FirstPass::default(); - self.xattrs.write(&mut xattr); - xattr.offset + let xattr_size = self.xattrs.byte_size(version); + + // V1: compact inodes when possible; V2: always extended + let use_compact = + version == format::FormatVersion::V1 && self.fits_in_compact(min_mtime, size, nlink); + + let inode_header_size = if use_compact { + size_of::() + } else { + size_of::() }; // We need to make sure the inline part doesn't overlap a block boundary - output.pad(32); + output.pad(INODE_SLOT_SIZE); if matches!(layout, format::DataLayout::FlatInline) { - let block_size = u64::from(format::BLOCK_SIZE); - let inode_and_xattr_size: u64 = (size_of::() + xattr_size) - .try_into() - .unwrap(); - let inline_start: u64 = output.len().try_into().unwrap(); - let inline_start = inline_start + inode_and_xattr_size; - let end_of_metadata = inline_start - 1; - let inline_end = inline_start + (size % block_size); - if end_of_metadata / block_size != inline_end / block_size { - // If we proceed, then we'll violate the rule about crossing block boundaries. - // The easiest thing to do is to add padding so that the inline data starts close - // to the start of a fresh block boundary, while ensuring inode alignment. - // pad_size is always < block_size (4096), so fits in usize - let pad_size = (block_size - end_of_metadata % block_size) as usize; - let pad = vec![0; pad_size]; - trace!("added pad {}", pad.len()); - output.write(&pad); - output.pad(32); + let inode_and_xattr_size: u64 = (inode_header_size + xattr_size).try_into().unwrap(); + + match version { + format::FormatVersion::V1 => { + self.pad_inline_tail_v1(output, inode_and_xattr_size, size, ctx); + } + format::FormatVersion::V2 => { + self.pad_inline_tail_v2(output, inode_and_xattr_size, size); + } } } - let format = format::InodeLayout::Extended | layout; + let xattr_icount: u16 = match xattr_size { + 0 => 0, + n => { + let word_count = n + .checked_sub(INODE_XATTR_HEADER_SIZE) + .expect("non-empty xattr block must be >= header size") + / XATTR_WORD_SIZE; + (1 + word_count) as u16 + } + }; - trace!( - "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}", - output.len() / 32, - format, - self.file_type(), - match xattr_size { - 0 => 0, - n => (1 + (n - 12) / 4) as u16, - }, - size % 4096, - output.len() - ); + output.note_inode(); + + if use_compact { + let format = format::InodeLayout::Compact | layout; + + // V1: use sequential ino + let ino = idx as u32; + + output.write_struct(format::CompactInodeHeader { + format, + xattr_icount: xattr_icount.into(), + mode: self.file_type() | self.stat.st_mode, + nlink: (nlink as u16).into(), + size: (size as u32).into(), + reserved: 0.into(), + u: u.into(), + ino: ino.into(), + uid: (self.stat.st_uid as u16).into(), + gid: (self.stat.st_gid as u16).into(), + reserved2: [0; 4], + }); + } else { + let format = format::InodeLayout::Extended | layout; - output.note_offset(Offset::Inode); - output.write_struct(format::ExtendedInodeHeader { - format, - xattr_icount: match xattr_size { - 0 => 0, - n => (1 + (n - 12) / 4) as u16, - } - .into(), - mode: self.file_type() | self.stat.st_mode, - size: size.into(), - u: u.into(), - ino: ((output.len() / 32) as u32).into(), - uid: self.stat.st_uid.into(), - gid: self.stat.st_gid.into(), - mtime: (self.stat.st_mtim_sec as u64).into(), - nlink: (nlink as u32).into(), - ..Default::default() - }); + // V1 uses the BFS index as i_ino (matching C mkcomposefs behaviour). + // V2 uses the NID (byte offset / INODE_SLOT_SIZE) for 32-bit stat compatibility. + let ino = match version { + format::FormatVersion::V1 => idx as u32, + format::FormatVersion::V2 => (output.len() / INODE_SLOT_SIZE) as u32, + }; - self.xattrs.write(output); + // V2 does not store sub-second mtime precision (mtime_nsec=0), + // matching origin/main which used ..Default::default() to zero it. + // V1 preserves full nanosecond precision. + let mtime_nsec: u32 = match version { + format::FormatVersion::V1 => self.stat.st_mtim_nsec, + format::FormatVersion::V2 => 0, + }; + output.write_struct(format::ExtendedInodeHeader { + format, + xattr_icount: xattr_icount.into(), + mode: self.file_type() | self.stat.st_mode, + size: size.into(), + u: u.into(), + ino: ino.into(), + uid: self.stat.st_uid.into(), + gid: self.stat.st_gid.into(), + mtime: (self.stat.st_mtim_sec as u64).into(), + mtime_nsec: mtime_nsec.into(), + nlink: (nlink as u32).into(), + ..Default::default() + }); + } + + self.xattrs.write(output, version); match &self.content { InodeContent::Directory(dir) => dir.write_inline(output), InodeContent::Leaf(leaf) => leaf.write_inline(output), }; - output.pad(32); + output.pad(INODE_SLOT_SIZE); } fn write_blocks(&self, output: &mut impl Output) { @@ -418,13 +993,18 @@ impl Inode<'_, ObjectID> { struct InodeCollector<'a, ObjectID: FsVerityHashValue> { inodes: Vec>, - hardlinks: HashMap, + hardlinks: HashMap, fs: &'a tree::FileSystem, - nlink_map: &'a [u32], + nlink_map: Vec, + version: format::FormatVersion, } impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { - fn push_inode(&mut self, stat: &'a tree::Stat, content: InodeContent<'a, ObjectID>) -> usize { + fn push_inode( + &mut self, + stat: &'a tree::Stat, + content: InodeContent<'a, ObjectID>, + ) -> InodeIdx { let mut xattrs = InodeXAttrs::default(); // We need to record extra xattrs for some files. These come first. @@ -434,23 +1014,28 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { }) = content { xattrs.add( - b"trusted.overlay.metacopy", + format::XATTR_OVERLAY_METACOPY, OverlayMetacopy::new(id).as_bytes(), + self.version, ); let redirect = format!("/{}", id.to_object_pathname()); - xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes()); + xattrs.add( + format::XATTR_OVERLAY_REDIRECT, + redirect.as_bytes(), + self.version, + ); } // Add the normal xattrs. They're already listed in sorted order. for (name, value) in stat.xattrs.iter() { let name = name.as_bytes(); - if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") { - let escaped = [b"trusted.overlay.overlay.", escapee].concat(); - xattrs.add(&escaped, value); + if let Some(escapee) = name.strip_prefix(format::XATTR_OVERLAY_PREFIX) { + let escaped = [format::XATTR_OVERLAY_ESCAPED_PREFIX, escapee].concat(); + xattrs.add(&escaped, value, self.version); } else { - xattrs.add(name, value); + xattrs.add(name, value, self.version); } } @@ -461,11 +1046,12 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { stat, xattrs, content, + escaped_whiteout: false, }); inode } - fn collect_leaf(&mut self, leaf_id: LeafId) -> usize { + fn collect_leaf(&mut self, leaf_id: LeafId) -> InodeIdx { let nlink = self.nlink_map[leaf_id.0] as usize; if nlink > 1 @@ -475,6 +1061,14 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { } let leaf = self.fs.leaf(leaf_id); + + // Hardlinked whiteouts are semantically invalid: a whiteout represents the + // absence of a file in an overlay, so having nlink > 1 is meaningless. + // ValidatedFileSystem guarantees this invariant was checked at construction time. + debug_assert!( + !(matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) && nlink > 1), + "ValidatedFileSystem guarantees whiteout nlink == 1" + ); let inode = self.push_inode( &leaf.stat, InodeContent::Leaf(Leaf { @@ -490,27 +1084,24 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { inode } - fn insert_sorted( - entries: &mut Vec>, - name: &'a [u8], - inode: usize, - file_type: format::FileType, - ) { - let entry = DirEnt { - name, - inode, - file_type, - }; - let point = entries.partition_point(|e| e.name < entry.name); - entries.insert(point, entry); - } - - fn collect_dir(&mut self, dir: &'a tree::Directory, parent: usize) -> usize { + /// Collect inodes using depth-first traversal (V2 / origin/main behavior). + fn collect_dir(&mut self, dir: &'a tree::Directory, parent: InodeIdx) -> InodeIdx { // The root inode number needs to fit in a u16. That more or less compels us to write the // directory inode before the inode of the children of the directory. Reserve a slot. let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default())); - let mut entries = vec![]; + let mut entries = vec![ + DirEnt { + name: b".", + inode: InodeRef::Known(me), + file_type: format::FileType::Directory, + }, + DirEnt { + name: b"..", + inode: InodeRef::Known(parent), + file_type: format::FileType::Directory, + }, + ]; for (name, inode) in dir.sorted_entries() { let child = match inode { @@ -519,34 +1110,359 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { }; entries.push(DirEnt { name: name.as_bytes(), - inode: child, + inode: InodeRef::Known(child), file_type: self.inodes[child].file_type(), }); } - // We're expected to add those, too - Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory); - Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory); + entries.sort_unstable_by_key(|e| e.name); // Now that we know the actual content, we can write it to our reserved slot self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); me } + /// Returns true if this leaf entry is an overlay whiteout stub generated internally + /// by `add_overlay_whiteouts()`, as opposed to a user-provided whiteout. These stubs + /// must NOT be escaped during V1 whiteout processing. + fn is_overlay_whiteout_stub( + &self, + name: &[u8], + leaf_id: LeafId, + me: InodeIdx, + root_inode: InodeIdx, + ) -> bool { + let root_stat = &self.fs.root.stat; + let leaf_stat = &self.fs.leaf(leaf_id).stat; + let selinux_key = std::ffi::OsStr::new("security.selinux"); + let expected_xattrs = if root_stat.xattrs.contains_key(selinux_key) { + 1 + } else { + 0 + }; + let has_correct_xattrs = leaf_stat.xattrs.len() == expected_xattrs + && (expected_xattrs == 0 + || leaf_stat.xattrs.get(selinux_key) == root_stat.xattrs.get(selinux_key)); + + me == root_inode + && name.len() == 2 + && name + .iter() + .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f')) + && leaf_stat.st_mode == 0o644 + && leaf_stat.st_uid == root_stat.st_uid + && leaf_stat.st_gid == root_stat.st_gid + && leaf_stat.st_mtim_sec == root_stat.st_mtim_sec + && leaf_stat.st_mtim_nsec == root_stat.st_mtim_nsec + && has_correct_xattrs + } + + /// Returns true if a leaf content is a V1 overlay whiteout (char device, rdev=0). + fn is_v1_whiteout(content: &tree::LeafContent) -> bool { + matches!(content, tree::LeafContent::CharacterDevice(0)) + } + + /// Collect all inodes using queue-based breadth-first traversal (V1). + /// + /// This algorithm matches the C mkcomposefs `lcfs_compute_tree()` function which uses + /// a linked-list queue to process directories. All nodes at depth N are assigned inode + /// numbers before any nodes at depth N+1. + /// + /// For V1, char device entries with rdev=0 (overlay whiteouts) are escaped to regular + /// files matching C mkcomposefs v1.0.8 `add_overlayfs_xattrs()` behavior: + /// - Child entry: converted to regular file + gets `trusted.overlay.overlay.whiteout=""` + /// and `user.overlay.whiteout=""` xattrs. + /// - Parent directory: gets `trusted.overlay.overlay.whiteouts=""`, + /// `user.overlay.whiteouts=""`, `trusted.overlay.overlay.opaque=x`, + /// `user.overlay.opaque=x` xattrs (added at most once per directory). + fn collect_tree(&mut self, root: &'a tree::Directory) { + use std::collections::VecDeque; + + // Pre-pass: for each multi-link leaf, find which directory holds the canonical + // (first DFS sorted-order) occurrence. + // + // In C mkcomposefs, when a dumpfile is parsed, the first occurrence of each + // inode (same content / nlink target) is the "original" and subsequent occurrences + // are "hardlinks" (with link_to pointer). During BFS, hardlinks are SKIPPED — only + // originals get inode numbers. Hardlink directory entries use the original's nid. + // + // The dumpfile is written in DFS sorted order (see write_dumpfile). So the canonical + // occurrence is whichever path appears first in that DFS traversal. + // + // We replicate this: when BFS encounters a non-canonical occurrence of a multi-link + // leaf (its canonical directory doesn't match the current directory), we defer the + // nid assignment until the canonical occurrence is processed. + // + // KEY: we record the DIRECTORY POINTER of the canonical occurrence, not just the + // leaf_id, because two occurrences of the same leaf share the same leaf_id — we + // need the directory pointer to distinguish canonical from non-canonical at BFS time. + let canonical_dirs = Self::find_canonical_dirs(root, &self.nlink_map); + + let root_inode = self.push_inode(&root.stat, InodeContent::Directory(Directory::default())); + let mut queue: VecDeque<(&'a tree::Directory, InodeIdx, InodeIdx)> = + VecDeque::new(); + queue.push_back((root, root_inode, root_inode)); + + // dir_entries: accumulates (me, parent, entries) for each directory processed in BFS order. + // Leaf entries whose canonical occurrence hasn't been BFS-processed yet are stored as + // InodeRef::Deferred(leaf_id) and resolved in a single post-BFS pass once all canonical + // inodes have been assigned. + let mut dir_entries: Vec<(InodeIdx, InodeIdx, Vec>)> = vec![]; // (me, parent, entries) + + while let Some((dir, parent, me)) = queue.pop_front() { + let mut entries = vec![ + DirEnt { + name: b".", + inode: InodeRef::Known(me), + file_type: format::FileType::Directory, + }, + DirEnt { + name: b"..", + inode: InodeRef::Known(parent), + file_type: format::FileType::Directory, + }, + ]; + let mut dir_has_whiteout = false; + + for (name, inode) in dir.sorted_entries() { + match inode { + tree::Inode::Directory(subdir) => { + let child = self.push_inode( + &subdir.stat, + InodeContent::Directory(Directory::default()), + ); + queue.push_back((subdir, me, child)); + entries.push(DirEnt { + name: name.as_bytes(), + inode: InodeRef::Known(child), + file_type: format::FileType::Directory, + }); + } + tree::Inode::Leaf(leaf_id, _) => { + // V1 whiteout escaping: char device with rdev=0 → regular file. + // Matches C mkcomposefs v1.0.8 `rewrite_tree_node_for_erofs()`, which + // escapes user-provided char devices. + // + // IMPORTANT: the 256 stubs added by add_overlay_whiteouts() are NOT + // escaped in C — they are added AFTER `rewrite_tree_node_for_erofs()` + // so they never go through escaping. We skip them by detecting root-level + // 2-char hex entries (the names used by add_overlay_whiteouts()) THAT ALSO + // exactly match the metadata applied by add_overlay_whiteouts(). This + // correctly distinguishes them from user-provided whiteouts that happen + // to have a 2-char hex name. + let name_bytes = name.as_bytes(); + let is_stub = + self.is_overlay_whiteout_stub(name_bytes, *leaf_id, me, root_inode); + + // Determine if this occurrence is canonical (first in DFS order). + // + // For multi-link leaves (nlink > 1), the canonical occurrence is the + // one in the directory recorded by find_canonical_dirs(). We compare + // the current directory pointer to identify it precisely. + // + // For single-link leaves (nlink = 1), there is only one occurrence, + // so it is always canonical (no entry in canonical_dirs). + let nlink = self.nlink_map[leaf_id.0]; + let is_canonical = if nlink > 1 { + // Multi-link: canonical iff this is the recorded canonical directory. + // We use pointer identity (std::ptr::eq) to match the current + // directory reference against the one recorded during the DFS + // pre-pass. The pointers are stable borrows from the tree, which + // outlives this entire function. + canonical_dirs + .get(leaf_id) + .is_some_and(|&p| std::ptr::eq(p, dir)) + } else { + // Single-link: always canonical + true + }; + + let child_ref = if is_canonical { + // Canonical occurrence: create nid now. + InodeRef::Known(self.collect_leaf(*leaf_id)) + } else if let Some(&nid) = self.hardlinks.get(leaf_id) { + // Non-canonical, and the canonical has already been processed. + InodeRef::Known(nid) + } else { + // Non-canonical, and canonical hasn't been assigned a nid yet + // (canonical is in a deeper directory, not yet BFS-processed). + // Store as Deferred; resolved in the post-BFS pass. + InodeRef::Deferred(*leaf_id) + }; + + // Apply whiteout escaping on the first canonical occurrence only. + // + // `is_canonical` is true for any entry whose directory pointer matches + // the canonical directory, so if a whiteout leaf and its hardlink both + // live in the same directory, both appear "canonical" by that check. + // We guard with `!escaped_whiteout` to ensure the xattrs are added + // exactly once — on the very first encounter of the inode. + if is_canonical + && matches!(child_ref, InodeRef::Known(_)) + && self.version == format::FormatVersion::V1 + && !is_stub + && Self::is_v1_whiteout(&self.fs.leaf(*leaf_id).content) + { + let InodeRef::Known(child) = child_ref else { + unreachable!() + }; + if !self.inodes[child].escaped_whiteout { + self.inodes[child].escaped_whiteout = true; + // Add per-entry whiteout xattrs (already-escaped names): + // C adds OVERLAY_XATTR_ESCAPED_WHITEOUT and OVERLAY_XATTR_USERXATTR_WHITEOUT. + self.inodes[child].xattrs.add( + format::XATTR_OVERLAY_WHITEOUT, + b"", + self.version, + ); + self.inodes[child].xattrs.add( + format::XATTR_USERXATTR_WHITEOUT, + b"", + self.version, + ); + dir_has_whiteout = true; + } + } + + // file_type for the dir entry: for Deferred entries, use a placeholder; + // it will be corrected in the post-BFS resolution pass. + let file_type = if let InodeRef::Known(child) = child_ref { + // file_type() already returns RegularFile when escaped_whiteout=true + self.inodes[child].file_type() + } else { + // Deferred; file_type will be updated in the resolution pass + format::FileType::RegularFile + }; + + entries.push(DirEnt { + name: name.as_bytes(), + inode: child_ref, + file_type, + }); + } + } + } + + // V1: if this directory had whiteout children, add parent xattrs. + // C adds these once per directory, on first whiteout child found. + // Matches OVERLAY_XATTR_ESCAPED_WHITEOUTS, OVERLAY_XATTR_USERXATTR_WHITEOUTS, + // OVERLAY_XATTR_ESCAPED_OPAQUE (=x), OVERLAY_XATTR_USERXATTR_OPAQUE (=x). + if self.version == format::FormatVersion::V1 && dir_has_whiteout { + self.inodes[me] + .xattrs + .add(format::XATTR_OVERLAY_WHITEOUTS, b"", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_USERXATTR_WHITEOUTS, b"", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_OVERLAY_OPAQUE, b"x", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_USERXATTR_OPAQUE, b"x", self.version); + } + + entries.sort_unstable_by_key(|e| e.name); + + dir_entries.push((me, parent, entries)); + } + + // Post-BFS: resolve all Deferred entries. + // At this point all canonical leaves have been assigned nids and are in self.hardlinks. + for (_me, _parent, entries) in &mut dir_entries { + for entry in entries.iter_mut() { + if let InodeRef::Deferred(leaf_id) = entry.inode { + let nid = *self + .hardlinks + .get(&leaf_id) + .expect("canonical leaf must have been assigned a nid during BFS"); + entry.inode = InodeRef::Known(nid); + entry.file_type = self.inodes[nid].file_type(); + } + } + } + + // Build directory content for each directory inode. + for (me, _parent, entries) in dir_entries { + self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); + } + } + + /// DFS pre-pass: find which directory contains the canonical occurrence of each + /// multi-link leaf (first encounter in DFS sorted order). + /// + /// C mkcomposefs parses dumpfiles in DFS sorted order. The first occurrence of each + /// leaf (by `LeafId`) is the "original"; subsequent occurrences are "hardlinks". + /// Only originals get inode numbers in BFS; hardlinks reuse the original's nid. + /// + /// The dumpfile writer (`write_dumpfile`) uses DFS sorted traversal, so we replicate + /// the same traversal here to determine canonical occurrences. + /// + /// Note: we cannot simplify to "first BFS encounter wins" because DFS and BFS visit + /// directories at different depths in different order (e.g. DFS visits `/a/deep/` + /// before `/b/`, while BFS visits `/b/` first). Changing the canonical ordering + /// would break binary compatibility with C mkcomposefs. + /// + /// Returns a `HashMap` mapping each multi-link leaf to the + /// directory pointer where its canonical (first DFS) occurrence lives. + /// Single-link leaves are NOT in the map (they're trivially canonical anywhere). + /// + /// We use raw pointers for directory identity comparison (`std::ptr::eq`) rather + /// than dereferencing. The pointers are stable `&'a` borrows from the tree which + /// outlives the entire `collect_tree` call. + fn find_canonical_dirs( + root: &'a tree::Directory, + nlink_map: &[u32], + ) -> HashMap> { + let mut seen: HashSet = HashSet::new(); + let mut canonical_dirs: HashMap> = HashMap::new(); + Self::dfs_find_canonical(root, nlink_map, &mut seen, &mut canonical_dirs); + canonical_dirs + } + + fn dfs_find_canonical( + dir: &'a tree::Directory, + nlink_map: &[u32], + seen: &mut HashSet, + canonical_dirs: &mut HashMap>, + ) { + let dir_ptr: *const tree::Directory = dir; + for (_, inode) in dir.sorted_entries() { + match inode { + tree::Inode::Directory(subdir) => { + Self::dfs_find_canonical(subdir, nlink_map, seen, canonical_dirs); + } + tree::Inode::Leaf(leaf_id, _) => { + if nlink_map[leaf_id.0] > 1 && seen.insert(*leaf_id) { + // First DFS encounter → canonical occurrence is in this directory. + canonical_dirs.insert(*leaf_id, dir_ptr); + // Second+ encounter → non-canonical (hardlink), dir not recorded + } + // Single-link leaves are always canonical; no need to record them + } + } + } + } + pub fn collect( fs: &'a tree::FileSystem, - nlink_map: &'a [u32], + version: format::FormatVersion, ) -> Vec> { let mut this = Self { inodes: vec![], hardlinks: HashMap::new(), fs, - nlink_map, + nlink_map: fs.nlinks(), + version, }; - // '..' of the root directory is the root directory again - let root_inode = this.collect_dir(&fs.root, 0); - assert_eq!(root_inode, 0); + match version { + format::FormatVersion::V1 => this.collect_tree(&fs.root), + format::FormatVersion::V2 => { + let root_inode = this.collect_dir(&fs.root, 0); + assert_eq!(root_inode, 0); + } + } this.inodes } @@ -554,9 +1470,23 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { /// Takes a list of inodes where each inode contains only local xattr values, determines which /// xattrs (key, value) pairs appear more than once, and shares them. -fn share_xattrs(inodes: &mut [Inode]) -> Vec { +/// +/// For V1: sorts locals by full key, reverses shared table, uses InodesEnd-relative xattr offsets. +/// For V2: uses natural BTreeMap order (derived Ord), ascending shared table. +fn share_xattrs( + inodes: &mut [Inode], + version: format::FormatVersion, +) -> Vec { let mut xattrs: BTreeMap = BTreeMap::new(); + // V1: sort local xattrs by full key to match C behavior + // V2: don't sort (insertion order is fine, BTreeMap handles shared ordering) + if version == format::FormatVersion::V1 { + for inode in inodes.iter_mut() { + inode.xattrs.local.sort_by(|a, b| a.cmp_by_full_key(b)); + } + } + // Collect all xattrs from the inodes for inode in inodes.iter() { for attr in &inode.xattrs.local { @@ -571,88 +1501,158 @@ fn share_xattrs(inodes: &mut [Inode]) -> Vec { // Share only xattrs with more than one user xattrs.retain(|_k, v| *v > 1); - // Repurpose the refcount field as an index lookup - for (idx, value) in xattrs.values_mut().enumerate() { - *value = idx; - } + let (xattrs, shared): (BTreeMap, Vec) = match version { + format::FormatVersion::V1 => { + // C mkcomposefs sorts shared xattrs by full key string (strcmp), then writes + // them in DESCENDING order in the shared xattr block. Our BTreeMap is ordered + // by (prefix_index, suffix, value) which differs from strcmp order when prefix + // indices don't sort the same way as prefix strings (e.g. "security."=6 sorts + // numerically after "trusted."=4, but 'security.' < 'trusted.' lexicographically). + // Collect into a Vec, sort by full key ascending, then reverse = descending. + let mut sorted: Vec<_> = xattrs.into_iter().collect(); + sorted.sort_by(|(a, _), (b, _)| a.cmp_by_full_key(b)); + let n_shared = sorted.len(); + // Assign indices in descending order: first entry on disk gets the highest ref. + // After reversal, sorted[0] (ascending-smallest) ends up last on disk. + // We iterate ascending-sorted and assign index = n-1-i so that the entry + // written LAST (smallest key in ascending order) gets the SMALLEST index. + // Reconstruct a map for the lookup phase below. + let xattrs_map: BTreeMap = sorted + .iter() + .enumerate() + .map(|(i, (k, _))| (k.clone(), n_shared - 1 - i)) + .collect(); + + // Return in descending full-key order (last in ascending = first written) + let mut out = sorted; + out.reverse(); + let shared_vec = out.into_iter().map(|(k, _)| k).collect(); + (xattrs_map, shared_vec) + } + format::FormatVersion::V2 => { + // Ascending order: sequential index assignment + for (idx, value) in xattrs.values_mut().enumerate() { + *value = idx; + } - // Visit each inode and change local xattrs into shared xattrs + // Return in ascending order (natural BTreeMap order) + let shared_vec = xattrs.keys().cloned().collect(); + (xattrs, shared_vec) + } + }; + + // Visit each inode and promote xattrs that are in the shared table. + // This is the same for both V1 and V2: remove from local, push index to shared. for inode in inodes.iter_mut() { inode.xattrs.local.retain(|attr| { if let Some(idx) = xattrs.get(attr) { inode.xattrs.shared.push(*idx); - false // drop the local xattr: we converted it + false } else { - true // retain the local xattr: we didn't convert it + true } }); } - // Return the shared xattrs as a vec - xattrs.into_keys().collect() + shared } fn write_erofs( output: &mut impl Output, inodes: &[Inode], xattrs: &[XAttr], + ctx: &mut WriteContext, ) { - // Write composefs header - output.note_offset(Offset::Header); - output.write_struct(format::ComposefsHeader { + let version = ctx.version; + let min_mtime = ctx.min_mtime; + let header_flags = ctx.header_flags; + let composefs_version: u32 = ctx.composefs_version; + // Determine build_time based on format version + // V1: use minimum mtime across all inodes for reproducibility + // V2: use 0 (not used) + let (build_time, build_time_nsec) = match version { + format::FormatVersion::V1 => min_mtime, + format::FormatVersion::V2 => (0, 0), + }; + + // Write composefs header (pads to 1024 bytes internally) + output.write_composefs_header(format::ComposefsHeader { magic: format::COMPOSEFS_MAGIC, version: format::VERSION, - flags: 0.into(), - composefs_version: format::COMPOSEFS_VERSION, + flags: header_flags.into(), + composefs_version: composefs_version.into(), ..Default::default() }); - output.pad(1024); // Write superblock - output.note_offset(Offset::Superblock); - output.write_struct(format::Superblock { + // V1: set xattr_blkaddr to computed value; V2: leave as 0 + let xattr_blkaddr = match version { + format::FormatVersion::V1 => output.get_xattr_blkaddr(), + format::FormatVersion::V2 => 0, + }; + output.write_superblock(format::Superblock { magic: format::MAGIC_V1, blkszbits: format::BLOCK_BITS, feature_compat: (format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER).into(), root_nid: (output.get_nid(0) as u16).into(), inos: (inodes.len() as u64).into(), - blocks: ((output.get(Offset::End, 0) / usize::from(format::BLOCK_SIZE)) as u32).into(), + blocks: output.get_block_count().into(), + build_time: build_time.into(), + build_time_nsec: build_time_nsec.into(), + xattr_blkaddr: xattr_blkaddr.into(), ..Default::default() }); // Write inode table for (idx, inode) in inodes.iter().enumerate() { // The inode may add padding to itself, so it notes its own offset - inode.write_inode(output, idx); + inode.write_inode(output, idx, ctx); } + // Mark end of inode table (slot-aligned) + output.pad(INODE_SLOT_SIZE); + output.note_inodes_end(); + // Write shared xattr table for xattr in xattrs { - output.note_offset(Offset::XAttr); + output.note_xattr(); xattr.write(output); } // Write blocks from inodes that have them output.pad(4096); for inode in inodes.iter() { - output.note_offset(Offset::Block); + output.note_block(); inode.write_blocks(output); } // That's it - output.note_offset(Offset::End); + output.note_end(); } +/// Offsets recorded during the first pass and consumed by the second pass. +/// Only contains values that are actually retrieved; singletons that are +/// write-only (header, superblock) are tracked as bools in `FirstPass`. #[derive(Default)] struct Layout { - offset_types: Vec, - offsets: Vec, + /// Byte offset of each inode, indexed by InodeIdx. + inodes: Vec, + /// Byte offset immediately after the last inode (slot-aligned). + inodes_end: Option, + /// Byte offset of each shared xattr entry, indexed sequentially. + xattrs: Vec, + /// Byte offset of each inode's block data region, indexed by InodeIdx. + blocks: Vec, + /// Total byte length of the image. + end: Option, } #[derive(Default)] struct FirstPass { offset: usize, layout: Layout, + header_emitted: bool, + superblock_emitted: bool, } struct SecondPass { @@ -660,87 +1660,349 @@ struct SecondPass { layout: Layout, } -impl Output for SecondPass { - fn note_offset(&mut self, _offset_type: Offset) { - /* no-op */ +impl Output for FirstPass { + fn note_header_emitted(&mut self) { + assert!(!self.header_emitted, "composefs header written twice"); + self.header_emitted = true; + } + fn note_superblock_emitted(&mut self) { + assert!(!self.superblock_emitted, "superblock written twice"); + self.superblock_emitted = true; + } + fn note_inode(&mut self) { + self.layout + .inodes + .push(NonZeroUsize::new(self.offset).expect("inode recorded at offset 0")); + } + fn note_inodes_end(&mut self) { + assert!( + self.layout.inodes_end.is_none(), + "inodes_end recorded twice" + ); + self.layout.inodes_end = NonZeroUsize::new(self.offset); + } + fn note_xattr(&mut self) { + self.layout + .xattrs + .push(NonZeroUsize::new(self.offset).expect("xattr recorded at offset 0")); + } + fn note_block(&mut self) { + debug_assert_eq!( + self.offset % format::BLOCK_SIZE as usize, + 0, + "block data must start at a block-aligned offset" + ); + self.layout + .blocks + .push(NonZeroUsize::new(self.offset).expect("block recorded at offset 0")); + } + fn note_end(&mut self) { + assert!(self.layout.end.is_none(), "end recorded twice"); + self.layout.end = NonZeroUsize::new(self.offset); } - fn get(&self, offset_type: Offset, idx: usize) -> usize { - let start = self.layout.offset_types[offset_type as usize]; - self.layout.offsets[start + idx] + fn get_inode_offset(&self, _idx: usize) -> Option { + None + } + fn get_inodes_end(&self) -> Option { + None + } + fn get_xattr_offset(&self, _idx: usize) -> Option { + None + } + fn get_block_offset(&self, _idx: usize) -> Option { + None + } + fn get_end(&self) -> Option { + None } fn write(&mut self, data: &[u8]) { - self.output.extend_from_slice(data); + self.offset += data.len(); } - fn pad(&mut self, alignment: usize) { - self.output - .resize(round_up(self.output.len(), alignment), 0); + self.offset = round_up(self.offset, alignment); } - fn len(&self) -> usize { - self.output.len() + self.offset } } -impl Output for FirstPass { - fn note_offset(&mut self, offset_type: Offset) { - while self.layout.offset_types.len() <= offset_type as usize { - self.layout.offset_types.push(self.layout.offsets.len()); - } - assert_eq!(self.layout.offset_types.len(), offset_type as usize + 1); - - trace!( - "{:?} #{} @{}", - offset_type, - self.layout.offsets.len() - self.layout.offset_types[offset_type as usize], - self.offset +impl Output for SecondPass { + fn note_header_emitted(&mut self) {} + fn note_superblock_emitted(&mut self) {} + fn note_inode(&mut self) {} + fn note_inodes_end(&mut self) { + debug_assert_eq!( + self.output.len(), + self.layout + .inodes_end + .expect("inodes_end not recorded") + .get(), + "second pass diverged from first at inodes_end" + ); + } + fn note_xattr(&mut self) {} + fn note_block(&mut self) {} + fn note_end(&mut self) { + debug_assert_eq!( + self.output.len(), + self.layout.end.expect("end not recorded").get(), + "second pass diverged from first at end" ); - self.layout.offsets.push(self.offset); } - fn get(&self, _: Offset, _: usize) -> usize { - 0 // We don't know offsets in the first pass, so fake it + fn get_inode_offset(&self, idx: usize) -> Option { + Some(self.layout.inodes[idx]) + } + fn get_inodes_end(&self) -> Option { + Some(self.layout.inodes_end.expect("inodes_end not recorded")) + } + fn get_xattr_offset(&self, idx: usize) -> Option { + Some(self.layout.xattrs[idx]) + } + fn get_block_offset(&self, idx: usize) -> Option { + Some(self.layout.blocks[idx]) + } + fn get_end(&self) -> Option { + Some(self.layout.end.expect("end not recorded")) } fn write(&mut self, data: &[u8]) { - self.offset += data.len(); + self.output.extend_from_slice(data); } - fn pad(&mut self, alignment: usize) { - self.offset = round_up(self.offset, alignment); + self.output + .resize(round_up(self.output.len(), alignment), 0); } - fn len(&self) -> usize { - self.offset + self.output.len() } } -/// Creates an EROFS filesystem image from a composefs tree +/// Calculates the minimum mtime across all inodes in the collection. +/// +/// This is used for V1 compatibility where build_time is set to the +/// minimum mtime for reproducibility. Returns `(0, 0)` for an empty slice. +fn calculate_min_mtime(inodes: &[Inode]) -> (u64, u32) { + inodes + .iter() + .map(|inode| (inode.stat.st_mtim_sec as u64, inode.stat.st_mtim_nsec)) + .reduce(|(a_sec, a_nsec), (b_sec, b_nsec)| { + if (b_sec, b_nsec) < (a_sec, a_nsec) { + (b_sec, b_nsec) + } else { + (a_sec, a_nsec) + } + }) + .unwrap_or((0, 0)) +} + +/// Return type of [`prepare_erofs_inodes`]: +/// `(inodes, shared_xattrs, min_mtime, header_flags, composefs_version)`. +type PreparedInodes<'a, ObjectID> = (Vec>, Vec, (u64, u32), u32, u32); + +/// Shared setup for all `mkfs_erofs_*` entry points. +/// +/// Collects inodes from the filesystem, injects the V1 opaque xattr on the +/// root directory, computes `header_flags` and `composefs_version`, promotes +/// repeated xattrs to the shared table, and calculates `min_mtime`. +/// +/// `min_composefs_version` is only meaningful for V1: it sets a lower bound on +/// the `composefs_version` header field, mirroring C mkcomposefs `--min-version`. +/// Pass `0` for the default auto-upgrade behaviour (0→1 when user whiteouts present). +/// Pass `1` to force `composefs_version=1` even when no user whiteouts exist. +/// +/// Returns `(inodes, shared_xattrs, min_mtime, header_flags, composefs_version)`. +fn prepare_erofs_inodes<'a, ObjectID: FsVerityHashValue>( + fs: &'a tree::FileSystem, + version: format::FormatVersion, + min_composefs_version: u32, +) -> PreparedInodes<'a, ObjectID> { + let mut inodes = InodeCollector::collect(fs, version); + + // For V1, add trusted.overlay.opaque xattr to root directory. + // This is done after collection (and thus after xattr escaping) to match + // the C implementation behavior. + if version == format::FormatVersion::V1 && !inodes.is_empty() { + inodes[0] + .xattrs + .add(format::XATTR_OVERLAY_OPAQUE_ROOT, b"y", version); + } + + // For V1, compute header flags and composefs_version matching C mkcomposefs behavior. + // This must be checked before share_xattrs(), while all xattrs are still local. + let (header_flags, composefs_version) = if version == format::FormatVersion::V1 { + // COMPOSEFS_FLAGS_HAS_ACL (bit 0) is set when any inode has POSIX ACL xattrs. + let has_acl = inodes.iter().any(|inode| { + inode.xattrs.local.iter().any(|xattr| { + xattr.prefix == format::XATTR_INDEX_POSIX_ACL_ACCESS + || xattr.prefix == format::XATTR_INDEX_POSIX_ACL_DEFAULT + }) + }); + let flags = if has_acl { + format::COMPOSEFS_FLAGS_HAS_ACL.get() + } else { + 0 + }; + + // C mkcomposefs bumps composefs_version from 0 to 1 when any user-provided + // node is a whiteout (char device with rdev=0). In the Rust writer, such + // nodes are detected and marked as `escaped_whiteout` by InodeCollector + // (stubs added by add_overlay_whiteouts() are deliberately excluded from + // this flag via the root-level 2-hex-char name heuristic). + // `min_composefs_version` mirrors C's `--min-version=1`: it forces + // composefs_version=1 even when no user whiteouts are present. + let has_user_whiteout = inodes.iter().any(|inode| inode.escaped_whiteout); + let cfs_ver = if has_user_whiteout || min_composefs_version >= 1 { + 1u32 + } else { + 0u32 + }; + + (flags, cfs_ver) + } else { + (0u32, format::COMPOSEFS_VERSION.get()) + }; + + let xattrs = share_xattrs(&mut inodes, version); + let min_mtime = calculate_min_mtime(&inodes); + + (inodes, xattrs, min_mtime, header_flags, composefs_version) +} + +/// Creates an EROFS filesystem image from a composefs tree using the default format (V2). /// /// This function performs a two-pass generation: /// 1. First pass determines the layout and sizes of all structures /// 2. Second pass writes the actual image data /// /// Returns the complete EROFS image as a byte array. -pub fn mkfs_erofs(fs: &tree::FileSystem) -> Box<[u8]> { - // Create the intermediate representation: flattened inodes and shared xattrs - let nlink_map = fs.nlinks(); - let mut inodes = InodeCollector::collect(fs, &nlink_map); - let xattrs = share_xattrs(&mut inodes); +pub fn mkfs_erofs(fs: &ValidatedFileSystem) -> Box<[u8]> { + mkfs_erofs_versioned(fs, format::FormatVersion::default()) +} + +/// Internal two-pass EROFS image generator shared by all public entry points. +/// +/// Runs a layout pass (first pass) followed by an emit pass (second pass). +/// When `faults` is `Some`, decisions are recorded during the first pass and +/// replayed during the second so both passes make identical choices. +pub(crate) fn mkfs_erofs_inner( + fs: &tree::FileSystem, + version: format::FormatVersion, + min_composefs_version: u32, + #[cfg(test)] faults: Option, +) -> Box<[u8]> { + let (inodes, xattrs, min_mtime, header_flags, composefs_version) = + prepare_erofs_inodes(fs, version, min_composefs_version); + + let mut ctx = WriteContext { + version, + min_mtime, + header_flags, + composefs_version, + #[cfg(test)] + faults, + }; - // Do a first pass with the writer to determine the layout + // First pass: determine the layout. let mut first_pass = FirstPass::default(); - write_erofs(&mut first_pass, &inodes, &xattrs); + write_erofs(&mut first_pass, &inodes, &xattrs, &mut ctx); - // Do a second pass with the writer to get the actual bytes + // Switch fault injector to replay mode so the second pass makes identical choices. + #[cfg(test)] + if let Some(ref mut f) = ctx.faults { + f.start_replay(); + } + + // Second pass: emit the actual bytes. let mut second_pass = SecondPass { output: vec![], layout: first_pass.layout, }; - write_erofs(&mut second_pass, &inodes, &xattrs); + write_erofs(&mut second_pass, &inodes, &xattrs, &mut ctx); - // That's it second_pass.output.into_boxed_slice() } + +/// Creates an EROFS filesystem image from a composefs tree with an explicit format version. +/// +/// The `version` parameter controls the format version: +/// - `FormatVersion::V1`: C mkcomposefs compatible (compact inodes, BFS) +/// - `FormatVersion::V2`: Current default (composefs_version=2, extended inodes, DFS) +/// +/// Returns the complete EROFS image as a byte array. +pub fn mkfs_erofs_versioned( + fs: &ValidatedFileSystem, + version: format::FormatVersion, +) -> Box<[u8]> { + mkfs_erofs_inner( + fs, + version, + 0, + #[cfg(test)] + None, + ) +} + +/// Like [`mkfs_erofs_versioned`] for V1, but with an explicit minimum +/// `composefs_version` header value. +/// +/// This mirrors C mkcomposefs `--min-version=N`: passing `min_composefs_version=1` +/// forces `composefs_version=1` in the EROFS header even when no user-visible +/// whiteout devices are present (the default auto-upgrade only sets it to 1 +/// when a whiteout is encountered). +/// +/// `min_composefs_version` must be 0 or 1; values above 1 are clamped to 1 +/// (there is no C-compatible format above version 1). +pub fn mkfs_erofs_v1_min_version( + fs: &ValidatedFileSystem, + min_composefs_version: u32, +) -> Box<[u8]> { + mkfs_erofs_inner( + fs, + format::FormatVersion::V1, + min_composefs_version.min(1), + #[cfg(test)] + None, + ) +} + +/// Test-only: write a versioned EROFS image with fault injection. +/// +/// `faults` controls which writer invariants are intentionally violated. +/// Pass `WriterFaults::new(seed)` with the desired rates set. +#[cfg(test)] +pub(crate) fn mkfs_erofs_with_faults( + fs: &ValidatedFileSystem, + version: format::FormatVersion, + faults: WriterFaults, +) -> Box<[u8]> { + mkfs_erofs_inner(&fs.0, version, 0, Some(faults)) +} + +#[cfg(test)] +mod tests { + use super::compute_chunk_format; + + /// Unit tests for `compute_chunk_format` with boundary values. + /// + /// The function converts a file size into the EROFS chunk-format field: + /// chunkbits = ilog2(size - 1) + 1, clamped to [BLOCK_BITS=12, 43] + /// result = chunkbits - BLOCK_BITS + #[test] + fn test_compute_chunk_format_boundary_values() { + // size=1: file_size <= 1 branch → chunkbits=1 → clamped to 12 → result 0 + assert_eq!(compute_chunk_format(1), 0, "size=1"); + // size=2: ilog2(1)+1=1 → clamped to 12 → result 0 + assert_eq!(compute_chunk_format(2), 0, "size=2"); + // size=4096: ilog2(4095)+1=12 → no clamp → result 0 + assert_eq!(compute_chunk_format(4096), 0, "size=4096"); + // size=4097: ilog2(4096)+1=13 → result 1 + assert_eq!(compute_chunk_format(4097), 1, "size=4097"); + // size=1<<20: ilog2((1<<20)-1)+1=20 → result 8 + assert_eq!(compute_chunk_format(1 << 20), 8, "size=1<<20"); + // size=(1<<20)+1: ilog2(1<<20)+1=21 → result 9 + assert_eq!(compute_chunk_format((1 << 20) + 1), 9, "size=(1<<20)+1"); + } +} diff --git a/crates/composefs/src/filesystem_ops.rs b/crates/composefs/src/filesystem_ops.rs index 31012230..55c932e3 100644 --- a/crates/composefs/src/filesystem_ops.rs +++ b/crates/composefs/src/filesystem_ops.rs @@ -4,22 +4,69 @@ //! FileSystem objects, including computing image IDs, committing to //! repositories, and generating dumpfiles. +use std::collections::HashMap; + use anyhow::Result; use fn_error_context::context; use crate::{ dumpfile::write_dumpfile, - erofs::writer::mkfs_erofs, + erofs::{ + format::{FormatSet, FormatVersion}, + writer::{mkfs_erofs_inner, validate_filesystem}, + }, fsverity::{FsVerityHashValue, compute_verity}, repository::Repository, tree::FileSystem, }; impl FileSystem { + /// Commits this filesystem as EROFS images for each version in `formats`. + /// + /// Returns a map from [`FormatVersion`] to the fsverity digest of the + /// stored image for that version. + /// + /// The `image_name` named ref (if provided) is assigned to the **first** + /// version yielded by `formats.iter()` (i.e. V1 when the set includes V1). + /// All subsequent versions are stored anonymously (no named ref). This + /// prevents the ref from silently being overwritten and left pointing at the + /// last written version. + /// + /// Note: Callers should ensure root metadata is set before calling this, + /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. + #[context("Committing filesystem as EROFS images")] + pub fn commit_images( + &self, + repository: &Repository, + image_name: Option<&str>, + formats: FormatSet, + ) -> Result> { + // Validate once before writing any version. + validate_filesystem(self)?; + let mut result = HashMap::new(); + let mut first = true; + for version in formats.iter() { + // Only the primary (first) version claims the named ref. + let name = if first { image_name } else { None }; + first = false; + let image_data = mkfs_erofs_inner( + self, + version, + 0, + #[cfg(test)] + None, + ); + let id = repository.write_image(name, &image_data)?; + result.insert(version, id); + } + Ok(result) + } + /// Commits this filesystem as an EROFS image to the repository. /// - /// Generates an EROFS filesystem image and writes it to the repository - /// with the optional name. Returns the fsverity digest of the committed image. + /// Generates an EROFS filesystem image using the repository's configured + /// EROFS format version and writes it with the optional name. Returns the + /// fsverity digest of the committed image. /// /// Note: Callers should ensure root metadata is set before calling this, /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. @@ -29,18 +76,32 @@ impl FileSystem { repository: &Repository, image_name: Option<&str>, ) -> Result { - repository.write_image(image_name, &mkfs_erofs(self)) + let version = repository.erofs_version(); + let formats = FormatSet::from(version); + let mut map = self.commit_images(repository, image_name, formats)?; + Ok(map.remove(&version).expect("format version must be in map")) } /// Computes the fsverity digest for this filesystem as an EROFS image. /// - /// Generates the EROFS image and returns its fsverity digest without - /// writing to a repository. + /// The digest depends on the EROFS format version: V1 and V2 produce + /// different on-disk layouts and therefore different digests. Callers + /// must supply the version explicitly so that the digest matches what is + /// actually stored (or will be stored) in the repository. /// /// Note: Callers should ensure root metadata is set before calling this, /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. - pub fn compute_image_id(&self) -> ObjectID { - compute_verity(&mkfs_erofs(self)) + pub fn compute_image_id(&self, version: FormatVersion) -> ObjectID { + // Callers are responsible for ensuring the tree is valid before calling this. + // In practice this is always called on freshly-built trees that don't have + // invalid constructs like hardlinked whiteouts. + compute_verity(&mkfs_erofs_inner( + self, + version, + 0, + #[cfg(test)] + None, + )) } /// Prints this filesystem in dumpfile format to stdout. diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index 790537e0..029f3c5a 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -408,6 +408,7 @@ fn stat_fd(fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, generic_tr st_uid: buf.st_uid, st_gid: buf.st_gid, st_mtim_sec: buf.st_mtime as i64, + st_mtim_nsec: buf.st_mtime_nsec as u32, xattrs: read_xattrs(fd)?, }, )) @@ -926,6 +927,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: Default::default(), + st_mtim_nsec: Default::default(), xattrs: Default::default(), }; set_file_contents(&td, OsStr::new("testfile"), &st, b"new contents").unwrap(); diff --git a/crates/composefs/src/generic_tree.rs b/crates/composefs/src/generic_tree.rs index 77eb5b4e..a5192728 100644 --- a/crates/composefs/src/generic_tree.rs +++ b/crates/composefs/src/generic_tree.rs @@ -21,6 +21,8 @@ pub struct Stat { pub st_gid: u32, /// Modification time in seconds since Unix epoch. pub st_mtim_sec: i64, + /// Modification time nanosecond component (0..999_999_999). + pub st_mtim_nsec: u32, /// Extended attributes as key-value pairs. pub xattrs: BTreeMap, Box<[u8]>>, } @@ -46,6 +48,7 @@ impl Stat { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -100,7 +103,7 @@ pub struct Leaf { } /// A directory node containing named entries. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Directory { /// Metadata for this directory. pub stat: Stat, @@ -109,7 +112,7 @@ pub struct Directory { } /// A filesystem inode representing either a directory or a leaf node. -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Inode { /// A directory inode. Directory(Box>), @@ -508,17 +511,30 @@ impl Directory { } } + /// Retains only top-level entries whose names satisfy the predicate. + /// This is used for filtering dump output to specific entries. + pub fn retain_top_level(&mut self, mut f: impl FnMut(&str) -> bool) { + self.entries.retain(|name, _| { + // Convert OsStr to str for comparison; non-UTF8 names never match + name.to_str().is_some_and(&mut f) + }); + } + /// Recursively finds the newest modification time in this directory tree. /// /// Returns the maximum modification time among this directory's metadata - /// and all files and subdirectories it contains. + /// and all files and subdirectories it contains, as a `(sec, nsec)` tuple + /// for full nanosecond precision. /// /// The `leaves` table is needed to resolve leaf mtimes. - pub fn newest_file(&self, leaves: &[Leaf]) -> i64 { - let mut newest = self.stat.st_mtim_sec; + pub fn newest_file(&self, leaves: &[Leaf]) -> (i64, u32) { + let mut newest = (self.stat.st_mtim_sec, self.stat.st_mtim_nsec); for inode in self.entries.values() { let mtime = match inode { - Inode::Leaf(id, _) => leaves[id.0].stat.st_mtim_sec, + Inode::Leaf(id, _) => { + let s = &leaves[id.0].stat; + (s.st_mtim_sec, s.st_mtim_nsec) + } Inode::Directory(dir) => dir.newest_file(leaves), }; if mtime > newest { @@ -595,6 +611,71 @@ pub struct FileSystem { } impl FileSystem { + /// Add 256 overlay whiteout stub entries to the root directory. + /// + /// This is required for Format 1.0 compatibility with the C mkcomposefs. + /// Each whiteout is a character device named "00" through "ff" with rdev=0. + /// They inherit uid/gid/mtime from the root directory but have empty xattrs. + /// + /// These entries allow overlay filesystems to efficiently represent + /// deleted files using device stubs that match the naming convention. + /// + /// Adds the 256 two-character hex-named whiteout stub entries (`00`..`ff`) to + /// the root directory, skipping any that already exist. + /// + /// Matches C mkcomposefs v1.0.8 `add_overlay_whiteouts()`: each stub inherits + /// `uid`, `gid`, and `mtime` from root, gets mode `S_IFCHR|0644` with `rdev=0`, + /// and **only** the `security.selinux` xattr from root (if present). No other + /// xattrs are propagated — copying all root xattrs would make them appear on 257 + /// inodes instead of 1, causing the xattr-sharing pass to turn them into shared + /// references and bloating the inode body in a way C does not. + pub fn add_overlay_whiteouts(&mut self) { + use std::ffi::OsString; + + // C mkcomposefs only inherits security.selinux from root for the stubs. + // Copying all root xattrs would change shared-vs-local xattr storage and + // produce a binary-incompatible image. + let selinux_key = std::ffi::OsStr::new("security.selinux"); + let mut whiteout_xattrs = std::collections::BTreeMap::new(); + if let Some(val) = self.root.stat.xattrs.get(selinux_key) { + whiteout_xattrs.insert(Box::from(selinux_key), val.clone()); + } + + let whiteout_stat = Stat { + st_mode: 0o644, + st_uid: self.root.stat.st_uid, + st_gid: self.root.stat.st_gid, + st_mtim_sec: self.root.stat.st_mtim_sec, + st_mtim_nsec: self.root.stat.st_mtim_nsec, + xattrs: whiteout_xattrs, + }; + + for i in 0..=255u8 { + let name = OsString::from(format!("{:02x}", i)); + + // Skip if entry already exists + if self.root.entries.contains_key(name.as_os_str()) { + continue; + } + + let leaf_id = self.push_leaf(whiteout_stat.clone(), LeafContent::CharacterDevice(0)); + self.root + .entries + .insert(name.into_boxed_os_str(), Inode::leaf(leaf_id)); + } + } + + /// Add trusted.overlay.opaque="y" xattr to root directory. + /// + /// This is required for Format 1.0 when whiteout entries are present, + /// marking the directory as opaque for the overlay filesystem. + pub fn set_overlay_opaque(&mut self) { + self.root.stat.xattrs.insert( + Box::from(std::ffi::OsStr::new("trusted.overlay.opaque")), + Box::from(*b"y"), + ); + } + /// Creates a new filesystem with a root directory having the given metadata. pub fn new(root_stat: Stat) -> Self { Self { @@ -643,6 +724,7 @@ impl FileSystem { let st_uid = usr.stat.st_uid; let st_gid = usr.stat.st_gid; let st_mtim_sec = usr.stat.st_mtim_sec; + let st_mtim_nsec = usr.stat.st_mtim_nsec; let xattrs = usr.stat.xattrs.clone(); // Apply copied metadata to root @@ -650,6 +732,7 @@ impl FileSystem { self.root.stat.st_uid = st_uid; self.root.stat.st_gid = st_gid; self.root.stat.st_mtim_sec = st_mtim_sec; + self.root.stat.st_mtim_nsec = st_mtim_nsec; self.root.stat.xattrs = xattrs; Ok(()) @@ -734,9 +817,10 @@ impl FileSystem { /// Returns an error if `/usr` does not exist (needed to get the mtime). pub fn canonicalize_run(&mut self) -> Result<(), ImageError> { if self.root.get_directory_opt(OsStr::new("run"))?.is_some() { - let usr_mtime = self.root.get_directory(OsStr::new("usr"))?.stat.st_mtim_sec; + let usr = self.root.get_directory(OsStr::new("usr"))?.stat.clone(); let run_dir = self.root.get_directory_mut(OsStr::new("run"))?; - run_dir.stat.st_mtim_sec = usr_mtime; + run_dir.stat.st_mtim_sec = usr.st_mtim_sec; + run_dir.stat.st_mtim_nsec = usr.st_mtim_nsec; run_dir.clear(); } Ok(()) @@ -991,7 +1075,9 @@ impl<'a, T> DirectoryRef<'a, T> { } /// Recursively finds the newest modification time in this directory tree. - pub fn newest_file(&self) -> i64 { + /// + /// Returns a `(sec, nsec)` tuple for full nanosecond precision. + pub fn newest_file(&self) -> (i64, u32) { self.dir.newest_file(self.leaves) } } @@ -1013,6 +1099,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -1024,6 +1111,7 @@ mod tests { st_uid: 1000, st_gid: 1000, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -1253,27 +1341,27 @@ mod tests { let mut leaves = Vec::new(); let mut root = Directory::new(stat_with_mtime(5)); - assert_eq!(root.newest_file(&leaves), 5); + assert_eq!(root.newest_file(&leaves), (5, 0)); let leaf_id_10 = push_leaf_file(&mut leaves, 10); root.insert(OsStr::new("file1"), Inode::leaf(leaf_id_10)); - assert_eq!(root.newest_file(&leaves), 10); + assert_eq!(root.newest_file(&leaves), (10, 0)); let subdir_stat = stat_with_mtime(15); let mut subdir = Box::new(Directory::new(subdir_stat)); let leaf_id_12 = push_leaf_file(&mut leaves, 12); subdir.insert(OsStr::new("subfile1"), Inode::leaf(leaf_id_12)); root.insert(OsStr::new("subdir"), Inode::Directory(subdir)); - assert_eq!(root.newest_file(&leaves), 15); + assert_eq!(root.newest_file(&leaves), (15, 0)); if let Some(Inode::Directory(sd)) = root.entries.get_mut(OsStr::new("subdir")) { let leaf_id_20 = push_leaf_file(&mut leaves, 20); sd.insert(OsStr::new("subfile2"), Inode::leaf(leaf_id_20)); } - assert_eq!(root.newest_file(&leaves), 20); + assert_eq!(root.newest_file(&leaves), (20, 0)); root.stat.st_mtim_sec = 25; - assert_eq!(root.newest_file(&leaves), 25); + assert_eq!(root.newest_file(&leaves), (25, 0)); } #[test] @@ -1325,6 +1413,7 @@ mod tests { st_uid: 42, st_gid: 43, st_mtim_sec: 1234567890, + st_mtim_nsec: 0, xattrs: BTreeMap::from([( Box::from(OsStr::new("security.selinux")), Box::from(b"system_u:object_r:usr_t:s0".as_slice()), @@ -1370,6 +1459,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::from([ ( Box::from(OsStr::new("security.selinux")), @@ -1622,6 +1712,7 @@ mod tests { st_uid: 100, st_gid: 200, st_mtim_sec: 54321, + st_mtim_nsec: 0, xattrs: BTreeMap::from([( Box::from(OsStr::new("user.test")), Box::from(b"val".as_slice()), @@ -1794,4 +1885,96 @@ mod tests { assert_eq!(fs.root.stat.st_mtim_sec, 200); assert_eq!(fs.leaves[0].stat.st_mtim_sec, 400); } + + #[test] + fn test_add_overlay_whiteouts() { + let root_stat = Stat { + st_mode: 0o755, + st_uid: 1000, + st_gid: 2000, + st_mtim_sec: 12345, + st_mtim_nsec: 0, + xattrs: BTreeMap::from([( + Box::from(OsStr::new("security.selinux")), + Box::from(b"system_u:object_r:root_t:s0".as_slice()), + )]), + }; + let mut fs = FileSystem::::new(root_stat); + + // Add a pre-existing entry that should not be overwritten + let pre_id = fs.push_leaf( + stat_with_mtime(99999), + LeafContent::Regular(FileContents {}), + ); + fs.root.insert(OsStr::new("00"), Inode::leaf(pre_id)); + + fs.add_overlay_whiteouts(); + + // Should have 256 whiteout entries (255 new + 1 pre-existing) + assert_eq!(fs.root.entries.len(), 256); + + // The pre-existing "00" should still have its original mtime + if let Some(Inode::Leaf(id, _)) = fs.root.entries.get(OsStr::new("00")) { + assert_eq!(fs.leaf(*id).stat.st_mtim_sec, 99999); + } else { + panic!("Expected '00' to remain a leaf"); + } + + // Check a newly created whiteout entry + if let Some(Inode::Leaf(id, _)) = fs.root.entries.get(OsStr::new("ff")) { + let leaf = fs.leaf(*id); + // Should be a character device with rdev=0 + assert!(matches!(leaf.content, LeafContent::CharacterDevice(0))); + // Should have mode 0o644 + assert_eq!(leaf.stat.st_mode, 0o644); + // Should inherit uid/gid/mtime from root + assert_eq!(leaf.stat.st_uid, 1000); + assert_eq!(leaf.stat.st_gid, 2000); + assert_eq!(leaf.stat.st_mtim_sec, 12345); + // Should inherit xattrs from root (e.g. SELinux label) — matching + // C mkcomposefs behaviour where whiteout entries copy root metadata. + assert_eq!( + leaf.stat + .xattrs + .get(OsStr::new("security.selinux")) + .map(|v| v.as_ref()), + Some(b"system_u:object_r:root_t:s0".as_slice()) + ); + } else { + panic!("Expected 'ff' to be a leaf"); + } + + // Check some middle entries exist + assert!(fs.root.entries.contains_key(OsStr::new("7f"))); + assert!(fs.root.entries.contains_key(OsStr::new("a0"))); + } + + #[test] + fn test_set_overlay_opaque() { + let mut fs = FileSystem::::new(default_stat()); + + fs.set_overlay_opaque(); + + let opaque = fs + .root + .stat + .xattrs + .get(OsStr::new("trusted.overlay.opaque")); + assert!(opaque.is_some()); + assert_eq!(opaque.unwrap().as_ref(), b"y"); + } + + #[test] + fn test_add_overlay_whiteouts_empty_fs() { + let mut fs = FileSystem::::new(default_stat()); + + fs.add_overlay_whiteouts(); + + // Should have exactly 256 entries + assert_eq!(fs.root.entries.len(), 256); + + // Check first and last entries + assert!(fs.root.entries.contains_key(OsStr::new("00"))); + assert!(fs.root.entries.contains_key(OsStr::new("ff"))); + } } diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index a569dd06..a08b03b8 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -107,6 +107,7 @@ use rustix::{ }; use crate::{ + erofs::format::{FormatSet, FormatVersion}, fsverity::{ Algorithm, CompareVerityError, DEFAULT_LG_BLOCKSIZE, EnableVerityError, FsVerityHashValue, FsVerityHasher, MeasureVerityError, compute_verity, enable_verity_maybe_copy, @@ -192,13 +193,19 @@ pub const REPO_FORMAT_VERSION: u32 = 1; /// but prevent any writes (adding objects, creating images, GC, …). /// - Unknown **incompatible** features cause the repository to be /// rejected entirely. -/// -/// There are currently no defined features. pub mod known_features { + /// The ro-compat feature flag for V1 EROFS repositories. + /// + /// When present in `read_only_compatible`, the repository uses the V1 + /// (C-tool compatible) EROFS format. Old tools that don't recognize this + /// flag will open the repository as read-only, preventing accidental V2 + /// image writes into a V1 repo. + pub const V1_EROFS: &str = "v1_erofs"; + /// Compatible features understood by this version. pub const COMPAT: &[&str] = &[]; /// Read-only compatible features understood by this version. - pub const RO_COMPAT: &[&str] = &[]; + pub const RO_COMPAT: &[&str] = &[V1_EROFS]; /// Incompatible features understood by this version. pub const INCOMPAT: &[&str] = &[]; } @@ -282,6 +289,10 @@ impl FeatureFlags { /// (ext4, XFS, EROFS): a base version integer for fundamental layout /// changes, plus three tiers of feature flags for finer-grained /// evolution. +/// +/// The EROFS format version is not stored as an explicit field; it is +/// derived from the feature flags: the presence of `"v1_erofs"` +/// in `read_only_compatible` means V1, its absence means V2. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct RepoMetadata { /// Base repository format version. Tools must refuse to operate @@ -297,21 +308,61 @@ pub struct RepoMetadata { } impl RepoMetadata { - /// Build metadata for a repository using the given hash type. - pub fn for_hash() -> Self { - Self { - version: REPO_FORMAT_VERSION, - algorithm: Algorithm::for_hash::(), - features: FeatureFlags::default(), + /// Derive the default EROFS format version from the feature flags. + /// + /// - `"v1_erofs"` present in `read_only_compatible` → [`FormatVersion::V1`] + /// - absent → [`FormatVersion::V2`] + pub fn erofs_version(&self) -> FormatVersion { + if self + .features + .read_only_compatible + .iter() + .any(|f| f == known_features::V1_EROFS) + { + FormatVersion::V1 + } else { + FormatVersion::V2 } } +} - /// Build metadata from an explicit [`Algorithm`]. +impl RepoMetadata { + /// Build metadata for a repository using the given hash type, with the default (V2) EROFS version. + pub fn for_hash() -> Self { + Self::new_with_formats( + Algorithm::for_hash::(), + FormatVersion::default(), + FormatSet::BOTH, + ) + } + + /// Build metadata from an explicit [`Algorithm`], with the default (V2) EROFS format version. pub fn new(algorithm: Algorithm) -> Self { + Self::new_with_formats(algorithm, FormatVersion::default(), FormatSet::BOTH) + } + + /// Build metadata with the correct feature flags for the given EROFS format version + /// and format set. + /// + /// The EROFS format version is encoded in the feature flags with a single flag: + /// - V1 repositories (both V1_ONLY and BOTH) add `"v1_erofs"` to `ro_compat` so that + /// older tools open them read-only rather than writing images in the wrong format. + /// - V2-only repositories omit `"v1_erofs"`. + pub fn new_with_formats( + algorithm: Algorithm, + erofs_version: FormatVersion, + _erofs_formats: FormatSet, + ) -> Self { + let mut features = FeatureFlags::default(); + if erofs_version == FormatVersion::V1 { + features + .read_only_compatible + .push(known_features::V1_EROFS.to_string()); + } Self { version: REPO_FORMAT_VERSION, algorithm, - features: FeatureFlags::default(), + features, } } @@ -351,6 +402,81 @@ impl RepoMetadata { } } +/// Configuration for initializing a new composefs repository. +/// +/// Passed to [`Repository::init_path`] to specify the algorithm, +/// fs-verity policy, and default EROFS format version. +/// +/// fs-verity is **required by default**. Call [`set_insecure`](Self::set_insecure) +/// to opt out (e.g. on tmpfs or in tests). +/// +/// # Examples +/// +/// ```no_run +/// use composefs::repository::RepositoryConfig; +/// use composefs::fsverity::Algorithm; +/// +/// // Default: SHA-256, fs-verity required, EROFS V2. +/// let config = RepositoryConfig::default(); +/// +/// // SHA-512 with fs-verity required. +/// let config = RepositoryConfig::new(Algorithm::SHA512); +/// +/// // Insecure mode (tmpfs, testing). +/// let config = RepositoryConfig::default().set_insecure(); +/// +/// // Custom algorithm, insecure. +/// let config = RepositoryConfig::new(Algorithm::SHA512).set_insecure(); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RepositoryConfig { + /// The fs-verity hash algorithm for content-addressed objects. + pub algorithm: Algorithm, + /// Default EROFS format version for images produced by this repository. + /// V1 is compatible with C `mkcomposefs` 1.0.8; V2 is the composefs-rs native format. + pub erofs_version: FormatVersion, + /// The set of EROFS format versions to generate when committing images. + /// + /// Defaults to V2-only. Set to [`FormatSet::V1_ONLY`] for C-tool + /// compatible output or [`FormatSet::BOTH`] when both V1 and V2 images + /// should be produced (e.g. for bootc workflows). + pub erofs_formats: FormatSet, + /// When `true`, fs-verity is NOT enabled on `meta.json` and is not required + /// on stored objects. Use [`set_insecure`](Self::set_insecure) to set this. + insecure: bool, +} + +impl RepositoryConfig { + /// Create a config with the given algorithm and all other settings at their defaults + /// (fs-verity required, `erofs_version = V2`, `erofs_formats = V2_ONLY`). + pub fn new(algorithm: Algorithm) -> Self { + Self { + algorithm, + ..Self::default() + } + } + + /// Disable fs-verity for this repository. + /// + /// Suitable for use on filesystems that do not support fs-verity (tmpfs, + /// overlayfs) or in test environments. Returns `self` for chaining. + pub fn set_insecure(mut self) -> Self { + self.insecure = true; + self + } +} + +impl Default for RepositoryConfig { + fn default() -> Self { + Self { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::default(), + erofs_formats: FormatSet::from(FormatVersion::default()), + insecure: false, + } + } +} + /// Read the fs-verity algorithm from a repository's `meta.json`. /// /// This is the public API for determining which algorithm a repository @@ -440,6 +566,24 @@ pub fn system_path() -> PathBuf { PathBuf::from("/sysroot/composefs") } +/// Derive the [`FormatSet`] from a [`RepoMetadata`]. +/// +/// - `"v1_erofs"` present in `ro_compat` → [`FormatSet::V1_ONLY`] +/// - `"v1_erofs"` absent → V2-only (reported as [`FormatSet::BOTH`] for +/// forward-compatibility; dual V1+V2 mode will add its own flag later) +fn repo_formats_from_meta(meta: &RepoMetadata) -> FormatSet { + if meta + .features + .read_only_compatible + .iter() + .any(|f| f == known_features::V1_EROFS) + { + FormatSet::V1_ONLY + } else { + FormatSet::BOTH + } +} + /// Write `meta.json` into a repository directory fd. /// /// This atomically writes (via O_TMPFILE + linkat) the metadata file. @@ -741,6 +885,9 @@ pub struct Repository { write_concurrency: Option, insecure: bool, metadata: RepoMetadata, + /// Per-invocation EROFS version override set by [`set_erofs_version`](Self::set_erofs_version). + /// Does not rewrite `meta.json`; only affects this `Repository` instance. + erofs_version_override: Option, /// When true, SplitStreamWriter::done() writes old-format (pre-repr(C)) /// headers. Used to test backward compatibility with splitstreams /// written before #[repr(C)] was added to SplitstreamHeader. @@ -1074,17 +1221,17 @@ impl Repository { /// Initialize a new repository at the target path and open it. /// /// Creates the directory (mode 0700) if it does not exist, writes - /// `meta.json` for the given `algorithm`, and returns the opened + /// `meta.json` using the parameters from `config`, and returns the opened /// repository together with a flag indicating whether this was a /// fresh initialization (`true`) or an idempotent open of an /// existing repository with the same algorithm (`false`). /// - /// The `algorithm` must be compatible with this repository's + /// The `config.algorithm` must be compatible with this repository's /// `ObjectID` type (e.g. `Algorithm::Sha512` for /// `Repository`). /// - /// If `enable_verity` is true, fs-verity is enabled on `meta.json`, - /// signaling that all objects must also have verity. + /// Unless `config` has been made insecure via [`RepositoryConfig::set_insecure`], + /// fs-verity is enabled on `meta.json`, signaling that all objects must also have verity. /// /// If `meta.json` already exists with a different algorithm, an /// error is returned. @@ -1092,10 +1239,16 @@ impl Repository { pub fn init_path( dirfd: impl AsFd, path: impl AsRef, - algorithm: Algorithm, - enable_verity: bool, + config: RepositoryConfig, ) -> Result<(Self, bool)> { let path = path.as_ref(); + let RepositoryConfig { + algorithm, + erofs_version, + erofs_formats, + insecure, + } = config; + let require_fsverity = !insecure; if !algorithm.is_compatible::() { bail!( @@ -1117,11 +1270,12 @@ impl Repository { ) .with_context(|| format!("opening repository directory {}", path.display()))?; - let meta = RepoMetadata::new(algorithm); + let meta = RepoMetadata::new_with_formats(algorithm, erofs_version, erofs_formats); // Try to write meta.json. If it already exists, check for - // idempotency: same algorithm is fine, different is an error. - if let Err(write_err) = write_repo_metadata(&repo_fd, &meta, enable_verity) { + // idempotency: same config is fine; certain upgrades are allowed; + // incompatible changes are errors. + if let Err(write_err) = write_repo_metadata(&repo_fd, &meta, require_fsverity) { match read_repo_metadata(&repo_fd)? { Some(existing) if existing == meta => { // Idempotent: same config, already initialized. @@ -1130,10 +1284,13 @@ impl Repository { } Some(existing) => { bail!( - "repository already initialized with algorithm '{}'; \ - cannot re-initialize with '{}'", + "repository already initialized with different configuration \ + (algorithm: {}, erofs_version: {:?}); \ + cannot re-initialize with (algorithm: {}, erofs_version: {:?})", existing.algorithm, + existing.erofs_version(), meta.algorithm, + meta.erofs_version(), ); } None => { @@ -1184,6 +1341,7 @@ impl Repository { write_concurrency: None, insecure: !has_verity, metadata, + erofs_version_override: None, #[cfg(any(test, feature = "test"))] write_old_splitstream_format: std::sync::atomic::AtomicBool::new(false), _data: std::marker::PhantomData, @@ -1227,6 +1385,10 @@ impl Repository { ); } + // Use `new` (no `v1_erofs` flag) for legacy repos + // that pre-date the format-set feature. No feature flags → V2 + BOTH, which + // is correct: old repos may contain images of any version and should not be + // artificially restricted. let meta = RepoMetadata::new(algorithm); write_repo_metadata(&repo_fd, &meta, has_verity)?; @@ -1880,6 +2042,19 @@ impl Repository { self.insecure } + /// Override the EROFS format version for this repository session. + /// + /// Changes the in-memory default used by [`FileSystem::commit_image`] + /// and [`FileSystem::compute_image_id`] for the lifetime of this + /// Override the EROFS format version for this `Repository` instance only. + /// + /// Does **not** rewrite `meta.json`. Intended for CLI tools that accept a + /// per-invocation `--erofs-version` flag to override the repository's stored default. + pub fn set_erofs_version(&mut self, version: FormatVersion) -> &mut Self { + self.erofs_version_override = Some(version); + self + } + /// Mark this repository as insecure, disabling verification of /// fs-verity digests. This allows operation on filesystems /// without verity support. @@ -3275,6 +3450,25 @@ impl Repository { &self.metadata } + /// Returns the effective EROFS format version for this repository. + /// + /// Returns the per-invocation override set by [`set_erofs_version`](Self::set_erofs_version) + /// if one is active, otherwise derives the version from the `meta.json` feature flags + /// (presence of `"v1_erofs"` in `read_only_compatible` → V1, absent → V2). + pub fn erofs_version(&self) -> FormatVersion { + self.erofs_version_override + .unwrap_or_else(|| self.metadata.erofs_version()) + } + + /// Returns the [`FormatSet`] configured for this repository. + /// + /// Derived from the `"v1_erofs"` ro_compat feature flag in `meta.json`: + /// - flag present → [`FormatSet::V1_ONLY`] + /// - flag absent → [`FormatSet::BOTH`] + pub fn default_format_set(&self) -> FormatSet { + repo_formats_from_meta(&self.metadata) + } + /// Lists all named stream references under a given prefix. /// /// Returns (name, target) pairs where name is relative to the prefix. @@ -3436,7 +3630,11 @@ mod tests { /// Create a test repository in insecure mode (no fs-verity required). fn create_test_repo(path: &Path) -> Result>> { - let (repo, _) = Repository::init_path(CWD, path, Algorithm::SHA512, false)?; + let (repo, _) = Repository::init_path( + CWD, + path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + )?; Ok(Arc::new(repo)) } @@ -3946,6 +4144,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), } } @@ -3959,6 +4158,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::External(obj.clone(), size)), @@ -4121,6 +4321,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::External(obj2.clone(), size2)), @@ -4785,8 +4986,7 @@ mod tests { #[tokio::test] async fn test_fsck_detects_corrupt_erofs_image() -> Result<()> { // Exercises fsck_image: corrupts the erofs image data so that - // parsing fails. The catch_unwind should catch the panic from - // the current erofs reader. + // parsing fails. fsck_image returns an error rather than panicking. let tmp = tempdir(); let repo = create_test_repo(&tmp.path().join("repo"))?; @@ -4823,6 +5023,45 @@ mod tests { Ok(()) } + /// Helper to create a V1 (C-compatible) EROFS image and write it to the repo. + fn commit_v1_image( + repo: &Repository, + obj_id: &Sha512HashValue, + obj_size: u64, + ) -> Result { + use crate::erofs::writer::{ValidatedFileSystem, mkfs_erofs_versioned}; + + let mut fs = make_test_fs(obj_id, obj_size); + fs.add_overlay_whiteouts(); + let image_data = + mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + repo.write_image(None, &image_data) + } + + #[tokio::test] + async fn test_fsck_validates_v1_erofs_image() -> Result<()> { + // V1 images (C-compatible format) should pass fsck just like V2. + // This catches regressions where fsck or the reader doesn't handle + // compact inodes, BFS ordering, or the whiteout table. + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj_size: u64 = 32 * 1024; + let obj = generate_test_data(obj_size, 0xBB); + let obj_id = repo.ensure_object(&obj)?; + + commit_v1_image(&repo, &obj_id, obj_size)?; + repo.sync()?; + + let result = repo.fsck().await?; + assert!( + result.is_ok(), + "V1 (C-compatible) erofs image should pass fsck: {result}" + ); + assert!(result.images_checked > 0, "should have checked the image"); + Ok(()) + } + // ---- Fsck metadata validation tests ---- #[tokio::test] @@ -4897,7 +5136,12 @@ mod tests { // Open a sha512 repo as sha256 → AlgorithmMismatch. let tmp = tempdir(); let path = tmp.path().join("sha512-repo"); - Repository::::init_path(CWD, &path, Algorithm::SHA512, false).unwrap(); + Repository::::init_path( + CWD, + &path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); assert!(matches!( Repository::::open_path(CWD, &path), Err(RepositoryOpenError::AlgorithmMismatch { .. }) @@ -4968,6 +5212,155 @@ mod tests { ); } + // ---- erofs_version / v1_erofs feature tests ---- + + #[test] + fn test_init_v1_repo_metadata() { + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + FormatVersion::V1, + FormatSet::V1_ONLY, + ); + assert_eq!(meta.erofs_version(), FormatVersion::V1); + assert!( + meta.features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V1 repo must list v1_erofs in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + } + + #[test] + fn test_init_v2_repo_metadata() { + let meta = + RepoMetadata::new_with_formats(Algorithm::SHA256, FormatVersion::V2, FormatSet::BOTH); + assert_eq!(meta.erofs_version(), FormatVersion::V2); + assert!( + !meta + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V2 repo must NOT list v1_erofs in ro_compat" + ); + } + + #[test] + fn test_init_path_erofs_version_mismatch() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + // First init: V1 + let config_v1 = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V1, + ..RepositoryConfig::default().set_insecure() + }; + Repository::::init_path(CWD, &path, config_v1)?; + + // Second init: V2 — should fail because meta.json already exists with V1 config + let config_v2 = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + ..RepositoryConfig::default().set_insecure() + }; + let result = Repository::::init_path(CWD, &path, config_v2); + assert!( + result.is_err(), + "re-initializing with different erofs_version must fail" + ); + let err = result.unwrap_err(); + // Use the full chain representation so we see the inner bail! message, + // not just the outermost fn_error_context wrapper. + let msg = format!("{err:#}"); + assert!( + msg.contains("erofs_version"), + "error message must mention erofs_version, got: {msg}" + ); + Ok(()) + } + + #[test] + fn test_init_path_same_erofs_version_is_idempotent() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V1, + ..RepositoryConfig::default().set_insecure() + }; + let (_, was_new) = Repository::::init_path(CWD, &path, config.clone())?; + assert!(was_new, "first init must be fresh"); + + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(!was_new, "second init with same config must be idempotent"); + assert_eq!(repo.erofs_version(), FormatVersion::V1); + Ok(()) + } + + #[test] + fn test_legacy_repo_defaults_to_v2() { + // A repo with no feature flags → no v1_erofs → derived version is V2. + let json = br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{}}"#; + let meta: RepoMetadata = serde_json::from_slice(json).unwrap(); + assert_eq!( + meta.erofs_version(), + FormatVersion::V2, + "repo with no v1_erofs flag should derive V2" + ); + + // A repo with v1_erofs in ro_compat → derived version is V1. + let json_v1 = br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{"read-only-compatible":["v1_erofs"]}}"#; + let meta_v1: RepoMetadata = serde_json::from_slice(json_v1).unwrap(); + assert_eq!( + meta_v1.erofs_version(), + FormatVersion::V1, + "repo with v1_erofs flag should derive V1" + ); + + // Old JSON that happens to have an erofs_version field (written by a previous + // version of this code) must deserialize successfully — serde ignores unknown fields. + let json_old = + br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{},"erofs_version":2}"#; + let meta_old: RepoMetadata = serde_json::from_slice(json_old).unwrap(); + assert_eq!( + meta_old.erofs_version(), + FormatVersion::V2, + "old JSON with explicit erofs_version field should still derive V2 from flags" + ); + } + + #[test] + fn test_old_tool_blocked_on_v1_repo() { + // Simulate an old tool that does not know about "v1_erofs". + // A V1 repo places "v1_erofs" in ro_compat, so any tool that + // does not recognise that feature must open the repo read-only. + // We model this by constructing the FeatureFlags directly and filtering + // against an empty ro_compat allowlist. + let features = FeatureFlags { + compatible: vec![], + read_only_compatible: vec![known_features::V1_EROFS.to_string()], + incompatible: vec![], + }; + + // An unknown ro_compat feature must not prevent opening, but must + // signal read-only access. + let unknown_ro: Vec = features + .read_only_compatible + .iter() + .filter(|f| ![].contains(&f.as_str())) // empty old-tool allowlist + .cloned() + .collect(); + assert_eq!( + unknown_ro, + vec![known_features::V1_EROFS.to_string()], + "old tool should see v1_erofs as an unknown ro_compat feature" + ); + // And the current tool knows about it, so check() returns ReadWrite. + assert_eq!(features.check().unwrap(), FeatureCheck::ReadWrite); + } + #[test] fn test_object_store_method_variants() { // Verify all variants exist and are distinct @@ -5001,9 +5394,12 @@ mod tests { // Create a repo, store an object, then remove meta.json to // simulate an old-format repository. - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA256, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); let data = b"hello world"; let obj_id = repo.ensure_object(data).unwrap(); drop(repo); @@ -5052,9 +5448,12 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA512, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); let data = b"sha512 test data"; let obj_id = repo.ensure_object(data).unwrap(); drop(repo); @@ -5089,9 +5488,12 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA512, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); repo.ensure_object(b"some data").unwrap(); drop(repo); @@ -5129,11 +5531,234 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - Repository::::init_path(CWD, &repo_path, Algorithm::SHA256, false) - .unwrap(); + Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); let (_repo, upgraded) = Repository::::open_upgrade(CWD, &repo_path).unwrap(); assert!(!upgraded); } + + #[tokio::test] + async fn test_fsck_v1_image_detects_missing_object() -> Result<()> { + // Same as test_fsck_validates_erofs_image_objects but with a V1 image, + // ensuring fsck correctly parses V1 images to find object references. + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj_size: u64 = 32 * 1024; + let obj = generate_test_data(obj_size, 0xBC); + let obj_id = repo.ensure_object(&obj)?; + + commit_v1_image(&repo, &obj_id, obj_size)?; + repo.sync()?; + + // Sanity: passes before we break it + let result = repo.fsck().await?; + assert!( + result.is_ok(), + "healthy V1 image should pass fsck: {result}" + ); + + // Delete the referenced object + let hex = obj_id.to_hex(); + let (prefix, rest) = hex.split_at(2); + let dir = open_test_repo_dir(&tmp); + dir.remove_file(format!("objects/{prefix}/{rest}"))?; + + let result = repo.fsck().await?; + assert!( + !result.is_ok(), + "fsck should detect missing object in V1 erofs image: {result}" + ); + assert!( + result.missing_objects > 0, + "should report missing objects: {result}" + ); + Ok(()) + } + + // ---- FormatSet / v1_erofs feature flag tests ---- + // + // The `v1_erofs` ro_compat flag is the single on-disk signal for V1 EROFS. + // It is set when `erofs_version == V1`; `erofs_formats` is not encoded on + // disk in this commit (dual V1+V2 / BOTH mode is deferred to the OCI commit). + + #[test] + fn test_v1_erofs_flag_set_for_v1_repos() { + // V1 format → v1_erofs present + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + FormatVersion::V1, + FormatSet::V1_ONLY, + ); + assert!( + meta.features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V1 repo must set v1_erofs in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + assert_eq!(meta.erofs_version(), FormatVersion::V1); + } + + #[test] + fn test_v1_erofs_flag_absent_for_v2_repos() { + // V2 format → v1_erofs absent + let meta = + RepoMetadata::new_with_formats(Algorithm::SHA256, FormatVersion::V2, FormatSet::BOTH); + assert!( + !meta + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V2 repo must NOT set v1_erofs in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + assert_eq!(meta.erofs_version(), FormatVersion::V2); + } + + #[test] + fn test_default_format_set_from_v1_erofs_flag() { + // v1_erofs present → V1_ONLY + let meta_v1 = RepoMetadata::new_with_formats( + Algorithm::SHA256, + FormatVersion::V1, + FormatSet::V1_ONLY, + ); + assert_eq!(repo_formats_from_meta(&meta_v1), FormatSet::V1_ONLY); + + // v1_erofs absent → BOTH (V2-only default) + let meta_v2 = + RepoMetadata::new_with_formats(Algorithm::SHA256, FormatVersion::V2, FormatSet::BOTH); + assert_eq!(repo_formats_from_meta(&meta_v2), FormatSet::BOTH); + } + + #[test] + fn test_init_path_v1_format_set() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V1, + erofs_formats: FormatSet::V1_ONLY, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(was_new); + assert_eq!(repo.erofs_version(), FormatVersion::V1); + assert_eq!(repo.default_format_set(), FormatSet::V1_ONLY); + assert!( + repo.metadata() + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "v1_erofs must be in ro_compat for V1 repos" + ); + Ok(()) + } + + #[test] + fn test_init_path_v2_format_set() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(was_new); + assert_eq!(repo.erofs_version(), FormatVersion::V2); + assert!( + !repo + .metadata() + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "v1_erofs must NOT be in ro_compat for V2 repos" + ); + Ok(()) + } + + /// Verify `commit_images` with `BOTH` and a named ref: + /// - both ObjectIDs are in the returned map, + /// - both image symlinks exist in `images/`, + /// - the named ref points to the V1 image (the primary / first version). + #[test] + fn test_commit_images_both_named_ref_points_to_v1() -> Result<()> { + use crate::tree::{FileSystem, Stat}; + + let tmp = tempdir(); + let repo_path = tmp.path().join("repo"); + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_version: FormatVersion::V2, + erofs_formats: FormatSet::BOTH, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, _) = Repository::::init_path(CWD, &repo_path, config)?; + + // Build a minimal filesystem (empty root dir is enough). + let root_stat = Stat { + st_mode: 0o755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let fs: FileSystem = FileSystem::new(root_stat); + + let map = fs.commit_images(&repo, Some("myref"), FormatSet::BOTH)?; + repo.sync()?; + + // Both versions must be in the result. + let v1_id = map + .get(&FormatVersion::V1) + .expect("V1 must be in result map"); + let v2_id = map + .get(&FormatVersion::V2) + .expect("V2 must be in result map"); + + // Both image symlinks must exist under images/. + let v1_image_path = format!("images/{}", v1_id.to_hex()); + let v2_image_path = format!("images/{}", v2_id.to_hex()); + assert!( + test_path_exists_in_repo(&tmp, &v1_image_path)?, + "V1 image symlink must exist: {v1_image_path}" + ); + assert!( + test_path_exists_in_repo(&tmp, &v2_image_path)?, + "V2 image symlink must exist: {v2_image_path}" + ); + + // The named ref must exist and must point to the V1 image (primary). + let ref_path = "images/refs/myref"; + assert!( + test_path_exists_in_repo(&tmp, ref_path)?, + "named ref images/refs/myref must exist" + ); + // The ref symlink target should contain the V1 image hex, not V2. + let ref_full = tmp.path().join("repo").join(ref_path); + let target = readlinkat(CWD, &ref_full, Vec::new())?; + let target_str = target.to_str()?; + assert!( + target_str.contains(&v1_id.to_hex()), + "named ref must point to V1 image ({}), but points to: {target_str}", + v1_id.to_hex() + ); + assert!( + !target_str.contains(&v2_id.to_hex()), + "named ref must NOT point to V2 image, but points to: {target_str}" + ); + Ok(()) + } } diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs index 6b338dee..8fd45f0b 100644 --- a/crates/composefs/src/splitstream.rs +++ b/crates/composefs/src/splitstream.rs @@ -1168,8 +1168,11 @@ mod tests { /// Create a test repository in insecure mode (no fs-verity required). fn create_test_repo(path: &Path) -> Result>> { - let (repo, _) = - Repository::init_path(CWD, path, crate::fsverity::Algorithm::SHA256, false)?; + let (repo, _) = Repository::init_path( + CWD, + path, + crate::repository::RepositoryConfig::default().set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs/src/test.rs b/crates/composefs/src/test.rs index b5674e9c..194cf98c 100644 --- a/crates/composefs/src/test.rs +++ b/crates/composefs/src/test.rs @@ -9,7 +9,10 @@ use once_cell::sync::Lazy; use rustix::fs::CWD; use tempfile::TempDir; -use crate::{fsverity::FsVerityHashValue, repository::Repository}; +use crate::{ + fsverity::FsVerityHashValue, + repository::{Repository, RepositoryConfig}, +}; static TMPDIR: Lazy = Lazy::new(|| { if let Some(path) = std::env::var_os("CFS_TEST_TMPDIR") { @@ -63,8 +66,12 @@ impl TestRepo { pub fn new() -> Self { let dir = tempdir(); let repo_path = dir.path().join("repo"); - let (repo, _) = Repository::init_path(CWD, &repo_path, ObjectID::ALGORITHM, false) - .expect("initializing test repo"); + let (repo, _) = Repository::init_path( + CWD, + &repo_path, + RepositoryConfig::new(ObjectID::ALGORITHM).set_insecure(), + ) + .expect("initializing test repo"); Self { repo: Arc::new(repo), repo_path, @@ -139,19 +146,37 @@ pub(crate) mod proptest_strategies { /// /// Linux filenames are arbitrary bytes except `/` (0x2F) and `\0` (0x00), /// with a max length of [`NAME_MAX`] (255) bytes. We generate a mix of - /// ASCII names and binary names, occasionally long, to exercise directory - /// entry layout edge cases. + /// lengths to exercise directory entry layout edge cases: + /// + /// - Short ASCII (common case) + /// - Binary bytes (no NUL or `/`) + /// - Long ASCII (crosses xattr/inode inline-data boundaries) + /// - Near-NAME_MAX: lengths 252–255 exercise all four 4-byte padding + /// residues in the erofs directory entry format (names are padded to the + /// next 4-byte boundary, so a 255-byte name has 1 pad byte, 254 has 2, + /// 253 has 3, 252 has 0) + /// - Exactly NAME_MAX (255 bytes): the hard limit pub fn filename() -> impl Strategy { prop_oneof![ // Short ASCII names (common case) - 6 => proptest::string::string_regex("[a-zA-Z0-9._-]{1,20}") + 5 => proptest::string::string_regex("[a-zA-Z0-9._-]{1,20}") .expect("valid regex") .prop_map(OsString::from), // Binary names with arbitrary bytes (no NUL or /) - 3 => prop::collection::vec(1..=0xFEu8, 1..=30) + 2 => prop::collection::vec(1..=0xFEu8, 1..=30) .prop_map(|mut v| { v.iter_mut().for_each(|b| if *b == b'/' { *b = b'_' }); OsString::from_vec(v) }), - // Long ASCII names (up to NAME_MAX) - 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{100,{NAME_MAX}}}")) + // Long ASCII names (100..=251) — crosses inline-data boundaries + 1 => proptest::string::string_regex("[a-zA-Z0-9._-]{100,251}") + .expect("valid regex") + .prop_map(OsString::from), + // Near-NAME_MAX (252–254): all four mod-4 padding residues in erofs dirents + 1 => (252usize..=254).prop_flat_map(|len| { + proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{{len}}}")) + .expect("valid regex") + .prop_map(OsString::from) + }), + // Exactly NAME_MAX (255): the hard limit + 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{{NAME_MAX}}}")) .expect("valid regex") .prop_map(OsString::from), ] @@ -162,29 +187,38 @@ pub(crate) mod proptest_strategies { pub fn stat() -> impl Strategy { ( 0..=0o7777u32, // permission bits - 0..=65535u32, // uid - 0..=65535u32, // gid - 0..=2_000_000_000i64, // mtime + 0..=131071u32, // uid — crosses u16::MAX to exercise extended inodes + 0..=131071u32, // gid — crosses u16::MAX to exercise extended inodes + 0..=2_000_000_000i64, // mtime sec + 0..1_000_000_000u32, // mtime nsec xattrs(), ) - .prop_map(|(mode, uid, gid, mtime, xattrs)| tree::Stat { - st_mode: mode, - st_uid: uid, - st_gid: gid, - st_mtim_sec: mtime, - xattrs, - }) + .prop_map( + |(mode, uid, gid, mtime_sec, mtime_nsec, xattrs)| tree::Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime_sec, + st_mtim_nsec: mtime_nsec, + xattrs, + }, + ) } /// Strategy for xattr keys covering all erofs prefix namespaces. /// /// The erofs format uses prefix indices to compress xattr names: - /// 0 = "" (fallback), 1 = "user.", 2 = "system.posix_acl_access", + /// 0 = "" (fallback, for unrecognized prefixes like com.example.*), + /// 1 = "user.", 2 = "system.posix_acl_access", /// 3 = "system.posix_acl_default", 4 = "trusted.", 5 = "lustre.", /// 6 = "security." /// /// The writer also escapes `trusted.overlay.*` → `trusted.overlay.overlay.*`, /// so we must test that path too. + /// + /// `lustre.*` keys are included here. For V1 images the writer skips index 5 during + /// prefix matching, so lustre.* xattrs fall through to prefix index 0 (raw fallback), + /// matching C mkcomposefs v1.0.8 behavior. fn xattr_key() -> impl Strategy { prop_oneof![ // user.* namespace (index 1) — most common @@ -210,6 +244,16 @@ pub(crate) mod proptest_strategies { 1 => Just("system.posix_acl_access".to_string()), // system.posix_acl_default (index 3) — exact name, no suffix 1 => Just("system.posix_acl_default".to_string()), + // Fallback prefix (index 0) — unrecognized prefix, full key stored as suffix. + // Both Rust and C agree on index 0 for these keys. + 1 => (0..3u32).prop_map(|n| format!("com.example.test_{n}")), + // lustre.* (index 5 in EROFS spec, but index 0 in C mkcomposefs v1.0.8). + // For V1 images, the writer skips index 5 so lustre.* falls through to index 0, + // matching C behavior for binary compatibility. + 1 => prop_oneof![ + Just("lustre.lov".to_string()), + Just("lustre.lma".to_string()), + ], ] } @@ -228,6 +272,58 @@ pub(crate) mod proptest_strategies { }) } + /// Strategy for xattr keys that stress corner cases in the V1 writer: + /// - Multiple `trusted.overlay.*` keys → all get escaped on disk + /// - `trusted.overlay.overlay.X` → double-escaped to `trusted.overlay.overlay.overlay.X` + /// - `security.selinux` + `security.ima` combinations + /// - `system.posix_acl_access` → triggers LCFS_EROFS_FLAGS_HAS_ACL header bit + fn xattr_key_unusual() -> impl Strategy { + prop_oneof![ + // trusted.overlay.* — each gets escaped to trusted.overlay.overlay.* on disk + 4 => prop_oneof![ + Just("trusted.overlay.custom".to_string()), + Just("trusted.overlay.origin".to_string()), + Just("trusted.overlay.upper".to_string()), + Just("trusted.overlay.redirect".to_string()), + Just("trusted.overlay.nfs_fh".to_string()), + ], + // Already-escaped key: trusted.overlay.overlay.X → double-escape on disk + 2 => Just("trusted.overlay.overlay.nested".to_string()), + // security.* — two labels on same inode + 3 => prop_oneof![ + Just("security.selinux".to_string()), + Just("security.ima".to_string()), + Just("security.capability".to_string()), + ], + // ACL — triggers LCFS_EROFS_FLAGS_HAS_ACL + 2 => Just("system.posix_acl_access".to_string()), + // user.* — filler + 1 => proptest::string::string_regex("user\\.[a-z]{1,10}") + .expect("valid regex"), + ] + } + + /// Xattr strategy for the unusual generator: 2–8 xattr pairs (key collisions are silently deduplicated by BTreeMap) with long values allowed. + fn xattrs_unusual() -> impl Strategy, Box<[u8]>>> { + prop::collection::vec( + ( + xattr_key_unusual(), + // Mix of short and long values — long values stress xattr dedup/block layout + prop_oneof![ + 3 => prop::collection::vec(any::(), 0..=20), + 1 => prop::collection::vec(any::(), 64..=512), + ], + ), + 2..=8, + ) + .prop_map(|pairs| { + pairs + .into_iter() + .map(|(k, v)| (OsStr::new(&k).into(), v.into_boxed_slice())) + .collect() + }) + } + /// Strategy for symlink targets as OsString. /// /// Symlink targets on Linux are arbitrary bytes except `\0`, up to @@ -252,7 +348,7 @@ pub(crate) mod proptest_strategies { /// /// External file references store raw hash bytes rather than a concrete /// `ObjectID` type, so the same spec works with any hash algorithm. - #[derive(Debug)] + #[derive(Debug, Clone)] pub enum LeafContentSpec { Inline(Vec), /// External file: random hash bytes (truncated to hash size at build time) and size. @@ -261,6 +357,10 @@ pub(crate) mod proptest_strategies { BlockDevice(u64), CharacterDevice(u64), Fifo, + Socket, + /// Overlay whiteout: char device with rdev=0. Always maps to CharacterDevice(0). + /// Distinct from CharacterDevice(rdev) to allow weighted generation. + Whiteout, } /// Strategy for hash-type-agnostic leaf content. @@ -270,7 +370,7 @@ pub(crate) mod proptest_strategies { // Inline file data is capped at INLINE_CONTENT_MAX_V0 (64 bytes) to match // the composefs invariant: larger files must be external (ChunkBased). ( - 0..10u8, + 0..11u8, prop::collection::vec(any::(), 0..=INLINE_CONTENT_MAX_V0), symlink_target(), prop::collection::vec(any::(), 64..=64), @@ -284,13 +384,14 @@ pub(crate) mod proptest_strategies { 5..=6 => LeafContentSpec::Symlink(symlink_target), 7 => LeafContentSpec::BlockDevice(rdev), 8 => LeafContentSpec::CharacterDevice(rdev), - _ => LeafContentSpec::Fifo, + 9 => LeafContentSpec::Fifo, + _ => LeafContentSpec::Socket, }, ) } /// A hash-type-agnostic leaf node specification. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct LeafSpec { pub stat: tree::Stat, pub content: LeafContentSpec, @@ -301,8 +402,17 @@ pub(crate) mod proptest_strategies { } /// Strategy for a list of uniquely-named leaf specs. - fn named_leaf_specs(max_entries: usize) -> impl Strategy> { - prop::collection::vec((filename(), leaf_spec()), 0..=max_entries).prop_map(|entries| { + /// Strategy for a list of uniquely-named leaf specs with a given entry count range. + /// + /// The `min..=max` range controls how many entries are attempted before + /// deduplication. Use `named_leaf_specs(0, 30)` for a small directory and + /// `named_leaf_specs(150, 300)` to reliably cross a 4 KiB directory block + /// boundary (~170 entries with typical short names × ~20 bytes each). + fn named_leaf_specs( + min: usize, + max: usize, + ) -> impl Strategy> { + prop::collection::vec((filename(), leaf_spec()), min..=max).prop_map(|entries| { let mut seen = std::collections::HashSet::new(); entries .into_iter() @@ -312,7 +422,7 @@ pub(crate) mod proptest_strategies { } /// Description of a directory to be built, including potential hardlinks. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct DirSpec { /// Stat metadata for this directory. pub stat: tree::Stat, @@ -323,7 +433,7 @@ pub(crate) mod proptest_strategies { } /// Description of a filesystem to be built, with hardlink info. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct FsSpec { /// Root directory specification. pub root: DirSpec, @@ -340,9 +450,35 @@ pub(crate) mod proptest_strategies { pub link_name: OsString, } + /// Hardlink spec for the unusual generator: places hardlink in root or a named subdir. + /// `target_dir_index: None` → root; `Some(i)` → subdirs[i % subdirs.len()]`. + #[derive(Debug, Clone)] + pub struct UnusualHardlinkSpec { + /// Index into the flat all-leaves list (root leaves first, then subdir leaves in order). + pub source_leaf_index: usize, + /// Name for the hardlink entry. + pub link_name: OsString, + /// Which directory receives the hardlink entry. + pub target_dir_index: Option, + } + + /// Filesystem description for the unusual generator. + #[derive(Debug, Clone)] + pub struct UnusualFsSpec { + pub root: DirSpec, + pub hardlinks: Vec, + } + /// Strategy for a subdirectory (no further nesting). + /// + /// Usually small (0–20 entries), but 1-in-4 times generates a large + /// directory (150–300 entries) to exercise multi-block directory layout. fn subdir_spec() -> impl Strategy { - (filename(), stat(), named_leaf_specs(10)).prop_map(|(name, stat, leaves)| { + let leaves_strat = prop_oneof![ + 3 => named_leaf_specs(0, 20), + 1 => named_leaf_specs(150, 300), + ]; + (filename(), stat(), leaves_strat).prop_map(|(name, stat, leaves)| { ( name, DirSpec { @@ -366,14 +502,19 @@ pub(crate) mod proptest_strategies { /// Strategy for generating a complete `FsSpec`. /// - /// Generates a root directory with up to 15 file entries and up to 5 - /// subdirectories (each with up to 10 entries, max depth 2). Then - /// optionally generates 0-3 hardlinks that reference existing leaves. + /// Root directory entry count is weighted: usually small (0–30), but + /// 1-in-4 times large (150–300) to reliably cross the 4 KiB directory + /// block boundary. Subdirectories use the same weighted split inside + /// `subdir_spec`. pub fn filesystem_spec() -> impl Strategy { + let root_leaves_strat = prop_oneof![ + 3 => named_leaf_specs(0, 30), + 1 => named_leaf_specs(150, 300), + ]; ( stat(), - named_leaf_specs(15), - unique_subdirs(5), + root_leaves_strat, + unique_subdirs(10), // Hardlink candidates: (source index placeholder, link name) prop::collection::vec((any::(), filename()), 0..=3), ) @@ -416,6 +557,153 @@ pub(crate) mod proptest_strategies { ) } + /// Strategy for the "unusual content" proptest generator. + /// + /// Explicitly constructs filesystem trees that stress corner cases in the V1 writer: + /// - Whiteout files (rdev=0 char devices) at root and in subdirs + /// - Multiple trusted.overlay.* xattrs per inode (escape path) + /// - Large external file sizes (up to 30 GB) + /// - Hardlinks across all leaf types and directories (post-generation pass) + pub fn unusual_filesystem_spec() -> impl Strategy { + fn unusual_stat() -> impl Strategy { + ( + 0u32..=0o7777u32, + 0u32..=131071u32, + 0u32..=131071u32, + 0u64..=u32::MAX as u64, + 0u32..=999_999_999u32, + xattrs_unusual(), + ) + .prop_map(|(mode, uid, gid, mtime_sec, mtime_nsec, xattrs)| { + tree::Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime_sec as i64, + st_mtim_nsec: mtime_nsec, + xattrs, + } + }) + } + + fn unusual_leaf_content_spec() -> impl Strategy { + let hash_bytes = prop::collection::vec(any::(), 64..=64); + let ext_size = prop_oneof![ + 5 => 1u64..=1_000_000u64, + 3 => 1_000_001u64..=100_000_000u64, + 2 => 100_000_001u64..=30_000_000_000u64, + ]; + ( + 0u8..=10u8, + prop::collection::vec(any::(), 0..=INLINE_CONTENT_MAX_V0), + symlink_target(), + hash_bytes, + ext_size, + 1u64..=65535u64, + ) + .prop_map( + |(tag, file_data, symlink_target, hash_bytes, ext_size, rdev)| match tag { + 0..=1 => LeafContentSpec::Inline(file_data), + 2..=3 => LeafContentSpec::External(hash_bytes, ext_size), + 4..=5 => LeafContentSpec::Symlink(symlink_target), + 6..=7 => LeafContentSpec::Whiteout, + 8 => LeafContentSpec::BlockDevice(rdev), + 9 => LeafContentSpec::Fifo, + _ => LeafContentSpec::Socket, + }, + ) + } + + fn unusual_leaf_spec() -> impl Strategy { + (unusual_stat(), unusual_leaf_content_spec()) + .prop_map(|(stat, content)| LeafSpec { stat, content }) + } + + fn unusual_named_leaves(max: usize) -> impl Strategy> { + prop::collection::vec((filename(), unusual_leaf_spec()), 0..=max).prop_map(|entries| { + let mut seen = std::collections::HashSet::new(); + entries + .into_iter() + .filter(|(name, _)| seen.insert(name.clone())) + .collect() + }) + } + + fn unusual_subdir_spec() -> impl Strategy { + (filename(), unusual_stat(), unusual_named_leaves(10)).prop_map( + |(name, stat, leaves)| { + ( + name, + DirSpec { + stat, + leaves, + subdirs: vec![], + }, + ) + }, + ) + } + + fn unusual_unique_subdirs(max: usize) -> impl Strategy> { + prop::collection::vec(unusual_subdir_spec(), 0..=max).prop_map(|dirs| { + let mut seen = std::collections::HashSet::new(); + dirs.into_iter() + .filter(|(name, _)| seen.insert(name.clone())) + .collect() + }) + } + + ( + unusual_stat(), + unusual_named_leaves(15), + unusual_unique_subdirs(5), + prop::collection::vec((any::(), filename(), any::()), 0..=5), + ) + .prop_map( + |(root_stat, mut root_leaves, mut root_subdirs, hl_candidates)| { + let mut seen: std::collections::HashSet = + std::collections::HashSet::new(); + root_subdirs.retain(|(name, _)| seen.insert(name.clone())); + root_leaves.retain(|(name, _)| seen.insert(name.clone())); + + let root_leaf_count = root_leaves.len(); + let total_leaves: usize = root_leaf_count + + root_subdirs + .iter() + .map(|(_, d)| d.leaves.len()) + .sum::(); + + let hardlinks = if total_leaves > 0 { + hl_candidates + .into_iter() + .map(|(src_idx, name, dir_idx)| UnusualHardlinkSpec { + source_leaf_index: src_idx % total_leaves, + link_name: name, + target_dir_index: if root_subdirs.is_empty() { + None + } else if dir_idx % 2 == 0 { + None + } else { + Some(dir_idx % root_subdirs.len()) + }, + }) + .collect() + } else { + vec![] + }; + + UnusualFsSpec { + root: DirSpec { + stat: root_stat, + leaves: root_leaves, + subdirs: root_subdirs, + }, + hardlinks, + } + }, + ) + } + /// Convert a `LeafContentSpec` into a concrete `tree::LeafContent`. fn build_leaf_content( spec: LeafContentSpec, @@ -436,6 +724,8 @@ pub(crate) mod proptest_strategies { LeafContentSpec::BlockDevice(rdev) => tree::LeafContent::BlockDevice(rdev), LeafContentSpec::CharacterDevice(rdev) => tree::LeafContent::CharacterDevice(rdev), LeafContentSpec::Fifo => tree::LeafContent::Fifo, + LeafContentSpec::Socket => tree::LeafContent::Socket, + LeafContentSpec::Whiteout => tree::LeafContent::CharacterDevice(0), } } @@ -485,4 +775,79 @@ pub(crate) mod proptest_strategies { fs } + + /// Build a `tree::FileSystem` from an `UnusualFsSpec`. + /// + /// Handles post-generation hardlink injection: hardlinks can target any leaf type + /// (symlinks, whiteouts, devices, FIFOs) and can be placed in root or any subdir. + pub fn build_unusual_filesystem( + spec: UnusualFsSpec, + ) -> tree::FileSystem { + let mut fs = tree::FileSystem::new(spec.root.stat); + + let mut all_leaf_ids: Vec = Vec::new(); + let mut root_used_names: std::collections::HashSet = + std::collections::HashSet::new(); + + // Insert root leaves + for (name, leaf_spec) in spec.root.leaves { + let leaf_id = fs.push_leaf(leaf_spec.stat, build_leaf_content(leaf_spec.content)); + all_leaf_ids.push(leaf_id); + root_used_names.insert(name.clone()); + fs.root.insert(&name, tree::Inode::leaf(leaf_id)); + } + + // Remember subdir names and per-subdir used-name sets for hardlink dedup + let mut subdir_names: Vec = Vec::new(); + let mut subdir_used_names: Vec> = Vec::new(); + + for (dir_name, dir_spec) in spec.root.subdirs { + subdir_names.push(dir_name.clone()); + let mut used: std::collections::HashSet = std::collections::HashSet::new(); + let mut subdir = tree::Directory::new(dir_spec.stat); + for (name, leaf_spec) in dir_spec.leaves { + let leaf_id = fs.push_leaf(leaf_spec.stat, build_leaf_content(leaf_spec.content)); + all_leaf_ids.push(leaf_id); + used.insert(name.clone()); + subdir.insert(&name, tree::Inode::leaf(leaf_id)); + } + subdir_used_names.push(used); + root_used_names.insert(dir_name.clone()); + fs.root + .insert(&dir_name, tree::Inode::Directory(Box::new(subdir))); + } + + // Post-generation hardlink pass: inject hardlinks to any leaf type, any dir. + // Whiteouts (chardev rdev=0) are excluded: hardlinked whiteouts are invalid. + let non_whiteout_leaf_ids: Vec = all_leaf_ids + .iter() + .copied() + .filter(|&id| !matches!(fs.leaf(id).content, tree::LeafContent::CharacterDevice(0))) + .collect(); + if !non_whiteout_leaf_ids.is_empty() { + for hl in spec.hardlinks { + let leaf_id = + non_whiteout_leaf_ids[hl.source_leaf_index % non_whiteout_leaf_ids.len()]; + match hl.target_dir_index { + None => { + if root_used_names.insert(hl.link_name.clone()) { + fs.root.insert(&hl.link_name, tree::Inode::leaf(leaf_id)); + } + } + Some(raw_idx) => { + let idx = raw_idx % subdir_names.len(); + if subdir_used_names[idx].insert(hl.link_name.clone()) { + if let Ok(subdir) = + fs.root.get_directory_mut(subdir_names[idx].as_os_str()) + { + subdir.insert(&hl.link_name, tree::Inode::leaf(leaf_id)); + } + } + } + } + } + } + + fs + } } diff --git a/crates/composefs/src/tree.rs b/crates/composefs/src/tree.rs index dd8865d4..ddfc61bf 100644 --- a/crates/composefs/src/tree.rs +++ b/crates/composefs/src/tree.rs @@ -57,6 +57,7 @@ mod tests { st_uid: 1000, st_gid: 1000, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -76,6 +77,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } diff --git a/crates/composefs/tests/mkfs.rs b/crates/composefs/tests/mkfs.rs index b2896c69..3944bd14 100644 --- a/crates/composefs/tests/mkfs.rs +++ b/crates/composefs/tests/mkfs.rs @@ -11,8 +11,14 @@ use similar_asserts::assert_eq; use tempfile::NamedTempFile; use composefs::{ - dumpfile::write_dumpfile, - erofs::{debug::debug_img, writer::mkfs_erofs}, + dumpfile::{dumpfile_to_filesystem, write_dumpfile}, + erofs::{ + debug::debug_img, + format::FormatVersion, + writer::{ + ValidatedFileSystem, mkfs_erofs, mkfs_erofs_v1_min_version, mkfs_erofs_versioned, + }, + }, fsverity::{FsVerityHashValue, Sha256HashValue}, tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}, }; @@ -23,12 +29,13 @@ fn default_stat() -> Stat { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } fn debug_fs(fs: FileSystem) -> String { - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let mut output = vec![]; debug_img(&mut output, &image).unwrap(); String::from_utf8(output).unwrap() @@ -54,6 +61,7 @@ fn add_leaf( st_uid: 0, st_mode: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, content, @@ -94,22 +102,80 @@ fn test_simple() { insta::assert_snapshot!(debug_fs(fs)); } -fn foreach_case(f: fn(&FileSystem)) { +fn foreach_case(f: fn(FileSystem)) { for case in [empty, simple] { let mut fs = FileSystem::new(default_stat()); case(&mut fs); - f(&fs); + f(fs); } } #[test_with::executable(fsck.erofs)] fn test_fsck() { foreach_case(|fs| { + // V2 (default) let mut tmp = NamedTempFile::new().unwrap(); - tmp.write_all(&mkfs_erofs(fs)).unwrap(); + tmp.write_all(&mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap())) + .unwrap(); let mut fsck = Command::new("fsck.erofs").arg(tmp.path()).spawn().unwrap(); assert!(fsck.wait().unwrap().success()); }); + + // V1 — needs its own filesystem instances for add_overlay_whiteouts + for case in [empty, simple] { + let mut fs = FileSystem::::new(default_stat()); + case(&mut fs); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + let mut tmp = NamedTempFile::new().unwrap(); + tmp.write_all(&image).unwrap(); + let mut fsck = Command::new("fsck.erofs").arg(tmp.path()).spawn().unwrap(); + assert!(fsck.wait().unwrap().success()); + } +} + +/// Verify byte-for-byte identity with C mkcomposefs for the pinned test cases. +/// +/// These fixed cases (`empty`, `simple`) complement the proptest binary-compat +/// tests in reader.rs which cover random trees. Keeping them pinned here means +/// a regression on these canonical shapes is immediately visible without proptest +/// shrinking, and is also validated by the digest stability tests above. +#[test_with::executable(mkcomposefs)] +fn test_vs_mkcomposefs() { + for case in [empty, simple] { + let mut fs_rust = FileSystem::new(default_stat()); + case(&mut fs_rust); + let mut fs_c = FileSystem::new(default_stat()); + case(&mut fs_c); + + fs_rust.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned( + &ValidatedFileSystem::new(fs_rust).unwrap(), + FormatVersion::V1, + ); + + let mut mkcomposefs = Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let mut stdin = mkcomposefs.stdin.take().unwrap(); + write_dumpfile(&mut stdin, &fs_c).unwrap(); + drop(stdin); + + let output = mkcomposefs.wait_with_output().unwrap(); + assert!(output.status.success()); + let mkcomposefs_image = output.stdout.into_boxed_slice(); + + if image != mkcomposefs_image { + let dump = dump_image(&image); + let mkcomposefs_dump = dump_image(&mkcomposefs_image); + assert_eq!(mkcomposefs_dump, dump, "structural diff (rust vs C)"); + } + assert_eq!(image, mkcomposefs_image); + } } fn dump_image(img: &[u8]) -> String { @@ -139,7 +205,7 @@ fn test_erofs_digest_stability() { for (name, case, expected_digest) in cases { let mut fs = FileSystem::::new(default_stat()); case(&mut fs); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&ValidatedFileSystem::new(fs).unwrap()); let digest = composefs::fsverity::compute_verity::(&image); let hex = digest.to_hex(); assert_eq!( @@ -149,32 +215,93 @@ fn test_erofs_digest_stability() { } } -#[should_panic] +#[test] +fn test_erofs_v1_digest_stability() { + // Same as test_erofs_digest_stability but for V1 (C-compatible) format. + // V1 output must be byte-stable since it needs to match C mkcomposefs. + let cases: &[(&str, fn(&mut FileSystem), &str)] = &[ + ( + "empty_v1", + empty, + "8f589e8f57ecb88823736b0d857ddca1e1068a23e264fad164b28f7038eb3682", + ), + ( + "simple_v1", + simple, + "9f3f5620ee0c54708516467d0d58741e7963047c7106b245d94c298259d0fa01", + ), + ]; + + for (name, case, expected_digest) in cases { + let mut fs = FileSystem::::new(default_stat()); + case(&mut fs); + fs.add_overlay_whiteouts(); + let image = mkfs_erofs_versioned(&ValidatedFileSystem::new(fs).unwrap(), FormatVersion::V1); + let digest = composefs::fsverity::compute_verity::(&image); + let hex = digest.to_hex(); + assert_eq!( + &hex, expected_digest, + "{name}: V1 EROFS digest changed — if this is intentional, update the pinned value" + ); + } +} + +/// Test that `--min-version=1` forces `composefs_version=1` in the EROFS header +/// even when no user-visible whiteout devices are present, matching C mkcomposefs +/// `--min-version=1 --max-version=1` behaviour. +/// +/// Uses a trimmed version of the C test suite's `special_v1.dump` fixture +/// (the `inline-large*` entries were removed since Rust intentionally rejects +/// inline content larger than `MAX_INLINE_CONTENT`). +/// +/// Golden digest verified against C mkcomposefs 1.0.8+. #[test_with::executable(mkcomposefs)] -fn test_vs_mkcomposefs() { - foreach_case(|fs| { - let image = mkfs_erofs(fs); +fn test_vs_mkcomposefs_min_version_1() { + let dump = include_str!("special_v1.dump"); - let mut mkcomposefs = Command::new("mkcomposefs") - .args(["--min-version=3", "--from-file", "-", "-"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .spawn() - .unwrap(); + // Parse the dumpfile and build Rust image with --min-version=1. + let mut fs_rust = dumpfile_to_filesystem::(dump).unwrap(); + fs_rust.add_overlay_whiteouts(); + let rust_image = mkfs_erofs_v1_min_version(&ValidatedFileSystem::new(fs_rust).unwrap(), 1); - let mut stdin = mkcomposefs.stdin.take().unwrap(); - write_dumpfile(&mut stdin, fs).unwrap(); - drop(stdin); + // Also generate via C mkcomposefs --min-version=1 --max-version=1. + let mut mkcomposefs = Command::new("mkcomposefs") + .args([ + "--min-version=1", + "--max-version=1", + "--from-file", + "-", + "-", + ]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + mkcomposefs + .stdin + .take() + .unwrap() + .write_all(dump.as_bytes()) + .unwrap(); + let output = mkcomposefs.wait_with_output().unwrap(); + assert!(output.status.success()); + let c_image = output.stdout.into_boxed_slice(); - let output = mkcomposefs.wait_with_output().unwrap(); - assert!(output.status.success()); - let mkcomposefs_image = output.stdout.into_boxed_slice(); + if rust_image != c_image { + let rust_dump = dump_image(&rust_image); + let c_dump = dump_image(&c_image); + assert_eq!( + c_dump, rust_dump, + "structural diff (rust vs C --min-version=1)" + ); + } + assert_eq!(rust_image, c_image); - if image != mkcomposefs_image { - let dump = dump_image(&image); - let mkcomposefs_dump = dump_image(&mkcomposefs_image); - assert_eq!(mkcomposefs_dump, dump); - } - assert_eq!(image, mkcomposefs_image); // fallback if the dump is somehow the same - }); + // Pin the expected digest so any regression is immediately visible. + let digest = composefs::fsverity::compute_verity::(&rust_image); + assert_eq!( + digest.to_hex(), + "b1c78c25db8638be5b9b483472d32ee9624d8d32ded626a91cd536116e8df97c", + "special_v1 --min-version=1 digest changed" + ); } diff --git a/crates/composefs/tests/special_v1.dump b/crates/composefs/tests/special_v1.dump new file mode 100644 index 00000000..214b606d --- /dev/null +++ b/crates/composefs/tests/special_v1.dump @@ -0,0 +1,7 @@ +/ 4096 40555 2 0 0 0 1633950376.0 - - - trusted.foo1=bar-1 user.foo2=bar-2 +/blockdev 0 60777 1 0 0 107690 1633950376.0 - - - trusted.bar=bar-2 +/chardev 0 20777 1 0 0 10769 1633950376.0 - - - trusted.foo=bar-2 +/escaped-xattr 0 100777 1 0 0 0 1633950376.0 - - - trusted.overlay.redirect=/foo\n user.overlay.redirect=/foo\n user.foo=bar-2 +/fifo 0 10777 1 0 0 0 1633950376.0 - - - trusted.bar=bar-2 +/inline 15 100777 1 0 0 0 1633950376.0 - FOOBAR\nINAFILE\n - user.foo=bar-2 +/whiteout 0 20777 1 0 0 0 1633950376.0 - - - trusted.foo=bar-2 diff --git a/doc/repository.md b/doc/repository.md index e3188305..7e400040 100644 --- a/doc/repository.md +++ b/doc/repository.md @@ -65,11 +65,42 @@ created by `cfsctl init` and contains: - `read-only-compatible` — old tools may read but must not write. - `incompatible` — old tools must refuse the repository entirely. + The currently defined feature flags are: + - `v1_erofs` (read-only-compatible) — present on repositories whose + EROFS image format is V1 (C-tool compatible: compact inodes, BFS + ordering, whiteout table). This is the single flag that encodes the + EROFS format version: present → V1, absent → V2 (the default). Old + tools that do not recognise this flag open the repository read-only + rather than accidentally writing images in the wrong format. + When `meta.json` is present, `cfsctl` auto-detects the hash algorithm and errors if `--hash` is explicitly passed with a conflicting value. When the file is absent (for repositories created before this feature), `--hash` is honored as before and defaults to `sha512`. +### `cfsctl init --erofs-version` + +The `--erofs-version` flag selects the EROFS format for newly committed +images. It controls the `v1_erofs` feature flag in `meta.json`: + +``` +cfsctl init # default: V2 EROFS (composefs-rs native) +cfsctl init --erofs-version v1 # V1 EROFS (C-tool compatible) +``` + +**V2** (default) uses extended inodes, DFS ordering, and `composefs_version=2` +in the EROFS superblock. This is the composefs-rs native format and is what +all repositories created before V1 support was added use. + +**V1** uses compact inodes where possible, BFS ordering, and a whiteout stub +table, producing output byte-for-byte identical to the C `mkcomposefs` tool. +The `v1_erofs` ro-compat flag is written to `meta.json` so that tools which +predate V1 support open the repository read-only rather than writing images +in the wrong format. + +Re-initializing an existing repository with a different `--erofs-version` is +rejected with an error; the format version is fixed at init time. + ## `objects/` This is where the content-addressed data is stored. The immediate children of diff --git a/docs/booting.md b/docs/booting.md new file mode 100644 index 00000000..52282eb8 --- /dev/null +++ b/docs/booting.md @@ -0,0 +1,77 @@ +# Booting from a composefs image + +This document describes how composefs-rs sets up the root filesystem during +early boot. It covers the kernel command-line interface, the expected on-disk +layout, kernel requirements, and the step-by-step mount sequence performed by +`composefs-setup-root`. + +The target audience is system integrators and OS developers who are packaging a +bootable system using composefs. Familiarity with Linux mount namespaces, +overlayfs, and fs-verity is assumed. + +## Kernel command-line + +A single kernel argument controls which image is booted: + +``` +composefs= +``` + +`` is the hex-encoded fs-verity digest of the EROFS metadata image to +mount as root. SHA-256 digests are 64 hex characters; SHA-512 digests are 128 +hex characters. `composefs-setup-root` tries SHA-512 first and falls back to +SHA-256 if the length does not match, so both algorithms are supported without +any additional configuration. + +**Insecure mode.** Prefixing the digest with `?` (e.g. `composefs=?`) +makes fs-verity verification optional. The system will boot even when the +underlying filesystem does not support fs-verity or the image has no verity +metadata attached. This mode exists for development and testing only; it must +not be used in production. + +Parsing is handled by `composefs_boot::cmdline::get_cmdline_composefs` +(`crates/composefs-boot/src/cmdline.rs`). The splitter follows the kernel's +own logic: tokens are separated by ASCII whitespace, and whitespace inside +double-quoted strings is treated as literal. There is no escape mechanism, so a +literal double-quote character cannot appear in a token value. + +## On-disk layout + +The composefs repository must be present at `/sysroot/composefs` with the +standard layout described in `doc/repository.md`. + +The `composefs=` digest must correspond to a symlink under `images/`. + +Persistent per-deployment state lives at `/sysroot/state/deploy//`, +where `` matches the `composefs=` kernel argument exactly. The `etc/` +and `var/` subdirectories within that directory serve as the upper layers for +the corresponding overlayfs mounts. + +## Kernel requirements + +The following kernel features must be available: + +- **EROFS** filesystem driver (`CONFIG_EROFS_FS`) +- **overlayfs** with `metacopy=on` and `redirect_dir=on` + (`CONFIG_OVERLAY_FS`, `CONFIG_OVERLAY_FS_METACOPY`, `CONFIG_OVERLAY_FS_REDIRECT_DIR`) +- **fs-verity** unless insecure mode is used (`CONFIG_FS_VERITY`) +- The modern Linux mount API (`fsopen` / `fsconfig` / `fsmount` / `move_mount`), + available since kernel 5.2. Kernel ≥ 6.15 is required for the atomic root + replacement path (the default build). On kernels without `fsconfig_set_fd` + support (e.g. RHEL 9 / kernel < 5.15), a loopback device is created + automatically by `composefs::mountcompat`. + +## Kernel argument + +The `composefs=` kernel argument is the authoritative selector for which image +Without the `?` insecure prefix, every file access through the overlayfs is +verified against the object's stored digest by the kernel, combining fs-verity +on the data objects with overlayfs `verity=require`. + +## Other notes + +As a workaround for a GPT auto-root issue in systemd +([systemd#35017](https://github.com/systemd/systemd/issues/35017)), +`composefs-setup-root` attempts to create `/run/systemd/volatile-root` as a +symlink pointing to the real block device before performing any mounts. Failure +to do so is non-fatal and does not abort the boot sequence. diff --git a/docs/erofs.md b/docs/erofs.md new file mode 100644 index 00000000..5ccb8306 --- /dev/null +++ b/docs/erofs.md @@ -0,0 +1,82 @@ +# composefs EROFS image format + +composefs images are EROFS filesystem images with composefs-specific extensions. They encode +a directory tree where regular files are stored externally in a content-addressed object store +and referenced by their fs-verity digest. The EROFS image itself carries only metadata: inodes, +directory entries, extended attributes, and chunk index entries that point to the external files. + +composefs-rs supports two EROFS format versions. V1 is byte-for-byte compatible with the C +`mkcomposefs` tool. V2 is the composefs-rs native default and drops several V1 constraints +that exist only for C compatibility. New repositories use V2 unless `--erofs-version v1` is +passed to `cfsctl init`. + +However, V2 is not mountable by RHEL9 era EROFS, and a goal is to transition to V1 by default +for maximum compatibility. + +## Format V1 + +V1 is selected with `cfsctl init --erofs-version v1`. The `v1_erofs` ro-compat feature flag +is written to `meta.json` so that tools without V1 support open the repository read-only. + +**`composefs_version` field values in V1:** + +- `0` — no user-visible whiteout files (character devices with rdev=0) in the tree +- `1` — at least one user-visible whiteout file is present + +The constant `COMPOSEFS_VERSION_V1` is 0; the field only reaches 1 when user whiteouts are +found. The `--min-version` flag in `mkcomposefs` (mirrored by `mkfs_erofs_v1_min_version`) +forces the value to 1 even when no user whiteouts exist, for forward compatibility. + +**Inode layout:** V1 uses compact inodes (32 bytes) when the file data and inode fit within +the constraints of the compact format, and extended inodes (64 bytes) otherwise. + +**Inode traversal order:** V1 collects inodes in breadth-first order — all entries at one +directory level before descending. + +**Whiteout stub table:** V1 includes 256 synthetic inode entries at the start of the inode +area, one per two-hex-character prefix `00`–`ff`. Each entry is a character-device stub +(chr 0,0) used by the overlay filesystem to resolve whiteout paths against the object store. +V2 omits them entirely. + +**Whiteout escaping:** User-visible whiteout files (chr 0,0) in the tree are not stored as +character devices on disk. Instead they receive a `trusted.overlay.opaque=x` xattr and are +serialized differently. The stub entries in the whiteout table are not escaped. + +**`build_time`:** The superblock `build_time` field is set to the minimum mtime across all inodes. + +**xattr sharing:** Xattr entries are deduplicated using a sort key that is the full xattr name (prefix string concatenated with the suffix). + +## Format V2 — Created in composefs-rs + +V2 is the default for all repositories created without `--erofs-version v1`. + +**`composefs_version` field:** Always `2` (the constant `COMPOSEFS_VERSION`). + +**Inode layout:** V2 always uses extended inodes (64 bytes). + +**Inode traversal order:** V2 collects inodes in depth-first order — all descendants of a directory before moving to the next sibling. + +**No whiteout stub table:** V2 has no synthetic stub entries; whiteout files are stored directly without escaping. + +**`build_time`:** Always 0. + +**xattr sharing:** Xattr entries are deduplicated using a sort key of (prefix, suffix, value) +rather than the full name string, which can produce a smaller shared xattr area. + +## Selecting the format + +The format is fixed at repository initialization time and cannot be changed afterward. + +``` +cfsctl init # V2 (default) +cfsctl init --erofs-version v1 # V1 (C-tool compatible) +``` + +The format is recorded in `meta.json` as the `v1_erofs` ro-compat feature flag: present +means V1, absent means V2. Tools that do not recognize this flag open the repository +read-only rather than writing images in the wrong format. + +For the standalone `mkcomposefs` tool, the equivalent flag is `--erofs-version`. The +`--min-version` flag (`mkfs_erofs_v1_min_version` in the Rust API) controls whether the +`composefs_version` field starts at 0 or 1 in V1 images regardless of whether user whiteouts +are present.