From 0f7e1936315ca85c482e2875632bccc60c0803e5 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 31 Mar 2026 11:38:20 -0700 Subject: [PATCH 01/10] fix(sandbox): add GPU device nodes and nvidia-persistenced to landlock baseline Landlock READ_FILE/WRITE_FILE restricts open(2) on character device files even when DAC permissions would otherwise allow it. GPU sandboxes need /dev/nvidiactl, /dev/nvidia-uvm, /dev/nvidia-uvm-tools, /dev/nvidia-modeset, and per-GPU /dev/nvidiaX nodes in the policy to allow NVML initialization. Additionally, CDI bind-mounts /run/nvidia-persistenced/socket into the container. NVML tries to connect to this socket at init time; if the directory is not in the landlock policy, it receives EACCES (not ECONNREFUSED), which causes NVML to abort with NVML_ERROR_INSUFFICIENT_PERMISSIONS even though nvidia-persistenced is optional. Both classes of paths are auto-added to the baseline when /dev/nvidiactl is present. Per-GPU device nodes are enumerated at runtime to handle multi-GPU configurations. --- crates/openshell-sandbox/src/lib.rs | 84 +++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 14963244..6b34a0d7 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -884,6 +884,53 @@ const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/va /// user working directory and temporary files. const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; +/// Fixed read-write paths required when a GPU is present. +/// +/// - `/run/nvidia-persistenced`: NVML tries to connect to the persistenced +/// socket at init time. If the socket exists but landlock denies traversal +/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS +/// even though the daemon is optional. +/// - `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, +/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. +/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even +/// when DAC permissions would otherwise allow it. +/// +/// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated +/// at runtime by `gpu_baseline_read_write_paths()` since the count varies. +const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[ + "/run/nvidia-persistenced", + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", +]; + +/// Returns true if GPU devices are present in the container. +fn has_gpu_devices() -> bool { + std::path::Path::new("/dev/nvidiactl").exists() +} + +/// Collect all GPU read-write paths: fixed devices + per-GPU `/dev/nvidiaX`. +fn gpu_baseline_read_write_paths() -> Vec { + let mut paths: Vec = GPU_BASELINE_READ_WRITE_FIXED + .iter() + .map(|p| std::path::PathBuf::from(p)) + .collect(); + + // Enumerate per-GPU device nodes injected by CDI (nvidia0, nvidia1, …). + if let Ok(entries) = std::fs::read_dir("/dev") { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if name.starts_with("nvidia") && name[6..].chars().all(|c| c.is_ascii_digit()) { + paths.push(entry.path()); + } + } + } + + paths +} + /// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths /// required for proxy-mode sandboxes. Paths are only added if missing; /// user-specified paths are never removed. @@ -935,6 +982,27 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) } } + if has_gpu_devices() { + for path in gpu_baseline_read_write_paths() { + let path_str = path.to_string_lossy(); + if !fs + .read_write + .iter() + .any(|p| p.as_str() == path_str.as_ref()) + { + if !path.exists() { + debug!( + path = %path.display(), + "GPU baseline read-write path does not exist, skipping enrichment" + ); + continue; + } + fs.read_write.push(path_str.into_owned()); + modified = true; + } + } + } + if modified { info!("Enriched policy with baseline filesystem paths for proxy mode"); } @@ -982,6 +1050,22 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } } + if has_gpu_devices() { + for p in gpu_baseline_read_write_paths() { + if !policy.filesystem.read_write.contains(&p) { + if !p.exists() { + debug!( + path = %p.display(), + "GPU baseline read-write path does not exist, skipping enrichment" + ); + continue; + } + policy.filesystem.read_write.push(p); + modified = true; + } + } + } + if modified { info!("Enriched policy with baseline filesystem paths for proxy mode"); } From eed1ae285a848fc19a7df33ea4bfe163456a57ed Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 31 Mar 2026 12:18:40 -0700 Subject: [PATCH 02/10] fix(sandbox): narrow /run/nvidia-persistenced to read-only in GPU baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVML only needs to traverse the directory and connect to the Unix socket — it does not create or modify files there. Read-only (traversal) access is sufficient; read-write was unnecessarily broad. --- crates/openshell-sandbox/src/lib.rs | 65 +++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 6b34a0d7..e6ceb779 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -884,21 +884,26 @@ const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/va /// user working directory and temporary files. const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; +/// Fixed read-only paths required when a GPU is present. +/// +/// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced +/// socket at init time. If the directory exists but landlock denies traversal +/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS +/// even though the daemon is optional. Only read/traversal access is needed — +/// NVML connects to the existing socket but does not create or modify files. +const GPU_BASELINE_READ_ONLY_FIXED: &[&str] = &["/run/nvidia-persistenced"]; + /// Fixed read-write paths required when a GPU is present. /// -/// - `/run/nvidia-persistenced`: NVML tries to connect to the persistenced -/// socket at init time. If the socket exists but landlock denies traversal -/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS -/// even though the daemon is optional. -/// - `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, -/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. -/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even -/// when DAC permissions would otherwise allow it. +/// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, +/// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. +/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even +/// when DAC permissions would otherwise allow it. Device nodes need +/// read-write because NVML opens them with O_RDWR. /// /// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated /// at runtime by `gpu_baseline_read_write_paths()` since the count varies. const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[ - "/run/nvidia-persistenced", "/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools", @@ -910,6 +915,14 @@ fn has_gpu_devices() -> bool { std::path::Path::new("/dev/nvidiactl").exists() } +/// Collect all GPU read-only paths (currently just the persistenced directory). +fn gpu_baseline_read_only_paths() -> Vec { + GPU_BASELINE_READ_ONLY_FIXED + .iter() + .map(|p| std::path::PathBuf::from(p)) + .collect() +} + /// Collect all GPU read-write paths: fixed devices + per-GPU `/dev/nvidiaX`. fn gpu_baseline_read_write_paths() -> Vec { let mut paths: Vec = GPU_BASELINE_READ_WRITE_FIXED @@ -983,6 +996,25 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) } if has_gpu_devices() { + for path in gpu_baseline_read_only_paths() { + let path_str = path.to_string_lossy(); + if !fs.read_only.iter().any(|p| p.as_str() == path_str.as_ref()) + && !fs + .read_write + .iter() + .any(|p| p.as_str() == path_str.as_ref()) + { + if !path.exists() { + debug!( + path = %path.display(), + "GPU baseline read-only path does not exist, skipping enrichment" + ); + continue; + } + fs.read_only.push(path_str.into_owned()); + modified = true; + } + } for path in gpu_baseline_read_write_paths() { let path_str = path.to_string_lossy(); if !fs @@ -1051,6 +1083,21 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } if has_gpu_devices() { + for p in gpu_baseline_read_only_paths() { + if !policy.filesystem.read_only.contains(&p) + && !policy.filesystem.read_write.contains(&p) + { + if !p.exists() { + debug!( + path = %p.display(), + "GPU baseline read-only path does not exist, skipping enrichment" + ); + continue; + } + policy.filesystem.read_only.push(p); + modified = true; + } + } for p in gpu_baseline_read_write_paths() { if !policy.filesystem.read_write.contains(&p) { if !p.exists() { From 978de1db79db532b281d9dadacd051320270c43d Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 31 Mar 2026 12:41:35 -0700 Subject: [PATCH 03/10] fix(bootstrap): diagnose CDI specs missing when GPU passthrough fails When Docker has CDI configured but no CDI spec files exist on the host, container startup fails with an opaque error. Add a failure pattern to detect this and surface actionable recovery steps pointing to `nvidia-ctk cdi generate`. --- crates/openshell-bootstrap/src/errors.rs | 90 ++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index b487c94a..26d32e2b 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -169,6 +169,19 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ match_mode: MatchMode::Any, diagnose: diagnose_docker_not_running, }, + // CDI specs missing — Docker daemon has CDI configured but no spec files exist + // or the requested device ID (nvidia.com/gpu=all) is not in any spec. + // Matches errors from Docker 25+ and containerd CDI injection paths. + FailurePattern { + matchers: &[ + "CDI device not found", + "unknown CDI device", + "failed to inject CDI devices", + "no CDI devices found", + ], + match_mode: MatchMode::Any, + diagnose: diagnose_cdi_specs_missing, + }, ]; fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis { @@ -396,6 +409,29 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis { } } +fn diagnose_cdi_specs_missing(_gateway_name: &str) -> GatewayFailureDiagnosis { + GatewayFailureDiagnosis { + summary: "CDI specs not found on host".to_string(), + explanation: "GPU passthrough via CDI was selected (your Docker daemon has CDI spec \ + directories configured) but no CDI device specs were found on the host. \ + Specs must be pre-generated before OpenShell can inject the GPU into the \ + cluster container." + .to_string(), + recovery_steps: vec![ + RecoveryStep::with_command( + "Generate CDI specs on the host (nvidia-ctk creates /etc/cdi/ if it does not exist)", + "sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml", + ), + RecoveryStep::with_command( + "Verify the specs were generated and include nvidia.com/gpu entries", + "nvidia-ctk cdi list", + ), + RecoveryStep::new("Then retry: openshell gateway start --gpu"), + ], + retryable: false, + } +} + fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis { GatewayFailureDiagnosis { summary: "Docker is not running".to_string(), @@ -925,4 +961,58 @@ mod tests { "commands should include gateway name, got: {all_commands}" ); } + + #[test] + fn test_diagnose_cdi_device_not_found() { + let diagnosis = diagnose_failure( + "test", + "could not run container: CDI device not found: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + + #[test] + fn test_diagnose_unknown_cdi_device() { + // containerd error path + let diagnosis = diagnose_failure( + "test", + "unknown CDI device requested: nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + } + + #[test] + fn test_diagnose_cdi_recovery_mentions_nvidia_ctk() { + let d = diagnose_failure("test", "CDI device not found", None) + .expect("should match CDI pattern"); + let all_steps: String = d + .recovery_steps + .iter() + .map(|s| format!("{} {}", s.description, s.command.as_deref().unwrap_or(""))) + .collect::>() + .join("\n"); + assert!( + all_steps.contains("nvidia-ctk cdi generate"), + "recovery steps should mention nvidia-ctk cdi generate, got: {all_steps}" + ); + assert!( + all_steps.contains("/etc/cdi/"), + "recovery steps should mention /etc/cdi/, got: {all_steps}" + ); + } } From 5f7b8e535b1f6c7c9d9addc3573be9b03c3d3c2f Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 31 Mar 2026 12:59:25 -0700 Subject: [PATCH 04/10] fix(bootstrap): add missing CDI error patterns from observed Docker 500 response --- crates/openshell-bootstrap/src/errors.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 26d32e2b..9e385c68 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -178,6 +178,8 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[ "unknown CDI device", "failed to inject CDI devices", "no CDI devices found", + "CDI device injection failed", + "unresolvable CDI devices", ], match_mode: MatchMode::Any, diagnose: diagnose_cdi_specs_missing, @@ -979,6 +981,24 @@ mod tests { assert!(!d.retryable); } + #[test] + fn test_diagnose_cdi_injection_failed_unresolvable() { + // Exact error observed from Docker 500 response + let diagnosis = diagnose_failure( + "test", + "Docker responded with status code 500: CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all", + None, + ); + assert!(diagnosis.is_some()); + let d = diagnosis.unwrap(); + assert!( + d.summary.contains("CDI"), + "expected CDI diagnosis, got: {}", + d.summary + ); + assert!(!d.retryable); + } + #[test] fn test_diagnose_unknown_cdi_device() { // containerd error path From f05cdf04a21b445639cd6135c790c50e38d478ee Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 31 Mar 2026 18:07:39 -0700 Subject: [PATCH 05/10] fix(sandbox): add /proc and /dev/urandom to Landlock baselines for GPU and proxy mode After the Landlock per-path fix (#677), missing paths no longer silently disable the entire ruleset. This exposed two gaps: Proxy baseline: - /proc and /dev/urandom are needed by virtually every process (already in restrictive_default_policy but missing from the enrichment baseline). GPU baseline: - /proc must be read-write because CUDA writes to /proc//task//comm during cuInit() to set thread names. Using /proc (not /proc/self/task) because Landlock rules bind to inodes and child processes have different procfs inodes than the parent shell. Also skips chown for virtual filesystem paths (/proc, /sys) in prepare_filesystem since they are kernel-managed and may contain symlinks (e.g. /proc/self) that trigger the symlink safety check. --- crates/openshell-sandbox/src/lib.rs | 68 +++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index e6ceb779..58a54cfb 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -877,8 +877,22 @@ pub(crate) fn spawn_route_refresh( /// Minimum read-only paths required for a proxy-mode sandbox child process to /// function: dynamic linker, shared libraries, DNS resolution, CA certs, -/// Python venv, and openshell logs. -const PROXY_BASELINE_READ_ONLY: &[&str] = &["/usr", "/lib", "/etc", "/app", "/var/log"]; +/// Python venv, openshell logs, process info, and random bytes. +/// +/// `/proc` and `/dev/urandom` are included here for the same reasons they +/// appear in `restrictive_default_policy()`: virtually every process needs +/// them. Before the Landlock per-path fix (#677) these were effectively free +/// because a missing path silently disabled the entire ruleset; now they must +/// be explicit. +const PROXY_BASELINE_READ_ONLY: &[&str] = &[ + "/usr", + "/lib", + "/etc", + "/app", + "/var/log", + "/proc", + "/dev/urandom", +]; /// Minimum read-write paths required for a proxy-mode sandbox child process: /// user working directory and temporary files. @@ -899,7 +913,16 @@ const GPU_BASELINE_READ_ONLY_FIXED: &[&str] = &["/run/nvidia-persistenced"]; /// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. /// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even /// when DAC permissions would otherwise allow it. Device nodes need -/// read-write because NVML opens them with O_RDWR. +/// read-write because NVML/CUDA opens them with O_RDWR. +/// +/// `/proc`: CUDA writes to `/proc//task//comm` during `cuInit()` +/// to set thread names. Without write access, `cuInit()` returns error 304 +/// (`cudaErrorOperatingSystem`). We must use `/proc` (not `/proc/self/task`) +/// because Landlock rules bind to inodes: `/proc/self/task` in the pre_exec +/// hook resolves to the shell's PID, but child processes (python, etc.) +/// have different PIDs and thus different procfs inodes. Security impact +/// is limited — writable proc entries (`oom_score_adj`, etc.) are already +/// kernel-restricted for non-root users; Landlock is defense-in-depth. /// /// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated /// at runtime by `gpu_baseline_read_write_paths()` since the count varies. @@ -908,6 +931,7 @@ const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[ "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools", "/dev/nvidia-modeset", + "/proc", ]; /// Returns true if GPU devices are present in the container. @@ -1380,22 +1404,32 @@ fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { // The TOCTOU window between lstat and chown is not exploitable because // no untrusted process is running yet (the child has not been forked). for path in &policy.filesystem.read_write { - // Check for symlinks before touching the path. Character/block devices - // (e.g. /dev/null) are legitimate read_write entries and must be allowed. - if let Ok(meta) = std::fs::symlink_metadata(path) { - if meta.file_type().is_symlink() { - return Err(miette::miette!( - "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", - path.display() - )); - } + // Virtual filesystems (/proc, /sys) are kernel-managed — skip chown + // and mkdir. These paths are added to the Landlock ruleset only; + // ownership changes are meaningless and may fail (symlinks like + // /proc/self, permission errors on sysfs nodes). + let is_virtual_fs = path.starts_with("/proc") || path.starts_with("/sys"); + + if is_virtual_fs { + debug!(path = %path.display(), "Skipping chown for virtual filesystem path"); } else { - debug!(path = %path.display(), "Creating read_write directory"); - std::fs::create_dir_all(path).into_diagnostic()?; - } + // Check for symlinks before touching the path. Character/block devices + // (e.g. /dev/null) are legitimate read_write entries and must be allowed. + if let Ok(meta) = std::fs::symlink_metadata(path) { + if meta.file_type().is_symlink() { + return Err(miette::miette!( + "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", + path.display() + )); + } + } else { + debug!(path = %path.display(), "Creating read_write directory"); + std::fs::create_dir_all(path).into_diagnostic()?; + } - debug!(path = %path.display(), ?uid, ?gid, "Setting ownership on read_write directory"); - chown(path, uid, gid).into_diagnostic()?; + debug!(path = %path.display(), ?uid, ?gid, "Setting ownership on read_write directory"); + chown(path, uid, gid).into_diagnostic()?; + } } Ok(()) From e1fb27dc8d497784d6842d60535bb93cf27c2bbf Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Wed, 1 Apr 2026 00:15:35 -0700 Subject: [PATCH 06/10] refactor(sandbox): remove unnecessary virtual filesystem skip in prepare_filesystem chown on /proc and /sys succeeds silently (kernel ignores it for virtual filesystems), so the special-case skip added in the previous commit is not needed. --- crates/openshell-sandbox/src/lib.rs | 38 +++++++++++------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 58a54cfb..6d676805 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -1404,32 +1404,22 @@ fn prepare_filesystem(policy: &SandboxPolicy) -> Result<()> { // The TOCTOU window between lstat and chown is not exploitable because // no untrusted process is running yet (the child has not been forked). for path in &policy.filesystem.read_write { - // Virtual filesystems (/proc, /sys) are kernel-managed — skip chown - // and mkdir. These paths are added to the Landlock ruleset only; - // ownership changes are meaningless and may fail (symlinks like - // /proc/self, permission errors on sysfs nodes). - let is_virtual_fs = path.starts_with("/proc") || path.starts_with("/sys"); - - if is_virtual_fs { - debug!(path = %path.display(), "Skipping chown for virtual filesystem path"); - } else { - // Check for symlinks before touching the path. Character/block devices - // (e.g. /dev/null) are legitimate read_write entries and must be allowed. - if let Ok(meta) = std::fs::symlink_metadata(path) { - if meta.file_type().is_symlink() { - return Err(miette::miette!( - "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", - path.display() - )); - } - } else { - debug!(path = %path.display(), "Creating read_write directory"); - std::fs::create_dir_all(path).into_diagnostic()?; + // Check for symlinks before touching the path. Character/block devices + // (e.g. /dev/null) are legitimate read_write entries and must be allowed. + if let Ok(meta) = std::fs::symlink_metadata(path) { + if meta.file_type().is_symlink() { + return Err(miette::miette!( + "read_write path '{}' is a symlink — refusing to chown (potential privilege escalation)", + path.display() + )); } - - debug!(path = %path.display(), ?uid, ?gid, "Setting ownership on read_write directory"); - chown(path, uid, gid).into_diagnostic()?; + } else { + debug!(path = %path.display(), "Creating read_write directory"); + std::fs::create_dir_all(path).into_diagnostic()?; } + + debug!(path = %path.display(), ?uid, ?gid, "Setting ownership on read_write directory"); + chown(path, uid, gid).into_diagnostic()?; } Ok(()) From 336f61973a23910176a6558bc61aad62f7d3b787 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Wed, 1 Apr 2026 11:08:31 -0700 Subject: [PATCH 07/10] fix(e2e): include /proc and /dev/urandom in test baseline filesystem The baseline enrichment now adds these paths for proxy-mode sandboxes, which changes the policy hash. Without them in the test fixture, the live policy update test sees a version bump when enrichment syncs back to the gateway, causing the "same policy should return existing version" assertion to fail. --- e2e/python/test_inference_routing.py | 2 +- e2e/python/test_sandbox_policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/python/test_inference_routing.py b/e2e/python/test_inference_routing.py index bda0d8cb..c83049c1 100644 --- a/e2e/python/test_inference_routing.py +++ b/e2e/python/test_inference_routing.py @@ -32,7 +32,7 @@ _BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy( include_workdir=True, - read_only=["/usr", "/lib", "/etc", "/app", "/var/log"], + read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"], read_write=["/sandbox", "/tmp"], ) _BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort") diff --git a/e2e/python/test_sandbox_policy.py b/e2e/python/test_sandbox_policy.py index 6a4bf5ed..625fe8da 100644 --- a/e2e/python/test_sandbox_policy.py +++ b/e2e/python/test_sandbox_policy.py @@ -22,7 +22,7 @@ _BASE_FILESYSTEM = sandbox_pb2.FilesystemPolicy( include_workdir=True, - read_only=["/usr", "/lib", "/etc", "/app", "/var/log"], + read_only=["/usr", "/lib", "/etc", "/app", "/var/log", "/proc", "/dev/urandom"], read_write=["/sandbox", "/tmp"], ) _BASE_LANDLOCK = sandbox_pb2.LandlockPolicy(compatibility="best_effort") From 997554d01d473d197b38455649e76e8b262f4193 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Wed, 1 Apr 2026 11:18:15 -0700 Subject: [PATCH 08/10] refactor(sandbox): simplify baseline path enrichment for GPU and proxy mode Consolidate GPU and proxy baseline paths into a single baseline_enrichment_paths() function that returns (read_only, read_write) string vecs. Both enrich_proto_baseline_paths and enrich_sandbox_baseline_paths now use the same loop pattern over these combined lists, removing the duplicated GPU-specific enrichment blocks. --- crates/openshell-sandbox/src/lib.rs | 198 ++++++++-------------------- 1 file changed, 58 insertions(+), 140 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 6d676805..1c4d84ad 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -898,35 +898,29 @@ const PROXY_BASELINE_READ_ONLY: &[&str] = &[ /// user working directory and temporary files. const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; -/// Fixed read-only paths required when a GPU is present. +/// GPU read-only paths. /// /// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced -/// socket at init time. If the directory exists but landlock denies traversal +/// socket at init time. If the directory exists but Landlock denies traversal /// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS -/// even though the daemon is optional. Only read/traversal access is needed — -/// NVML connects to the existing socket but does not create or modify files. -const GPU_BASELINE_READ_ONLY_FIXED: &[&str] = &["/run/nvidia-persistenced"]; +/// even though the daemon is optional. Only read/traversal access is needed. +const GPU_BASELINE_READ_ONLY: &[&str] = &["/run/nvidia-persistenced"]; -/// Fixed read-write paths required when a GPU is present. +/// GPU read-write paths (static). /// /// `/dev/nvidiactl`, `/dev/nvidia-uvm`, `/dev/nvidia-uvm-tools`, /// `/dev/nvidia-modeset`: control and UVM devices injected by CDI. -/// Landlock READ_FILE/WRITE_FILE restricts open(2) on device files even -/// when DAC permissions would otherwise allow it. Device nodes need -/// read-write because NVML/CUDA opens them with O_RDWR. +/// Landlock restricts `open(2)` on device files even when DAC allows it; +/// these need read-write because NVML/CUDA opens them with `O_RDWR`. /// /// `/proc`: CUDA writes to `/proc//task//comm` during `cuInit()` -/// to set thread names. Without write access, `cuInit()` returns error 304 -/// (`cudaErrorOperatingSystem`). We must use `/proc` (not `/proc/self/task`) -/// because Landlock rules bind to inodes: `/proc/self/task` in the pre_exec -/// hook resolves to the shell's PID, but child processes (python, etc.) -/// have different PIDs and thus different procfs inodes. Security impact -/// is limited — writable proc entries (`oom_score_adj`, etc.) are already -/// kernel-restricted for non-root users; Landlock is defense-in-depth. +/// to set thread names. Without write access, `cuInit()` returns error 304. +/// Must use `/proc` (not `/proc/self/task`) because Landlock rules bind to +/// inodes and child processes have different procfs inodes than the parent. /// -/// Per-GPU device files (`/dev/nvidia0`, `/dev/nvidia1`, …) are enumerated -/// at runtime by `gpu_baseline_read_write_paths()` since the count varies. -const GPU_BASELINE_READ_WRITE_FIXED: &[&str] = &[ +/// Per-GPU device files (`/dev/nvidia0`, …) are enumerated at runtime by +/// `enumerate_gpu_device_nodes()` since the count varies. +const GPU_BASELINE_READ_WRITE: &[&str] = &[ "/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools", @@ -939,35 +933,36 @@ fn has_gpu_devices() -> bool { std::path::Path::new("/dev/nvidiactl").exists() } -/// Collect all GPU read-only paths (currently just the persistenced directory). -fn gpu_baseline_read_only_paths() -> Vec { - GPU_BASELINE_READ_ONLY_FIXED - .iter() - .map(|p| std::path::PathBuf::from(p)) - .collect() -} - -/// Collect all GPU read-write paths: fixed devices + per-GPU `/dev/nvidiaX`. -fn gpu_baseline_read_write_paths() -> Vec { - let mut paths: Vec = GPU_BASELINE_READ_WRITE_FIXED - .iter() - .map(|p| std::path::PathBuf::from(p)) - .collect(); - - // Enumerate per-GPU device nodes injected by CDI (nvidia0, nvidia1, …). +/// Enumerate per-GPU device nodes (`/dev/nvidia0`, `/dev/nvidia1`, …). +fn enumerate_gpu_device_nodes() -> Vec { + let mut paths = Vec::new(); if let Ok(entries) = std::fs::read_dir("/dev") { for entry in entries.flatten() { let name = entry.file_name(); let name = name.to_string_lossy(); if name.starts_with("nvidia") && name[6..].chars().all(|c| c.is_ascii_digit()) { - paths.push(entry.path()); + paths.push(entry.path().to_string_lossy().into_owned()); } } } - paths } +/// Collect all baseline paths for enrichment: proxy defaults + GPU (if present). +/// Returns `(read_only, read_write)` as owned `String` vecs. +fn baseline_enrichment_paths() -> (Vec, Vec) { + let mut ro: Vec = PROXY_BASELINE_READ_ONLY.iter().map(|&s| s.to_string()).collect(); + let mut rw: Vec = PROXY_BASELINE_READ_WRITE.iter().map(|&s| s.to_string()).collect(); + + if has_gpu_devices() { + ro.extend(GPU_BASELINE_READ_ONLY.iter().map(|&s| s.to_string())); + rw.extend(GPU_BASELINE_READ_WRITE.iter().map(|&s| s.to_string())); + rw.extend(enumerate_gpu_device_nodes()); + } + + (ro, rw) +} + /// Ensure a proto `SandboxPolicy` includes the baseline filesystem paths /// required for proxy-mode sandboxes. Paths are only added if missing; /// user-specified paths are never removed. @@ -986,79 +981,37 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) ..Default::default() }); + let (ro, rw) = baseline_enrichment_paths(); + + // Baseline paths are system-injected, not user-specified. Skip paths + // that do not exist in this container image to avoid noisy warnings from + // Landlock and, more critically, to prevent a single missing baseline + // path from abandoning the entire Landlock ruleset under best-effort + // mode (see issue #664). let mut modified = false; - for &path in PROXY_BASELINE_READ_ONLY { - if !fs.read_only.iter().any(|p| p.as_str() == path) { - // Baseline paths are system-injected, not user-specified. Skip - // paths that do not exist in this container image to avoid noisy - // warnings from Landlock and, more critically, to prevent a single - // missing baseline path from abandoning the entire Landlock - // ruleset under best-effort mode (see issue #664). + for path in &ro { + if !fs.read_only.iter().any(|p| p == path) + && !fs.read_write.iter().any(|p| p == path) + { if !std::path::Path::new(path).exists() { - debug!( - path, - "Baseline read-only path does not exist, skipping enrichment" - ); + debug!(path, "Baseline read-only path does not exist, skipping enrichment"); continue; } - fs.read_only.push(path.to_string()); + fs.read_only.push(path.clone()); modified = true; } } - for &path in PROXY_BASELINE_READ_WRITE { - if !fs.read_write.iter().any(|p| p.as_str() == path) { + for path in &rw { + if !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { - debug!( - path, - "Baseline read-write path does not exist, skipping enrichment" - ); + debug!(path, "Baseline read-write path does not exist, skipping enrichment"); continue; } - fs.read_write.push(path.to_string()); + fs.read_write.push(path.clone()); modified = true; } } - if has_gpu_devices() { - for path in gpu_baseline_read_only_paths() { - let path_str = path.to_string_lossy(); - if !fs.read_only.iter().any(|p| p.as_str() == path_str.as_ref()) - && !fs - .read_write - .iter() - .any(|p| p.as_str() == path_str.as_ref()) - { - if !path.exists() { - debug!( - path = %path.display(), - "GPU baseline read-only path does not exist, skipping enrichment" - ); - continue; - } - fs.read_only.push(path_str.into_owned()); - modified = true; - } - } - for path in gpu_baseline_read_write_paths() { - let path_str = path.to_string_lossy(); - if !fs - .read_write - .iter() - .any(|p| p.as_str() == path_str.as_ref()) - { - if !path.exists() { - debug!( - path = %path.display(), - "GPU baseline read-write path does not exist, skipping enrichment" - ); - continue; - } - fs.read_write.push(path_str.into_owned()); - modified = true; - } - } - } - if modified { info!("Enriched policy with baseline filesystem paths for proxy mode"); } @@ -1074,31 +1027,27 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { return; } + let (ro, rw) = baseline_enrichment_paths(); + let mut modified = false; - for &path in PROXY_BASELINE_READ_ONLY { + for path in &ro { let p = std::path::PathBuf::from(path); - if !policy.filesystem.read_only.contains(&p) { - // Baseline paths are system-injected — skip non-existent paths to - // avoid Landlock ruleset abandonment (issue #664). + if !policy.filesystem.read_only.contains(&p) + && !policy.filesystem.read_write.contains(&p) + { if !p.exists() { - debug!( - path, - "Baseline read-only path does not exist, skipping enrichment" - ); + debug!(path, "Baseline read-only path does not exist, skipping enrichment"); continue; } policy.filesystem.read_only.push(p); modified = true; } } - for &path in PROXY_BASELINE_READ_WRITE { + for path in &rw { let p = std::path::PathBuf::from(path); if !policy.filesystem.read_write.contains(&p) { if !p.exists() { - debug!( - path, - "Baseline read-write path does not exist, skipping enrichment" - ); + debug!(path, "Baseline read-write path does not exist, skipping enrichment"); continue; } policy.filesystem.read_write.push(p); @@ -1106,37 +1055,6 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } } - if has_gpu_devices() { - for p in gpu_baseline_read_only_paths() { - if !policy.filesystem.read_only.contains(&p) - && !policy.filesystem.read_write.contains(&p) - { - if !p.exists() { - debug!( - path = %p.display(), - "GPU baseline read-only path does not exist, skipping enrichment" - ); - continue; - } - policy.filesystem.read_only.push(p); - modified = true; - } - } - for p in gpu_baseline_read_write_paths() { - if !policy.filesystem.read_write.contains(&p) { - if !p.exists() { - debug!( - path = %p.display(), - "GPU baseline read-write path does not exist, skipping enrichment" - ); - continue; - } - policy.filesystem.read_write.push(p); - modified = true; - } - } - } - if modified { info!("Enriched policy with baseline filesystem paths for proxy mode"); } From c5e7cec447d50fafcc497c41038606ab8c2b84ee Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Wed, 1 Apr 2026 11:21:58 -0700 Subject: [PATCH 09/10] style(sandbox): fix clippy doc warning and rustfmt --- crates/openshell-sandbox/src/lib.rs | 40 +++++++++++++++++++---------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 1c4d84ad..9b6372cc 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -902,7 +902,7 @@ const PROXY_BASELINE_READ_WRITE: &[&str] = &["/sandbox", "/tmp"]; /// /// `/run/nvidia-persistenced`: NVML tries to connect to the persistenced /// socket at init time. If the directory exists but Landlock denies traversal -/// (EACCES vs ECONNREFUSED), NVML returns NVML_ERROR_INSUFFICIENT_PERMISSIONS +/// (EACCES vs ECONNREFUSED), NVML returns `NVML_ERROR_INSUFFICIENT_PERMISSIONS` /// even though the daemon is optional. Only read/traversal access is needed. const GPU_BASELINE_READ_ONLY: &[&str] = &["/run/nvidia-persistenced"]; @@ -951,8 +951,14 @@ fn enumerate_gpu_device_nodes() -> Vec { /// Collect all baseline paths for enrichment: proxy defaults + GPU (if present). /// Returns `(read_only, read_write)` as owned `String` vecs. fn baseline_enrichment_paths() -> (Vec, Vec) { - let mut ro: Vec = PROXY_BASELINE_READ_ONLY.iter().map(|&s| s.to_string()).collect(); - let mut rw: Vec = PROXY_BASELINE_READ_WRITE.iter().map(|&s| s.to_string()).collect(); + let mut ro: Vec = PROXY_BASELINE_READ_ONLY + .iter() + .map(|&s| s.to_string()) + .collect(); + let mut rw: Vec = PROXY_BASELINE_READ_WRITE + .iter() + .map(|&s| s.to_string()) + .collect(); if has_gpu_devices() { ro.extend(GPU_BASELINE_READ_ONLY.iter().map(|&s| s.to_string())); @@ -990,11 +996,12 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) // mode (see issue #664). let mut modified = false; for path in &ro { - if !fs.read_only.iter().any(|p| p == path) - && !fs.read_write.iter().any(|p| p == path) - { + if !fs.read_only.iter().any(|p| p == path) && !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { - debug!(path, "Baseline read-only path does not exist, skipping enrichment"); + debug!( + path, + "Baseline read-only path does not exist, skipping enrichment" + ); continue; } fs.read_only.push(path.clone()); @@ -1004,7 +1011,10 @@ fn enrich_proto_baseline_paths(proto: &mut openshell_core::proto::SandboxPolicy) for path in &rw { if !fs.read_write.iter().any(|p| p == path) { if !std::path::Path::new(path).exists() { - debug!(path, "Baseline read-write path does not exist, skipping enrichment"); + debug!( + path, + "Baseline read-write path does not exist, skipping enrichment" + ); continue; } fs.read_write.push(path.clone()); @@ -1032,11 +1042,12 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { let mut modified = false; for path in &ro { let p = std::path::PathBuf::from(path); - if !policy.filesystem.read_only.contains(&p) - && !policy.filesystem.read_write.contains(&p) - { + if !policy.filesystem.read_only.contains(&p) && !policy.filesystem.read_write.contains(&p) { if !p.exists() { - debug!(path, "Baseline read-only path does not exist, skipping enrichment"); + debug!( + path, + "Baseline read-only path does not exist, skipping enrichment" + ); continue; } policy.filesystem.read_only.push(p); @@ -1047,7 +1058,10 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { let p = std::path::PathBuf::from(path); if !policy.filesystem.read_write.contains(&p) { if !p.exists() { - debug!(path, "Baseline read-write path does not exist, skipping enrichment"); + debug!( + path, + "Baseline read-write path does not exist, skipping enrichment" + ); continue; } policy.filesystem.read_write.push(p); From 5927f9f07c54206752d576e11879a69d00749b85 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Wed, 1 Apr 2026 17:09:08 -0700 Subject: [PATCH 10/10] fix(sandbox): address PR review nits for GPU baseline paths - Use strip_prefix("nvidia") instead of hardcoded index to match per-GPU device nodes; reject bare "nvidia" (empty suffix) - Deduplicate /proc from read_only when GPU promotes it to read_write in baseline_enrichment_paths() - Add unit tests for baseline path deduplication and device enumeration --- crates/openshell-sandbox/src/lib.rs | 79 ++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 9b6372cc..2384a217 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -940,7 +940,10 @@ fn enumerate_gpu_device_nodes() -> Vec { for entry in entries.flatten() { let name = entry.file_name(); let name = name.to_string_lossy(); - if name.starts_with("nvidia") && name[6..].chars().all(|c| c.is_ascii_digit()) { + if let Some(suffix) = name.strip_prefix("nvidia") { + if suffix.is_empty() || !suffix.chars().all(|c| c.is_ascii_digit()) { + continue; + } paths.push(entry.path().to_string_lossy().into_owned()); } } @@ -966,6 +969,11 @@ fn baseline_enrichment_paths() -> (Vec, Vec) { rw.extend(enumerate_gpu_device_nodes()); } + // A path promoted to read_write (e.g. /proc for GPU) should not also + // appear in read_only — Landlock handles the overlap correctly but the + // duplicate is confusing when inspecting the effective policy. + ro.retain(|p| !rw.contains(p)); + (ro, rw) } @@ -1074,6 +1082,75 @@ fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { } } +#[cfg(test)] +mod baseline_tests { + use super::*; + + #[test] + fn proc_not_in_both_read_only_and_read_write_when_gpu_present() { + // When GPU devices are present, /proc is promoted to read_write + // (CUDA needs to write /proc//task//comm). It should + // NOT also appear in read_only. + if !has_gpu_devices() { + // Can't test GPU dedup without GPU devices; skip silently. + return; + } + let (ro, rw) = baseline_enrichment_paths(); + assert!( + rw.contains(&"/proc".to_string()), + "/proc should be in read_write when GPU is present" + ); + assert!( + !ro.contains(&"/proc".to_string()), + "/proc should NOT be in read_only when it is already in read_write" + ); + } + + #[test] + fn proc_in_read_only_without_gpu() { + if has_gpu_devices() { + // On a GPU host we can't test the non-GPU path; skip silently. + return; + } + let (ro, _rw) = baseline_enrichment_paths(); + assert!( + ro.contains(&"/proc".to_string()), + "/proc should be in read_only when GPU is not present" + ); + } + + #[test] + fn baseline_read_write_always_includes_sandbox_and_tmp() { + let (_ro, rw) = baseline_enrichment_paths(); + assert!(rw.contains(&"/sandbox".to_string())); + assert!(rw.contains(&"/tmp".to_string())); + } + + #[test] + fn enumerate_gpu_device_nodes_skips_bare_nvidia() { + // "nvidia" (without a trailing digit) is a valid /dev entry on some + // systems but is not a per-GPU device node. The enumerator must + // not match it. + let nodes = enumerate_gpu_device_nodes(); + assert!( + !nodes.contains(&"/dev/nvidia".to_string()), + "bare /dev/nvidia should not be enumerated: {nodes:?}" + ); + } + + #[test] + fn no_duplicate_paths_in_baseline() { + let (ro, rw) = baseline_enrichment_paths(); + // No path should appear in both lists. + for path in &ro { + assert!( + !rw.contains(path), + "path {path} appears in both read_only and read_write" + ); + } + } +} + /// Load sandbox policy from local files or gRPC. /// /// Priority: