From 071e15863a53f087d8a64fa8c4ce2b45ef4e766c Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:19:00 -0700 Subject: [PATCH] fix(sandbox): harden seccomp filter to block dangerous syscalls --- architecture/sandbox.md | 46 ++++- architecture/security-policy.md | 29 ++- .../src/sandbox/linux/seccomp.rs | 182 ++++++++++++++++++ 3 files changed, 248 insertions(+), 9 deletions(-) diff --git a/architecture/sandbox.md b/architecture/sandbox.md index c870708d..c5e212f8 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -24,7 +24,7 @@ All paths are relative to `crates/openshell-sandbox/src/`. | `sandbox/mod.rs` | Platform abstraction -- dispatches to Linux or no-op | | `sandbox/linux/mod.rs` | Linux composition: Landlock then seccomp | | `sandbox/linux/landlock.rs` | Filesystem isolation via Landlock LSM (ABI V1) | -| `sandbox/linux/seccomp.rs` | Syscall filtering via BPF on `SYS_socket` | +| `sandbox/linux/seccomp.rs` | Syscall filtering via BPF: socket domain blocks, dangerous syscall blocks, conditional flag blocks | | `bypass_monitor.rs` | Background `/dev/kmsg` reader for iptables bypass detection events | | `sandbox/linux/netns.rs` | Network namespace creation, veth pair setup, bypass detection iptables rules, cleanup on drop | | `l7/mod.rs` | L7 types (`L7Protocol`, `TlsMode`, `EnforcementMode`, `L7EndpointConfig`), config parsing, validation, access preset expansion, deprecated `tls` value handling | @@ -451,13 +451,7 @@ Kernel-level error behavior (e.g., Landlock ABI unavailable) depends on `Landloc **File:** `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs` -Seccomp blocks socket creation for specific address families. The filter targets a single syscall (`SYS_socket`) and inspects argument 0 (the domain). - -**Always blocked** (regardless of network mode): -- `AF_NETLINK`, `AF_PACKET`, `AF_BLUETOOTH`, `AF_VSOCK` - -**Additionally blocked in `Block` mode** (no proxy): -- `AF_INET`, `AF_INET6` +Seccomp provides three layers of syscall restriction: socket domain blocks, unconditional syscall blocks, and conditional syscall blocks. The filter uses a default-allow policy (`SeccompAction::Allow`) with targeted rules that return `Errno(EPERM)`. **Skipped entirely** in `Allow` mode. @@ -465,8 +459,44 @@ Setup: 1. `prctl(PR_SET_NO_NEW_PRIVS, 1)` -- required before seccomp 2. `seccompiler::apply_filter()` with default action `Allow` and per-rule action `Errno(EPERM)` +#### Socket domain blocks + +| Domain | Always blocked | Additionally blocked in Block mode | +|--------|:-:|:-:| +| `AF_PACKET` | Yes | | +| `AF_BLUETOOTH` | Yes | | +| `AF_VSOCK` | Yes | | +| `AF_INET` | | Yes | +| `AF_INET6` | | Yes | +| `AF_NETLINK` | | Yes | + In `Proxy` mode, `AF_INET`/`AF_INET6` are allowed because the sandboxed process needs to connect to the proxy over the veth pair. The network namespace ensures it can only reach the proxy's IP (`10.200.0.1`). +#### Unconditional syscall blocks + +These syscalls are blocked entirely (EPERM for any invocation): + +| Syscall | Reason | +|---------|--------| +| `memfd_create` | Fileless binary execution bypasses Landlock filesystem restrictions | +| `ptrace` | Cross-process memory inspection and code injection | +| `bpf` | Kernel BPF program loading | +| `process_vm_readv` | Cross-process memory read | +| `io_uring_setup` | Async I/O subsystem with extensive CVE history | +| `mount` | Filesystem mount could subvert Landlock or overlay writable paths | + +#### Conditional syscall blocks + +These syscalls are only blocked when specific flag patterns are present: + +| Syscall | Condition | Reason | +|---------|-----------|--------| +| `execveat` | `AT_EMPTY_PATH` flag set (arg4) | Fileless execution from an anonymous fd | +| `unshare` | `CLONE_NEWUSER` flag set (arg0) | User namespace creation enables privilege escalation | +| `seccomp` | operation == `SECCOMP_SET_MODE_FILTER` (arg0) | Prevents sandboxed code from replacing the active filter | + +Conditional blocks use `MaskedEq` for flag checks (bit-test) and `Eq` for exact-value matches. This allows normal use of these syscalls while blocking the dangerous flag combinations. + ### Network namespace isolation **File:** `crates/openshell-sandbox/src/sandbox/linux/netns.rs` diff --git a/architecture/security-policy.md b/architecture/security-policy.md index 555ba67a..01eb96f9 100644 --- a/architecture/security-policy.md +++ b/architecture/security-policy.md @@ -850,6 +850,10 @@ The response includes an `X-OpenShell-Policy` header and `Connection: close`. Se ## Seccomp Filter Details +The seccomp filter uses a default-allow policy (`SeccompAction::Allow`) with targeted rules that return `EPERM`. It provides three layers of protection: socket domain blocks, unconditional syscall blocks, and conditional syscall blocks. See `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs`. + +### Blocked socket domains + Regardless of network mode, certain socket domains are always blocked: | Domain | Constant | Reason | @@ -861,7 +865,30 @@ Regardless of network mode, certain socket domains are always blocked: In proxy mode (which is always active), `AF_INET` (2) and `AF_INET6` (10) are allowed so the sandbox process can reach the proxy. -The seccomp filter uses a default-allow policy (`SeccompAction::Allow`) with specific `socket()` syscall rules that return `EPERM` when the first argument (domain) matches a blocked value. See `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs`. +### Blocked syscalls + +These syscalls are blocked unconditionally (EPERM for any invocation): + +| Syscall | NR (x86-64) | Reason | +|---------|-------------|--------| +| `memfd_create` | 319 | Fileless binary execution bypasses Landlock filesystem restrictions | +| `ptrace` | 101 | Cross-process memory inspection and code injection | +| `bpf` | 321 | Kernel BPF program loading | +| `process_vm_readv` | 310 | Cross-process memory read | +| `io_uring_setup` | 425 | Async I/O subsystem with extensive CVE history | +| `mount` | 165 | Filesystem mount could subvert Landlock or overlay writable paths | + +### Conditionally blocked syscalls + +These syscalls are blocked only when specific flag patterns are present in their arguments: + +| Syscall | NR (x86-64) | Condition | Reason | +|---------|-------------|-----------|--------| +| `execveat` | 322 | `AT_EMPTY_PATH` (0x1000) set in flags (arg4) | Fileless execution from an anonymous fd | +| `unshare` | 272 | `CLONE_NEWUSER` (0x10000000) set in flags (arg0) | User namespace creation enables privilege escalation | +| `seccomp` | 317 | operation == `SECCOMP_SET_MODE_FILTER` (1) in arg0 | Prevents sandboxed code from replacing the active filter | + +Flag checks use `MaskedEq` (`(arg & mask) == mask`) to detect the flag bit regardless of other bits. The `seccomp` syscall check uses `Eq` for exact value comparison on the operation argument. --- diff --git a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs index 6c9d8307..e2344749 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/seccomp.rs @@ -2,6 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 //! Seccomp syscall filtering. +//! +//! The filter uses a default-allow policy with targeted blocks: +//! +//! 1. **Socket domain blocks** -- prevent raw/kernel sockets that bypass the proxy +//! 2. **Unconditional syscall blocks** -- block syscalls that enable sandbox escape +//! (fileless exec, ptrace, BPF, cross-process memory access, io_uring, mount) +//! 3. **Conditional syscall blocks** -- block dangerous flag combinations on otherwise +//! needed syscalls (execveat+AT_EMPTY_PATH, unshare+CLONE_NEWUSER, +//! seccomp+SET_MODE_FILTER) use crate::policy::{NetworkMode, SandboxPolicy}; use miette::{IntoDiagnostic, Result}; @@ -13,6 +22,9 @@ use std::collections::BTreeMap; use std::convert::TryInto; use tracing::debug; +/// Value of `SECCOMP_SET_MODE_FILTER` (linux/seccomp.h). +const SECCOMP_SET_MODE_FILTER: u64 = 1; + pub fn apply(policy: &SandboxPolicy) -> Result<()> { if matches!(policy.network.mode, NetworkMode::Allow) { return Ok(()); @@ -37,6 +49,7 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> { fn build_filter(allow_inet: bool) -> Result { let mut rules: BTreeMap> = BTreeMap::new(); + // --- Socket domain blocks --- let mut blocked_domains = vec![libc::AF_PACKET, libc::AF_BLUETOOTH, libc::AF_VSOCK]; if !allow_inet { blocked_domains.push(libc::AF_INET); @@ -49,6 +62,51 @@ fn build_filter(allow_inet: bool) -> Result { add_socket_domain_rule(&mut rules, domain)?; } + // --- Unconditional syscall blocks --- + // These syscalls are blocked entirely (empty rule vec = unconditional EPERM). + + // Fileless binary execution via memfd bypasses Landlock filesystem restrictions. + rules.entry(libc::SYS_memfd_create).or_default(); + // Cross-process memory inspection and code injection. + rules.entry(libc::SYS_ptrace).or_default(); + // Kernel BPF program loading. + rules.entry(libc::SYS_bpf).or_default(); + // Cross-process memory read. + rules.entry(libc::SYS_process_vm_readv).or_default(); + // Async I/O subsystem with extensive CVE history. + rules.entry(libc::SYS_io_uring_setup).or_default(); + // Filesystem mount could subvert Landlock or overlay writable paths. + rules.entry(libc::SYS_mount).or_default(); + + // --- Conditional syscall blocks --- + + // execveat with AT_EMPTY_PATH enables fileless execution from an anonymous fd. + add_masked_arg_rule( + &mut rules, + libc::SYS_execveat, + 4, // flags argument + libc::AT_EMPTY_PATH as u64, + )?; + + // unshare with CLONE_NEWUSER allows creating user namespaces to escalate privileges. + add_masked_arg_rule( + &mut rules, + libc::SYS_unshare, + 0, // flags argument + libc::CLONE_NEWUSER as u64, + )?; + + // seccomp(SECCOMP_SET_MODE_FILTER) would let sandboxed code replace the active filter. + let condition = SeccompCondition::new( + 0, // operation argument + SeccompCmpArgLen::Dword, + SeccompCmpOp::Eq, + SECCOMP_SET_MODE_FILTER, + ) + .into_diagnostic()?; + let rule = SeccompRule::new(vec![condition]).into_diagnostic()?; + rules.entry(libc::SYS_seccomp).or_default().push(rule); + let arch = std::env::consts::ARCH .try_into() .map_err(|_| miette::miette!("Unsupported architecture for seccomp"))?; @@ -74,3 +132,127 @@ fn add_socket_domain_rule(rules: &mut BTreeMap>, domain: i rules.entry(libc::SYS_socket).or_default().push(rule); Ok(()) } + +/// Block a syscall when a specific bit pattern is set in an argument. +/// +/// Uses `MaskedEq` to check `(arg & flag_bit) == flag_bit`, which triggers +/// EPERM when the flag is present regardless of other bits in the argument. +fn add_masked_arg_rule( + rules: &mut BTreeMap>, + syscall: i64, + arg_index: u8, + flag_bit: u64, +) -> Result<()> { + let condition = SeccompCondition::new( + arg_index, + SeccompCmpArgLen::Dword, + SeccompCmpOp::MaskedEq(flag_bit), + flag_bit, + ) + .into_diagnostic()?; + let rule = SeccompRule::new(vec![condition]).into_diagnostic()?; + rules.entry(syscall).or_default().push(rule); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_filter_proxy_mode_compiles() { + let filter = build_filter(true); + assert!(filter.is_ok(), "build_filter(true) should succeed"); + } + + #[test] + fn build_filter_block_mode_compiles() { + let filter = build_filter(false); + assert!(filter.is_ok(), "build_filter(false) should succeed"); + } + + #[test] + fn add_masked_arg_rule_creates_entry() { + let mut rules: BTreeMap> = BTreeMap::new(); + let result = add_masked_arg_rule(&mut rules, libc::SYS_execveat, 4, 0x1000); + assert!(result.is_ok()); + assert!( + rules.contains_key(&libc::SYS_execveat), + "should have an entry for SYS_execveat" + ); + assert_eq!( + rules[&libc::SYS_execveat].len(), + 1, + "should have exactly one rule" + ); + } + + #[test] + fn unconditional_blocks_present_in_filter() { + let mut rules: BTreeMap> = BTreeMap::new(); + + // Simulate what build_filter does for unconditional blocks + rules.entry(libc::SYS_memfd_create).or_default(); + rules.entry(libc::SYS_ptrace).or_default(); + rules.entry(libc::SYS_bpf).or_default(); + rules.entry(libc::SYS_process_vm_readv).or_default(); + rules.entry(libc::SYS_io_uring_setup).or_default(); + rules.entry(libc::SYS_mount).or_default(); + + // Unconditional blocks have an empty Vec (no conditions = always match) + for syscall in [ + libc::SYS_memfd_create, + libc::SYS_ptrace, + libc::SYS_bpf, + libc::SYS_process_vm_readv, + libc::SYS_io_uring_setup, + libc::SYS_mount, + ] { + assert!( + rules.contains_key(&syscall), + "syscall {syscall} should be in the rules map" + ); + assert!( + rules[&syscall].is_empty(), + "syscall {syscall} should have empty rules (unconditional block)" + ); + } + } + + #[test] + fn conditional_blocks_have_rules() { + // Build a real filter and verify the conditional syscalls have rule entries + // (non-empty Vec means conditional match) + let mut rules: BTreeMap> = BTreeMap::new(); + + add_masked_arg_rule( + &mut rules, + libc::SYS_execveat, + 4, + libc::AT_EMPTY_PATH as u64, + ) + .unwrap(); + add_masked_arg_rule(&mut rules, libc::SYS_unshare, 0, libc::CLONE_NEWUSER as u64).unwrap(); + + let condition = SeccompCondition::new( + 0, + SeccompCmpArgLen::Dword, + SeccompCmpOp::Eq, + SECCOMP_SET_MODE_FILTER, + ) + .unwrap(); + let rule = SeccompRule::new(vec![condition]).unwrap(); + rules.entry(libc::SYS_seccomp).or_default().push(rule); + + for syscall in [libc::SYS_execveat, libc::SYS_unshare, libc::SYS_seccomp] { + assert!( + rules.contains_key(&syscall), + "syscall {syscall} should be in the rules map" + ); + assert!( + !rules[&syscall].is_empty(), + "syscall {syscall} should have conditional rules" + ); + } + } +}