diff --git a/crates/fakecloud-ec2/src/runtime/firewall.rs b/crates/fakecloud-ec2/src/runtime/firewall.rs index 27c44886..4bfcae31 100644 --- a/crates/fakecloud-ec2/src/runtime/firewall.rs +++ b/crates/fakecloud-ec2/src/runtime/firewall.rs @@ -261,17 +261,37 @@ fn push_proto_ports(parts: &mut Vec, protocol: &str, from: i64, to: i64) } } } - other => parts.push(format!("ip protocol {other}")), + // An unrecognized protocol is interpolated into the nft script, so + // restrict it to the protocol-token charset `[a-z0-9-]` to avoid + // ruleset injection (finding 2.2); anything else emits no proto match. + other if other.chars().all(|c| c.is_ascii_alphanumeric() || c == '-') => { + parts.push(format!("ip protocol {other}")) + } + _ => {} } } /// Drop `0.0.0.0/0` (which nft rejects as a no-op match) to `None`, and strip a /// redundant `/32` host suffix so single-host rules read cleanly. +/// +/// Also **sanitizes**: the CIDR comes from an Authorize/RevokeSecurityGroup +/// param and is interpolated raw into the `nft -f -` script, so a value +/// containing nft metacharacters (whitespace, `;`, `{`, newline, …) could +/// inject ruleset syntax. Anything outside the IPv4/IPv6-CIDR character set +/// `[0-9a-fA-F.:/]` is rejected to `None` (the match clause is dropped, never +/// the whole rule), closing that injection surface (bug-hunt 2026-06-18 +/// finding 2.2). fn normalized_cidr(cidr: &Option) -> Option { let c = cidr.as_deref()?; if c == "0.0.0.0/0" || c.is_empty() { return None; } + if !c + .chars() + .all(|ch| ch.is_ascii_hexdigit() || matches!(ch, '.' | ':' | '/')) + { + return None; + } Some(c.trim_end_matches("/32").to_string()) } @@ -285,19 +305,27 @@ pub enum EnforcementMode { } /// Decide the enforcement mode from the environment. Enforcement is opt-in: -/// `FAKECLOUD_EC2_SG_ENFORCEMENT` must be set to `1`/`true`/`nftables`, and -/// `nft` must actually be runnable, or we degrade to `Disabled` with a single -/// warning. `env` and `nft_probe` are injected so the decision is unit-testable -/// without touching the real environment or running `nft`. +/// `FAKECLOUD_EC2_SG_ENFORCEMENT` must be set to `1`/`true`/`nftables`, `nft` +/// must be runnable, AND the daemon must run on this host's network namespace +/// (`host_local`). `env`, `host_local`, and `nft_probe` are injected so the +/// decision is unit-testable without touching the environment or running `nft`. +/// +/// `host_local` guards the false-positive on Docker Desktop / podman-machine +/// (macOS/Windows): there the per-subnet bridges live inside the daemon's Linux +/// VM, so `nft` on the host installs rules against the wrong netfilter and +/// silently filters nothing — yet the probe would pass on a Linux box. We treat +/// only a native-Linux host as able to filter (bug-hunt 2026-06-18 finding 1.5), +/// so `enforced` never claims active enforcement that can't take effect. pub fn resolve_enforcement_mode( env: Option<&str>, + host_local: bool, nft_probe: impl FnOnce() -> bool, ) -> EnforcementMode { let opted_in = matches!( env.map(|v| v.to_ascii_lowercase()).as_deref(), Some("1") | Some("true") | Some("nftables") | Some("on") ); - if !opted_in { + if !opted_in || !host_local { return EnforcementMode::Disabled; } if nft_probe() { @@ -307,6 +335,15 @@ pub fn resolve_enforcement_mode( } } +/// Whether the container daemon shares this process's network namespace, so +/// host nftables rules actually see the inter-container traffic. True only on a +/// native-Linux host; Docker Desktop / podman-machine on macOS/Windows run the +/// daemon in a separate Linux VM. (Honest default; can be overridden by the +/// caller when fakecloud and the daemon are known to share a netns.) +pub fn host_shares_daemon_netns() -> bool { + cfg!(target_os = "linux") +} + /// True when `nft list ruleset` runs successfully — i.e. nft exists and this /// process holds enough capability to read the ruleset (a good proxy for being /// able to write it). @@ -432,6 +469,38 @@ mod tests { .contains("ip saddr 203.0.113.7 tcp dport 22")); } + #[test] + fn cidr_with_nft_metacharacters_is_dropped_not_injected() { + // A CIDR carrying nft syntax (`;`, spaces, words) must never reach the + // `nft -f -` script (finding 2.2). The match clause is omitted; the + // rule still renders safely and terminates in `accept`. + let r = tcp(22, Some("10.0.0.0/8; drop comment \"x\"")); + let line = render_rule(&r, Direction::Ingress, "172.30.0.2"); + assert!(!line.contains(';'), "no injected semicolon: {line}"); + assert!(!line.contains("comment"), "no injected tokens: {line}"); + assert!( + !line.contains("ip saddr"), + "malformed cidr clause omitted: {line}" + ); + assert!(line.ends_with("accept"), "rule still valid: {line}"); + } + + #[test] + fn unknown_protocol_with_bad_chars_emits_no_proto_match() { + let r = FirewallRule { + protocol: "tcp; drop".into(), + from_port: -1, + to_port: -1, + cidr: None, + }; + let line = render_rule(&r, Direction::Ingress, "172.30.0.2"); + assert!( + !line.contains(';') && !line.contains("ip protocol"), + "{line}" + ); + assert_eq!(line, "ip daddr 172.30.0.2 accept"); + } + #[test] fn nacl_deny_emitted_before_instance_rules() { let model = vec![SubnetFirewall { @@ -504,27 +573,33 @@ mod tests { #[test] fn enforcement_mode_is_opt_in_and_capability_gated() { - // not opted in -> disabled regardless of nft availability + // not opted in -> disabled regardless of nft availability / host assert_eq!( - resolve_enforcement_mode(None, || true), + resolve_enforcement_mode(None, true, || true), EnforcementMode::Disabled ); assert_eq!( - resolve_enforcement_mode(Some("0"), || true), + resolve_enforcement_mode(Some("0"), true, || true), EnforcementMode::Disabled ); // opted in but nft missing -> degrade assert_eq!( - resolve_enforcement_mode(Some("1"), || false), + resolve_enforcement_mode(Some("1"), true, || false), + EnforcementMode::Disabled + ); + // opted in + capable but daemon not host-local (Docker Desktop/VM) -> + // degrade rather than falsely claim enforced (finding 1.5) + assert_eq!( + resolve_enforcement_mode(Some("1"), false, || true), EnforcementMode::Disabled ); - // opted in + capable -> nftables + // opted in + host-local + capable -> nftables assert_eq!( - resolve_enforcement_mode(Some("nftables"), || true), + resolve_enforcement_mode(Some("nftables"), true, || true), EnforcementMode::Nftables ); assert_eq!( - resolve_enforcement_mode(Some("TRUE"), || true), + resolve_enforcement_mode(Some("TRUE"), true, || true), EnforcementMode::Nftables ); } diff --git a/crates/fakecloud-ec2/src/runtime/k8s.rs b/crates/fakecloud-ec2/src/runtime/k8s.rs index 0e085624..cf3ee05f 100644 --- a/crates/fakecloud-ec2/src/runtime/k8s.rs +++ b/crates/fakecloud-ec2/src/runtime/k8s.rs @@ -73,7 +73,7 @@ impl K8sInstances { ); let pod_config = K8sPodConfig::resolved_base("FAKECLOUD_EC2_K8S")?; // Detect the CNI so we can warn when NetworkPolicy won't be enforced. - let cni = CniDriver::from_components(client.kube_system_pod_names().await); + let cni = CniDriver::from_components(client.cni_component_names().await); if cni.enforces() { tracing::info!(?cni, "k8s CNI enforces NetworkPolicy; EC2 security groups will be applied as NetworkPolicies"); } else { diff --git a/crates/fakecloud-ec2/src/runtime/mod.rs b/crates/fakecloud-ec2/src/runtime/mod.rs index 8284b699..c6721575 100644 --- a/crates/fakecloud-ec2/src/runtime/mod.rs +++ b/crates/fakecloud-ec2/src/runtime/mod.rs @@ -160,12 +160,18 @@ impl FirewallEnforcer { /// can't be backed (so the operator knows it degraded, not silently). fn detect() -> Self { let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok(); - let mode = resolve_enforcement_mode(requested.as_deref(), firewall::nft_available); + let mode = resolve_enforcement_mode( + requested.as_deref(), + firewall::host_shares_daemon_netns(), + firewall::nft_available, + ); if requested.is_some() && mode == EnforcementMode::Disabled { tracing::warn!( - "EC2 security-group enforcement was requested but nftables/CAP_NET_ADMIN \ - is unavailable; falling back to metadata-only (phase-2 L3 isolation stays \ - active, security-group rules are tracked but not enforced)" + "EC2 security-group enforcement was requested but it can't take effect here \ + (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \ + network namespace — Docker Desktop / podman-machine run the daemon in a VM); \ + falling back to metadata-only (phase-2 L3 isolation stays active, security-group \ + rules are tracked but not enforced)" ); } else if mode == EnforcementMode::Nftables { tracing::info!("EC2 security-group enforcement active via nftables"); @@ -241,6 +247,13 @@ pub struct Ec2Runtime { instances: Arc>>, /// Host firewall enforcer for security groups + NACLs. firewall: FirewallEnforcer, + /// Serializes firewall reconciles. Reconcile is fired from many concurrent + /// background tasks (per SG/NACL/lifecycle event); without this, two + /// reconciles built from divergent state could interleave so the k8s + /// apply+prune of one deletes a policy the other just applied (bug-hunt + /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the + /// last-started reconcile the last-applied for both backends. + reconcile_lock: Arc>, } impl Ec2Runtime { @@ -255,6 +268,7 @@ impl Ec2Runtime { }), instances: Arc::new(RwLock::new(HashMap::new())), firewall: FirewallEnforcer::detect(), + reconcile_lock: Arc::new(tokio::sync::Mutex::new(())), }) } @@ -268,6 +282,7 @@ impl Ec2Runtime { instances: Arc::new(RwLock::new(HashMap::new())), // k8s isolation is a NetworkPolicy concern (phase 4), not host nft. firewall: FirewallEnforcer::disabled(), + reconcile_lock: Arc::new(tokio::sync::Mutex::new(())), }) } @@ -279,7 +294,9 @@ impl Ec2Runtime { /// Re-render and atomically apply the security-group/NACL ruleset for the /// given per-subnet model. No-op (cheap) when enforcement is disabled. + /// Serialized against other reconciles (finding 4.3). pub async fn reconcile_firewall(&self, subnets: Vec) { + let _guard = self.reconcile_lock.lock().await; self.firewall.reconcile(&subnets).await; } @@ -296,9 +313,12 @@ impl Ec2Runtime { } /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the - /// Docker backend (which uses nftables instead). + /// Docker backend (which uses nftables instead). Serialized against other + /// reconciles so a concurrent apply+prune can't delete a just-applied + /// policy (finding 4.3). pub async fn reconcile_network_policies(&self, rules: Vec) { if let InstanceBackend::K8s(k) = &self.backend { + let _guard = self.reconcile_lock.lock().await; k.reconcile_network_policies(&rules).await; } } diff --git a/crates/fakecloud-ec2/src/service/instance.rs b/crates/fakecloud-ec2/src/service/instance.rs index 34f4dfa6..dd9bf69f 100644 --- a/crates/fakecloud-ec2/src/service/instance.rs +++ b/crates/fakecloud-ec2/src/service/instance.rs @@ -32,6 +32,19 @@ fn state_xml(tag: &str, code: i64, name: &str) -> String { format!("<{tag}>{code}{name}") } +/// First three octets of a subnet CIDR's network address, e.g. +/// `172.31.16.0/20 -> "172.31.16"`, so a synthesized private IP lands inside +/// the subnet. Falls back to `10.0.0` for a non-IPv4-CIDR input. +fn subnet_ip_prefix(cidr: &str) -> String { + let addr = cidr.split('/').next().unwrap_or(cidr); + let octets: Vec<&str> = addr.split('.').collect(); + if octets.len() == 4 && octets.iter().all(|o| o.parse::().is_ok()) { + format!("{}.{}.{}", octets[0], octets[1], octets[2]) + } else { + "10.0.0".to_string() + } +} + /// Map of `security-group-id -> group-name` for the whole state, so /// DescribeInstances can render the real `groupName` (AWS returns the name, not /// the id) instead of echoing the id back. @@ -244,7 +257,7 @@ pub(crate) async fn run_instances( .get("NetworkInterface.1.AssociatePublicIpAddress") .or_else(|| req.query_params.get("AssociatePublicIpAddress")) .map(|v| v == "true"); - let (vpc_id, subnet_auto_public, instance_network) = { + let (vpc_id, subnet_auto_public, instance_network, ip_prefix) = { let accounts = svc.state.read(); let empty = Ec2State::new(&req.account_id, &req.region); let state = accounts.get(&req.account_id).unwrap_or(&empty); @@ -304,7 +317,17 @@ pub(crate) async fn run_instances( // launch, which assigns public IPs by default. None => (Some(crate::defaults::default_vpc_id(&req.account_id)), true), }; - (vpc, auto_public, instance_network) + // Metadata-only private IP base, derived from the resolved subnet's + // CIDR so DescribeInstances reports an IP inside the subnet (was a + // hard-coded 10.0.0.x outside the subnet — bug-hunt finding 1.7). A + // real container-backed instance overwrites this with its true bridge + // IP once running. + let ip_prefix = subnet_id + .as_ref() + .and_then(|sid| state.subnets.get(sid)) + .map(|s| subnet_ip_prefix(&s.cidr_block)) + .unwrap_or_else(|| "10.0.0".to_string()); + (vpc, auto_public, instance_network, ip_prefix) }; let assign_public = assoc_public.unwrap_or(subnet_auto_public); @@ -328,7 +351,7 @@ pub(crate) async fn run_instances( instance_type: instance_type.clone(), state_code: 0, state_name: "pending".to_string(), - private_ip: format!("10.0.0.{}", 10 + idx), + private_ip: format!("{ip_prefix}.{}", 10 + idx), public_ip: if assign_public { Some(format!("52.0.0.{}", 10 + idx)) } else { @@ -1543,3 +1566,25 @@ pub(crate) fn describe_instance_topology( &ec2_list("instanceSet", &[]), )) } + +#[cfg(test)] +mod tests { + use super::subnet_ip_prefix; + + #[test] + fn subnet_ip_prefix_uses_subnet_network() { + // The synthesized metadata IP must land inside the subnet (finding 1.7). + assert_eq!(subnet_ip_prefix("172.31.16.0/20"), "172.31.16"); + assert_eq!(subnet_ip_prefix("10.0.5.0/24"), "10.0.5"); + // bare address (no mask) still works + assert_eq!(subnet_ip_prefix("192.168.1.0"), "192.168.1"); + } + + #[test] + fn subnet_ip_prefix_falls_back_on_garbage() { + assert_eq!(subnet_ip_prefix(""), "10.0.0"); + assert_eq!(subnet_ip_prefix("not-a-cidr"), "10.0.0"); + // IPv6 / non-dotted-quad falls back rather than producing nonsense + assert_eq!(subnet_ip_prefix("fd00::/8"), "10.0.0"); + } +} diff --git a/crates/fakecloud-k8s/src/client.rs b/crates/fakecloud-k8s/src/client.rs index f11e066a..6c996322 100644 --- a/crates/fakecloud-k8s/src/client.rs +++ b/crates/fakecloud-k8s/src/client.rs @@ -431,23 +431,31 @@ impl K8sClient { } } - /// Best-effort detection of the cluster CNI from `kube-system` Pod names - /// (e.g. `calico-node-*`, `cilium-*`, `kindnet-*`). Returns the matched - /// component names; the caller maps them to a driver. An empty result - /// (list failed or no recognizable CNI) maps to "unknown". - pub async fn kube_system_pod_names(&self) -> Vec { - let api: Api = Api::namespaced(self.client.clone(), "kube-system"); - match api.list(&ListParams::default()).await { - Ok(list) => list - .items - .into_iter() - .filter_map(|p| p.metadata.name) - .collect(), - Err(e) => { - tracing::debug!(error = %e, "k8s CNI detect: list kube-system pods failed"); - Vec::new() + /// Best-effort detection of the cluster CNI from Pod names across the + /// namespaces CNIs commonly install into (e.g. `calico-node-*`, `cilium-*`, + /// `kindnet-*`). Returns the matched component names; the caller maps them + /// to a driver. An empty result (lists failed or no recognizable CNI) maps + /// to "unknown". + /// + /// Scans `kube-system` plus the operator namespaces Calico/Cilium use + /// (`calico-system`, `tigera-operator`, `cilium`) so a Tigera-operator or + /// dedicated-namespace install isn't mis-reported as non-enforcing + /// (bug-hunt 2026-06-18 finding 1.6). Per-namespace list errors (RBAC / + /// absent namespace) are swallowed. + pub async fn cni_component_names(&self) -> Vec { + const CNI_NAMESPACES: [&str; 4] = + ["kube-system", "calico-system", "tigera-operator", "cilium"]; + let mut names = Vec::new(); + for ns in CNI_NAMESPACES { + let api: Api = Api::namespaced(self.client.clone(), ns); + match api.list(&ListParams::default()).await { + Ok(list) => names.extend(list.items.into_iter().filter_map(|p| p.metadata.name)), + Err(e) => { + tracing::debug!(namespace = ns, error = %e, "k8s CNI detect: list pods failed"); + } } } + names } }