Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 88 additions & 13 deletions crates/fakecloud-ec2/src/runtime/firewall.rs
Original file line number Diff line number Diff line change
Expand Up @@ -261,17 +261,37 @@ fn push_proto_ports(parts: &mut Vec<String>, protocol: &str, from: i64, to: i64)
}
}
}
other => parts.push(format!("ip protocol {other}")),
// An unrecognized protocol is interpolated into the nft script, so
// restrict it to the protocol-token charset `[a-z0-9-]` to avoid
// ruleset injection (finding 2.2); anything else emits no proto match.
other if other.chars().all(|c| c.is_ascii_alphanumeric() || c == '-') => {
parts.push(format!("ip protocol {other}"))
}
_ => {}
}
}

/// Drop `0.0.0.0/0` (which nft rejects as a no-op match) to `None`, and strip a
/// redundant `/32` host suffix so single-host rules read cleanly.
///
/// Also **sanitizes**: the CIDR comes from an Authorize/RevokeSecurityGroup
/// param and is interpolated raw into the `nft -f -` script, so a value
/// containing nft metacharacters (whitespace, `;`, `{`, newline, …) could
/// inject ruleset syntax. Anything outside the IPv4/IPv6-CIDR character set
/// `[0-9a-fA-F.:/]` is rejected to `None` (the match clause is dropped, never
/// the whole rule), closing that injection surface (bug-hunt 2026-06-18
/// finding 2.2).
fn normalized_cidr(cidr: &Option<String>) -> Option<String> {
let c = cidr.as_deref()?;
if c == "0.0.0.0/0" || c.is_empty() {
return None;
}
if !c
.chars()
.all(|ch| ch.is_ascii_hexdigit() || matches!(ch, '.' | ':' | '/'))
{
return None;
}
Some(c.trim_end_matches("/32").to_string())
}

Expand All @@ -285,19 +305,27 @@ pub enum EnforcementMode {
}

/// Decide the enforcement mode from the environment. Enforcement is opt-in:
/// `FAKECLOUD_EC2_SG_ENFORCEMENT` must be set to `1`/`true`/`nftables`, and
/// `nft` must actually be runnable, or we degrade to `Disabled` with a single
/// warning. `env` and `nft_probe` are injected so the decision is unit-testable
/// without touching the real environment or running `nft`.
/// `FAKECLOUD_EC2_SG_ENFORCEMENT` must be set to `1`/`true`/`nftables`, `nft`
/// must be runnable, AND the daemon must run on this host's network namespace
/// (`host_local`). `env`, `host_local`, and `nft_probe` are injected so the
/// decision is unit-testable without touching the environment or running `nft`.
///
/// `host_local` guards the false-positive on Docker Desktop / podman-machine
/// (macOS/Windows): there the per-subnet bridges live inside the daemon's Linux
/// VM, so `nft` on the host installs rules against the wrong netfilter and
/// silently filters nothing — yet the probe would pass on a Linux box. We treat
/// only a native-Linux host as able to filter (bug-hunt 2026-06-18 finding 1.5),
/// so `enforced` never claims active enforcement that can't take effect.
pub fn resolve_enforcement_mode(
env: Option<&str>,
host_local: bool,
nft_probe: impl FnOnce() -> bool,
) -> EnforcementMode {
let opted_in = matches!(
env.map(|v| v.to_ascii_lowercase()).as_deref(),
Some("1") | Some("true") | Some("nftables") | Some("on")
);
if !opted_in {
if !opted_in || !host_local {
return EnforcementMode::Disabled;
}
if nft_probe() {
Expand All @@ -307,6 +335,15 @@ pub fn resolve_enforcement_mode(
}
}

/// Whether the container daemon shares this process's network namespace, so
/// host nftables rules actually see the inter-container traffic. True only on a
/// native-Linux host; Docker Desktop / podman-machine on macOS/Windows run the
/// daemon in a separate Linux VM. (Honest default; can be overridden by the
/// caller when fakecloud and the daemon are known to share a netns.)
pub fn host_shares_daemon_netns() -> bool {
cfg!(target_os = "linux")
}

/// True when `nft list ruleset` runs successfully — i.e. nft exists and this
/// process holds enough capability to read the ruleset (a good proxy for being
/// able to write it).
Expand Down Expand Up @@ -432,6 +469,38 @@ mod tests {
.contains("ip saddr 203.0.113.7 tcp dport 22"));
}

#[test]
fn cidr_with_nft_metacharacters_is_dropped_not_injected() {
// A CIDR carrying nft syntax (`;`, spaces, words) must never reach the
// `nft -f -` script (finding 2.2). The match clause is omitted; the
// rule still renders safely and terminates in `accept`.
let r = tcp(22, Some("10.0.0.0/8; drop comment \"x\""));
let line = render_rule(&r, Direction::Ingress, "172.30.0.2");
assert!(!line.contains(';'), "no injected semicolon: {line}");
assert!(!line.contains("comment"), "no injected tokens: {line}");
assert!(
!line.contains("ip saddr"),
"malformed cidr clause omitted: {line}"
);
assert!(line.ends_with("accept"), "rule still valid: {line}");
}

#[test]
fn unknown_protocol_with_bad_chars_emits_no_proto_match() {
let r = FirewallRule {
protocol: "tcp; drop".into(),
from_port: -1,
to_port: -1,
cidr: None,
};
let line = render_rule(&r, Direction::Ingress, "172.30.0.2");
assert!(
!line.contains(';') && !line.contains("ip protocol"),
"{line}"
);
assert_eq!(line, "ip daddr 172.30.0.2 accept");
}

#[test]
fn nacl_deny_emitted_before_instance_rules() {
let model = vec![SubnetFirewall {
Expand Down Expand Up @@ -504,27 +573,33 @@ mod tests {

#[test]
fn enforcement_mode_is_opt_in_and_capability_gated() {
// not opted in -> disabled regardless of nft availability
// not opted in -> disabled regardless of nft availability / host
assert_eq!(
resolve_enforcement_mode(None, || true),
resolve_enforcement_mode(None, true, || true),
EnforcementMode::Disabled
);
assert_eq!(
resolve_enforcement_mode(Some("0"), || true),
resolve_enforcement_mode(Some("0"), true, || true),
EnforcementMode::Disabled
);
// opted in but nft missing -> degrade
assert_eq!(
resolve_enforcement_mode(Some("1"), || false),
resolve_enforcement_mode(Some("1"), true, || false),
EnforcementMode::Disabled
);
// opted in + capable but daemon not host-local (Docker Desktop/VM) ->
// degrade rather than falsely claim enforced (finding 1.5)
assert_eq!(
resolve_enforcement_mode(Some("1"), false, || true),
EnforcementMode::Disabled
);
// opted in + capable -> nftables
// opted in + host-local + capable -> nftables
assert_eq!(
resolve_enforcement_mode(Some("nftables"), || true),
resolve_enforcement_mode(Some("nftables"), true, || true),
EnforcementMode::Nftables
);
assert_eq!(
resolve_enforcement_mode(Some("TRUE"), || true),
resolve_enforcement_mode(Some("TRUE"), true, || true),
EnforcementMode::Nftables
);
}
Expand Down
2 changes: 1 addition & 1 deletion crates/fakecloud-ec2/src/runtime/k8s.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ impl K8sInstances {
);
let pod_config = K8sPodConfig::resolved_base("FAKECLOUD_EC2_K8S")?;
// Detect the CNI so we can warn when NetworkPolicy won't be enforced.
let cni = CniDriver::from_components(client.kube_system_pod_names().await);
let cni = CniDriver::from_components(client.cni_component_names().await);
if cni.enforces() {
tracing::info!(?cni, "k8s CNI enforces NetworkPolicy; EC2 security groups will be applied as NetworkPolicies");
} else {
Expand Down
30 changes: 25 additions & 5 deletions crates/fakecloud-ec2/src/runtime/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,18 @@ impl FirewallEnforcer {
/// can't be backed (so the operator knows it degraded, not silently).
fn detect() -> Self {
let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
let mode = resolve_enforcement_mode(requested.as_deref(), firewall::nft_available);
let mode = resolve_enforcement_mode(
requested.as_deref(),
firewall::host_shares_daemon_netns(),
firewall::nft_available,
);
if requested.is_some() && mode == EnforcementMode::Disabled {
tracing::warn!(
"EC2 security-group enforcement was requested but nftables/CAP_NET_ADMIN \
is unavailable; falling back to metadata-only (phase-2 L3 isolation stays \
active, security-group rules are tracked but not enforced)"
"EC2 security-group enforcement was requested but it can't take effect here \
(needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
rules are tracked but not enforced)"
);
} else if mode == EnforcementMode::Nftables {
tracing::info!("EC2 security-group enforcement active via nftables");
Expand Down Expand Up @@ -241,6 +247,13 @@ pub struct Ec2Runtime {
instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
/// Host firewall enforcer for security groups + NACLs.
firewall: FirewallEnforcer,
/// Serializes firewall reconciles. Reconcile is fired from many concurrent
/// background tasks (per SG/NACL/lifecycle event); without this, two
/// reconciles built from divergent state could interleave so the k8s
/// apply+prune of one deletes a policy the other just applied (bug-hunt
/// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
/// last-started reconcile the last-applied for both backends.
reconcile_lock: Arc<tokio::sync::Mutex<()>>,
}

impl Ec2Runtime {
Expand All @@ -255,6 +268,7 @@ impl Ec2Runtime {
}),
instances: Arc::new(RwLock::new(HashMap::new())),
firewall: FirewallEnforcer::detect(),
reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
})
}

Expand All @@ -268,6 +282,7 @@ impl Ec2Runtime {
instances: Arc::new(RwLock::new(HashMap::new())),
// k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
firewall: FirewallEnforcer::disabled(),
reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
})
}

Expand All @@ -279,7 +294,9 @@ impl Ec2Runtime {

/// Re-render and atomically apply the security-group/NACL ruleset for the
/// given per-subnet model. No-op (cheap) when enforcement is disabled.
/// Serialized against other reconciles (finding 4.3).
pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
let _guard = self.reconcile_lock.lock().await;
self.firewall.reconcile(&subnets).await;
}

Expand All @@ -296,9 +313,12 @@ impl Ec2Runtime {
}

/// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
/// Docker backend (which uses nftables instead).
/// Docker backend (which uses nftables instead). Serialized against other
/// reconciles so a concurrent apply+prune can't delete a just-applied
/// policy (finding 4.3).
pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
if let InstanceBackend::K8s(k) = &self.backend {
let _guard = self.reconcile_lock.lock().await;
k.reconcile_network_policies(&rules).await;
}
}
Expand Down
51 changes: 48 additions & 3 deletions crates/fakecloud-ec2/src/service/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ fn state_xml(tag: &str, code: i64, name: &str) -> String {
format!("<{tag}><code>{code}</code><name>{name}</name></{tag}>")
}

/// First three octets of a subnet CIDR's network address, e.g.
/// `172.31.16.0/20 -> "172.31.16"`, so a synthesized private IP lands inside
/// the subnet. Falls back to `10.0.0` for a non-IPv4-CIDR input.
fn subnet_ip_prefix(cidr: &str) -> String {
let addr = cidr.split('/').next().unwrap_or(cidr);
let octets: Vec<&str> = addr.split('.').collect();
if octets.len() == 4 && octets.iter().all(|o| o.parse::<u8>().is_ok()) {
format!("{}.{}.{}", octets[0], octets[1], octets[2])
} else {
"10.0.0".to_string()
}
}

/// Map of `security-group-id -> group-name` for the whole state, so
/// DescribeInstances can render the real `groupName` (AWS returns the name, not
/// the id) instead of echoing the id back.
Expand Down Expand Up @@ -244,7 +257,7 @@ pub(crate) async fn run_instances(
.get("NetworkInterface.1.AssociatePublicIpAddress")
.or_else(|| req.query_params.get("AssociatePublicIpAddress"))
.map(|v| v == "true");
let (vpc_id, subnet_auto_public, instance_network) = {
let (vpc_id, subnet_auto_public, instance_network, ip_prefix) = {
let accounts = svc.state.read();
let empty = Ec2State::new(&req.account_id, &req.region);
let state = accounts.get(&req.account_id).unwrap_or(&empty);
Expand Down Expand Up @@ -304,7 +317,17 @@ pub(crate) async fn run_instances(
// launch, which assigns public IPs by default.
None => (Some(crate::defaults::default_vpc_id(&req.account_id)), true),
};
(vpc, auto_public, instance_network)
// Metadata-only private IP base, derived from the resolved subnet's
// CIDR so DescribeInstances reports an IP inside the subnet (was a
// hard-coded 10.0.0.x outside the subnet — bug-hunt finding 1.7). A
// real container-backed instance overwrites this with its true bridge
// IP once running.
let ip_prefix = subnet_id
.as_ref()
.and_then(|sid| state.subnets.get(sid))
.map(|s| subnet_ip_prefix(&s.cidr_block))
.unwrap_or_else(|| "10.0.0".to_string());
(vpc, auto_public, instance_network, ip_prefix)
};
let assign_public = assoc_public.unwrap_or(subnet_auto_public);

Expand All @@ -328,7 +351,7 @@ pub(crate) async fn run_instances(
instance_type: instance_type.clone(),
state_code: 0,
state_name: "pending".to_string(),
private_ip: format!("10.0.0.{}", 10 + idx),
private_ip: format!("{ip_prefix}.{}", 10 + idx),
public_ip: if assign_public {
Some(format!("52.0.0.{}", 10 + idx))
} else {
Expand Down Expand Up @@ -1543,3 +1566,25 @@ pub(crate) fn describe_instance_topology(
&ec2_list("instanceSet", &[]),
))
}

#[cfg(test)]
mod tests {
use super::subnet_ip_prefix;

#[test]
fn subnet_ip_prefix_uses_subnet_network() {
// The synthesized metadata IP must land inside the subnet (finding 1.7).
assert_eq!(subnet_ip_prefix("172.31.16.0/20"), "172.31.16");
assert_eq!(subnet_ip_prefix("10.0.5.0/24"), "10.0.5");
// bare address (no mask) still works
assert_eq!(subnet_ip_prefix("192.168.1.0"), "192.168.1");
}

#[test]
fn subnet_ip_prefix_falls_back_on_garbage() {
assert_eq!(subnet_ip_prefix(""), "10.0.0");
assert_eq!(subnet_ip_prefix("not-a-cidr"), "10.0.0");
// IPv6 / non-dotted-quad falls back rather than producing nonsense
assert_eq!(subnet_ip_prefix("fd00::/8"), "10.0.0");
}
}
38 changes: 23 additions & 15 deletions crates/fakecloud-k8s/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,23 +431,31 @@ impl K8sClient {
}
}

/// Best-effort detection of the cluster CNI from `kube-system` Pod names
/// (e.g. `calico-node-*`, `cilium-*`, `kindnet-*`). Returns the matched
/// component names; the caller maps them to a driver. An empty result
/// (list failed or no recognizable CNI) maps to "unknown".
pub async fn kube_system_pod_names(&self) -> Vec<String> {
let api: Api<Pod> = Api::namespaced(self.client.clone(), "kube-system");
match api.list(&ListParams::default()).await {
Ok(list) => list
.items
.into_iter()
.filter_map(|p| p.metadata.name)
.collect(),
Err(e) => {
tracing::debug!(error = %e, "k8s CNI detect: list kube-system pods failed");
Vec::new()
/// Best-effort detection of the cluster CNI from Pod names across the
/// namespaces CNIs commonly install into (e.g. `calico-node-*`, `cilium-*`,
/// `kindnet-*`). Returns the matched component names; the caller maps them
/// to a driver. An empty result (lists failed or no recognizable CNI) maps
/// to "unknown".
///
/// Scans `kube-system` plus the operator namespaces Calico/Cilium use
/// (`calico-system`, `tigera-operator`, `cilium`) so a Tigera-operator or
/// dedicated-namespace install isn't mis-reported as non-enforcing
/// (bug-hunt 2026-06-18 finding 1.6). Per-namespace list errors (RBAC /
/// absent namespace) are swallowed.
pub async fn cni_component_names(&self) -> Vec<String> {
const CNI_NAMESPACES: [&str; 4] =
["kube-system", "calico-system", "tigera-operator", "cilium"];
let mut names = Vec::new();
for ns in CNI_NAMESPACES {
let api: Api<Pod> = Api::namespaced(self.client.clone(), ns);
match api.list(&ListParams::default()).await {
Ok(list) => names.extend(list.items.into_iter().filter_map(|p| p.metadata.name)),
Err(e) => {
tracing::debug!(namespace = ns, error = %e, "k8s CNI detect: list pods failed");
}
}
}
names
}
}

Expand Down
Loading