From b5f3f381c00bfb51cc6c479803d155b3d95bb753 Mon Sep 17 00:00:00 2001 From: Lucas Vieira Date: Thu, 18 Jun 2026 11:03:15 -0300 Subject: [PATCH] fix(ec2): re-apply SG firewall after restart-recovery and reboot Bug-hunt 2026-06-18 findings 4.1, 4.2. The security-group nft/NetworkPolicy reconcile is event-triggered, but two lifecycle paths didn't fire it: - 4.1: recover_persisted_containers rebuilds containers after a restart but never reconciled. The startup reaper had cleared the previous process's nft table / NetworkPolicies, so with enforcement enabled the recovered instances ran unfiltered until some unrelated later op triggered a reconcile. Now a coordinator task awaits all per-instance recovery tasks and fires one reconcile once they're up. - 4.2: RebootInstances can change an instance's IP (k8s Pod recreate), leaving a stale /32 in peers' SG rules. The reboot bg task now reconciles after the recreate loop. Both gated on network_isolation_enforced() (no-op when enforcement is off). --- crates/fakecloud-ec2/src/service/instance.rs | 7 ++++++ crates/fakecloud-ec2/src/service/mod.rs | 23 +++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/crates/fakecloud-ec2/src/service/instance.rs b/crates/fakecloud-ec2/src/service/instance.rs index b769b4052..34f4dfa66 100644 --- a/crates/fakecloud-ec2/src/service/instance.rs +++ b/crates/fakecloud-ec2/src/service/instance.rs @@ -688,6 +688,13 @@ pub(crate) async fn reboot_instances( } } } + // A reboot can change the instance's IP (k8s Pod recreate), which + // leaves a stale /32 in every peer's security-group rules until an + // unrelated reconcile fires. Re-apply the firewall now (#1745; + // bug-hunt 2026-06-18 finding 4.2). No-op when enforcement is off. + if rt.network_isolation_enforced() { + super::firewall_model::reconcile(&svc_state, &rt).await; + } }); } Ok(Ec2Service::respond( diff --git a/crates/fakecloud-ec2/src/service/mod.rs b/crates/fakecloud-ec2/src/service/mod.rs index 647b1b2f6..7c65dea6b 100644 --- a/crates/fakecloud-ec2/src/service/mod.rs +++ b/crates/fakecloud-ec2/src/service/mod.rs @@ -1016,10 +1016,11 @@ impl Ec2Service { "recovering backing containers for persisted ec2 instances", ); + let mut handles = Vec::new(); for p in pending { let runtime = runtime.clone(); let state = self.state.clone(); - tokio::spawn(async move { + handles.push(tokio::spawn(async move { let running = runtime .run_instance(&p.id, p.user_data.as_deref(), &p.tags, p.network.as_ref()) .await; @@ -1063,6 +1064,26 @@ impl Ec2Service { if reap { runtime.terminate_instance(&p.id).await; } + })); + } + + // Once every instance is back up, (re)apply the security-group + // firewall. The startup reaper cleared the previous process's nft + // table / NetworkPolicies, and the per-instance recovery tasks above + // don't reconcile — without this, recovered instances would run + // unfiltered until some unrelated later op happened to trigger a + // reconcile (#1745; bug-hunt 2026-06-18 finding 4.1). No-op when + // enforcement is disabled. + { + let runtime = runtime.clone(); + let state = self.state.clone(); + tokio::spawn(async move { + for h in handles { + let _ = h.await; + } + if runtime.network_isolation_enforced() { + firewall_model::reconcile(&state, &runtime).await; + } }); } }