From 1d081af01474951c98e5a15a8178498fba97f526 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Tue, 10 Feb 2026 16:00:42 +0100 Subject: [PATCH 1/8] Re-iterate status evaluation. --- api/v2/types_firewall.go | 2 +- controllers/firewall/reconcile.go | 5 - controllers/firewall/status.go | 12 +- controllers/set/delete.go | 8 +- controllers/set/status.go | 148 +++++++++++++++--------- controllers/set/status_test.go | 185 ++++++++++++++++++++++++++++++ 6 files changed, 292 insertions(+), 68 deletions(-) create mode 100644 controllers/set/status_test.go diff --git a/api/v2/types_firewall.go b/api/v2/types_firewall.go index d6fc16d..bd2da5c 100644 --- a/api/v2/types_firewall.go +++ b/api/v2/types_firewall.go @@ -185,7 +185,7 @@ const ( FirewallDistanceConfigured ConditionType = "Distance" // FirewallProvisioned indicates that all health conditions have been met at least once. // Once set to true, it stays true and is used to detect condition degradation. - FirewallHealthy ConditionType = "Healthy" + FirewallProvisioned ConditionType = "Provisioned" ) // ShootAccess contains secret references to construct a shoot client in the firewall-controller to update its firewall monitor. diff --git a/controllers/firewall/reconcile.go b/controllers/firewall/reconcile.go index 42467b0..26170ae 100644 --- a/controllers/firewall/reconcile.go +++ b/controllers/firewall/reconcile.go @@ -30,11 +30,6 @@ func (c *controller) Reconcile(r *controllers.Ctx[*v2.Firewall]) error { } SetFirewallStatusFromMonitor(r.Target, mon) - - if isAllConditionsMet(r.Target) { - cond := v2.NewCondition(v2.FirewallHealthy, v2.ConditionTrue, "Healthy", "All firewall conditions have been met.") - r.Target.Status.Conditions.Set(cond) - } }() fws, err := c.firewallCache.Get(r.Ctx, r.Target) diff --git a/controllers/firewall/status.go b/controllers/firewall/status.go index b5b1b0f..5473377 100644 --- a/controllers/firewall/status.go +++ b/controllers/firewall/status.go @@ -129,6 +129,11 @@ func SetFirewallStatusFromMonitor(fw *v2.Firewall, mon *v2.FirewallMonitor) { cond = v2.NewCondition(v2.FirewallDistanceConfigured, v2.ConditionTrue, "NotChecking", "Not checking distance due to firewall annotation.") fw.Status.Conditions.Set(cond) + if isProvisioned(fw) { + cond := v2.NewCondition(v2.FirewallProvisioned, v2.ConditionTrue, "Provisioned", "All firewall conditions have been met.") + fw.Status.Conditions.Set(cond) + } + return } @@ -190,9 +195,14 @@ func SetFirewallStatusFromMonitor(fw *v2.Firewall, mon *v2.FirewallMonitor) { cond := v2.NewCondition(v2.FirewallDistanceConfigured, v2.ConditionFalse, "NotConfigured", fmt.Sprintf("Controller has configured distance %d, but %d is specified.", connection.ActualDistance, fw.Distance)) fw.Status.Conditions.Set(cond) } + + if isProvisioned(fw) { + cond := v2.NewCondition(v2.FirewallProvisioned, v2.ConditionTrue, "Provisioned", "All firewall conditions have been met.") + fw.Status.Conditions.Set(cond) + } } -func isAllConditionsMet(fw *v2.Firewall) bool { +func isProvisioned(fw *v2.Firewall) bool { for _, ct := range []v2.ConditionType{ v2.FirewallCreated, v2.FirewallReady, diff --git a/controllers/set/delete.go b/controllers/set/delete.go index 14cb3a4..d6385c9 100644 --- a/controllers/set/delete.go +++ b/controllers/set/delete.go @@ -44,15 +44,13 @@ func (c *controller) deleteFirewalls(r *controllers.Ctx[*v2.FirewallSet], fws .. } func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallSet], fws ...*v2.Firewall) ([]*v2.Firewall, error) { var result []*v2.Firewall - createTimeout := c.c.GetCreateTimeout() - healthTimeout := c.c.GetFirewallHealthTimeout() for _, fw := range fws { status := c.evaluateFirewallConditions(fw) - switch { - case (createTimeout > 0 && status.CreateTimeout) || (healthTimeout > 0 && status.HealthTimeout): - r.Log.Info("firewall health or creation timeout exceeded, deleting from set", "firewall-name", fw.Name) + switch status { + case statusCreateTimeout, statusHealthTimeout: + r.Log.Info("firewall creation or health timeout exceeded, deleting from set", "firewall-name", fw.Name) err := c.deleteFirewalls(r, fw) if err != nil { diff --git a/controllers/set/status.go b/controllers/set/status.go index 5098d57..11d3ade 100644 --- a/controllers/set/status.go +++ b/controllers/set/status.go @@ -8,59 +8,83 @@ import ( "github.com/metal-stack/metal-lib/pkg/pointer" ) -type firewallConditionStatus struct { - IsReady bool - CreateTimeout bool - HealthTimeout bool -} +type status string -func (c *controller) evaluateFirewallConditions(fw *v2.Firewall) firewallConditionStatus { - var ( - unhealthyTimeout = c.c.GetFirewallHealthTimeout() - allocationTimeout = c.c.GetCreateTimeout() - - created = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallCreated)).Status == v2.ConditionTrue - ready = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallReady)).Status == v2.ConditionTrue - connected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerConnected)).Status == v2.ConditionTrue - seedConnected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected)).Status == v2.ConditionTrue - distanceConfigured = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallDistanceConfigured)).Status == v2.ConditionTrue - allConditionsMet = created && ready && connected && seedConnected && distanceConfigured - - seedUpdatedTime = pointer.SafeDeref(fw.Status.ControllerStatus).SeedUpdated.Time - timeSinceReconcile = time.Since(seedUpdatedTime) - allocationTime = pointer.SafeDeref(fw.Status.MachineStatus).AllocationTimestamp.Time - ) - - if allConditionsMet { - return firewallConditionStatus{IsReady: true} - } +const ( + statusReady status = "ready" + statusProgressing status = "progressing" + statusUnhealthy status = "unhealthy" + statusHealthTimeout status = "health-timeout" + statusCreateTimeout status = "create-timeout" +) + +func (c *controller) evaluateFirewallConditions(fw *v2.Firewall) status { + switch fw.Status.Phase { + case v2.FirewallPhaseCreating, v2.FirewallPhaseCrashing: + var ( + createTimeout = c.c.GetCreateTimeout() + provisioned = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallProvisioned)).Status == v2.ConditionTrue + ) - // duration after which a firewall in the creation phase will be recreated, exceeded - if allocationTimeout > 0 && fw.Status.Phase == v2.FirewallPhaseCreating && !allocationTime.IsZero() { - if time.Since(allocationTime) > allocationTimeout { - c.log.Info("create timeout exceeded", "firewall-name", fw.Name, "allocated-at", allocationTime.String(), "timeout-after", allocationTimeout.String()) - return firewallConditionStatus{CreateTimeout: true} + if provisioned { + return statusReady } - } - // Only apply health timeout once we have a non-zero seed reconcile timestamp. - if (!ready || !seedConnected || !connected) && unhealthyTimeout > 0 && created && !seedUpdatedTime.IsZero() && timeSinceReconcile > unhealthyTimeout { - c.log.Info("health timeout exceeded", "firewall-name", fw.Name, "last-reconciled-at", seedUpdatedTime.String(), "timeout-after", unhealthyTimeout.String()) - return firewallConditionStatus{HealthTimeout: true} - } - // Firewall was healthy at one point (all conditions were met), but then one of the monitor conditions - // degraded so the firewall is unhealthy. Only check monitor conditions (connected, seedConnected, distanceConfigured) - // because the ready condition degradation is already handled by the time-based health timeout above. - wasHealthy := pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallHealthy)).Status == v2.ConditionTrue - monitorConditionsDegraded := !connected || !seedConnected || !distanceConfigured - if monitorConditionsDegraded && wasHealthy && unhealthyTimeout > 0 { - c.log.Info("firewall monitor conditions degraded", "firewall-name", fw.Name) - return firewallConditionStatus{HealthTimeout: true} - } - //if everything returns false, it is progressing - return firewallConditionStatus{ - IsReady: allConditionsMet, - CreateTimeout: false, - HealthTimeout: false, + + if createTimeout > 0 { + createTimeout := c.c.GetCreateTimeout() + + if ok := checkForTimeout(fw, v2.FirewallReady, createTimeout); ok { + c.log.Info("create timeout exceeded, firewall not provisioned in time", "firewall-name", fw.Name, "timeout-after", createTimeout.String()) + return statusCreateTimeout + } + } + + return statusProgressing + + case v2.FirewallPhaseRunning: + fallthrough + + default: + var ( + created = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallCreated)).Status == v2.ConditionTrue + ready = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallReady)).Status == v2.ConditionTrue + provisioned = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallProvisioned)).Status == v2.ConditionTrue + connected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerConnected)).Status == v2.ConditionTrue + seedConnected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected)).Status == v2.ConditionTrue + distanceConfigured = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallDistanceConfigured)).Status == v2.ConditionTrue + + allConditionsMet = created && ready && provisioned && connected && seedConnected && distanceConfigured + ) + + if allConditionsMet { + return statusReady + } + + if provisioned { + healthTimeout := c.c.GetFirewallHealthTimeout() + + switch { + case !seedConnected: + if ok := checkForTimeout(fw, v2.FirewallControllerSeedConnected, healthTimeout); ok { + c.log.Info("health timeout exceeded, seed connection lost", "firewall-name", fw.Name, "timeout-after", healthTimeout.String()) + return statusHealthTimeout + } + + case !connected: + if ok := checkForTimeout(fw, v2.FirewallControllerConnected, healthTimeout); ok { + c.log.Info("health timeout exceeded, firewall monitor not reconciled anymore by controller", "firewall-name", fw.Name, "timeout-after", healthTimeout.String()) + return statusHealthTimeout + } + + case !ready: + if ok := checkForTimeout(fw, v2.FirewallReady, healthTimeout); ok { + c.log.Info("health timeout exceeded, firewall is not ready from perspective of the metal-api", "firewall-name", fw.Name, "timeout-after", healthTimeout.String()) + return statusHealthTimeout + } + } + } + + return statusUnhealthy } } @@ -73,17 +97,19 @@ func (c *controller) setStatus(r *controllers.Ctx[*v2.FirewallSet], ownedFirewal for _, fw := range ownedFirewalls { statusReport := c.evaluateFirewallConditions(fw) - switch { - case statusReport.IsReady: + switch statusReport { + case statusReady: r.Target.Status.ReadyReplicas++ continue - - case statusReport.CreateTimeout || statusReport.HealthTimeout: + case statusProgressing: + r.Target.Status.ProgressingReplicas++ + continue + case statusUnhealthy, statusCreateTimeout, statusHealthTimeout: + fallthrough + default: r.Target.Status.UnhealthyReplicas++ continue } - - r.Target.Status.ProgressingReplicas++ } revision, err := controllers.Revision(r.Target) @@ -94,3 +120,13 @@ func (c *controller) setStatus(r *controllers.Ctx[*v2.FirewallSet], ownedFirewal return nil } + +func checkForTimeout(fw *v2.Firewall, condition v2.ConditionType, timeout time.Duration) bool { + if timeout == 0 { + return false + } + + cond := pointer.SafeDeref(fw.Status.Conditions.Get(condition)) + + return time.Since(cond.LastTransitionTime.Time) > timeout +} diff --git a/controllers/set/status_test.go b/controllers/set/status_test.go new file mode 100644 index 0000000..9ecc625 --- /dev/null +++ b/controllers/set/status_test.go @@ -0,0 +1,185 @@ +package set + +import ( + "testing" + "time" + + "github.com/google/go-cmp/cmp" + v2 "github.com/metal-stack/firewall-controller-manager/api/v2" + "github.com/metal-stack/firewall-controller-manager/api/v2/config" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func Test_controller_evaluateFirewallConditions(t *testing.T) { + tenMinutesAgo := time.Now().Add(-10 * time.Minute) + + tests := []struct { + name string + modFn func(fw *v2.Firewall) + healthTimeout time.Duration + createTimeout time.Duration + want status + }{ + { + name: "ready firewall in running phase", + modFn: nil, + want: statusReady, + }, + { + name: "unhealthy firewall in running phase due to firewall monitor not reconciling", + modFn: func(fw *v2.Firewall) { + fw.Status.Conditions.Set(v2.Condition{ + Type: v2.FirewallControllerConnected, + Status: v2.ConditionFalse, + }) + }, + want: statusUnhealthy, + }, + { + name: "unhealthy firewall in running phase due to firewall not reconciling", + modFn: func(fw *v2.Firewall) { + fw.Status.Conditions.Set(v2.Condition{ + Type: v2.FirewallControllerSeedConnected, + Status: v2.ConditionFalse, + }) + }, + want: statusUnhealthy, + }, + { + name: "unhealthy firewall in running phase due to readiness condition false", + modFn: func(fw *v2.Firewall) { + fw.Status.Conditions.Set(v2.Condition{ + Type: v2.FirewallReady, + Status: v2.ConditionFalse, + }) + }, + want: statusUnhealthy, + }, + { + name: "unhealthy firewall in running phase due to readiness condition false", + modFn: func(fw *v2.Firewall) { + fw.Status.Conditions.Set(v2.Condition{ + Type: v2.FirewallReady, + Status: v2.ConditionFalse, + }) + }, + want: statusUnhealthy, + }, + { + name: "health timeout reached because seed not connected", + healthTimeout: 5 * time.Minute, + modFn: func(fw *v2.Firewall) { + cond := fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected) + cond.Status = v2.ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: statusHealthTimeout, + }, + { + name: "health timeout not yet reached", + healthTimeout: 15 * time.Minute, + modFn: func(fw *v2.Firewall) { + cond := fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected) + cond.Status = v2.ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: statusUnhealthy, + }, + { + name: "create timeout reached because not provisioned", + createTimeout: 5 * time.Minute, + modFn: func(fw *v2.Firewall) { + fw.Status.Phase = v2.FirewallPhaseCreating + cond := fw.Status.Conditions.Get(v2.FirewallProvisioned) + cond.Status = v2.ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: statusCreateTimeout, + }, + { + name: "create timeout not yet reached", + createTimeout: 15 * time.Minute, + modFn: func(fw *v2.Firewall) { + fw.Status.Phase = v2.FirewallPhaseCreating + cond := fw.Status.Conditions.Get(v2.FirewallProvisioned) + cond.Status = v2.ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: statusProgressing, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg, err := config.New(&config.NewControllerConfig{ + FirewallHealthTimeout: tt.healthTimeout, + CreateTimeout: tt.createTimeout, + SkipValidation: true, + }) + require.NoError(t, err) + + c := controller{ + c: cfg, + } + + fw := &v2.Firewall{ + Status: v2.FirewallStatus{ + Phase: v2.FirewallPhaseRunning, + Conditions: v2.Conditions{ + { + Type: v2.FirewallControllerConnected, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: v2.FirewallControllerSeedConnected, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: v2.FirewallCreated, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: v2.FirewallReady, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: v2.FirewallProvisioned, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: v2.FirewallDistanceConfigured, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: v2.FirewallMonitorDeployed, + Status: v2.ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + }, + }, + } + + if tt.modFn != nil { + tt.modFn(fw) + } + + got := c.evaluateFirewallConditions(fw) + if diff := cmp.Diff(tt.want, got); diff != "" { + t.Errorf("diff = %s", diff) + } + }) + } +} From e6c7b688bbc8a3438421a146cdec052dd7e6d80a Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 14:12:45 +0100 Subject: [PATCH 2/8] Turn into timeout controller. --- api/v2/types_firewall.go | 164 ++++++++++++++++++++++++++ api/v2/types_firewall_test.go | 183 +++++++++++++++++++++++++++++ controllers/firewall/status.go | 23 +++- controllers/set/delete.go | 21 ---- controllers/set/reconcile.go | 7 -- controllers/set/status.go | 103 +---------------- controllers/set/status_test.go | 185 ------------------------------ controllers/timeout/controller.go | 49 ++++++++ controllers/timeout/reconcile.go | 77 +++++++++++++ main.go | 4 + 10 files changed, 503 insertions(+), 313 deletions(-) delete mode 100644 controllers/set/status_test.go create mode 100644 controllers/timeout/controller.go create mode 100644 controllers/timeout/reconcile.go diff --git a/api/v2/types_firewall.go b/api/v2/types_firewall.go index bd2da5c..bbfa351 100644 --- a/api/v2/types_firewall.go +++ b/api/v2/types_firewall.go @@ -1,8 +1,10 @@ package v2 import ( + "fmt" "sort" "strconv" + "time" "github.com/metal-stack/metal-lib/pkg/pointer" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -354,3 +356,165 @@ func SortFirewallsByImportance(fws []*Firewall) { return !a.CreationTimestamp.Before(&b.CreationTimestamp) }) } + +type ( + FirewallStatusResult string + + FirewallStatusEvalResult struct { + Result FirewallStatusResult + Reason string + TimeoutIn *time.Duration + } +) + +const ( + FirewallStatusReady FirewallStatusResult = "ready" + FirewallStatusProgressing FirewallStatusResult = "progressing" + FirewallStatusUnhealthy FirewallStatusResult = "unhealthy" + FirewallStatusHealthTimeout FirewallStatusResult = "health-timeout" + FirewallStatusCreateTimeout FirewallStatusResult = "create-timeout" +) + +func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Duration) *FirewallStatusEvalResult { + var ( + checkForTimeout = func(fw *Firewall, condition ConditionType, timeout time.Duration) (time.Duration, bool) { + if timeout == 0 { + return 0, false + } + + var ( + cond = pointer.SafeDeref(fw.Status.Conditions.Get(condition)) + transitionTime = cond.LastTransitionTime.Time + deadline = time.Until(transitionTime.Add(timeout)) + ) + + if deadline < 0 { + return 0, true + } + + return deadline, false + } + + unhealthyConditions = func(cts ...ConditionType) []*Condition { + var res []*Condition + + for _, ct := range cts { + cond := fw.Status.Conditions.Get(ct) + if cond == nil || cond.Status != ConditionTrue { + res = append(res, cond) + } + } + + return res + } + + unhealthyTypes []string + timeoutIn *time.Duration + ) + + switch fw.Status.Phase { + case FirewallPhaseCreating, FirewallPhaseCrashing: + conditions := unhealthyConditions(FirewallCreated, + FirewallReady, + FirewallProvisioned, + ) + + if len(conditions) == 0 { + return &FirewallStatusEvalResult{ + Result: FirewallStatusReady, + Reason: "", + } + } + + if createTimeout > 0 { + if t, ok := checkForTimeout(fw, FirewallReady, createTimeout); ok { + return &FirewallStatusEvalResult{ + Result: FirewallStatusCreateTimeout, + Reason: fmt.Sprintf("%s create timeout exceeded, firewall not provisioned in time", createTimeout.String()), + } + } else if createTimeout != 0 { + timeoutIn = &t + } + } + + for _, c := range conditions { + unhealthyTypes = append(unhealthyTypes, string(c.Type)) + } + + return &FirewallStatusEvalResult{ + Result: FirewallStatusProgressing, + Reason: fmt.Sprintf("not all health conditions are true: %v", unhealthyTypes), + TimeoutIn: timeoutIn, + } + + case FirewallPhaseRunning: + fallthrough + + default: + conditions := unhealthyConditions(FirewallCreated, + FirewallReady, + FirewallProvisioned, + FirewallControllerConnected, + FirewallControllerSeedConnected, + FirewallDistanceConfigured, + ) + + if len(conditions) == 0 { + return &FirewallStatusEvalResult{ + Result: FirewallStatusReady, + Reason: "", + } + } + + var ( + ready = pointer.SafeDeref(fw.Status.Conditions.Get(FirewallReady)).Status == ConditionTrue + provisioned = pointer.SafeDeref(fw.Status.Conditions.Get(FirewallProvisioned)).Status == ConditionTrue + connected = pointer.SafeDeref(fw.Status.Conditions.Get(FirewallControllerConnected)).Status == ConditionTrue + seedConnected = pointer.SafeDeref(fw.Status.Conditions.Get(FirewallControllerSeedConnected)).Status == ConditionTrue + ) + + if provisioned { + switch { + case !seedConnected: + if t, ok := checkForTimeout(fw, FirewallControllerSeedConnected, healthTimeout); ok { + return &FirewallStatusEvalResult{ + Result: FirewallStatusHealthTimeout, + Reason: fmt.Sprintf("%s health timeout exceeded, seed connection lost", healthTimeout.String()), + } + } else if healthTimeout != 0 { + timeoutIn = &t + } + + case !connected: + if t, ok := checkForTimeout(fw, FirewallControllerConnected, healthTimeout); ok { + return &FirewallStatusEvalResult{ + Result: FirewallStatusHealthTimeout, + Reason: fmt.Sprintf("%s health timeout exceeded, firewall monitor not reconciled anymore", healthTimeout.String()), + } + } else if healthTimeout != 0 { + timeoutIn = &t + } + + case !ready: + if t, ok := checkForTimeout(fw, FirewallReady, healthTimeout); ok { + return &FirewallStatusEvalResult{ + Result: FirewallStatusHealthTimeout, + Reason: fmt.Sprintf("%s health timeout exceeded, firewall is not ready from perspective of the metal-api", healthTimeout.String()), + } + } else if healthTimeout != 0 { + timeoutIn = &t + } + } + } + + for _, c := range conditions { + unhealthyTypes = append(unhealthyTypes, string(c.Type)) + } + + return &FirewallStatusEvalResult{ + Result: FirewallStatusUnhealthy, + Reason: fmt.Sprintf("not all health conditions are true: %v", unhealthyTypes), + TimeoutIn: timeoutIn, + } + } +} diff --git a/api/v2/types_firewall_test.go b/api/v2/types_firewall_test.go index e145730..142a912 100644 --- a/api/v2/types_firewall_test.go +++ b/api/v2/types_firewall_test.go @@ -6,7 +6,10 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + "github.com/metal-stack/metal-lib/pkg/pointer" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "testing/synctest" ) func Test_SortFirewallsByImportance(t *testing.T) { @@ -107,3 +110,183 @@ func Test_SortFirewallsByImportance(t *testing.T) { }) } } + +func Test_EvaluateFirewallStatus(t *testing.T) { + tests := []struct { + name string + modFn func(fw *Firewall) + healthTimeout time.Duration + createTimeout time.Duration + want *FirewallStatusEvalResult + wantReason string + }{ + { + name: "ready firewall in running phase", + modFn: nil, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusReady, + }, + }, + { + name: "unhealthy firewall in running phase due to firewall monitor not reconciling", + modFn: func(fw *Firewall) { + fw.Status.Conditions.Set(Condition{ + Type: FirewallControllerConnected, + Status: ConditionFalse, + }) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusUnhealthy, + Reason: "not all health conditions are true: [Connected]", + }, + }, + { + name: "unhealthy firewall in running phase due to firewall not reconciling", + modFn: func(fw *Firewall) { + fw.Status.Conditions.Set(Condition{ + Type: FirewallControllerSeedConnected, + Status: ConditionFalse, + }) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusUnhealthy, + Reason: "not all health conditions are true: [SeedConnected]", + }, + }, + { + name: "unhealthy firewall in running phase due to readiness condition false", + modFn: func(fw *Firewall) { + fw.Status.Conditions.Set(Condition{ + Type: FirewallReady, + Status: ConditionFalse, + }) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusUnhealthy, + Reason: "not all health conditions are true: [Ready]", + }, + }, + { + name: "health timeout reached because seed not connected", + healthTimeout: 5 * time.Minute, + modFn: func(fw *Firewall) { + cond := fw.Status.Conditions.Get(FirewallControllerSeedConnected) + cond.Status = ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusHealthTimeout, + Reason: "5m0s health timeout exceeded, seed connection lost", + }, + }, + { + name: "health timeout not yet reached", + healthTimeout: 15 * time.Minute, + modFn: func(fw *Firewall) { + cond := fw.Status.Conditions.Get(FirewallControllerSeedConnected) + cond.Status = ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusUnhealthy, + Reason: "not all health conditions are true: [SeedConnected]", + TimeoutIn: pointer.Pointer(5 * time.Minute), + }, + }, + { + name: "create timeout reached because not provisioned", + createTimeout: 5 * time.Minute, + modFn: func(fw *Firewall) { + fw.Status.Phase = FirewallPhaseCreating + cond := fw.Status.Conditions.Get(FirewallProvisioned) + cond.Status = ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusCreateTimeout, + Reason: "5m0s create timeout exceeded, firewall not provisioned in time", + }, + }, + { + name: "create timeout not yet reached", + createTimeout: 15 * time.Minute, + modFn: func(fw *Firewall) { + fw.Status.Phase = FirewallPhaseCreating + cond := fw.Status.Conditions.Get(FirewallProvisioned) + cond.Status = ConditionFalse + fw.Status.Conditions.Set(*cond) + }, + want: &FirewallStatusEvalResult{ + Result: FirewallStatusProgressing, + Reason: "not all health conditions are true: [Provisioned]", + TimeoutIn: pointer.Pointer(5 * time.Minute), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + tenMinutesAgo := time.Now().Add(-10 * time.Minute) + + fw := &Firewall{ + Status: FirewallStatus{ + Phase: FirewallPhaseRunning, + Conditions: Conditions{ + { + Type: FirewallControllerConnected, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: FirewallControllerSeedConnected, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: FirewallCreated, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: FirewallReady, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: FirewallProvisioned, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: FirewallDistanceConfigured, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + { + Type: FirewallMonitorDeployed, + Status: ConditionTrue, + LastTransitionTime: metav1.NewTime(tenMinutesAgo), + LastUpdateTime: metav1.NewTime(tenMinutesAgo), + }, + }, + }, + } + + if tt.modFn != nil { + tt.modFn(fw) + } + + got := EvaluateFirewallStatus(fw, tt.createTimeout, tt.healthTimeout) + if diff := cmp.Diff(tt.want, got); diff != "" { + t.Errorf("diff = %s", diff) + } + }) + }) + } +} diff --git a/controllers/firewall/status.go b/controllers/firewall/status.go index 5473377..38a1e09 100644 --- a/controllers/firewall/status.go +++ b/controllers/firewall/status.go @@ -160,11 +160,30 @@ func SetFirewallStatusFromMonitor(fw *v2.Firewall, mon *v2.FirewallMonitor) { fw.Status.ControllerStatus = connection + var ( + // currently, the firewall-controller writes the reconcile time hard-coded every three minutes + // the FCM reconciles the firewall hard-coded at least every two minutes + // + // this can be visualized as: + // + // fc (write) w w w w + // | | | | + // t (minutes) 0--1--2--3--4--5--6--7--8--9--10-- + // | | | | | | + // FCM (read) r r r r r r + // + // so, read out data will contain t={0, 0, 3, 6, 6, 9}, which shows that the maximum distance is three minutes + maximumSeedUpdateDrift = 3 * time.Minute + // in this case, the firewall-controller almost permanently updates this value (fw.Spec.Interval, by default 10s) + // so we can assume the read out interval from the fcm firewall reconcile, which is maximum two minutes as described above + maximumShootUpdateDrift = 2 * time.Minute + ) + // Check if the firewall-controller has reconciled the shoot if connection.Updated.Time.IsZero() { cond := v2.NewCondition(v2.FirewallControllerConnected, v2.ConditionFalse, "NotConnected", "Controller has not yet connected to shoot.") fw.Status.Conditions.Set(cond) - } else if time.Since(connection.Updated.Time) > 5*time.Minute { + } else if time.Since(connection.Updated.Time) > maximumShootUpdateDrift { cond := v2.NewCondition(v2.FirewallControllerConnected, v2.ConditionFalse, "StoppedReconciling", fmt.Sprintf("Controller has stopped reconciling since %s to shoot.", connection.Updated.String())) fw.Status.Conditions.Set(cond) } else { @@ -176,7 +195,7 @@ func SetFirewallStatusFromMonitor(fw *v2.Firewall, mon *v2.FirewallMonitor) { if connection.SeedUpdated.Time.IsZero() { cond := v2.NewCondition(v2.FirewallControllerSeedConnected, v2.ConditionFalse, "NotConnected", "Controller has not yet connected to seed.") fw.Status.Conditions.Set(cond) - } else if time.Since(connection.SeedUpdated.Time) > 5*time.Minute { + } else if time.Since(connection.SeedUpdated.Time) > maximumSeedUpdateDrift { cond := v2.NewCondition(v2.FirewallControllerSeedConnected, v2.ConditionFalse, "StoppedReconciling", fmt.Sprintf("Controller has stopped reconciling since %s to seed.", connection.SeedUpdated.String())) fw.Status.Conditions.Set(cond) } else { diff --git a/controllers/set/delete.go b/controllers/set/delete.go index d6385c9..8f44a3c 100644 --- a/controllers/set/delete.go +++ b/controllers/set/delete.go @@ -42,24 +42,3 @@ func (c *controller) deleteFirewalls(r *controllers.Ctx[*v2.FirewallSet], fws .. return nil } -func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallSet], fws ...*v2.Firewall) ([]*v2.Firewall, error) { - var result []*v2.Firewall - - for _, fw := range fws { - status := c.evaluateFirewallConditions(fw) - - switch status { - case statusCreateTimeout, statusHealthTimeout: - r.Log.Info("firewall creation or health timeout exceeded, deleting from set", "firewall-name", fw.Name) - - err := c.deleteFirewalls(r, fw) - if err != nil { - return nil, err - } - - result = append(result, fw) - } - - } - return result, nil -} diff --git a/controllers/set/reconcile.go b/controllers/set/reconcile.go index 50231af..0cb8c38 100644 --- a/controllers/set/reconcile.go +++ b/controllers/set/reconcile.go @@ -110,13 +110,6 @@ func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { } } - deletedFws, err := c.deleteIfUnhealthyOrTimeout(r, ownedFirewalls...) - if err != nil { - return err - } - - ownedFirewalls = controllers.Except(ownedFirewalls, deletedFws...) - err = c.setStatus(r, ownedFirewalls) if err != nil { return err diff --git a/controllers/set/status.go b/controllers/set/status.go index 11d3ade..7b05812 100644 --- a/controllers/set/status.go +++ b/controllers/set/status.go @@ -1,93 +1,10 @@ package set import ( - "time" - v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" - "github.com/metal-stack/metal-lib/pkg/pointer" -) - -type status string - -const ( - statusReady status = "ready" - statusProgressing status = "progressing" - statusUnhealthy status = "unhealthy" - statusHealthTimeout status = "health-timeout" - statusCreateTimeout status = "create-timeout" ) -func (c *controller) evaluateFirewallConditions(fw *v2.Firewall) status { - switch fw.Status.Phase { - case v2.FirewallPhaseCreating, v2.FirewallPhaseCrashing: - var ( - createTimeout = c.c.GetCreateTimeout() - provisioned = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallProvisioned)).Status == v2.ConditionTrue - ) - - if provisioned { - return statusReady - } - - if createTimeout > 0 { - createTimeout := c.c.GetCreateTimeout() - - if ok := checkForTimeout(fw, v2.FirewallReady, createTimeout); ok { - c.log.Info("create timeout exceeded, firewall not provisioned in time", "firewall-name", fw.Name, "timeout-after", createTimeout.String()) - return statusCreateTimeout - } - } - - return statusProgressing - - case v2.FirewallPhaseRunning: - fallthrough - - default: - var ( - created = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallCreated)).Status == v2.ConditionTrue - ready = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallReady)).Status == v2.ConditionTrue - provisioned = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallProvisioned)).Status == v2.ConditionTrue - connected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerConnected)).Status == v2.ConditionTrue - seedConnected = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected)).Status == v2.ConditionTrue - distanceConfigured = pointer.SafeDeref(fw.Status.Conditions.Get(v2.FirewallDistanceConfigured)).Status == v2.ConditionTrue - - allConditionsMet = created && ready && provisioned && connected && seedConnected && distanceConfigured - ) - - if allConditionsMet { - return statusReady - } - - if provisioned { - healthTimeout := c.c.GetFirewallHealthTimeout() - - switch { - case !seedConnected: - if ok := checkForTimeout(fw, v2.FirewallControllerSeedConnected, healthTimeout); ok { - c.log.Info("health timeout exceeded, seed connection lost", "firewall-name", fw.Name, "timeout-after", healthTimeout.String()) - return statusHealthTimeout - } - - case !connected: - if ok := checkForTimeout(fw, v2.FirewallControllerConnected, healthTimeout); ok { - c.log.Info("health timeout exceeded, firewall monitor not reconciled anymore by controller", "firewall-name", fw.Name, "timeout-after", healthTimeout.String()) - return statusHealthTimeout - } - - case !ready: - if ok := checkForTimeout(fw, v2.FirewallReady, healthTimeout); ok { - c.log.Info("health timeout exceeded, firewall is not ready from perspective of the metal-api", "firewall-name", fw.Name, "timeout-after", healthTimeout.String()) - return statusHealthTimeout - } - } - } - - return statusUnhealthy - } -} - func (c *controller) setStatus(r *controllers.Ctx[*v2.FirewallSet], ownedFirewalls []*v2.Firewall) error { r.Target.Status.TargetReplicas = r.Target.Spec.Replicas r.Target.Status.ReadyReplicas = 0 @@ -95,16 +12,16 @@ func (c *controller) setStatus(r *controllers.Ctx[*v2.FirewallSet], ownedFirewal r.Target.Status.UnhealthyReplicas = 0 for _, fw := range ownedFirewalls { - statusReport := c.evaluateFirewallConditions(fw) + status := v2.EvaluateFirewallStatus(fw, c.c.GetCreateTimeout(), c.c.GetFirewallHealthTimeout()) - switch statusReport { - case statusReady: + switch status.Result { + case v2.FirewallStatusReady: r.Target.Status.ReadyReplicas++ continue - case statusProgressing: + case v2.FirewallStatusProgressing: r.Target.Status.ProgressingReplicas++ continue - case statusUnhealthy, statusCreateTimeout, statusHealthTimeout: + case v2.FirewallStatusUnhealthy, v2.FirewallStatusCreateTimeout, v2.FirewallStatusHealthTimeout: fallthrough default: r.Target.Status.UnhealthyReplicas++ @@ -120,13 +37,3 @@ func (c *controller) setStatus(r *controllers.Ctx[*v2.FirewallSet], ownedFirewal return nil } - -func checkForTimeout(fw *v2.Firewall, condition v2.ConditionType, timeout time.Duration) bool { - if timeout == 0 { - return false - } - - cond := pointer.SafeDeref(fw.Status.Conditions.Get(condition)) - - return time.Since(cond.LastTransitionTime.Time) > timeout -} diff --git a/controllers/set/status_test.go b/controllers/set/status_test.go deleted file mode 100644 index 9ecc625..0000000 --- a/controllers/set/status_test.go +++ /dev/null @@ -1,185 +0,0 @@ -package set - -import ( - "testing" - "time" - - "github.com/google/go-cmp/cmp" - v2 "github.com/metal-stack/firewall-controller-manager/api/v2" - "github.com/metal-stack/firewall-controller-manager/api/v2/config" - "github.com/stretchr/testify/require" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func Test_controller_evaluateFirewallConditions(t *testing.T) { - tenMinutesAgo := time.Now().Add(-10 * time.Minute) - - tests := []struct { - name string - modFn func(fw *v2.Firewall) - healthTimeout time.Duration - createTimeout time.Duration - want status - }{ - { - name: "ready firewall in running phase", - modFn: nil, - want: statusReady, - }, - { - name: "unhealthy firewall in running phase due to firewall monitor not reconciling", - modFn: func(fw *v2.Firewall) { - fw.Status.Conditions.Set(v2.Condition{ - Type: v2.FirewallControllerConnected, - Status: v2.ConditionFalse, - }) - }, - want: statusUnhealthy, - }, - { - name: "unhealthy firewall in running phase due to firewall not reconciling", - modFn: func(fw *v2.Firewall) { - fw.Status.Conditions.Set(v2.Condition{ - Type: v2.FirewallControllerSeedConnected, - Status: v2.ConditionFalse, - }) - }, - want: statusUnhealthy, - }, - { - name: "unhealthy firewall in running phase due to readiness condition false", - modFn: func(fw *v2.Firewall) { - fw.Status.Conditions.Set(v2.Condition{ - Type: v2.FirewallReady, - Status: v2.ConditionFalse, - }) - }, - want: statusUnhealthy, - }, - { - name: "unhealthy firewall in running phase due to readiness condition false", - modFn: func(fw *v2.Firewall) { - fw.Status.Conditions.Set(v2.Condition{ - Type: v2.FirewallReady, - Status: v2.ConditionFalse, - }) - }, - want: statusUnhealthy, - }, - { - name: "health timeout reached because seed not connected", - healthTimeout: 5 * time.Minute, - modFn: func(fw *v2.Firewall) { - cond := fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected) - cond.Status = v2.ConditionFalse - fw.Status.Conditions.Set(*cond) - }, - want: statusHealthTimeout, - }, - { - name: "health timeout not yet reached", - healthTimeout: 15 * time.Minute, - modFn: func(fw *v2.Firewall) { - cond := fw.Status.Conditions.Get(v2.FirewallControllerSeedConnected) - cond.Status = v2.ConditionFalse - fw.Status.Conditions.Set(*cond) - }, - want: statusUnhealthy, - }, - { - name: "create timeout reached because not provisioned", - createTimeout: 5 * time.Minute, - modFn: func(fw *v2.Firewall) { - fw.Status.Phase = v2.FirewallPhaseCreating - cond := fw.Status.Conditions.Get(v2.FirewallProvisioned) - cond.Status = v2.ConditionFalse - fw.Status.Conditions.Set(*cond) - }, - want: statusCreateTimeout, - }, - { - name: "create timeout not yet reached", - createTimeout: 15 * time.Minute, - modFn: func(fw *v2.Firewall) { - fw.Status.Phase = v2.FirewallPhaseCreating - cond := fw.Status.Conditions.Get(v2.FirewallProvisioned) - cond.Status = v2.ConditionFalse - fw.Status.Conditions.Set(*cond) - }, - want: statusProgressing, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - cfg, err := config.New(&config.NewControllerConfig{ - FirewallHealthTimeout: tt.healthTimeout, - CreateTimeout: tt.createTimeout, - SkipValidation: true, - }) - require.NoError(t, err) - - c := controller{ - c: cfg, - } - - fw := &v2.Firewall{ - Status: v2.FirewallStatus{ - Phase: v2.FirewallPhaseRunning, - Conditions: v2.Conditions{ - { - Type: v2.FirewallControllerConnected, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - { - Type: v2.FirewallControllerSeedConnected, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - { - Type: v2.FirewallCreated, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - { - Type: v2.FirewallReady, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - { - Type: v2.FirewallProvisioned, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - { - Type: v2.FirewallDistanceConfigured, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - { - Type: v2.FirewallMonitorDeployed, - Status: v2.ConditionTrue, - LastTransitionTime: metav1.NewTime(tenMinutesAgo), - LastUpdateTime: metav1.NewTime(tenMinutesAgo), - }, - }, - }, - } - - if tt.modFn != nil { - tt.modFn(fw) - } - - got := c.evaluateFirewallConditions(fw) - if diff := cmp.Diff(tt.want, got); diff != "" { - t.Errorf("diff = %s", diff) - } - }) - } -} diff --git a/controllers/timeout/controller.go b/controllers/timeout/controller.go new file mode 100644 index 0000000..f7831a2 --- /dev/null +++ b/controllers/timeout/controller.go @@ -0,0 +1,49 @@ +package timeout + +import ( + "github.com/go-logr/logr" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + v2 "github.com/metal-stack/firewall-controller-manager/api/v2" + "github.com/metal-stack/firewall-controller-manager/api/v2/config" + "github.com/metal-stack/firewall-controller-manager/controllers" +) + +type controller struct { + c *config.ControllerConfig + log logr.Logger + recorder record.EventRecorder +} + +func SetupWithManager(log logr.Logger, recorder record.EventRecorder, mgr ctrl.Manager, c *config.ControllerConfig) error { + if c.GetFirewallHealthTimeout() <= 0 && c.GetCreateTimeout() <= 0 { + log.Info("not registering timeout controller because neither create nor health timeout configured") + return nil + } + + g := controllers.NewGenericController(log, c.GetSeedClient(), c.GetSeedNamespace(), &controller{ + c: c, + log: log, + recorder: recorder, + }).WithoutStatus() + + return ctrl.NewControllerManagedBy(mgr). + For( + &v2.FirewallSet{}, + ). + Named("FirewallHealthTimeout"). + WithEventFilter(predicate.NewPredicateFuncs(controllers.SkipOtherNamespace(c.GetSeedNamespace()))). + Complete(g) +} + +func (c *controller) New() *v2.FirewallSet { + return &v2.FirewallSet{} +} + +func (c *controller) SetStatus(_ *v2.FirewallSet, _ *v2.FirewallSet) {} + +func (c *controller) Delete(_ *controllers.Ctx[*v2.FirewallSet]) error { + return nil +} diff --git a/controllers/timeout/reconcile.go b/controllers/timeout/reconcile.go new file mode 100644 index 0000000..bca9400 --- /dev/null +++ b/controllers/timeout/reconcile.go @@ -0,0 +1,77 @@ +package timeout + +import ( + "fmt" + "sort" + + v2 "github.com/metal-stack/firewall-controller-manager/api/v2" + "github.com/metal-stack/firewall-controller-manager/controllers" +) + +func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { + ownedFirewalls, _, err := controllers.GetOwnedResources(r.Ctx, c.c.GetSeedClient(), r.Target.Spec.Selector, r.Target, &v2.FirewallList{}, func(fl *v2.FirewallList) []*v2.Firewall { + return fl.GetItems() + }) + if err != nil { + return fmt.Errorf("unable to get owned firewalls: %w", err) + } + + err = c.deleteIfUnhealthyOrTimeout(r, ownedFirewalls...) + if err != nil { + return err + } + + return nil +} + +func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallSet], fws ...*v2.Firewall) error { + type fwWithStatus struct { + firewall *v2.Firewall + status *v2.FirewallStatusEvalResult + } + + var nextTimeouts []*fwWithStatus + + for _, fw := range fws { + status := v2.EvaluateFirewallStatus(fw, c.c.GetCreateTimeout(), c.c.GetFirewallHealthTimeout()) + + switch status.Result { + case v2.FirewallStatusCreateTimeout, v2.FirewallStatusHealthTimeout: + r.Log.Info("firewall timeout exceeded, deleting from set", "reason", status.Reason, "firewall-name", fw.Name) + + if fw.DeletionTimestamp != nil { + r.Log.Info("deletion timestamp on firewall already set", "firewall-name", fw.Name) + continue + } + + err := c.c.GetSeedClient().Delete(r.Ctx, fw) + if err != nil { + return err + } + + c.recorder.Eventf(fw, "Normal", "Delete", "deleted firewall %s due to %s", fw.Name, status) + + case v2.FirewallStatusUnhealthy: + if status.TimeoutIn != nil { + nextTimeouts = append(nextTimeouts, &fwWithStatus{ + firewall: fw, + status: status, + }) + } + } + } + + if len(nextTimeouts) > 0 { + sort.SliceStable(nextTimeouts, func(i, j int) bool { + return *nextTimeouts[i].status.TimeoutIn < *nextTimeouts[j].status.TimeoutIn + }) + + nextTimeout := nextTimeouts[0] + + return controllers.RequeueAfter( + *nextTimeout.status.TimeoutIn, + fmt.Sprintf("checking for timeout of firewall %q (reason: %s)", nextTimeout.firewall.Name, nextTimeout.status.Reason)) + } + + return nil +} diff --git a/main.go b/main.go index e68aad6..2d43000 100644 --- a/main.go +++ b/main.go @@ -32,6 +32,7 @@ import ( "github.com/metal-stack/firewall-controller-manager/controllers/firewall" "github.com/metal-stack/firewall-controller-manager/controllers/monitor" "github.com/metal-stack/firewall-controller-manager/controllers/set" + "github.com/metal-stack/firewall-controller-manager/controllers/timeout" "github.com/metal-stack/firewall-controller-manager/controllers/update" ) @@ -282,6 +283,9 @@ func main() { if err := update.SetupWithManager(ctrl.Log.WithName("controllers").WithName("update"), seedMgr.GetEventRecorder("update-controller"), seedMgr, cc); err != nil { log.Fatalf("unable to setup update controller: %v", err) } + if err := timeout.SetupWithManager(ctrl.Log.WithName("controllers").WithName("timeout"), seedMgr.GetEventRecorderFor("timeout-controller"), seedMgr, cc); err != nil { + log.Fatalf("unable to setup timeout controller: %v", err) + } if err := deployment.SetupWebhookWithManager(ctrl.Log.WithName("defaulting-webhook"), seedMgr, cc); err != nil { log.Fatalf("unable to setup webhook, controller deployment %v", err) From f8be5a818462afad1494909a7805476e4239c526 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 14:22:40 +0100 Subject: [PATCH 3/8] Naming. --- api/v2/types_firewall.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/api/v2/types_firewall.go b/api/v2/types_firewall.go index bbfa351..3eae677 100644 --- a/api/v2/types_firewall.go +++ b/api/v2/types_firewall.go @@ -395,7 +395,7 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura return deadline, false } - unhealthyConditions = func(cts ...ConditionType) []*Condition { + collectUnhealthyConditions = func(cts ...ConditionType) []*Condition { var res []*Condition for _, ct := range cts { @@ -414,12 +414,13 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura switch fw.Status.Phase { case FirewallPhaseCreating, FirewallPhaseCrashing: - conditions := unhealthyConditions(FirewallCreated, + unhealthyConds := collectUnhealthyConditions( + FirewallCreated, FirewallReady, FirewallProvisioned, ) - if len(conditions) == 0 { + if len(unhealthyConds) == 0 { return &FirewallStatusEvalResult{ Result: FirewallStatusReady, Reason: "", @@ -437,7 +438,7 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura } } - for _, c := range conditions { + for _, c := range unhealthyConds { unhealthyTypes = append(unhealthyTypes, string(c.Type)) } @@ -451,7 +452,8 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura fallthrough default: - conditions := unhealthyConditions(FirewallCreated, + unhealthyConds := collectUnhealthyConditions( + FirewallCreated, FirewallReady, FirewallProvisioned, FirewallControllerConnected, @@ -459,7 +461,7 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura FirewallDistanceConfigured, ) - if len(conditions) == 0 { + if len(unhealthyConds) == 0 { return &FirewallStatusEvalResult{ Result: FirewallStatusReady, Reason: "", @@ -507,7 +509,7 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura } } - for _, c := range conditions { + for _, c := range unhealthyConds { unhealthyTypes = append(unhealthyTypes, string(c.Type)) } From 11f19207bd2940b596cbda55288d63754398fd60 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 14:23:14 +0100 Subject: [PATCH 4/8] Build. --- .github/workflows/docker.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 6b42cde..a02d760 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -2,8 +2,8 @@ name: Docker Build Action on: pull_request: - branches: - - main + # branches: + # - main release: types: - published From 8cd0fad6c05e2c6a1f048e73dbb75e6f5360fd9d Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 14:26:05 +0100 Subject: [PATCH 5/8] Synctest is not available. --- api/v2/zz_generated.deepcopy.go | 21 +++++++++++++++++++++ integration/suite_test.go | 6 +++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/api/v2/zz_generated.deepcopy.go b/api/v2/zz_generated.deepcopy.go index b9d9399..393c86c 100644 --- a/api/v2/zz_generated.deepcopy.go +++ b/api/v2/zz_generated.deepcopy.go @@ -6,6 +6,7 @@ package v2 import ( runtime "k8s.io/apimachinery/pkg/runtime" + timex "time" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -742,6 +743,26 @@ func (in *FirewallStatus) DeepCopy() *FirewallStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FirewallStatusEvalResult) DeepCopyInto(out *FirewallStatusEvalResult) { + *out = *in + if in.TimeoutIn != nil { + in, out := &in.TimeoutIn, &out.TimeoutIn + *out = new(timex.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FirewallStatusEvalResult. +func (in *FirewallStatusEvalResult) DeepCopy() *FirewallStatusEvalResult { + if in == nil { + return nil + } + out := new(FirewallStatusEvalResult) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FirewallTemplateSpec) DeepCopyInto(out *FirewallTemplateSpec) { *out = *in diff --git a/integration/suite_test.go b/integration/suite_test.go index e4c633b..5619ac3 100644 --- a/integration/suite_test.go +++ b/integration/suite_test.go @@ -32,9 +32,9 @@ import ( ) const ( - namespaceName = "test" - firewallHealthTimeout = 19 * 24 * time.Hour - firewallCreateTimeout = 19 * 24 * time.Hour + namespaceName = "test" + firewallHealthTimeout = 19 * 24 * time.Hour + firewallCreateTimeout = 19 * 24 * time.Hour ) var ( From 193ebc826571fe72bd2373e5a70bb03f7c6b1819 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 15:08:17 +0100 Subject: [PATCH 6/8] Update event recorder. --- controllers/timeout/controller.go | 6 +++--- controllers/timeout/reconcile.go | 2 +- main.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/controllers/timeout/controller.go b/controllers/timeout/controller.go index f7831a2..317a96a 100644 --- a/controllers/timeout/controller.go +++ b/controllers/timeout/controller.go @@ -2,7 +2,7 @@ package timeout import ( "github.com/go-logr/logr" - "k8s.io/client-go/tools/record" + "k8s.io/client-go/tools/events" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -14,10 +14,10 @@ import ( type controller struct { c *config.ControllerConfig log logr.Logger - recorder record.EventRecorder + recorder events.EventRecorder } -func SetupWithManager(log logr.Logger, recorder record.EventRecorder, mgr ctrl.Manager, c *config.ControllerConfig) error { +func SetupWithManager(log logr.Logger, recorder events.EventRecorder, mgr ctrl.Manager, c *config.ControllerConfig) error { if c.GetFirewallHealthTimeout() <= 0 && c.GetCreateTimeout() <= 0 { log.Info("not registering timeout controller because neither create nor health timeout configured") return nil diff --git a/controllers/timeout/reconcile.go b/controllers/timeout/reconcile.go index bca9400..2386c6a 100644 --- a/controllers/timeout/reconcile.go +++ b/controllers/timeout/reconcile.go @@ -49,7 +49,7 @@ func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallS return err } - c.recorder.Eventf(fw, "Normal", "Delete", "deleted firewall %s due to %s", fw.Name, status) + c.recorder.Eventf(fw, nil, "Normal", "Delete", "deleted firewall %s due to %s", fw.Name, status) case v2.FirewallStatusUnhealthy: if status.TimeoutIn != nil { diff --git a/main.go b/main.go index 2d43000..deec2c4 100644 --- a/main.go +++ b/main.go @@ -283,7 +283,7 @@ func main() { if err := update.SetupWithManager(ctrl.Log.WithName("controllers").WithName("update"), seedMgr.GetEventRecorder("update-controller"), seedMgr, cc); err != nil { log.Fatalf("unable to setup update controller: %v", err) } - if err := timeout.SetupWithManager(ctrl.Log.WithName("controllers").WithName("timeout"), seedMgr.GetEventRecorderFor("timeout-controller"), seedMgr, cc); err != nil { + if err := timeout.SetupWithManager(ctrl.Log.WithName("controllers").WithName("timeout"), seedMgr.GetEventRecorder("timeout-controller"), seedMgr, cc); err != nil { log.Fatalf("unable to setup timeout controller: %v", err) } From 7ab67a34d546c3dbde6efccfaa1949db602af9aa Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 16:26:44 +0100 Subject: [PATCH 7/8] First set of fixes and integration tests. --- api/v2/types_firewall.go | 4 +- controllers/deployment/delete.go | 3 +- controllers/deployment/reconcile.go | 3 +- controllers/deployment/recreate.go | 4 +- controllers/deployment/rolling.go | 3 +- controllers/firewall/delete.go | 4 +- controllers/firewall/reconcile.go | 11 ++++- controllers/generic_controller.go | 14 ++---- controllers/set/delete.go | 4 +- controllers/set/reconcile.go | 6 ++- controllers/timeout/reconcile.go | 4 +- integration/integration_test.go | 65 ++++++++++++++++++++++++++ integration/metal_resources_test.go | 72 ++++++++++++++--------------- integration/suite_test.go | 8 ++++ 14 files changed, 149 insertions(+), 56 deletions(-) diff --git a/api/v2/types_firewall.go b/api/v2/types_firewall.go index 3eae677..c8cf401 100644 --- a/api/v2/types_firewall.go +++ b/api/v2/types_firewall.go @@ -400,7 +400,9 @@ func EvaluateFirewallStatus(fw *Firewall, createTimeout, healthTimeout time.Dura for _, ct := range cts { cond := fw.Status.Conditions.Get(ct) - if cond == nil || cond.Status != ConditionTrue { + if cond == nil { + res = append(res, &Condition{Type: ct}) + } else if cond.Status != ConditionTrue { res = append(res, cond) } } diff --git a/controllers/deployment/delete.go b/controllers/deployment/delete.go index 7d0298c..5e77bd4 100644 --- a/controllers/deployment/delete.go +++ b/controllers/deployment/delete.go @@ -6,6 +6,7 @@ import ( v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + corev1 "k8s.io/api/core/v1" ) func (c *controller) Delete(r *controllers.Ctx[*v2.FirewallDeployment]) error { @@ -33,7 +34,7 @@ func (c *controller) deleteFirewallSets(r *controllers.Ctx[*v2.FirewallDeploymen r.Log.Info("set deletion timestamp on firewall set", "set-name", set.Name) - c.recorder.Eventf(set, nil, "Normal", "Delete", "deleted firewallset %s", set.Name) + c.recorder.Eventf(set, nil, corev1.EventTypeNormal, "Delete", "deleting set", "deleted firewall set %s", set.Name) } if len(sets) > 0 { diff --git a/controllers/deployment/reconcile.go b/controllers/deployment/reconcile.go index ef9c981..7ab6c5c 100644 --- a/controllers/deployment/reconcile.go +++ b/controllers/deployment/reconcile.go @@ -8,6 +8,7 @@ import ( "github.com/google/uuid" v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/retry" @@ -217,7 +218,7 @@ func (c *controller) syncFirewallSet(r *controllers.Ctx[*v2.FirewallDeployment], cond := v2.NewCondition(v2.FirewallDeplomentProgressing, v2.ConditionTrue, "FirewallSetUpdated", fmt.Sprintf("Updated firewall set %q.", set.Name)) r.Target.Status.Conditions.Set(cond) - c.recorder.Eventf(set, nil, "Normal", "Update", "updated firewallset %s", set.Name) + c.recorder.Eventf(set, nil, corev1.EventTypeNormal, "Update", "updating set", "updated firewall set %s", set.Name) return nil } diff --git a/controllers/deployment/recreate.go b/controllers/deployment/recreate.go index da483c1..f5de6ba 100644 --- a/controllers/deployment/recreate.go +++ b/controllers/deployment/recreate.go @@ -6,6 +6,8 @@ import ( v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + + corev1 "k8s.io/api/core/v1" ) // recreateStrategy first deletes the existing firewall sets and then creates a new one @@ -20,7 +22,7 @@ func (c *controller) recreateStrategy(r *controllers.Ctx[*v2.FirewallDeployment] return err } - c.recorder.Eventf(set, nil, "Normal", "Recreate", "recreated firewallset old: %s new: %s", latestSet.Name, set.Name) + c.recorder.Eventf(set, nil, corev1.EventTypeNormal, "Recreate", "recreating set", "recreated firewall set, old: %s new: %s", latestSet.Name, set.Name) latestSet = set } diff --git a/controllers/deployment/rolling.go b/controllers/deployment/rolling.go index 07de9f6..61c48a5 100644 --- a/controllers/deployment/rolling.go +++ b/controllers/deployment/rolling.go @@ -6,6 +6,7 @@ import ( v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + corev1 "k8s.io/api/core/v1" ) // rollingUpdateStrategy first creates a new set and deletes the old one's when the new one becomes ready @@ -20,7 +21,7 @@ func (c *controller) rollingUpdateStrategy(r *controllers.Ctx[*v2.FirewallDeploy return err } - c.recorder.Eventf(newSet, nil, "Normal", "Create", "created firewallset %s", newSet.Name) + c.recorder.Eventf(newSet, nil, corev1.EventTypeNormal, "Create", "creating set", "created firewall set %s", newSet.Name) ownedSets = append(ownedSets, newSet) diff --git a/controllers/firewall/delete.go b/controllers/firewall/delete.go index 191584c..20f1de1 100644 --- a/controllers/firewall/delete.go +++ b/controllers/firewall/delete.go @@ -9,6 +9,8 @@ import ( "github.com/metal-stack/firewall-controller-manager/controllers" "github.com/metal-stack/metal-go/api/client/machine" apierrors "k8s.io/apimachinery/pkg/api/errors" + + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -46,7 +48,7 @@ func (c *controller) Delete(r *controllers.Ctx[*v2.Firewall]) error { r.Log.Info("deleted firewall", "firewall-name", f.Name, "id", *resp.Payload.ID) - c.recorder.Eventf(r.Target, nil, "Normal", "Delete", "deleted firewall %s id %s", r.Target.Name, *resp.Payload.ID) + c.recorder.Eventf(r.Target, nil, corev1.EventTypeNormal, "Delete", "deleting firewall", "deleted firewall %s id %s", r.Target.Name, *resp.Payload.ID) } return nil diff --git a/controllers/firewall/reconcile.go b/controllers/firewall/reconcile.go index 26170ae..13f25e6 100644 --- a/controllers/firewall/reconcile.go +++ b/controllers/firewall/reconcile.go @@ -12,6 +12,8 @@ import ( "github.com/metal-stack/metal-go/api/client/machine" "github.com/metal-stack/metal-go/api/models" "github.com/metal-stack/metal-lib/pkg/pointer" + + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" ) @@ -54,6 +56,11 @@ func (c *controller) Reconcile(r *controllers.Ctx[*v2.Firewall]) error { cond := v2.NewCondition(v2.FirewallCreated, v2.ConditionTrue, "Created", fmt.Sprintf("Firewall %q created successfully.", pointer.SafeDeref(pointer.SafeDeref(f.Allocation).Name))) r.Target.Status.Conditions.Set(cond) + // this is mainly for tests when the firewall is already present + if r.Target.Status.Phase == v2.FirewallPhase("") { + r.Target.Status.Phase = v2.FirewallPhaseCreating + } + var currentStatus *v2.MachineStatus currentStatus, err = getMachineStatus(f) if err != nil { @@ -169,7 +176,7 @@ func (c *controller) createFirewall(r *controllers.Ctx[*v2.Firewall]) (*models.V cond := v2.NewCondition(v2.FirewallCreated, v2.ConditionTrue, "Created", fmt.Sprintf("Firewall %q created successfully.", pointer.SafeDeref(pointer.SafeDeref(resp.Payload.Allocation).Name))) r.Target.Status.Conditions.Set(cond) - c.recorder.Eventf(r.Target, nil, "Normal", "Create", "created firewall %s id %s", r.Target.Name, pointer.SafeDeref(resp.Payload.ID)) + c.recorder.Eventf(r.Target, nil, corev1.EventTypeNormal, "Create", "created firewall %s id %s", "creating firewall", r.Target.Name, pointer.SafeDeref(resp.Payload.ID)) return resp.Payload, nil } @@ -189,6 +196,7 @@ func isFirewallProgressing(status *v2.MachineStatus) bool { if status.LastEvent.Event != "Phoned Home" { return true } + return false } @@ -207,6 +215,7 @@ func isFirewallReady(status *v2.MachineStatus) bool { if status.LastEvent.Event == "Phoned Home" { return true } + return false } diff --git a/controllers/generic_controller.go b/controllers/generic_controller.go index 5a585e2..73408e3 100644 --- a/controllers/generic_controller.go +++ b/controllers/generic_controller.go @@ -103,8 +103,7 @@ func (g GenericController[O]) Reconcile(ctx context.Context, req ctrl.Request) ( log.Info("reconciling resource deletion flow") err := g.reconciler.Delete(rctx) if err != nil { - var requeueErr *requeueError - if errors.As(err, &requeueErr) { + if requeueErr, ok := errors.AsType[*requeueError](err); ok { log.Info(requeueErr.Error()) return ctrl.Result{RequeueAfter: requeueErr.after}, nil //nolint:nilerr we need to return nil such that the requeue works } @@ -192,16 +191,13 @@ func (g GenericController[O]) Reconcile(ctx context.Context, req ctrl.Request) ( err := g.reconciler.Reconcile(rctx) if err != nil { - var requeueErr *requeueError - - switch { - case errors.As(err, &requeueErr): + if requeueErr, ok := errors.AsType[*requeueError](err); ok { log.Info(requeueErr.Error()) return ctrl.Result{RequeueAfter: requeueErr.after}, nil //nolint:nilerr we need to return nil such that the requeue works - default: - log.Error(err, "error during reconcile") - return ctrl.Result{}, err } + + log.Error(err, "error during reconcile") + return ctrl.Result{}, err } return ctrl.Result{}, statusErr diff --git a/controllers/set/delete.go b/controllers/set/delete.go index 8f44a3c..1ab22a4 100644 --- a/controllers/set/delete.go +++ b/controllers/set/delete.go @@ -6,6 +6,8 @@ import ( v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + + corev1 "k8s.io/api/core/v1" ) func (c *controller) Delete(r *controllers.Ctx[*v2.FirewallSet]) error { @@ -33,7 +35,7 @@ func (c *controller) deleteFirewalls(r *controllers.Ctx[*v2.FirewallSet], fws .. r.Log.Info("set deletion timestamp on firewall", "firewall-name", fw.Name) - c.recorder.Eventf(fw, nil, "Normal", "Delete", "deleted firewall %s", fw.Name) + c.recorder.Eventf(fw, nil, corev1.EventTypeNormal, "Delete", "deleting firewall", "deleted firewall %s", fw.Name) } if len(fws) > 0 { diff --git a/controllers/set/reconcile.go b/controllers/set/reconcile.go index 0cb8c38..3f0aa53 100644 --- a/controllers/set/reconcile.go +++ b/controllers/set/reconcile.go @@ -8,8 +8,10 @@ import ( "github.com/google/uuid" v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { @@ -90,7 +92,7 @@ func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { r.Log.Info("firewall created", "firewall-name", fw.Name) - c.recorder.Eventf(r.Target, nil, "Normal", "Create", "created firewall %s", fw.Name) + c.recorder.Eventf(r.Target, nil, corev1.EventTypeNormal, "Create", "creating firewall", "created firewall %s", fw.Name) ownedFirewalls = append(ownedFirewalls, fw) } diff --git a/controllers/timeout/reconcile.go b/controllers/timeout/reconcile.go index 2386c6a..ba97a85 100644 --- a/controllers/timeout/reconcile.go +++ b/controllers/timeout/reconcile.go @@ -6,6 +6,8 @@ import ( v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + + corev1 "k8s.io/api/core/v1" ) func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { @@ -49,7 +51,7 @@ func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallS return err } - c.recorder.Eventf(fw, nil, "Normal", "Delete", "deleted firewall %s due to %s", fw.Name, status) + c.recorder.Eventf(fw, nil, corev1.EventTypeNormal, "Delete", "deleting firewall", "deleted firewall %s due to %s", fw.Name, status) case v2.FirewallStatusUnhealthy: if status.TimeoutIn != nil { diff --git a/integration/integration_test.go b/integration/integration_test.go index df10bc3..1b51a7f 100644 --- a/integration/integration_test.go +++ b/integration/integration_test.go @@ -315,6 +315,7 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("Connected")) Expect(cond.Message).To(Equal(fmt.Sprintf("Controller reconciled shoot at %s.", mon.ControllerStatus.Updated.String()))) }) + It("should have the firewall-controller connected to seed condition true", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -325,6 +326,7 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("Connected")) Expect(cond.Message).To(Equal(fmt.Sprintf("Controller reconciled firewall at %s.", mon.ControllerStatus.SeedUpdated.String()))) }) + It("should have configured the distance", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -335,6 +337,13 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("Configured")) Expect(cond.Message).To(Equal(fmt.Sprintf("Controller has configured the specified distance %d.", v2.FirewallShortestDistance))) }) + + It("should be in the running phase", func() { + Eventually(func() v2.FirewallPhase { + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(fw), fw)).To(Succeed()) + return fw.Status.Phase + }, 5*time.Second, interval).Should(Equal(v2.FirewallPhaseRunning)) + }) }) Context("the firewall set resource", func() { @@ -580,6 +589,13 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.LastTransitionTime).NotTo(BeZero()) }) + It("should be in the creating phase", func() { + Eventually(func() v2.FirewallPhase { + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(fw), fw)).To(Succeed()) + return fw.Status.Phase + }, 5*time.Second, interval).Should(Equal(v2.FirewallPhaseCreating)) + }) + It("should have firewall networks populated", func() { var nws []v2.FirewallNetwork var fw = fw.DeepCopy() @@ -1006,6 +1022,17 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Message).To(Equal(fmt.Sprintf("Firewall %q is phoning home and alive.", *firewall1.Allocation.Name))) }) + It("should have the provisioned condition true", func() { + cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { + return fd.Status.Conditions + }, v2.FirewallProvisioned, v2.ConditionTrue, 15*time.Second) + + Expect(cond.LastTransitionTime).NotTo(BeZero()) + Expect(cond.LastUpdateTime).NotTo(BeZero()) + Expect(cond.Reason).To(Equal("Provisioned")) + Expect(cond.Message).To(Equal("All firewall conditions have been met.")) + }) + It("should have the monitor condition true", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -1027,6 +1054,7 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("Connected")) Expect(cond.Message).To(Equal(fmt.Sprintf("Controller reconciled shoot at %s.", mon.ControllerStatus.Updated.String()))) }) + It("should have the firewall-controller connected to seed condition true", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -1037,6 +1065,7 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("Connected")) Expect(cond.Message).To(Equal(fmt.Sprintf("Controller reconciled firewall at %s.", mon.ControllerStatus.SeedUpdated.String()))) }) + It("should have configured the distance", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -1047,6 +1076,13 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("Configured")) Expect(cond.Message).To(Equal(fmt.Sprintf("Controller has configured the specified distance %d.", v2.FirewallShortestDistance))) }) + + It("should be in the running phase", func() { + Eventually(func() v2.FirewallPhase { + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(fw), fw)).To(Succeed()) + return fw.Status.Phase + }, 5*time.Second, interval).Should(Equal(v2.FirewallPhaseRunning)) + }) }) Context("the firewall set resource", func() { @@ -1374,6 +1410,17 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Message).To(Equal(fmt.Sprintf("Firewall %q is phoning home and alive.", *readyFirewall.Allocation.Name))) }) + It("should have the provisioned condition true", func() { + cond := testcommon.WaitForCondition(k8sClient, ctx, newFw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { + return fd.Status.Conditions + }, v2.FirewallProvisioned, v2.ConditionTrue, 15*time.Second) + + Expect(cond.LastTransitionTime).NotTo(BeZero()) + Expect(cond.LastUpdateTime).NotTo(BeZero()) + Expect(cond.Reason).To(Equal("Provisioned")) + Expect(cond.Message).To(Equal("All firewall conditions have been met.")) + }) + It("should have the monitor condition true", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, newFw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -1768,6 +1815,17 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Message).To(Equal(fmt.Sprintf("Firewall %q is phoning home and alive.", *firewall1.Allocation.Name))) }) + It("should have the provisioned condition true", func() { + cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { + return fd.Status.Conditions + }, v2.FirewallProvisioned, v2.ConditionTrue, 15*time.Second) + + Expect(cond.LastTransitionTime).NotTo(BeZero()) + Expect(cond.LastUpdateTime).NotTo(BeZero()) + Expect(cond.Reason).To(Equal("Provisioned")) + Expect(cond.Message).To(Equal("All firewall conditions have been met.")) + }) + It("should have the monitor condition true", func() { cond := testcommon.WaitForCondition(k8sClient, ctx, fw.DeepCopy(), func(fd *v2.Firewall) v2.Conditions { return fd.Status.Conditions @@ -1789,6 +1847,13 @@ var _ = Context("integration test", Ordered, func() { Expect(cond.Reason).To(Equal("NotChecking")) Expect(cond.Message).To(Equal("Not checking controller connection due to firewall annotation.")) }) + + It("should be in the running phase", func() { + Eventually(func() v2.FirewallPhase { + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(fw), fw)).To(Succeed()) + return fw.Status.Phase + }, 5*time.Second, interval).Should(Equal(v2.FirewallPhaseRunning)) + }) }) Context("the firewall set resource", func() { diff --git a/integration/metal_resources_test.go b/integration/metal_resources_test.go index 64972da..24e7925 100644 --- a/integration/metal_resources_test.go +++ b/integration/metal_resources_test.go @@ -313,86 +313,86 @@ var ( firewall3 = &models.V1FirewallResponse{ Allocation: &models.V1MachineAllocation{ BootInfo: &models.V1BootInfo{ - Bootloaderid: pointer.Pointer("bootloaderid"), - Cmdline: pointer.Pointer("cmdline"), - ImageID: pointer.Pointer("imageid"), - Initrd: pointer.Pointer("initrd"), - Kernel: pointer.Pointer("kernel"), - OsPartition: pointer.Pointer("ospartition"), - PrimaryDisk: pointer.Pointer("primarydisk"), + Bootloaderid: new("bootloaderid"), + Cmdline: new("cmdline"), + ImageID: new("imageid"), + Initrd: new("initrd"), + Kernel: new("kernel"), + OsPartition: new("ospartition"), + PrimaryDisk: new("primarydisk"), }, - Created: pointer.Pointer(strfmt.DateTime(testTime.Add(-20 * 24 * time.Hour))), - Creator: pointer.Pointer("creator"), + Created: new(strfmt.DateTime(testTime.Add(-20 * 24 * time.Hour))), + Creator: new("creator"), Description: "firewall allocation 3", Filesystemlayout: fsl1, - Hostname: pointer.Pointer("firewall-hostname-3"), + Hostname: new("firewall-hostname-3"), Image: image1, - Name: pointer.Pointer("firewall-3"), + Name: new("firewall-3"), Networks: []*models.V1MachineNetwork{ { - Asn: pointer.Pointer(int64(200)), + Asn: new(int64(200)), Destinationprefixes: []string{"2.2.2.2"}, Ips: []string{"1.1.1.1"}, - Nat: pointer.Pointer(false), - Networkid: pointer.Pointer("private"), + Nat: new(false), + Networkid: new("private"), Networktype: pointer.Pointer(net.PrivatePrimaryUnshared), Prefixes: []string{"prefixes"}, - Private: pointer.Pointer(true), - Underlay: pointer.Pointer(false), - Vrf: pointer.Pointer(int64(100)), + Private: new(true), + Underlay: new(false), + Vrf: new(int64(100)), }, }, - Project: pointer.Pointer("project-1"), - Reinstall: pointer.Pointer(false), + Project: new("project-1"), + Reinstall: new(false), Role: pointer.Pointer(models.V1MachineAllocationRoleFirewall), SSHPubKeys: []string{"sshpubkey"}, - Succeeded: pointer.Pointer(true), + Succeeded: new(true), UserData: "---userdata---", }, Bios: &models.V1MachineBIOS{ - Date: pointer.Pointer("biosdata"), - Vendor: pointer.Pointer("biosvendor"), - Version: pointer.Pointer("biosversion"), + Date: new("biosdata"), + Vendor: new("biosvendor"), + Version: new("biosversion"), }, Description: "firewall 1", Events: &models.V1MachineRecentProvisioningEvents{ - CrashLoop: pointer.Pointer(true), - FailedMachineReclaim: pointer.Pointer(true), + CrashLoop: new(true), + FailedMachineReclaim: new(true), LastErrorEvent: &models.V1MachineProvisioningEvent{ - Event: pointer.Pointer("Crashed"), + Event: new("Crashed"), Message: "crash", Time: strfmt.DateTime(testTime.Add(-10 * 24 * time.Hour)), }, LastEventTime: strfmt.DateTime(testTime.Add(-7 * 24 * time.Hour)), Log: []*models.V1MachineProvisioningEvent{ { - Event: pointer.Pointer("Phoned Home"), + Event: new("Phoned Home"), Message: "phoning home", Time: strfmt.DateTime(testTime.Add(-7 * 24 * time.Hour)), }, }, }, Hardware: &models.V1MachineHardware{ - CPUCores: pointer.Pointer(int32(16)), + CPUCores: new(int32(16)), Disks: []*models.V1MachineBlockDevice{}, - Memory: pointer.Pointer(int64(32)), + Memory: new(int64(32)), Nics: []*models.V1MachineNic{}, }, - ID: pointer.Pointer("3"), + ID: new("3"), Ledstate: &models.V1ChassisIdentifyLEDState{ - Description: pointer.Pointer(""), - Value: pointer.Pointer(""), + Description: new(""), + Value: new(""), }, - Liveliness: pointer.Pointer("Unhealthy"), + Liveliness: new("Unhealthy"), Name: "firewall-3", Partition: partition1, Rackid: "rack-1", Size: size1, State: &models.V1MachineState{ - Description: pointer.Pointer("state"), + Description: new("state"), Issuer: "issuer", - MetalHammerVersion: pointer.Pointer("version"), - Value: pointer.Pointer(""), + MetalHammerVersion: new("version"), + Value: new(""), }, Tags: []string{"a"}, } diff --git a/integration/suite_test.go b/integration/suite_test.go index 5619ac3..fe5055b 100644 --- a/integration/suite_test.go +++ b/integration/suite_test.go @@ -169,6 +169,14 @@ var _ = BeforeSuite(func() { ) Expect(err).ToNot(HaveOccurred()) + // err = timeout.SetupWithManager( + // ctrl.Log.WithName("controllers").WithName("timeout"), + // mgr.GetEventRecorder("timeout-controller"), + // mgr, + // cc, + // ) + // Expect(err).ToNot(HaveOccurred()) + err = deployment.SetupWebhookWithManager(ctrl.Log.WithName("defaulting-webhook"), mgr, cc) Expect(err).ToNot(HaveOccurred()) err = set.SetupWebhookWithManager(ctrl.Log.WithName("defaulting-webhook"), mgr, cc) From 8b221843b4ae6b8b9f2a66d3b6f2a623f7ae186a Mon Sep 17 00:00:00 2001 From: Gerrit Date: Wed, 11 Feb 2026 16:47:09 +0100 Subject: [PATCH 8/8] Do not use generic controller for timeout controller. Otherwise it will try to deal with finalizers, deletion flows and so on. --- controllers/timeout/controller.go | 31 +++++++--------- controllers/timeout/reconcile.go | 61 +++++++++++++++++++++---------- integration/suite_test.go | 15 ++++---- 3 files changed, 63 insertions(+), 44 deletions(-) diff --git a/controllers/timeout/controller.go b/controllers/timeout/controller.go index 317a96a..a7bbc31 100644 --- a/controllers/timeout/controller.go +++ b/controllers/timeout/controller.go @@ -4,6 +4,7 @@ import ( "github.com/go-logr/logr" "k8s.io/client-go/tools/events" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/predicate" v2 "github.com/metal-stack/firewall-controller-manager/api/v2" @@ -12,9 +13,11 @@ import ( ) type controller struct { - c *config.ControllerConfig - log logr.Logger - recorder events.EventRecorder + c *config.ControllerConfig + client client.Client + namespace string + log logr.Logger + recorder events.EventRecorder } func SetupWithManager(log logr.Logger, recorder events.EventRecorder, mgr ctrl.Manager, c *config.ControllerConfig) error { @@ -23,11 +26,13 @@ func SetupWithManager(log logr.Logger, recorder events.EventRecorder, mgr ctrl.M return nil } - g := controllers.NewGenericController(log, c.GetSeedClient(), c.GetSeedNamespace(), &controller{ - c: c, - log: log, - recorder: recorder, - }).WithoutStatus() + g := &controller{ + c: c, + log: log, + client: c.GetSeedClient(), + namespace: c.GetSeedNamespace(), + recorder: recorder, + } return ctrl.NewControllerManagedBy(mgr). For( @@ -37,13 +42,3 @@ func SetupWithManager(log logr.Logger, recorder events.EventRecorder, mgr ctrl.M WithEventFilter(predicate.NewPredicateFuncs(controllers.SkipOtherNamespace(c.GetSeedNamespace()))). Complete(g) } - -func (c *controller) New() *v2.FirewallSet { - return &v2.FirewallSet{} -} - -func (c *controller) SetStatus(_ *v2.FirewallSet, _ *v2.FirewallSet) {} - -func (c *controller) Delete(_ *controllers.Ctx[*v2.FirewallSet]) error { - return nil -} diff --git a/controllers/timeout/reconcile.go b/controllers/timeout/reconcile.go index ba97a85..95748e3 100644 --- a/controllers/timeout/reconcile.go +++ b/controllers/timeout/reconcile.go @@ -1,32 +1,50 @@ package timeout import ( + "context" "fmt" "sort" v2 "github.com/metal-stack/firewall-controller-manager/api/v2" "github.com/metal-stack/firewall-controller-manager/controllers" + apierrors "k8s.io/apimachinery/pkg/api/errors" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" corev1 "k8s.io/api/core/v1" ) -func (c *controller) Reconcile(r *controllers.Ctx[*v2.FirewallSet]) error { - ownedFirewalls, _, err := controllers.GetOwnedResources(r.Ctx, c.c.GetSeedClient(), r.Target.Spec.Selector, r.Target, &v2.FirewallList{}, func(fl *v2.FirewallList) []*v2.Firewall { - return fl.GetItems() - }) - if err != nil { - return fmt.Errorf("unable to get owned firewalls: %w", err) +func (c *controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + if req.Namespace != c.namespace { // should already be filtered out through predicate, but we will check anyway + return ctrl.Result{}, nil } - err = c.deleteIfUnhealthyOrTimeout(r, ownedFirewalls...) + set := &v2.FirewallSet{} + if err := c.client.Get(ctx, req.NamespacedName, set, &client.GetOptions{}); err != nil { + if apierrors.IsNotFound(err) { + c.log.Info("resource no longer exists") + return ctrl.Result{}, nil + } + + return ctrl.Result{}, fmt.Errorf("error retrieving resource: %w", err) + } + + if !set.GetDeletionTimestamp().IsZero() { + return ctrl.Result{}, nil + } + + ownedFirewalls, _, err := controllers.GetOwnedResources(ctx, c.c.GetSeedClient(), set.Spec.Selector, set, &v2.FirewallList{}, func(fl *v2.FirewallList) []*v2.Firewall { + return fl.GetItems() + }) if err != nil { - return err + return ctrl.Result{}, fmt.Errorf("unable to get owned firewalls: %w", err) } - return nil + return c.deleteIfUnhealthyOrTimeout(ctx, ownedFirewalls...) } -func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallSet], fws ...*v2.Firewall) error { +func (c *controller) deleteIfUnhealthyOrTimeout(ctx context.Context, fws ...*v2.Firewall) (ctrl.Result, error) { type fwWithStatus struct { firewall *v2.Firewall status *v2.FirewallStatusEvalResult @@ -39,16 +57,16 @@ func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallS switch status.Result { case v2.FirewallStatusCreateTimeout, v2.FirewallStatusHealthTimeout: - r.Log.Info("firewall timeout exceeded, deleting from set", "reason", status.Reason, "firewall-name", fw.Name) + c.log.Info("firewall timeout exceeded, deleting from set", "reason", status.Reason, "firewall-name", fw.Name) if fw.DeletionTimestamp != nil { - r.Log.Info("deletion timestamp on firewall already set", "firewall-name", fw.Name) + c.log.Info("deletion timestamp on firewall already set", "firewall-name", fw.Name) continue } - err := c.c.GetSeedClient().Delete(r.Ctx, fw) + err := c.c.GetSeedClient().Delete(ctx, fw) if err != nil { - return err + return ctrl.Result{}, err } c.recorder.Eventf(fw, nil, corev1.EventTypeNormal, "Delete", "deleting firewall", "deleted firewall %s due to %s", fw.Name, status) @@ -68,12 +86,17 @@ func (c *controller) deleteIfUnhealthyOrTimeout(r *controllers.Ctx[*v2.FirewallS return *nextTimeouts[i].status.TimeoutIn < *nextTimeouts[j].status.TimeoutIn }) - nextTimeout := nextTimeouts[0] + var ( + nextTimeout = nextTimeouts[0] + in = *nextTimeout.status.TimeoutIn + ) + + c.log.Info("scheduled check for next health timeout", "firewall-name", nextTimeout.firewall, "reason", nextTimeout.status.Reason, "in", in.String()) - return controllers.RequeueAfter( - *nextTimeout.status.TimeoutIn, - fmt.Sprintf("checking for timeout of firewall %q (reason: %s)", nextTimeout.firewall.Name, nextTimeout.status.Reason)) + return ctrl.Result{ + RequeueAfter: in, + }, nil } - return nil + return ctrl.Result{}, nil } diff --git a/integration/suite_test.go b/integration/suite_test.go index fe5055b..8db2fed 100644 --- a/integration/suite_test.go +++ b/integration/suite_test.go @@ -16,6 +16,7 @@ import ( "github.com/metal-stack/firewall-controller-manager/controllers/firewall" "github.com/metal-stack/firewall-controller-manager/controllers/monitor" "github.com/metal-stack/firewall-controller-manager/controllers/set" + "github.com/metal-stack/firewall-controller-manager/controllers/timeout" "github.com/metal-stack/firewall-controller-manager/controllers/update" metalclient "github.com/metal-stack/metal-go/test/client" "github.com/metal-stack/metal-lib/pkg/tag" @@ -169,13 +170,13 @@ var _ = BeforeSuite(func() { ) Expect(err).ToNot(HaveOccurred()) - // err = timeout.SetupWithManager( - // ctrl.Log.WithName("controllers").WithName("timeout"), - // mgr.GetEventRecorder("timeout-controller"), - // mgr, - // cc, - // ) - // Expect(err).ToNot(HaveOccurred()) + err = timeout.SetupWithManager( + ctrl.Log.WithName("controllers").WithName("timeout"), + mgr.GetEventRecorder("timeout-controller"), + mgr, + cc, + ) + Expect(err).ToNot(HaveOccurred()) err = deployment.SetupWebhookWithManager(ctrl.Log.WithName("defaulting-webhook"), mgr, cc) Expect(err).ToNot(HaveOccurred())