From d29005b533f4dc2fd6e25b213b897ea12b8216ae Mon Sep 17 00:00:00 2001 From: Amine Date: Tue, 3 Mar 2026 15:49:13 -0800 Subject: [PATCH] Verify resource deletion before removing finalizer Some AWS APIs return success from Delete immediately while the resource transitions through a deleting state. This change ensures the reconciler verifies the resource is fully deleted (ReadOne returns NotFound) before removing the finalizer and allowing the Kubernetes CR to be deleted. When a resource still exists after Delete, the reconciler sets ACK.ResourceSynced to False with reason "Deleting" and requeues after 15 seconds. This prevents orphaned AWS resources when the CR is deleted before AWS finishes cleaning up. --- pkg/condition/condition.go | 2 + pkg/runtime/reconciler.go | 31 +++++- pkg/runtime/reconciler_test.go | 169 +++++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+), 4 deletions(-) diff --git a/pkg/condition/condition.go b/pkg/condition/condition.go index 8e9c449e..a4ac9297 100644 --- a/pkg/condition/condition.go +++ b/pkg/condition/condition.go @@ -37,6 +37,8 @@ var ( SyncedMessage = "Resource synced successfully" FailedReferenceResolutionMessage = "Reference resolution failed" UnavailableIAMRoleMessage = "IAM Role is not available" + DeletingMessage = "deletion in progress" + DeletingReason = "Deleting" IAMRoleSelectedReason = "Selected" IAMRoleSelectedMessage = "roleARN: %s, selectorName: %s, selectorResourceVersion: %s" diff --git a/pkg/runtime/reconciler.go b/pkg/runtime/reconciler.go index 2abf836d..3049526f 100644 --- a/pkg/runtime/reconciler.go +++ b/pkg/runtime/reconciler.go @@ -1129,13 +1129,36 @@ func (r *resourceReconciler) deleteResource( latest, _ = r.patchResourceMetadataAndSpec(ctx, rm, current, latest) } if err != nil { - // NOTE: Delete() implementations that have asynchronously-completing - // deletions should return a RequeueNeededAfter. return latest, err } - // Now that external AWS service resources have been appropriately cleaned - // up, we remove the finalizer representing the CR is managed by ACK, + // Verify the AWS resource is actually deleted before removing the + // finalizer. Some AWS APIs return success immediately while the resource + // transitions through a "deleting" state. We need to ensure the resource + // is fully gone (ReadOne returns NotFound) before allowing the CR to be + // deleted from Kubernetes. + rlog.Enter("rm.ReadOne (verify deletion)") + _, verifyErr := rm.ReadOne(ctx, current) + rlog.Exit("rm.ReadOne (verify deletion)", verifyErr) + if verifyErr == nil { + // Resource still exists, requeue to wait for deletion to complete + resForCondition := latest + if !ackcompare.IsNotNil(resForCondition) { + resForCondition = current + } + ackcondition.SetSynced(resForCondition, corev1.ConditionFalse, &condition.DeletingMessage, &condition.DeletingReason) + return resForCondition, requeue.NeededAfter( + errors.New(condition.DeletingMessage), + 15*time.Second, + ) + } + if verifyErr != ackerr.NotFound { + // Some other error occurred during verification, return it + return latest, verifyErr + } + + // Now that external AWS service resources have been confirmed deleted, + // we remove the finalizer representing the CR is managed by ACK, // allowing the CR to be deleted by the Kubernetes API server if ackcompare.IsNotNil(latest) { err = r.setResourceUnmanaged(ctx, rm, latest) diff --git a/pkg/runtime/reconciler_test.go b/pkg/runtime/reconciler_test.go index 9dd9cac5..1cb5b3d3 100644 --- a/pkg/runtime/reconciler_test.go +++ b/pkg/runtime/reconciler_test.go @@ -1912,3 +1912,172 @@ func TestReconcile_AccountDrifted(t *testing.T) { assert.Contains(t, err.Error(), "Resource already exists in account 111111111111") assert.Contains(t, err.Error(), "but the role used for reconciliation is in account 222222222222") } + +// TestReconcilerDelete_VerifyDeletionNotFound tests that when a resource is deleted +// and the verification ReadOne returns NotFound, the finalizer is removed. +func TestReconcilerDelete_VerifyDeletionNotFound(t *testing.T) { + require := require.New(t) + + ctx := context.TODO() + arn := ackv1alpha1.AWSResourceName("mybook-arn") + + desired, desiredRTObj, metaObj := resourceMocks() + // Set deletion policy annotation to avoid namespace cache lookup + metaObj.SetAnnotations(map[string]string{ + ackv1alpha1.AnnotationDeletionPolicy: string(ackv1alpha1.DeletionPolicyDelete), + }) + desired.On("ReplaceConditions", []*ackv1alpha1.Condition{}).Return() + desired.On("IsBeingDeleted").Return(true) + + ids := &ackmocks.AWSResourceIdentifiers{} + ids.On("ARN").Return(&arn) + desired.On("Identifiers").Return(ids) + desired.On("Conditions").Return([]*ackv1alpha1.Condition{}) + + // First ReadOne returns the resource (exists) + // Delete succeeds + // Second ReadOne (verification) returns NotFound + rm := &ackmocks.AWSResourceManager{} + rm.On("ResolveReferences", ctx, nil, desired).Return(desired, false, nil) + rm.On("ClearResolvedReferences", desired).Return(desired) + rm.On("ReadOne", ctx, desired).Return(desired, nil).Once() + rm.On("Delete", ctx, desired).Return(desired, nil) + rm.On("ReadOne", ctx, desired).Return(nil, ackerr.NotFound).Once() + + rmf, rd := managedResourceManagerFactoryMocks(desired, desired) + rd.On("IsManaged", desired).Return(true) + rd.On("MarkUnmanaged", desired).Return() + rd.On("ResourceFromRuntimeObject", desiredRTObj).Return(desired) + rd.On("Delta", desired, desired).Return(ackcompare.NewDelta()) + + r, kc, _ := reconcilerMocks(rmf) + kc.On("Patch", withoutCancelContextMatcher, desiredRTObj, mock.AnythingOfType("*client.mergeFromPatch")).Return(nil) + + // Set DeletionTimestamp on the meta object + now := metav1.Now() + metaObj.SetDeletionTimestamp(&now) + + // Call reconcile (not Sync) since delete happens in reconcile + reconciler := r.(*resourceReconciler) + _, err := reconciler.reconcile(ctx, rm, desired) + + require.Nil(err) + rm.AssertCalled(t, "ReadOne", ctx, desired) + rm.AssertCalled(t, "Delete", ctx, desired) + rm.AssertNumberOfCalls(t, "ReadOne", 2) + rd.AssertCalled(t, "MarkUnmanaged", desired) +} + +// TestReconcilerDelete_VerifyDeletionStillExists tests that when a resource is deleted +// but the verification ReadOne returns success (resource still exists), it requeues. +func TestReconcilerDelete_VerifyDeletionStillExists(t *testing.T) { + require := require.New(t) + + ctx := context.TODO() + arn := ackv1alpha1.AWSResourceName("mybook-arn") + + desired, _, metaObj := resourceMocks() + // Set deletion policy annotation to avoid namespace cache lookup + metaObj.SetAnnotations(map[string]string{ + ackv1alpha1.AnnotationDeletionPolicy: string(ackv1alpha1.DeletionPolicyDelete), + }) + desired.On("ReplaceConditions", mock.AnythingOfType("[]*v1alpha1.Condition")).Return() + desired.On("IsBeingDeleted").Return(true) + + ids := &ackmocks.AWSResourceIdentifiers{} + ids.On("ARN").Return(&arn) + desired.On("Identifiers").Return(ids) + desired.On("Conditions").Return([]*ackv1alpha1.Condition{}) + + // First ReadOne returns the resource (exists) + // Delete succeeds + // Second ReadOne (verification) also returns success (resource still exists, e.g., in "deleting" state) + rm := &ackmocks.AWSResourceManager{} + rm.On("ResolveReferences", ctx, nil, desired).Return(desired, false, nil) + rm.On("ClearResolvedReferences", desired).Return(desired) + rm.On("ReadOne", ctx, desired).Return(desired, nil).Once() + rm.On("Delete", ctx, desired).Return(desired, nil) + rm.On("ReadOne", ctx, desired).Return(desired, nil).Once() + + rmf, rd := managedResourceManagerFactoryMocks(desired, desired) + rd.On("IsManaged", desired).Return(true) + rd.On("Delta", desired, desired).Return(ackcompare.NewDelta()) + + r, _, _ := reconcilerMocks(rmf) + + // Set DeletionTimestamp on the meta object + now := metav1.Now() + metaObj.SetDeletionTimestamp(&now) + + reconciler := r.(*resourceReconciler) + _, err := reconciler.reconcile(ctx, rm, desired) + + // Should return a requeue error + require.NotNil(err) + var requeueErr *requeue.RequeueNeededAfter + require.True(errors.As(err, &requeueErr)) + assert.Equal(t, 15*time.Second, requeueErr.Duration()) + + rm.AssertCalled(t, "ReadOne", ctx, desired) + rm.AssertCalled(t, "Delete", ctx, desired) + rm.AssertNumberOfCalls(t, "ReadOne", 2) + // MarkUnmanaged should NOT be called since resource still exists + rd.AssertNotCalled(t, "MarkUnmanaged", desired) +} + +// TestReconcilerDelete_VerifyDeletionError tests that when a resource is deleted +// but the verification ReadOne returns an error (other than NotFound), it returns that error. +func TestReconcilerDelete_VerifyDeletionError(t *testing.T) { + require := require.New(t) + + ctx := context.TODO() + arn := ackv1alpha1.AWSResourceName("mybook-arn") + + desired, _, metaObj := resourceMocks() + // Set deletion policy annotation to avoid namespace cache lookup + metaObj.SetAnnotations(map[string]string{ + ackv1alpha1.AnnotationDeletionPolicy: string(ackv1alpha1.DeletionPolicyDelete), + }) + desired.On("ReplaceConditions", []*ackv1alpha1.Condition{}).Return() + desired.On("IsBeingDeleted").Return(true) + + ids := &ackmocks.AWSResourceIdentifiers{} + ids.On("ARN").Return(&arn) + desired.On("Identifiers").Return(ids) + desired.On("Conditions").Return([]*ackv1alpha1.Condition{}) + + verifyError := errors.New("AWS API error during verification") + + // First ReadOne returns the resource (exists) + // Delete succeeds + // Second ReadOne (verification) returns an error + rm := &ackmocks.AWSResourceManager{} + rm.On("ResolveReferences", ctx, nil, desired).Return(desired, false, nil) + rm.On("ClearResolvedReferences", desired).Return(desired) + rm.On("ReadOne", ctx, desired).Return(desired, nil).Once() + rm.On("Delete", ctx, desired).Return(desired, nil) + rm.On("ReadOne", ctx, desired).Return(nil, verifyError).Once() + + rmf, rd := managedResourceManagerFactoryMocks(desired, desired) + rd.On("IsManaged", desired).Return(true) + rd.On("Delta", desired, desired).Return(ackcompare.NewDelta()) + + r, _, _ := reconcilerMocks(rmf) + + // Set DeletionTimestamp on the meta object + now := metav1.Now() + metaObj.SetDeletionTimestamp(&now) + + reconciler := r.(*resourceReconciler) + _, err := reconciler.reconcile(ctx, rm, desired) + + // Should return the verification error + require.NotNil(err) + assert.Equal(t, verifyError, err) + + rm.AssertCalled(t, "ReadOne", ctx, desired) + rm.AssertCalled(t, "Delete", ctx, desired) + rm.AssertNumberOfCalls(t, "ReadOne", 2) + // MarkUnmanaged should NOT be called since verification failed + rd.AssertNotCalled(t, "MarkUnmanaged", desired) +}