Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/condition/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var (
SyncedMessage = "Resource synced successfully"
FailedReferenceResolutionMessage = "Reference resolution failed"
UnavailableIAMRoleMessage = "IAM Role is not available"
DeletingMessage = "deletion in progress"
DeletingReason = "Deleting"

IAMRoleSelectedReason = "Selected"
IAMRoleSelectedMessage = "roleARN: %s, selectorName: %s, selectorResourceVersion: %s"
Expand Down
31 changes: 27 additions & 4 deletions pkg/runtime/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -1129,13 +1129,36 @@ func (r *resourceReconciler) deleteResource(
latest, _ = r.patchResourceMetadataAndSpec(ctx, rm, current, latest)
}
if err != nil {
// NOTE: Delete() implementations that have asynchronously-completing
// deletions should return a RequeueNeededAfter.
return latest, err
}

// Now that external AWS service resources have been appropriately cleaned
// up, we remove the finalizer representing the CR is managed by ACK,
// Verify the AWS resource is actually deleted before removing the
// finalizer. Some AWS APIs return success immediately while the resource
// transitions through a "deleting" state. We need to ensure the resource
// is fully gone (ReadOne returns NotFound) before allowing the CR to be
// deleted from Kubernetes.
Comment on lines +1138 to +1139
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be good enough for most resources, but there could be cases where the AWS resource is still returned by ReadOne, but with a state indicating that deletion was successful. One example that comes to mind is JobRuns from emr-containers. We likely need to added a check for a "deleted" state similar to our sycned:when check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll also need to consider how this is rolled out since we'll need to have those checks in place before applying this to avoid these kinds of resources becoming stuck during deletion.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm aware of those edge cases and willing to make a code-gen patch to support that. I just wanted to get a feel on this change, and alignment that this is something we want to support in ACK.

rlog.Enter("rm.ReadOne (verify deletion)")
_, verifyErr := rm.ReadOne(ctx, current)
rlog.Exit("rm.ReadOne (verify deletion)", verifyErr)
if verifyErr == nil {
// Resource still exists, requeue to wait for deletion to complete
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the second reconcile we may need to protect against sdkDelete returning an error if the AWS API returns a validation error when called on a deleting resource.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we'll probably need a both a IsDeleted and IsDeleting

resForCondition := latest
if !ackcompare.IsNotNil(resForCondition) {
resForCondition = current
}
ackcondition.SetSynced(resForCondition, corev1.ConditionFalse, &condition.DeletingMessage, &condition.DeletingReason)
return resForCondition, requeue.NeededAfter(
errors.New(condition.DeletingMessage),
15*time.Second,
)
}
if verifyErr != ackerr.NotFound {
// Some other error occurred during verification, return it
return latest, verifyErr
}

// Now that external AWS service resources have been confirmed deleted,
// we remove the finalizer representing the CR is managed by ACK,
// allowing the CR to be deleted by the Kubernetes API server
if ackcompare.IsNotNil(latest) {
err = r.setResourceUnmanaged(ctx, rm, latest)
Expand Down
169 changes: 169 additions & 0 deletions pkg/runtime/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1912,3 +1912,172 @@ func TestReconcile_AccountDrifted(t *testing.T) {
assert.Contains(t, err.Error(), "Resource already exists in account 111111111111")
assert.Contains(t, err.Error(), "but the role used for reconciliation is in account 222222222222")
}

// TestReconcilerDelete_VerifyDeletionNotFound tests that when a resource is deleted
// and the verification ReadOne returns NotFound, the finalizer is removed.
func TestReconcilerDelete_VerifyDeletionNotFound(t *testing.T) {
require := require.New(t)

ctx := context.TODO()
arn := ackv1alpha1.AWSResourceName("mybook-arn")

desired, desiredRTObj, metaObj := resourceMocks()
// Set deletion policy annotation to avoid namespace cache lookup
metaObj.SetAnnotations(map[string]string{
ackv1alpha1.AnnotationDeletionPolicy: string(ackv1alpha1.DeletionPolicyDelete),
})
desired.On("ReplaceConditions", []*ackv1alpha1.Condition{}).Return()
desired.On("IsBeingDeleted").Return(true)

ids := &ackmocks.AWSResourceIdentifiers{}
ids.On("ARN").Return(&arn)
desired.On("Identifiers").Return(ids)
desired.On("Conditions").Return([]*ackv1alpha1.Condition{})

// First ReadOne returns the resource (exists)
// Delete succeeds
// Second ReadOne (verification) returns NotFound
rm := &ackmocks.AWSResourceManager{}
rm.On("ResolveReferences", ctx, nil, desired).Return(desired, false, nil)
rm.On("ClearResolvedReferences", desired).Return(desired)
rm.On("ReadOne", ctx, desired).Return(desired, nil).Once()
rm.On("Delete", ctx, desired).Return(desired, nil)
rm.On("ReadOne", ctx, desired).Return(nil, ackerr.NotFound).Once()

rmf, rd := managedResourceManagerFactoryMocks(desired, desired)
rd.On("IsManaged", desired).Return(true)
rd.On("MarkUnmanaged", desired).Return()
rd.On("ResourceFromRuntimeObject", desiredRTObj).Return(desired)
rd.On("Delta", desired, desired).Return(ackcompare.NewDelta())

r, kc, _ := reconcilerMocks(rmf)
kc.On("Patch", withoutCancelContextMatcher, desiredRTObj, mock.AnythingOfType("*client.mergeFromPatch")).Return(nil)

// Set DeletionTimestamp on the meta object
now := metav1.Now()
metaObj.SetDeletionTimestamp(&now)

// Call reconcile (not Sync) since delete happens in reconcile
reconciler := r.(*resourceReconciler)
_, err := reconciler.reconcile(ctx, rm, desired)

require.Nil(err)
rm.AssertCalled(t, "ReadOne", ctx, desired)
rm.AssertCalled(t, "Delete", ctx, desired)
rm.AssertNumberOfCalls(t, "ReadOne", 2)
rd.AssertCalled(t, "MarkUnmanaged", desired)
}

// TestReconcilerDelete_VerifyDeletionStillExists tests that when a resource is deleted
// but the verification ReadOne returns success (resource still exists), it requeues.
func TestReconcilerDelete_VerifyDeletionStillExists(t *testing.T) {
require := require.New(t)

ctx := context.TODO()
arn := ackv1alpha1.AWSResourceName("mybook-arn")

desired, _, metaObj := resourceMocks()
// Set deletion policy annotation to avoid namespace cache lookup
metaObj.SetAnnotations(map[string]string{
ackv1alpha1.AnnotationDeletionPolicy: string(ackv1alpha1.DeletionPolicyDelete),
})
desired.On("ReplaceConditions", mock.AnythingOfType("[]*v1alpha1.Condition")).Return()
desired.On("IsBeingDeleted").Return(true)

ids := &ackmocks.AWSResourceIdentifiers{}
ids.On("ARN").Return(&arn)
desired.On("Identifiers").Return(ids)
desired.On("Conditions").Return([]*ackv1alpha1.Condition{})

// First ReadOne returns the resource (exists)
// Delete succeeds
// Second ReadOne (verification) also returns success (resource still exists, e.g., in "deleting" state)
rm := &ackmocks.AWSResourceManager{}
rm.On("ResolveReferences", ctx, nil, desired).Return(desired, false, nil)
rm.On("ClearResolvedReferences", desired).Return(desired)
rm.On("ReadOne", ctx, desired).Return(desired, nil).Once()
rm.On("Delete", ctx, desired).Return(desired, nil)
rm.On("ReadOne", ctx, desired).Return(desired, nil).Once()

rmf, rd := managedResourceManagerFactoryMocks(desired, desired)
rd.On("IsManaged", desired).Return(true)
rd.On("Delta", desired, desired).Return(ackcompare.NewDelta())

r, _, _ := reconcilerMocks(rmf)

// Set DeletionTimestamp on the meta object
now := metav1.Now()
metaObj.SetDeletionTimestamp(&now)

reconciler := r.(*resourceReconciler)
_, err := reconciler.reconcile(ctx, rm, desired)

// Should return a requeue error
require.NotNil(err)
var requeueErr *requeue.RequeueNeededAfter
require.True(errors.As(err, &requeueErr))
assert.Equal(t, 15*time.Second, requeueErr.Duration())

rm.AssertCalled(t, "ReadOne", ctx, desired)
rm.AssertCalled(t, "Delete", ctx, desired)
rm.AssertNumberOfCalls(t, "ReadOne", 2)
// MarkUnmanaged should NOT be called since resource still exists
rd.AssertNotCalled(t, "MarkUnmanaged", desired)
}

// TestReconcilerDelete_VerifyDeletionError tests that when a resource is deleted
// but the verification ReadOne returns an error (other than NotFound), it returns that error.
func TestReconcilerDelete_VerifyDeletionError(t *testing.T) {
require := require.New(t)

ctx := context.TODO()
arn := ackv1alpha1.AWSResourceName("mybook-arn")

desired, _, metaObj := resourceMocks()
// Set deletion policy annotation to avoid namespace cache lookup
metaObj.SetAnnotations(map[string]string{
ackv1alpha1.AnnotationDeletionPolicy: string(ackv1alpha1.DeletionPolicyDelete),
})
desired.On("ReplaceConditions", []*ackv1alpha1.Condition{}).Return()
desired.On("IsBeingDeleted").Return(true)

ids := &ackmocks.AWSResourceIdentifiers{}
ids.On("ARN").Return(&arn)
desired.On("Identifiers").Return(ids)
desired.On("Conditions").Return([]*ackv1alpha1.Condition{})

verifyError := errors.New("AWS API error during verification")

// First ReadOne returns the resource (exists)
// Delete succeeds
// Second ReadOne (verification) returns an error
rm := &ackmocks.AWSResourceManager{}
rm.On("ResolveReferences", ctx, nil, desired).Return(desired, false, nil)
rm.On("ClearResolvedReferences", desired).Return(desired)
rm.On("ReadOne", ctx, desired).Return(desired, nil).Once()
rm.On("Delete", ctx, desired).Return(desired, nil)
rm.On("ReadOne", ctx, desired).Return(nil, verifyError).Once()

rmf, rd := managedResourceManagerFactoryMocks(desired, desired)
rd.On("IsManaged", desired).Return(true)
rd.On("Delta", desired, desired).Return(ackcompare.NewDelta())

r, _, _ := reconcilerMocks(rmf)

// Set DeletionTimestamp on the meta object
now := metav1.Now()
metaObj.SetDeletionTimestamp(&now)

reconciler := r.(*resourceReconciler)
_, err := reconciler.reconcile(ctx, rm, desired)

// Should return the verification error
require.NotNil(err)
assert.Equal(t, verifyError, err)

rm.AssertCalled(t, "ReadOne", ctx, desired)
rm.AssertCalled(t, "Delete", ctx, desired)
rm.AssertNumberOfCalls(t, "ReadOne", 2)
// MarkUnmanaged should NOT be called since verification failed
rd.AssertNotCalled(t, "MarkUnmanaged", desired)
}