-
Notifications
You must be signed in to change notification settings - Fork 258
fix(e2e): reduce E2E test flakiness #8480
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2482c81
84d5792
4f42248
91203e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -74,7 +74,7 @@ func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.Mana | |
| }, nil | ||
| } | ||
|
|
||
| func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace string, labelSelector string, fieldSelector string, maxRetries int) (*corev1.Pod, error) { | ||
| func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) { | ||
| defer toolkit.LogStepCtxf(ctx, "waiting for pod %s %s in %q namespace", labelSelector, fieldSelector, namespace)() | ||
| var pod *corev1.Pod | ||
|
|
||
|
|
@@ -101,22 +101,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace | |
| } | ||
| } | ||
|
|
||
| // Check for FailedCreatePodSandBox events | ||
| events, err := k.Typed.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: "involvedObject.name=" + pod.Name}) | ||
| if err == nil { | ||
| for _, event := range events.Items { | ||
| if event.Reason == "FailedCreatePodSandBox" { | ||
| maxRetries-- | ||
| sandboxErr := fmt.Errorf("pod %s has FailedCreatePodSandBox event: %s", pod.Name, event.Message) | ||
| if maxRetries <= 0 { | ||
| return false, sandboxErr | ||
| } | ||
| k.Typed.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: to.Ptr(int64(0))}) | ||
| return false, nil // Keep polling | ||
| } | ||
| } | ||
| } | ||
|
|
||
| switch pod.Status.Phase { | ||
| case corev1.PodFailed: | ||
| logPodDebugInfo(ctx, k, pod) | ||
|
|
@@ -142,10 +126,6 @@ func (k *Kubeclient) WaitUntilPodRunningWithRetry(ctx context.Context, namespace | |
| return pod, err | ||
| } | ||
|
|
||
| func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, labelSelector string, fieldSelector string) (*corev1.Pod, error) { | ||
| return k.WaitUntilPodRunningWithRetry(ctx, namespace, labelSelector, fieldSelector, 0) | ||
| } | ||
|
|
||
| func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string { | ||
| defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)() | ||
| var lastNode *corev1.Node | ||
|
|
@@ -199,7 +179,7 @@ func (k *Kubeclient) GetPodNetworkDebugPodForNode(ctx context.Context, kubeNodeN | |
| if kubeNodeName == "" { | ||
| return nil, fmt.Errorf("kubeNodeName must not be empty") | ||
| } | ||
| return k.WaitUntilPodRunningWithRetry(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName, 3) | ||
| return k.WaitUntilPodRunning(ctx, defaultNamespace, fmt.Sprintf("app=%s", podNetworkDebugAppLabel), "spec.nodeName="+kubeNodeName) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The retry didn't work. It re-created pod with the same name preserving the events. |
||
| } | ||
|
|
||
| func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,6 @@ import ( | |
| "github.com/Azure/agentbaker/e2e/config" | ||
| "github.com/Azure/agentbaker/e2e/toolkit" | ||
| "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" | ||
| "github.com/stretchr/testify/assert" | ||
| "github.com/stretchr/testify/require" | ||
| corev1 "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/api/resource" | ||
|
|
@@ -269,15 +268,27 @@ func getIPTablesRulesCompatibleWithEBPFHostRouting() (map[string][]string, []str | |
| } | ||
|
|
||
| // validateWireServerBlocked checks that unprivileged pods cannot reach WireServer. | ||
|
|
||
| // The iptables FORWARD DROP rules blocking pod→WireServer traffic can be transiently | ||
| // absent when kube-proxy or CNI flush/recreate iptables chains during node setup. | ||
| // We resolve the debug pod once up front (outside the retry budget) so that pod | ||
| // scheduling latency doesn't eat into the iptables-check timeout. | ||
| // Wireserver must never be reachable from pods — any successful connection is a | ||
| // security issue, not a transient condition to retry through. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont commit and ensure stability before removing the retry. or else this will keep generating issues, this test run with every e2e |
||
| // | ||
| // We accept two curl exit codes as evidence of a working block: | ||
| // | ||
| // 28 = operation timeout (FORWARD DROP — packets silently dropped) | ||
| // 7 = couldn't connect (FORWARD REJECT — RST / ICMP unreachable) | ||
| // | ||
| // Any other exit code is suspicious and fails the test with full diagnostics: | ||
| // | ||
| // 0 = wireserver reachable (security regression) | ||
| // 127 = curl missing from debug image (test would otherwise silently bypass) | ||
| // 2/3 = invalid curl args | ||
| // 6 = DNS resolution issue (wireserver IP is literal — should not happen) | ||
| // | ||
| // We do retry transient kube-apiserver exec hiccups, but never on the curl | ||
| // result itself — a single observation of an unexpected exit code is enough | ||
| // to fail loudly. | ||
| func validateWireServerBlocked(ctx context.Context, s *Scenario) { | ||
| defer toolkit.LogStep(s.T, "validating wireserver is blocked from unprivileged pods")() | ||
|
|
||
| // Resolve the unprivileged debug pod once — this can take 25-30s on cold nodes. | ||
| // Using the parent context so it has the full scenario timeout, not the short poll timeout. | ||
| nonHostPod, err := s.Runtime.Cluster.Kube.GetPodNetworkDebugPodForNode(ctx, s.Runtime.VM.KubeName) | ||
| require.NoError(s.T, err, "failed to get non host debug pod for wireserver validation") | ||
|
|
||
|
|
@@ -297,30 +308,37 @@ func validateWireServerBlocked(ctx context.Context, s *Scenario) { | |
| }, | ||
| } | ||
|
|
||
| allowedExitCodes := map[string]bool{"28": true, "7": true} | ||
|
|
||
| for _, check := range checks { | ||
| var lastResult *podExecResult | ||
| err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) { | ||
| execResult, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd) | ||
| var execResult *podExecResult | ||
| pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { | ||
| r, execErr := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, check.cmd) | ||
| if execErr != nil { | ||
| s.T.Logf("wireserver check %q: exec error (retrying): %v", check.desc, execErr) | ||
| return false, nil | ||
| } | ||
| lastResult = execResult | ||
| if lastResult.exitCode == "28" { | ||
| return true, nil | ||
| } | ||
| s.T.Logf("wireserver check %q: expected exit code 28, got %s (retrying)", check.desc, lastResult.exitCode) | ||
| return false, nil | ||
| execResult = r | ||
| return true, nil | ||
| }) | ||
| if err != nil { | ||
| s.T.Logf("host IPTABLES: %s", execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L FORWARD -v -n --line-numbers").String()) | ||
| if lastResult == nil { | ||
| require.NoErrorf(s.T, err, "curl to %s did not complete before polling stopped", check.desc) | ||
| } | ||
| s.T.Logf("last curl result for %s: %s", check.desc, lastResult.String()) | ||
| assert.Equal(s.T, "28", lastResult.exitCode, "curl to %s expected to fail with timeout, but it didn't after retries", check.desc) | ||
| s.T.FailNow() | ||
| require.NoErrorf(s.T, pollErr, "wireserver check %q: exec failed after retries", check.desc) | ||
|
|
||
| if allowedExitCodes[execResult.exitCode] { | ||
| continue | ||
| } | ||
|
|
||
| iptablesFwd := execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L FORWARD -v -n --line-numbers").String() | ||
| iptablesKubeFwd := execScriptOnVMForScenario(ctx, s, "sudo iptables -t filter -L KUBE-FORWARD -v -n --line-numbers 2>/dev/null || echo 'chain not found'").String() | ||
| iptablesSave := execScriptOnVMForScenario(ctx, s, "sudo iptables-save -t filter 2>/dev/null | head -80").String() | ||
| conntrack := execScriptOnVMForScenario(ctx, s, "sudo conntrack -L -d 168.63.129.16 2>/dev/null || echo 'conntrack not available'").String() | ||
| s.T.Fatalf("wireserver check %q: unexpected curl exit code %q (want 28 timeout or 7 refused)\n"+ | ||
| "stdout=%q, stderr=%q\n"+ | ||
| "FORWARD chain:\n%s\n"+ | ||
| "KUBE-FORWARD chain:\n%s\n"+ | ||
| "iptables-save filter:\n%s\n"+ | ||
| "conntrack:\n%s", | ||
| check.desc, execResult.exitCode, execResult.stdout, execResult.stderr, | ||
| iptablesFwd, iptablesKubeFwd, iptablesSave, conntrack) | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.