-
Notifications
You must be signed in to change notification settings - Fork 45
improve sandbox access resilience through ServiceFQDN #245
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -200,21 +200,22 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf | |
| sandboxRollbackFunc() | ||
| }() | ||
|
|
||
| // agent-sandbox create pod with same name as sandbox if no warmpool is used | ||
| // so here we try to get pod IP by sandbox name first | ||
| // if warmpool is used, the pod name is stored in sandbox's annotation `agents.x-k8s.io/sandbox-pod-name` | ||
| // https://github.com/kubernetes-sigs/agent-sandbox/blob/3ab7fbcd85ad0d75c6e632ecd14bcaeda5e76e1e/controllers/sandbox_controller.go#L465 | ||
| sandboxPodName := sandbox.Name | ||
| if podName, exists := createdSandbox.Annotations[controllers.SandboxPodNameAnnotation]; exists { | ||
| sandboxPodName = podName | ||
| } | ||
|
|
||
| podIP, err := s.k8sClient.GetSandboxPodIP(ctx, sandbox.Namespace, sandbox.Name, sandboxPodName) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to get sandbox %s/%s pod IP: %v", sandbox.Namespace, sandbox.Name, err) | ||
| // Prefer ServiceFQDN: K8s Endpoints controller keeps DNS in sync when pod is evicted/recreated. | ||
| // Fallback to pod IP for agent-sandbox without ServiceFQDN. | ||
| host := createdSandbox.Status.ServiceFQDN | ||
| if host == "" { | ||
| sandboxPodName := sandbox.Name | ||
| if podName, exists := createdSandbox.Annotations[controllers.SandboxPodNameAnnotation]; exists { | ||
| sandboxPodName = podName | ||
| } | ||
| var err error | ||
| host, err = s.k8sClient.GetSandboxPodIP(ctx, sandbox.Namespace, sandbox.Name, sandboxPodName) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to get sandbox %s/%s pod IP: %v", sandbox.Namespace, sandbox.Name, err) | ||
| } | ||
|
Comment on lines
+212
to
+215
|
||
| } | ||
|
|
||
| storeCacheInfo := buildSandboxInfo(createdSandbox, podIP, sandboxEntry) | ||
| storeCacheInfo := buildSandboxInfo(createdSandbox, host, sandboxEntry) | ||
|
|
||
| response := &types.CreateSandboxResponse{ | ||
| SessionID: sandboxEntry.SessionID, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,7 +37,8 @@ func buildSandboxPlaceHolder(sandboxCR *sandboxv1alpha1.Sandbox, entry *sandboxE | |
| } | ||
| } | ||
|
|
||
| func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, podIP string, entry *sandboxEntry) *types.SandboxInfo { | ||
| // buildSandboxInfo builds SandboxInfo. host is ServiceFQDN (preferred, resilient to pod restart) or pod IP. | ||
| func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, host string, entry *sandboxEntry) *types.SandboxInfo { | ||
| createdAt := sandbox.GetCreationTimestamp().Time | ||
| expiresAt := createdAt.Add(DefaultSandboxTTL) | ||
| if sandbox.Spec.Lifecycle.ShutdownTime != nil { | ||
|
|
@@ -48,7 +49,7 @@ func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, podIP string, entry *san | |
| accesses = append(accesses, types.SandboxEntryPoint{ | ||
| Path: port.PathPrefix, | ||
| Protocol: string(port.Protocol), | ||
| Endpoint: net.JoinHostPort(podIP, strconv.Itoa(int(port.Port))), | ||
| Endpoint: net.JoinHostPort(host, strconv.Itoa(int(port.Port))), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto, FQDN is a convenient way to maintain IPs, but it may not meet the needs of the community. |
||
| }) | ||
| } | ||
| return &types.SandboxInfo{ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,7 +57,11 @@ const ( | |
| // ownerKindSandboxWarmPool is the owner reference kind for SandboxWarmPool resources | ||
| ownerKindSandboxWarmPool = "SandboxWarmPool" | ||
|
|
||
| agentcubeNamespace = "agentcube" | ||
| // sessionIDLabelKey is the label key for session ID on Sandbox (matches workloadmanager.SessionIdLabelKey) | ||
| sessionIDLabelKey = "runtime.agentcube.io/session-id" | ||
|
Comment on lines
+60
to
+61
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To improve maintainability and avoid potential drift, it's better to use a single source of truth for this label key. You've correctly noted in the comment that this matches Instead of duplicating the constant, consider moving For example, you could add it to const (
// ... other constants
SessionIDLabelKey = "runtime.agentcube.io/session-id"
)Then, you can import |
||
|
|
||
| agentcubeNamespace = "agentcube" | ||
| e2eCodeInterpreterName = "e2e-code-interpreter" | ||
| ) | ||
|
|
||
| var ( | ||
|
|
@@ -792,7 +796,7 @@ func TestCodeInterpreterBasicInvocation(t *testing.T) { | |
| env := newTestEnv(t) | ||
|
|
||
| namespace := agentcubeNamespace | ||
| name := "e2e-code-interpreter" | ||
| name := e2eCodeInterpreterName | ||
|
|
||
| testCases := []struct { | ||
| name string | ||
|
|
@@ -835,7 +839,7 @@ func TestCodeInterpreterFileOperations(t *testing.T) { | |
| env := newTestEnv(t) | ||
|
|
||
| namespace := agentcubeNamespace | ||
| name := "e2e-code-interpreter" | ||
| name := e2eCodeInterpreterName | ||
|
|
||
| // Create a session for file operations | ||
| sessionID, err := env.createCodeInterpreterSession(namespace, name) | ||
|
|
@@ -938,6 +942,69 @@ print("Fibonacci sequence generated") | |
| }) | ||
| } | ||
|
|
||
| // TestCodeInterpreterSandboxAccessAfterPodRestart verifies sandbox access resilience: | ||
| // when the pod is evicted/recreated, requests with the same session still succeed (ServiceFQDN). | ||
| func TestCodeInterpreterSandboxAccessAfterPodRestart(t *testing.T) { | ||
| env := newTestEnv(t) | ||
| ctx, err := newE2ETestContext() | ||
| require.NoError(t, err) | ||
|
|
||
| namespace := agentcubeNamespace | ||
| name := e2eCodeInterpreterName | ||
|
|
||
| sessionID, err := env.createCodeInterpreterSession(namespace, name) | ||
| require.NoError(t, err, "Failed to create code interpreter session") | ||
|
|
||
| t.Cleanup(func() { | ||
| _ = env.deleteCodeInterpreterSession(sessionID) | ||
| }) | ||
|
|
||
| // 1. Invoke once to verify sandbox is ready | ||
| req := &CodeInterpreterExecuteRequest{ | ||
| Command: []string{"echo", "before-pod-restart"}, | ||
| } | ||
| resp, err := env.invokeCodeInterpreter(namespace, name, sessionID, req) | ||
| require.NoError(t, err, "First invoke should succeed") | ||
| require.Contains(t, resp.Stdout, "before-pod-restart") | ||
|
|
||
| // 2. Find sandbox and its pod | ||
| sandbox, err := ctx.getSandboxBySessionID(namespace, sessionID) | ||
| require.NoError(t, err, "Should find sandbox by session ID") | ||
| require.NotNil(t, sandbox) | ||
|
|
||
| pod, err := ctx.getPodByOwner(namespace, "Sandbox", sandbox.Name) | ||
| require.NoError(t, err, "Should find pod owned by sandbox") | ||
| require.NotNil(t, pod) | ||
|
|
||
| t.Logf("Deleting pod %s/%s to simulate eviction", pod.Namespace, pod.Name) | ||
| err = ctx.kubeClient.CoreV1().Pods(namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}) | ||
| require.NoError(t, err, "Failed to delete pod") | ||
|
|
||
| // 3. Wait for new pod to be ready (agent-sandbox reconciles and recreates) | ||
| require.Eventually(t, func() bool { | ||
| newPod, err := ctx.getPodByOwner(namespace, "Sandbox", sandbox.Name) | ||
| if err != nil || newPod == nil { | ||
| return false | ||
| } | ||
| for _, c := range newPod.Status.Conditions { | ||
| if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { | ||
| return true | ||
| } | ||
| } | ||
| return false | ||
| }, 3*time.Minute, 5*time.Second, "New pod should become ready after recreation") | ||
|
Comment on lines
+983
to
+995
|
||
|
|
||
| // 4. Invoke again with same session - should succeed via ServiceFQDN | ||
| reqAfter := &CodeInterpreterExecuteRequest{ | ||
| Command: []string{"echo", "after-pod-restart"}, | ||
| } | ||
| respAfter, err := env.invokeCodeInterpreter(namespace, name, sessionID, reqAfter) | ||
| require.NoError(t, err, "Invoke after pod restart should succeed (ServiceFQDN resilience)") | ||
| require.Contains(t, respAfter.Stdout, "after-pod-restart") | ||
|
|
||
| t.Logf("Sandbox access after pod restart verified successfully") | ||
| } | ||
|
|
||
| func (ctx *e2eTestContext) cleanupCodeInterpreter(t *testing.T, namespace, name, yamlPath string) { | ||
| t.Log("Cleaning up code interpreter resources...") | ||
| if err := ctx.deleteYamlFile(yamlPath); err != nil { | ||
|
|
@@ -1311,6 +1378,24 @@ func (ctx *e2eTestContext) getSandboxByOwner(namespace, ownerKind, ownerName str | |
| return found, nil | ||
| } | ||
|
|
||
| // getSandboxBySessionID finds exactly one Sandbox with the given session ID label | ||
| func (ctx *e2eTestContext) getSandboxBySessionID(namespace, sessionID string) (*sandboxv1alpha1.Sandbox, error) { | ||
| sandboxList := &sandboxv1alpha1.SandboxList{} | ||
| err := ctx.ctrlClient.List(context.Background(), sandboxList, | ||
| client.InNamespace(namespace), | ||
| client.MatchingLabels{sessionIDLabelKey: sessionID}) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to list sandboxes: %w", err) | ||
| } | ||
| if len(sandboxList.Items) == 0 { | ||
| return nil, fmt.Errorf("no sandbox found with session ID %s", sessionID) | ||
| } | ||
| if len(sandboxList.Items) > 1 { | ||
| return nil, fmt.Errorf("found multiple sandboxes with session ID %s", sessionID) | ||
| } | ||
| return &sandboxList.Items[0], nil | ||
| } | ||
|
|
||
| // getPodByOwner finds exactly one Pod owned by the specified owner | ||
| func (ctx *e2eTestContext) getPodByOwner(namespace, ownerKind, ownerName string) (*corev1.Pod, error) { | ||
| podList, err := ctx.kubeClient.CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{}) | ||
|
|
@@ -1505,7 +1590,7 @@ func TestCodeInterpreterBasicInvocationLoad(t *testing.T) { | |
| env := newTestEnv(t) | ||
|
|
||
| namespace := agentcubeNamespace | ||
| name := "e2e-code-interpreter" | ||
| name := e2eCodeInterpreterName | ||
|
|
||
| // Load test configuration | ||
| const ( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
createdSandbox.Status.ServiceFQDNis preferred here, but upstream agent-sandbox currently hardcodes that field to.svc.cluster.local. If a cluster uses a different DNS domain, this may regress cases that previously worked via pod IP fallback.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Under non-default cluster domains, there is indeed a theoretical risk of rollback. Because FQDN actually introduces many uncertainties, if this design is not accepted by the community, I will consider another solution for maintaining IPs. Do you think this solution aligns with the community's design, or is abandoning FQDN more reasonable?
anyway thanks for your review! :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Eventually the upstram should not hard code
cluster.local, by that assumption, should we accept using service fqdn?