diff --git a/components/backend/handlers/workspace_container.go b/components/backend/handlers/workspace_container.go new file mode 100644 index 000000000..ac99aa291 --- /dev/null +++ b/components/backend/handlers/workspace_container.go @@ -0,0 +1,206 @@ +// Package handlers implements HTTP handlers for the backend API. +package handlers + +import ( + "context" + "log" + "net/http" + + "github.com/gin-gonic/gin" + "k8s.io/apimachinery/pkg/api/errors" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +// WorkspaceContainerSettings represents workspace container customization. +// Workspace container mode is always enabled (ADR-0006); these settings allow optional customization. +type WorkspaceContainerSettings struct { + Image string `json:"image,omitempty"` + Resources *WorkspaceContainerResourceLimits `json:"resources,omitempty"` +} + +// WorkspaceContainerResourceLimits represents resource limits for workspace containers +type WorkspaceContainerResourceLimits struct { + CPURequest string `json:"cpuRequest,omitempty"` + CPULimit string `json:"cpuLimit,omitempty"` + MemoryRequest string `json:"memoryRequest,omitempty"` + MemoryLimit string `json:"memoryLimit,omitempty"` +} + +// GetWorkspaceContainerSettings returns the workspace container settings for a project +func GetWorkspaceContainerSettings(c *gin.Context) { + project := c.GetString("project") + if project == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "project name required"}) + return + } + + // Get user-scoped dynamic client + reqK8s, reqDyn := GetK8sClientsForRequest(c) + if reqK8s == nil || reqDyn == nil { + c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid or missing authentication token"}) + return + } + + ctx := context.Background() + gvr := GetProjectSettingsResource() + + // Get the ProjectSettings CR (singleton per namespace) + obj, err := reqDyn.Resource(gvr).Namespace(project).Get(ctx, "projectsettings", v1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + // No ProjectSettings CR exists, return empty settings (uses platform defaults) + c.JSON(http.StatusOK, WorkspaceContainerSettings{}) + return + } + log.Printf("Failed to get ProjectSettings for %s: %v", project, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get project settings"}) + return + } + + // Extract workspaceContainer from spec + spec, specFound, err := unstructured.NestedMap(obj.Object, "spec") + if err != nil || !specFound { + // No spec or error reading it, return empty settings + c.JSON(http.StatusOK, WorkspaceContainerSettings{}) + return + } + wcMap, found, err := unstructured.NestedMap(spec, "workspaceContainer") + if err != nil || !found { + // No custom settings, uses platform defaults + c.JSON(http.StatusOK, WorkspaceContainerSettings{}) + return + } + + // Build response with optional customizations + settings := WorkspaceContainerSettings{} + if image, ok := wcMap["image"].(string); ok { + settings.Image = image + } + + // Extract resources if present + if resources, found, err := unstructured.NestedMap(wcMap, "resources"); err == nil && found { + settings.Resources = &WorkspaceContainerResourceLimits{} + if v, ok := resources["cpuRequest"].(string); ok { + settings.Resources.CPURequest = v + } + if v, ok := resources["cpuLimit"].(string); ok { + settings.Resources.CPULimit = v + } + if v, ok := resources["memoryRequest"].(string); ok { + settings.Resources.MemoryRequest = v + } + if v, ok := resources["memoryLimit"].(string); ok { + settings.Resources.MemoryLimit = v + } + } + + c.JSON(http.StatusOK, settings) +} + +// UpdateWorkspaceContainerSettings updates the workspace container settings for a project +func UpdateWorkspaceContainerSettings(c *gin.Context) { + project := c.GetString("project") + if project == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "project name required"}) + return + } + + var req WorkspaceContainerSettings + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid request body"}) + return + } + + // Get user-scoped dynamic client + reqK8s, reqDyn := GetK8sClientsForRequest(c) + if reqK8s == nil || reqDyn == nil { + c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid or missing authentication token"}) + return + } + + ctx := context.Background() + gvr := GetProjectSettingsResource() + + // Get or create the ProjectSettings CR + obj, err := reqDyn.Resource(gvr).Namespace(project).Get(ctx, "projectsettings", v1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + // Create new ProjectSettings with workspaceContainer + obj = &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "vteam.ambient-code/v1alpha1", + "kind": "ProjectSettings", + "metadata": map[string]interface{}{ + "name": "projectsettings", + "namespace": project, + }, + "spec": map[string]interface{}{ + "groupAccess": []interface{}{}, // Required field + }, + }, + } + } else { + log.Printf("Failed to get ProjectSettings for %s: %v", project, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to get project settings"}) + return + } + } + + // Build workspaceContainer map with optional customizations + wcMap := map[string]interface{}{} + if req.Image != "" { + wcMap["image"] = req.Image + } + if req.Resources != nil { + resources := map[string]interface{}{} + if req.Resources.CPURequest != "" { + resources["cpuRequest"] = req.Resources.CPURequest + } + if req.Resources.CPULimit != "" { + resources["cpuLimit"] = req.Resources.CPULimit + } + if req.Resources.MemoryRequest != "" { + resources["memoryRequest"] = req.Resources.MemoryRequest + } + if req.Resources.MemoryLimit != "" { + resources["memoryLimit"] = req.Resources.MemoryLimit + } + if len(resources) > 0 { + wcMap["resources"] = resources + } + } + + // Set workspaceContainer in spec + if err := unstructured.SetNestedMap(obj.Object, wcMap, "spec", "workspaceContainer"); err != nil { + log.Printf("Failed to set workspaceContainer in spec: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update settings"}) + return + } + + // Create or update the ProjectSettings CR + if obj.GetResourceVersion() == "" { + // Create new + _, err = reqDyn.Resource(gvr).Namespace(project).Create(ctx, obj, v1.CreateOptions{}) + if err != nil { + log.Printf("Failed to create ProjectSettings for %s: %v", project, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create project settings"}) + return + } + log.Printf("Created ProjectSettings with workspaceContainer for project %s", project) + } else { + // Update existing + _, err = reqDyn.Resource(gvr).Namespace(project).Update(ctx, obj, v1.UpdateOptions{}) + if err != nil { + log.Printf("Failed to update ProjectSettings for %s: %v", project, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to update project settings"}) + return + } + log.Printf("Updated workspaceContainer settings for project %s", project) + } + + c.JSON(http.StatusOK, gin.H{ + "message": "Workspace container settings updated", + "image": req.Image, + }) +} diff --git a/components/backend/routes.go b/components/backend/routes.go index 4663be58d..325e272af 100644 --- a/components/backend/routes.go +++ b/components/backend/routes.go @@ -96,6 +96,10 @@ func registerRoutes(r *gin.Engine) { projectGroup.GET("/integration-secrets", handlers.ListIntegrationSecrets) projectGroup.PUT("/integration-secrets", handlers.UpdateIntegrationSecrets) + // Workspace container settings (ADR-0006) + projectGroup.GET("/workspace-container", handlers.GetWorkspaceContainerSettings) + projectGroup.PUT("/workspace-container", handlers.UpdateWorkspaceContainerSettings) + // GitLab authentication endpoints (project-scoped) projectGroup.POST("/auth/gitlab/connect", handlers.ConnectGitLabGlobal) projectGroup.GET("/auth/gitlab/status", handlers.GetGitLabStatusGlobal) diff --git a/components/frontend/package-lock.json b/components/frontend/package-lock.json index 0a1085e88..f2fef801d 100644 --- a/components/frontend/package-lock.json +++ b/components/frontend/package-lock.json @@ -17,6 +17,7 @@ "@radix-ui/react-progress": "^1.1.7", "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.3", + "@radix-ui/react-switch": "^1.2.6", "@radix-ui/react-tabs": "^1.1.13", "@radix-ui/react-toast": "^1.2.15", "@radix-ui/react-tooltip": "^1.2.8", @@ -1856,6 +1857,35 @@ } } }, + "node_modules/@radix-ui/react-switch": { + "version": "1.2.6", + "resolved": "https://registry.npmjs.org/@radix-ui/react-switch/-/react-switch-1.2.6.tgz", + "integrity": "sha512-bByzr1+ep1zk4VubeEVViV592vu2lHE2BZY5OnzehZqOOgogN80+mNtCqPkhn2gklJqOpxWgPoYTSnhBCqpOXQ==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-use-previous": "1.1.1", + "@radix-ui/react-use-size": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-tabs": { "version": "1.1.13", "resolved": "https://registry.npmjs.org/@radix-ui/react-tabs/-/react-tabs-1.1.13.tgz", diff --git a/components/frontend/package.json b/components/frontend/package.json index 6d72bea77..37be3ef61 100644 --- a/components/frontend/package.json +++ b/components/frontend/package.json @@ -18,6 +18,7 @@ "@radix-ui/react-progress": "^1.1.7", "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.3", + "@radix-ui/react-switch": "^1.2.6", "@radix-ui/react-tabs": "^1.1.13", "@radix-ui/react-toast": "^1.2.15", "@radix-ui/react-tooltip": "^1.2.8", diff --git a/components/frontend/src/app/api/projects/[name]/workspace-container/route.ts b/components/frontend/src/app/api/projects/[name]/workspace-container/route.ts new file mode 100644 index 000000000..baf1979d1 --- /dev/null +++ b/components/frontend/src/app/api/projects/[name]/workspace-container/route.ts @@ -0,0 +1,56 @@ +import { BACKEND_URL } from '@/lib/config'; +import { buildForwardHeadersAsync } from '@/lib/auth'; + +export async function GET( + request: Request, + { params }: { params: Promise<{ name: string }> } +) { + try { + const { name } = await params; + const headers = await buildForwardHeadersAsync(request); + + const resp = await fetch( + `${BACKEND_URL}/projects/${encodeURIComponent(name)}/workspace-container`, + { headers } + ); + const data = await resp.json().catch(() => ({})); + return Response.json(data, { status: resp.status }); + } catch (error) { + console.error('Error fetching workspace container settings:', error); + return Response.json( + { error: 'Failed to fetch workspace container settings' }, + { status: 500 } + ); + } +} + +export async function PUT( + request: Request, + { params }: { params: Promise<{ name: string }> } +) { + try { + const { name } = await params; + const headers = await buildForwardHeadersAsync(request); + const body = await request.json(); + + const resp = await fetch( + `${BACKEND_URL}/projects/${encodeURIComponent(name)}/workspace-container`, + { + method: 'PUT', + headers: { + ...headers, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(body), + } + ); + const data = await resp.json().catch(() => ({})); + return Response.json(data, { status: resp.status }); + } catch (error) { + console.error('Error updating workspace container settings:', error); + return Response.json( + { error: 'Failed to update workspace container settings' }, + { status: 500 } + ); + } +} diff --git a/components/frontend/src/components/ui/switch.tsx b/components/frontend/src/components/ui/switch.tsx new file mode 100644 index 000000000..82188ef90 --- /dev/null +++ b/components/frontend/src/components/ui/switch.tsx @@ -0,0 +1,29 @@ +"use client" + +import * as React from "react" +import * as SwitchPrimitives from "@radix-ui/react-switch" + +import { cn } from "@/lib/utils" + +const Switch = React.forwardRef< + React.ComponentRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + + + +)) +Switch.displayName = SwitchPrimitives.Root.displayName + +export { Switch } diff --git a/components/frontend/src/components/workspace-sections/settings-section.tsx b/components/frontend/src/components/workspace-sections/settings-section.tsx index 607da5ef8..b8ad04dba 100644 --- a/components/frontend/src/components/workspace-sections/settings-section.tsx +++ b/components/frontend/src/components/workspace-sections/settings-section.tsx @@ -7,14 +7,40 @@ import { Label } from "@/components/ui/label"; import { Input } from "@/components/ui/input"; import { Textarea } from "@/components/ui/textarea"; import { Button } from "@/components/ui/button"; -import { Save, Loader2, Info, AlertTriangle } from "lucide-react"; +import { Save, Loader2, AlertTriangle, Container, Info } from "lucide-react"; import { Plus, Trash2, Eye, EyeOff, ChevronDown, ChevronRight } from "lucide-react"; import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { successToast, errorToast } from "@/hooks/use-toast"; import { useProject, useUpdateProject } from "@/services/queries/use-projects"; import { useSecretsValues, useUpdateSecrets, useIntegrationSecrets, useUpdateIntegrationSecrets } from "@/services/queries/use-secrets"; +import { useWorkspaceContainerSettings, useUpdateWorkspaceContainerSettings } from "@/services/queries/use-workspace-container"; import { useClusterInfo } from "@/hooks/use-cluster-info"; -import { useMemo } from "react"; + +// Fixed keys that have dedicated UI sections and should be filtered from custom env vars +type FixedSecretKey = + | "ANTHROPIC_API_KEY" + | "GIT_USER_NAME" + | "GIT_USER_EMAIL" + | "GITHUB_TOKEN" + | "JIRA_URL" + | "JIRA_PROJECT" + | "JIRA_EMAIL" + | "JIRA_API_TOKEN" + | "GITLAB_TOKEN" + | "GITLAB_INSTANCE_URL"; + +const FIXED_KEYS: readonly FixedSecretKey[] = [ + "ANTHROPIC_API_KEY", + "GIT_USER_NAME", + "GIT_USER_EMAIL", + "GITHUB_TOKEN", + "JIRA_URL", + "JIRA_PROJECT", + "JIRA_EMAIL", + "JIRA_API_TOKEN", + "GITLAB_TOKEN", + "GITLAB_INSTANCE_URL", +]; type SettingsSectionProps = { projectName: string; @@ -42,16 +68,23 @@ export function SettingsSection({ projectName }: SettingsSectionProps) { const [githubExpanded, setGithubExpanded] = useState(false); const [jiraExpanded, setJiraExpanded] = useState(false); const [gitlabExpanded, setGitlabExpanded] = useState(false); - const FIXED_KEYS = useMemo(() => ["ANTHROPIC_API_KEY","GIT_USER_NAME","GIT_USER_EMAIL","GITHUB_TOKEN","JIRA_URL","JIRA_PROJECT","JIRA_EMAIL","JIRA_API_TOKEN","GITLAB_TOKEN","GITLAB_INSTANCE_URL"] as const, []); + const [workspaceExpanded, setWorkspaceExpanded] = useState(false); + const [workspaceImage, setWorkspaceImage] = useState(""); + const [workspaceCpuRequest, setWorkspaceCpuRequest] = useState(""); + const [workspaceCpuLimit, setWorkspaceCpuLimit] = useState(""); + const [workspaceMemoryRequest, setWorkspaceMemoryRequest] = useState(""); + const [workspaceMemoryLimit, setWorkspaceMemoryLimit] = useState(""); // React Query hooks const { data: project, isLoading: projectLoading } = useProject(projectName); const { data: runnerSecrets } = useSecretsValues(projectName); // ambient-runner-secrets (ANTHROPIC_API_KEY) const { data: integrationSecrets } = useIntegrationSecrets(projectName); // ambient-non-vertex-integrations (GITHUB_TOKEN, GIT_USER_*, JIRA_*, custom) + const { data: workspaceContainerData } = useWorkspaceContainerSettings(projectName); const { vertexEnabled } = useClusterInfo(); const updateProjectMutation = useUpdateProject(); const updateSecretsMutation = useUpdateSecrets(); const updateIntegrationSecretsMutation = useUpdateIntegrationSecrets(); + const updateWorkspaceContainerMutation = useUpdateWorkspaceContainerSettings(); // Sync project data to form useEffect(() => { @@ -60,6 +93,17 @@ export function SettingsSection({ projectName }: SettingsSectionProps) { } }, [project]); + // Sync workspace container settings to state + useEffect(() => { + if (workspaceContainerData) { + setWorkspaceImage(workspaceContainerData.image || ""); + setWorkspaceCpuRequest(workspaceContainerData.resources?.cpuRequest || ""); + setWorkspaceCpuLimit(workspaceContainerData.resources?.cpuLimit || ""); + setWorkspaceMemoryRequest(workspaceContainerData.resources?.memoryRequest || ""); + setWorkspaceMemoryLimit(workspaceContainerData.resources?.memoryLimit || ""); + } + }, [workspaceContainerData]); + // Sync secrets values to state (merge both secrets) useEffect(() => { const allSecrets = [...(runnerSecrets || []), ...(integrationSecrets || [])]; @@ -75,9 +119,9 @@ export function SettingsSection({ projectName }: SettingsSectionProps) { setJiraToken(byKey["JIRA_API_TOKEN"] || ""); setGitlabToken(byKey["GITLAB_TOKEN"] || ""); setGitlabInstanceUrl(byKey["GITLAB_INSTANCE_URL"] || ""); - setSecrets(allSecrets.filter(s => !FIXED_KEYS.includes(s.key as typeof FIXED_KEYS[number]))); + setSecrets(allSecrets.filter(s => !FIXED_KEYS.includes(s.key as FixedSecretKey))); } - }, [runnerSecrets, integrationSecrets, FIXED_KEYS]); + }, [runnerSecrets, integrationSecrets]); const handleSave = () => { if (!project) return; @@ -149,7 +193,7 @@ export function SettingsSection({ projectName }: SettingsSectionProps) { if (gitlabInstanceUrl) integrationData["GITLAB_INSTANCE_URL"] = gitlabInstanceUrl; for (const { key, value } of secrets) { if (!key) continue; - if (FIXED_KEYS.includes(key as typeof FIXED_KEYS[number])) continue; + if (FIXED_KEYS.includes(key as FixedSecretKey)) continue; integrationData[key] = value ?? ""; } @@ -175,6 +219,40 @@ export function SettingsSection({ projectName }: SettingsSectionProps) { ); }; + // Save workspace container settings (optional customizations) + const handleSaveWorkspaceContainer = () => { + if (!projectName) return; + + const resources = { + cpuRequest: workspaceCpuRequest || undefined, + cpuLimit: workspaceCpuLimit || undefined, + memoryRequest: workspaceMemoryRequest || undefined, + memoryLimit: workspaceMemoryLimit || undefined, + }; + + // Only include resources if any are set + const hasResources = Object.values(resources).some(v => v !== undefined); + + updateWorkspaceContainerMutation.mutate( + { + projectName, + settings: { + image: workspaceImage || undefined, + resources: hasResources ? resources : undefined, + }, + }, + { + onSuccess: () => { + successToast("Workspace container settings saved"); + }, + onError: (error) => { + const message = error instanceof Error ? error.message : "Failed to save workspace settings"; + errorToast(message); + }, + } + ); + }; + const addSecretRow = () => { setSecrets((prev) => [...prev, { key: "", value: "" }]); }; @@ -468,6 +546,101 @@ export function SettingsSection({ projectName }: SettingsSectionProps) { )} + {/* Workspace Container Section (ADR-0006) */} +
+ + {workspaceExpanded && ( +
+
+ +
+ Override the default workspace container image +
+ setWorkspaceImage(e.target.value)} + /> +
+
+ +
+ Configure CPU and memory resources for workspace containers +
+
+
+ + setWorkspaceCpuRequest(e.target.value)} + /> +
+
+ + setWorkspaceCpuLimit(e.target.value)} + /> +
+
+ + setWorkspaceMemoryRequest(e.target.value)} + /> +
+
+ + setWorkspaceMemoryLimit(e.target.value)} + /> +
+
+
+
+ +
+
+ )} +
+ {/* Custom Environment Variables Section */}
diff --git a/components/frontend/src/services/api/workspace-container.ts b/components/frontend/src/services/api/workspace-container.ts new file mode 100644 index 000000000..aa708f1c5 --- /dev/null +++ b/components/frontend/src/services/api/workspace-container.ts @@ -0,0 +1,31 @@ +/** + * Workspace Container API service + * Handles workspace container settings for ADR-0006 agent isolation + */ + +import { apiClient } from './client'; +import type { WorkspaceContainerSettings } from '@/types/project-settings'; + +/** + * Get workspace container settings for a project + */ +export async function getWorkspaceContainerSettings( + projectName: string +): Promise { + return apiClient.get( + `/projects/${projectName}/workspace-container` + ); +} + +/** + * Update workspace container settings for a project + */ +export async function updateWorkspaceContainerSettings( + projectName: string, + settings: WorkspaceContainerSettings +): Promise { + await apiClient.put( + `/projects/${projectName}/workspace-container`, + settings + ); +} diff --git a/components/frontend/src/services/queries/use-workspace-container.ts b/components/frontend/src/services/queries/use-workspace-container.ts new file mode 100644 index 000000000..dfebd0f81 --- /dev/null +++ b/components/frontend/src/services/queries/use-workspace-container.ts @@ -0,0 +1,31 @@ +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; +import * as workspaceContainerApi from '../api/workspace-container'; +import type { WorkspaceContainerSettings } from '@/types/project-settings'; + +export function useWorkspaceContainerSettings(projectName: string) { + return useQuery({ + queryKey: ['workspace-container', projectName], + queryFn: () => workspaceContainerApi.getWorkspaceContainerSettings(projectName), + enabled: !!projectName, + }); +} + +export function useUpdateWorkspaceContainerSettings() { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: ({ + projectName, + settings, + }: { + projectName: string; + settings: WorkspaceContainerSettings; + }) => workspaceContainerApi.updateWorkspaceContainerSettings(projectName, settings), + onSuccess: (_, { projectName }) => { + queryClient.invalidateQueries({ queryKey: ['workspace-container', projectName] }); + }, + onError: (error) => { + console.error('Failed to update workspace container settings:', error); + }, + }); +} diff --git a/components/frontend/src/types/project-settings.ts b/components/frontend/src/types/project-settings.ts index ccb9ebd0f..94cd12faa 100644 --- a/components/frontend/src/types/project-settings.ts +++ b/components/frontend/src/types/project-settings.ts @@ -18,6 +18,18 @@ export type ProjectResourceLimits = { diskQuotaGB: number; }; +// Workspace container customization settings. +// Workspace container mode is always enabled (ADR-0006); these settings allow optional customization. +export type WorkspaceContainerSettings = { + image?: string; + resources?: { + cpuRequest?: string; + cpuLimit?: string; + memoryRequest?: string; + memoryLimit?: string; + }; +}; + export type ObjectMeta = { name: string; namespace: string; diff --git a/components/manifests/base/crds/projectsettings-crd.yaml b/components/manifests/base/crds/projectsettings-crd.yaml index f14fc1219..a3127a294 100644 --- a/components/manifests/base/crds/projectsettings-crd.yaml +++ b/components/manifests/base/crds/projectsettings-crd.yaml @@ -62,6 +62,29 @@ spec: - "github" - "gitlab" description: "Git hosting provider (auto-detected from URL if not specified)" + workspaceContainer: + type: object + description: "Workspace container configuration for ADR-0006 agent isolation. Optional customization - sessions always run in workspace containers using the platform default image if not specified." + properties: + image: + type: string + description: "Custom container image for workspace pods (optional - uses platform default if not specified)" + resources: + type: object + description: "Resource requirements for workspace containers" + properties: + cpuRequest: + type: string + description: "CPU request (e.g., 250m)" + cpuLimit: + type: string + description: "CPU limit (e.g., 2)" + memoryRequest: + type: string + description: "Memory request (e.g., 512Mi)" + memoryLimit: + type: string + description: "Memory limit (e.g., 2Gi)" status: type: object properties: diff --git a/components/manifests/base/operator-deployment.yaml b/components/manifests/base/operator-deployment.yaml index fa3326676..64298a0f8 100644 --- a/components/manifests/base/operator-deployment.yaml +++ b/components/manifests/base/operator-deployment.yaml @@ -34,6 +34,8 @@ spec: value: "quay.io/ambient_code/vteam_claude_runner:latest" - name: CONTENT_SERVICE_IMAGE value: "quay.io/ambient_code/vteam_backend:latest" + - name: SESSION_PROXY_IMAGE + value: "quay.io/ambient_code/vteam_session_proxy:latest" - name: IMAGE_PULL_POLICY value: "Always" # Vertex AI configuration from ConfigMap diff --git a/components/manifests/base/rbac/operator-clusterrole.yaml b/components/manifests/base/rbac/operator-clusterrole.yaml index e5a6b97ae..7eac7d4fb 100644 --- a/components/manifests/base/rbac/operator-clusterrole.yaml +++ b/components/manifests/base/rbac/operator-clusterrole.yaml @@ -25,13 +25,17 @@ rules: - apiGroups: ["batch"] resources: ["jobs"] verbs: ["get", "list", "watch", "create", "delete"] -# Pods (for getting logs from failed jobs and cleanup on stop) +# Pods (for getting logs from failed jobs, creating workspace pods, and cleanup on stop) - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "delete", "deletecollection"] + verbs: ["get", "list", "watch", "create", "delete", "deletecollection"] - apiGroups: [""] resources: ["pods/log"] verbs: ["get"] +# Pod exec (ADR-0006: execute commands in workspace pods) +- apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] # PersistentVolumeClaims (create workspace PVCs) - apiGroups: [""] resources: ["persistentvolumeclaims"] @@ -63,3 +67,24 @@ rules: - apiGroups: [""] resources: ["secrets"] verbs: ["get", "create", "delete", "update"] +# ServiceAccounts (create ambient-runner SA in project namespaces) +- apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get", "create", "delete"] +# ConfigMaps (create workspace template ConfigMaps for workspace container mode) +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "create", "delete", "update"] +# Roles (create roles for runner service accounts in project namespaces) +# 'escalate' required to create roles with permissions the operator already holds +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles"] + verbs: ["get", "create", "update", "escalate"] +# TokenReview (ADR-0006: authenticate runner ServiceAccount tokens for streaming exec API) +- apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews"] + verbs: ["create"] +# Events (ADR-0006: emit audit events for command execution) +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] diff --git a/components/operator/go.mod b/components/operator/go.mod index febd720bd..9b671b97a 100644 --- a/components/operator/go.mod +++ b/components/operator/go.mod @@ -21,12 +21,15 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/pkg/errors v0.9.1 // indirect github.com/spf13/pflag v1.0.6 // indirect github.com/x448/float16 v0.8.4 // indirect diff --git a/components/operator/go.sum b/components/operator/go.sum index 236504ad6..1fdff5f55 100644 --- a/components/operator/go.sum +++ b/components/operator/go.sum @@ -1,3 +1,5 @@ +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -29,6 +31,8 @@ github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgY github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -44,6 +48,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= +github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -52,6 +58,8 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= diff --git a/components/operator/internal/config/config.go b/components/operator/internal/config/config.go index 73b33b978..4491a13ca 100644 --- a/components/operator/internal/config/config.go +++ b/components/operator/internal/config/config.go @@ -16,6 +16,7 @@ import ( var ( K8sClient kubernetes.Interface DynamicClient dynamic.Interface + RestConfig *rest.Config // Exported for remotecommand (pod exec) ) // Config holds the operator configuration @@ -24,6 +25,7 @@ type Config struct { BackendNamespace string AmbientCodeRunnerImage string ContentServiceImage string + SessionProxyImage string // ADR-0006: Session proxy sidecar image ImagePullPolicy corev1.PullPolicy } @@ -45,6 +47,9 @@ func InitK8sClients() error { } } + // Save rest config for remotecommand (pod exec) + RestConfig = config + // Create standard Kubernetes client K8sClient, err = kubernetes.NewForConfig(config) if err != nil { @@ -86,6 +91,12 @@ func LoadConfig() *Config { contentServiceImage = "quay.io/ambient_code/vteam_backend:latest" } + // ADR-0006: Session proxy sidecar image + sessionProxyImage := os.Getenv("SESSION_PROXY_IMAGE") + if sessionProxyImage == "" { + sessionProxyImage = "quay.io/ambient_code/vteam_session_proxy:latest" + } + // Get image pull policy from environment or use default imagePullPolicyStr := os.Getenv("IMAGE_PULL_POLICY") if imagePullPolicyStr == "" { @@ -98,6 +109,7 @@ func LoadConfig() *Config { BackendNamespace: backendNamespace, AmbientCodeRunnerImage: ambientCodeRunnerImage, ContentServiceImage: contentServiceImage, + SessionProxyImage: sessionProxyImage, ImagePullPolicy: imagePullPolicy, } } diff --git a/components/operator/internal/handlers/sessions.go b/components/operator/internal/handlers/sessions.go index 1059c807c..78dd8b398 100644 --- a/components/operator/internal/handlers/sessions.go +++ b/components/operator/internal/handlers/sessions.go @@ -23,6 +23,7 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" intstr "k8s.io/apimachinery/pkg/util/intstr" @@ -700,6 +701,21 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { copyCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() if err := copySecretToNamespace(copyCtx, ambientVertexSecret, sessionNamespace, currentObj); err != nil { + errMsg := fmt.Sprintf("Failed to copy Vertex AI secret: %v", err) + statusPatch.SetField("phase", "Failed") + statusPatch.AddCondition(conditionUpdate{ + Type: conditionSecretsReady, + Status: "False", + Reason: "SecretCopyFailed", + Message: errMsg, + }) + statusPatch.AddCondition(conditionUpdate{ + Type: conditionReady, + Status: "False", + Reason: "VertexSecretError", + Message: errMsg, + }) + _ = statusPatch.Apply() return fmt.Errorf("failed to copy %s secret from %s to %s (CLAUDE_CODE_USE_VERTEX=1): %w", types.AmbientVertexSecretName, operatorNamespace, sessionNamespace, err) } ambientVertexSecretCopied = true @@ -801,6 +817,37 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { timeout, _, _ := unstructured.NestedInt64(spec, "timeout") interactive, _, _ := unstructured.NestedBool(spec, "interactive") + // ADR-0006: Workspace container mode is always enabled + // Configuration priority: CR spec.workspacePodTemplate > ProjectSettings > platform default + workspacePodTemplate, hasCRTemplate, _ := unstructured.NestedMap(spec, "workspacePodTemplate") + + // If no CR-level template, check ProjectSettings CR for customizations + if !hasCRTemplate { + psImage, _ := getWorkspaceContainerFromProjectSettings(sessionNamespace) + workspaceImage := psImage + if workspaceImage == "" { + workspaceImage = appConfig.AmbientCodeRunnerImage + } + log.Printf("Workspace container mode for session %s: image=%s (custom=%v)", name, workspaceImage, psImage != "") + + // Build a workspacePodTemplate with just the image + workspacePodTemplate = map[string]interface{}{ + "apiVersion": "v1", + "kind": "Pod", + "spec": map[string]interface{}{ + "containers": []interface{}{ + map[string]interface{}{ + "name": "workspace", + "image": workspaceImage, + }, + }, + }, + } + } else { + workspaceImage := extractWorkspaceImage(workspacePodTemplate) + log.Printf("Workspace container mode enabled for session %s with CR image: %s", name, workspaceImage) + } + llmSettings, _, _ := unstructured.NestedMap(spec, "llmSettings") model, _, _ := unstructured.NestedString(llmSettings, "model") temperature, _, _ := unstructured.NestedFloat64(llmSettings, "temperature") @@ -931,6 +978,12 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { } log.Printf("Session %s initiated by user: %s (userId: %s)", name, userName, userID) + // ADR-0006: Ensure runner service account exists for session-proxy pods/exec operations + if err := ensureRunnerServiceAccount(sessionNamespace); err != nil { + log.Printf("Warning: Failed to ensure runner service account: %v", err) + // Continue - job may still work if SA already exists + } + // Create the Job job := &batchv1.Job{ ObjectMeta: v1.ObjectMeta{ @@ -968,8 +1021,10 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { }, Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, - // Explicitly set service account for pod creation permissions + // ADR-0006: Disable automatic token mounting - only proxy sidecar gets the token AutomountServiceAccountToken: boolPtr(false), + // ADR-0006: Use service account with pods/exec permissions (for proxy sidecar) + ServiceAccountName: "ambient-runner", Volumes: []corev1.Volume{ { Name: "workspace", @@ -979,6 +1034,32 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { }, }, }, + // ADR-0006: Projected volume for SA token - mounted only in proxy sidecar + { + Name: "proxy-token", + VolumeSource: corev1.VolumeSource{ + Projected: &corev1.ProjectedVolumeSource{ + Sources: []corev1.VolumeProjection{ + { + ServiceAccountToken: &corev1.ServiceAccountTokenProjection{ + Path: "token", + ExpirationSeconds: int64Ptr(3600), + }, + }, + { + ConfigMap: &corev1.ConfigMapProjection{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "kube-root-ca.crt", + }, + Items: []corev1.KeyToPath{ + {Key: "ca.crt", Path: "ca.crt"}, + }, + }, + }, + }, + }, + }, + }, }, // InitContainer to ensure workspace directory structure exists @@ -1085,6 +1166,21 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { // S3 disabled; backend persists messages ) + // ADR-0006: Workspace MCP environment variables + // Agent uses workspace MCP tool for ALL command execution (Bash disabled) + base = append(base, + // Session identification for workspace MCP server (streaming exec) + corev1.EnvVar{Name: "SESSION_NAME", Value: name}, + corev1.EnvVar{Name: "SESSION_UID", Value: string(currentObj.GetUID())}, + // Namespace for K8s operations + corev1.EnvVar{ + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{FieldPath: "metadata.namespace"}, + }, + }, + ) + // Platform-wide Langfuse observability configuration // Uses secretKeyRef to prevent credential exposure in pod specs // Secret is copied to session namespace from operator namespace @@ -1274,12 +1370,64 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { Resources: corev1.ResourceRequirements{}, }, + // ADR-0006: Session proxy sidecar - provides streaming exec API + // Token is mounted only in this container via projected volume + { + Name: "session-proxy", + Image: appConfig.SessionProxyImage, + ImagePullPolicy: appConfig.ImagePullPolicy, + Env: []corev1.EnvVar{ + {Name: "SESSION_NAME", Value: name}, + {Name: "NAMESPACE", Value: sessionNamespace}, + {Name: "LISTEN_ADDR", Value: ":8081"}, + }, + Ports: []corev1.ContainerPort{ + {ContainerPort: 8081, Name: "exec-api"}, + }, + // Mount the projected SA token - only this container has K8s API access + VolumeMounts: []corev1.VolumeMount{ + { + Name: "proxy-token", + MountPath: "/var/run/secrets/kubernetes.io/serviceaccount", + ReadOnly: true, + }, + }, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: boolPtr(false), + ReadOnlyRootFilesystem: boolPtr(true), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("200m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + }, }, }, }, }, } + // ADR-0006: Add SESSION_PROXY_URL to runner container for exec API access + // Session-proxy sidecar listens on port 8081 (content service uses 8080) + for i := range job.Spec.Template.Spec.Containers { + if job.Spec.Template.Spec.Containers[i].Name == "ambient-code-runner" { + job.Spec.Template.Spec.Containers[i].Env = append( + job.Spec.Template.Spec.Containers[i].Env, + corev1.EnvVar{Name: "SESSION_PROXY_URL", Value: "http://localhost:8081"}, + ) + break + } + } + // Note: No volume mounts needed for runner/integration secrets // All keys are injected as environment variables via EnvFrom above @@ -1333,13 +1481,86 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error { } log.Printf("Created job %s for AgenticSession %s", jobName, name) + + // ADR-0006: Create the workspace pod + // The proxy sidecar in the runner pod will exec into this pod + workspacePodName := fmt.Sprintf("%s-workspace", name) + workspaceImage := extractWorkspaceImage(workspacePodTemplate) + if workspaceImage == "" { + workspaceImage = appConfig.AmbientCodeRunnerImage // fallback to runner image + } + + workspacePod := &corev1.Pod{ + ObjectMeta: v1.ObjectMeta{ + Name: workspacePodName, + Namespace: sessionNamespace, + Labels: map[string]string{ + "session": name, + "type": "workspace", + "app": "ambient-workspace", + }, + OwnerReferences: []v1.OwnerReference{ + { + APIVersion: "batch/v1", + Kind: "Job", + Name: jobName, + UID: createdJob.UID, + Controller: boolPtr(true), + }, + }, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "workspace", + Image: workspaceImage, + ImagePullPolicy: appConfig.ImagePullPolicy, + Command: []string{"sleep", "infinity"}, // Keep pod alive for exec + VolumeMounts: []corev1.VolumeMount{ + {Name: "workspace", MountPath: "/workspace"}, + }, + WorkingDir: fmt.Sprintf("/workspace/sessions/%s/workspace", name), + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: boolPtr(false), + ReadOnlyRootFilesystem: boolPtr(false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "workspace", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + }, + }, + }, + }, + }, + } + + if _, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Create(context.TODO(), workspacePod, v1.CreateOptions{}); err != nil { + if !errors.IsAlreadyExists(err) { + log.Printf("Failed to create workspace pod %s: %v", workspacePodName, err) + // Don't fail the session - the proxy will retry finding the pod + } + } else { + log.Printf("Created workspace pod %s for session %s", workspacePodName, name) + } + + // Update status using batched StatusPatch approach statusPatch.SetField("phase", "Creating") statusPatch.SetField("observedGeneration", currentObj.GetGeneration()) + statusPatch.SetField("workspacePod", workspacePodName) statusPatch.AddCondition(conditionUpdate{ Type: conditionJobCreated, Status: "True", Reason: "JobCreated", - Message: "Runner job created", + Message: "Runner job and workspace pod created", }) // Apply all accumulated status changes in a single API call if err := statusPatch.Apply(); err != nil { @@ -2472,6 +2693,174 @@ func regenerateRunnerToken(sessionNamespace, sessionName string, session *unstru return nil } +// ensureRunnerServiceAccount creates or updates the ambient-runner service account and RBAC +// resources needed for workspace container mode (session-proxy permissions). +// ADR-0006: Supports pod creation, exec, logs, and deletion for workspace containers. +func ensureRunnerServiceAccount(namespace string) error { + ctx := context.Background() + + // Create/Update ServiceAccount + sa := &corev1.ServiceAccount{ + ObjectMeta: v1.ObjectMeta{ + Name: "ambient-runner", + Namespace: namespace, + }, + } + _, err := config.K8sClient.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, v1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create service account: %w", err) + } + + // Create/Update Role with pods/exec permissions for session proxy sidecar + // ADR-0006: Proxy needs to list pods (to find workspace) and exec into them + role := &rbacv1.Role{ + ObjectMeta: v1.ObjectMeta{ + Name: "ambient-runner-exec", + Namespace: namespace, + }, + Rules: []rbacv1.PolicyRule{ + // Pod list permission (to discover workspace pod by label) + { + APIGroups: []string{""}, + Resources: []string{"pods"}, + Verbs: []string{"get", "list"}, + }, + // Pod exec permission (to run commands in workspace container) + { + APIGroups: []string{""}, + Resources: []string{"pods/exec"}, + Verbs: []string{"create"}, + }, + }, + } + _, err = config.K8sClient.RbacV1().Roles(namespace).Create(ctx, role, v1.CreateOptions{}) + if err != nil { + if errors.IsAlreadyExists(err) { + // Update existing role with new permissions + _, err = config.K8sClient.RbacV1().Roles(namespace).Update(ctx, role, v1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update role: %w", err) + } + } else { + return fmt.Errorf("failed to create role: %w", err) + } + } + + // Create/Update RoleBinding + rb := &rbacv1.RoleBinding{ + ObjectMeta: v1.ObjectMeta{ + Name: "ambient-runner-exec", + Namespace: namespace, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: "ambient-runner-exec", + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: "ambient-runner", + Namespace: namespace, + }, + }, + } + _, err = config.K8sClient.RbacV1().RoleBindings(namespace).Create(ctx, rb, v1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create role binding: %w", err) + } + + log.Printf("Ensured ambient-runner RBAC resources in namespace %s", namespace) + return nil +} + +// getWorkspaceContainerFromProjectSettings returns custom workspace container settings from ProjectSettings CR. +// Returns (image, resources) - empty values mean use platform defaults. +// Workspace container mode is always enabled (ADR-0006); this function just retrieves optional customizations. +func getWorkspaceContainerFromProjectSettings(namespace string) (string, map[string]string) { + gvr := types.GetProjectSettingsResource() + ctx := context.Background() + + obj, err := config.DynamicClient.Resource(gvr).Namespace(namespace).Get(ctx, "projectsettings", v1.GetOptions{}) + if err != nil { + if !errors.IsNotFound(err) { + log.Printf("Error getting ProjectSettings for namespace %s: %v", namespace, err) + } + return "", nil + } + + spec, found, _ := unstructured.NestedMap(obj.Object, "spec") + if !found { + return "", nil + } + + wcMap, found, _ := unstructured.NestedMap(spec, "workspaceContainer") + if !found { + return "", nil + } + + image, _, _ := unstructured.NestedString(wcMap, "image") + + // Extract resources if present + resources := make(map[string]string) + if resMap, found, _ := unstructured.NestedMap(wcMap, "resources"); found { + if v, ok := resMap["cpuRequest"].(string); ok && v != "" { + resources["cpuRequest"] = v + } + if v, ok := resMap["cpuLimit"].(string); ok && v != "" { + resources["cpuLimit"] = v + } + if v, ok := resMap["memoryRequest"].(string); ok && v != "" { + resources["memoryRequest"] = v + } + if v, ok := resMap["memoryLimit"].(string); ok && v != "" { + resources["memoryLimit"] = v + } + } + + return image, resources +} + +// extractWorkspaceImage extracts the image from a workspacePodTemplate. +// It looks for containers[].name=="workspace" or falls back to the first container. +func extractWorkspaceImage(template map[string]interface{}) string { + if template == nil { + return "" + } + + specMap, found, _ := unstructured.NestedMap(template, "spec") + if !found { + return "" + } + + containers, found, _ := unstructured.NestedSlice(specMap, "containers") + if !found || len(containers) == 0 { + return "" + } + + // Look for container named "workspace" first + for _, c := range containers { + container, ok := c.(map[string]interface{}) + if !ok { + continue + } + if name, _, _ := unstructured.NestedString(container, "name"); name == "workspace" { + if image, _, _ := unstructured.NestedString(container, "image"); image != "" { + return image + } + } + } + + // Fall back to first container's image + if first, ok := containers[0].(map[string]interface{}); ok { + if image, _, _ := unstructured.NestedString(first, "image"); image != "" { + return image + } + } + + return "" +} + // Helper functions var ( boolPtr = func(b bool) *bool { return &b } diff --git a/components/runners/claude-code-runner/Dockerfile b/components/runners/claude-code-runner/Dockerfile index 1b0e3e63e..174855704 100644 --- a/components/runners/claude-code-runner/Dockerfile +++ b/components/runners/claude-code-runner/Dockerfile @@ -19,9 +19,14 @@ RUN cd /app/runner-shell && pip install --no-cache-dir . # Copy claude-runner specific files COPY claude-code-runner /app/claude-runner +# Copy MCP servers (ADR-0006: workspace streaming exec) +COPY claude-code-runner/mcp-servers /app/mcp-servers + # Install runner wrapper as a package (pulls dependencies like claude-agent-sdk) +# Also install MCP SDK and httpx for workspace MCP server (ADR-0006) RUN pip install --no-cache-dir /app/claude-runner[observability] \ - && pip install --no-cache-dir aiofiles + && pip install --no-cache-dir aiofiles mcp httpx \ + && python3 /app/claude-runner/patches/patch_sdk_remove_bash.py # Set environment variables ENV PYTHONUNBUFFERED=1 diff --git a/components/runners/claude-code-runner/mcp-servers/workspace.py b/components/runners/claude-code-runner/mcp-servers/workspace.py new file mode 100644 index 000000000..b9a59b0f9 --- /dev/null +++ b/components/runners/claude-code-runner/mcp-servers/workspace.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +""" +Workspace MCP Server + +Provides tools for executing commands in workspace pods via the session-proxy +sidecar's streaming exec API. This is the primary interface for the Claude agent +to run shell commands in workspace container mode (ADR-0006). + +The session-proxy sidecar runs in the same pod and handles K8s authentication +internally, so no token is required from the runner container. +""" + +import json +import logging +import os +import sys + +# HTTP client - prefer httpx for streaming, fall back to aiohttp +try: + import httpx + USE_HTTPX = True +except ImportError: + try: + import aiohttp + USE_HTTPX = False + except ImportError: + print("Error: Neither httpx nor aiohttp installed. Install with: pip install httpx", file=sys.stderr) + sys.exit(1) + +# MCP SDK imports +try: + from mcp.server.fastmcp import FastMCP +except ImportError: + print("Error: MCP SDK not installed. Install with: pip install mcp", file=sys.stderr) + sys.exit(1) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger("workspace-mcp") + +# Environment variables (set by operator) +SESSION_NAME = os.environ.get("SESSION_NAME", "") +POD_NAMESPACE = os.environ.get("POD_NAMESPACE", "") + +# Session-proxy sidecar URL (runs in the same pod on localhost) +# The session-proxy handles K8s authentication internally +SESSION_PROXY_URL = os.environ.get("SESSION_PROXY_URL", "http://localhost:8081") + +# Configurable timeouts via environment variables +DEFAULT_TIMEOUT = int(os.environ.get("WORKSPACE_DEFAULT_TIMEOUT", "300")) +MAX_TIMEOUT = int(os.environ.get("WORKSPACE_MAX_TIMEOUT", "1800")) # 30 minutes max +DEFAULT_WORKDIR = "/workspace" + + +# Create MCP server +server = FastMCP("workspace") + + +@server.tool() +async def exec( + command: str, + workdir: str = DEFAULT_WORKDIR, + timeout: int = DEFAULT_TIMEOUT, +) -> str: + """Execute a command in the workspace pod with streaming output. + + The workspace pod is created automatically on first use using your + configured container image. State persists across exec calls + (working directory, environment variables, background processes). + + Args: + command: Shell command to execute (e.g., "npm install", "git status") + workdir: Working directory for command execution (default: /workspace) + timeout: Command timeout in seconds (default: 300, max: 1800) + + Returns: + Command output (stdout + stderr streamed in real-time) + + Examples: + exec("git status") + exec("npm install && npm test") + exec("cargo build --release", workdir="/workspace/myrepo") + exec("python analyze.py", timeout=600) + """ + if not command.strip(): + return "Error: Empty command" + + if not SESSION_NAME or not POD_NAMESPACE: + return "Error: Missing required environment variables (SESSION_NAME, POD_NAMESPACE)" + + # Clamp timeout + timeout = min(max(timeout, 1), MAX_TIMEOUT) + + try: + return await _exec_streaming(command, workdir, timeout) + except Exception as e: + logger.exception("exec failed") + return f"Error: {str(e)}" + + +async def _exec_streaming(command: str, workdir: str, timeout: int) -> str: + """Execute command via session-proxy sidecar.""" + url = f"{SESSION_PROXY_URL}/exec" + headers = { + "Content-Type": "application/json", + } + # Session-proxy expects command as array and optional cwd + payload = { + "command": ["sh", "-c", command], + "cwd": workdir, + } + + if USE_HTTPX: + return await _exec_httpx(url, headers, payload, timeout) + else: + return await _exec_aiohttp(url, headers, payload, timeout) + + +async def _exec_httpx(url: str, headers: dict, payload: dict, timeout: int) -> str: + """Execute using httpx with streaming.""" + output_chunks = [] + + async with httpx.AsyncClient() as client: + try: + async with client.stream( + "POST", + url, + headers=headers, + json=payload, + timeout=timeout + 30, # Add buffer for network latency + ) as response: + if response.status_code == 503: + return "Error: Workspace pod not ready" + if response.status_code != 200: + body = await response.aread() + return f"Error: HTTP {response.status_code} - {body.decode('utf-8', errors='replace')}" + + # Stream the output + async for chunk in response.aiter_text(): + output_chunks.append(chunk) + # Note: MCP streaming would yield chunks here for real-time display + # when the protocol supports it + + except httpx.TimeoutException: + return "Error: Request timed out" + except httpx.ConnectError as e: + return f"Error: Failed to connect to session-proxy: {e}" + + # Combine output and parse final status + full_output = "".join(output_chunks) + return _format_output(full_output) + + +async def _exec_aiohttp(url: str, headers: dict, payload: dict, timeout: int) -> str: + """Execute using aiohttp with streaming.""" + import aiohttp + + output_chunks = [] + + timeout_config = aiohttp.ClientTimeout(total=timeout + 30) + + try: + async with aiohttp.ClientSession(timeout=timeout_config) as session: + async with session.post(url, headers=headers, json=payload) as response: + if response.status == 503: + return "Error: Workspace pod not ready" + if response.status != 200: + body = await response.text() + return f"Error: HTTP {response.status} - {body}" + + # Stream the output + async for chunk in response.content.iter_any(): + output_chunks.append(chunk.decode('utf-8', errors='replace')) + + except aiohttp.ClientError as e: + return f"Error: HTTP request failed: {e}" + + # Combine output and parse final status + full_output = "".join(output_chunks) + return _format_output(full_output) + + +def _format_output(full_output: str) -> str: + """Format the streaming output, extracting final status.""" + # Session-proxy sends final JSON status: {"exitCode": 0} or {"exitCode": 1, "error": "..."} + lines = full_output.rstrip().split('\n') + + if not lines: + return "(no output)" + + # Check if last line is JSON status + last_line = lines[-1].strip() + exit_status = None + + if last_line.startswith('{') and last_line.endswith('}'): + try: + exit_status = json.loads(last_line) + # Remove status line from output + lines = lines[:-1] + except json.JSONDecodeError: + pass + + output = '\n'.join(lines) + + if exit_status: + exit_code = exit_status.get("exitCode", 0) + error = exit_status.get("error") + + if exit_code != 0: + if error: + if output: + return f"{output}\n\nError: {error} (exit code: {exit_code})" + return f"Error: {error} (exit code: {exit_code})" + if output: + return f"{output}\n\n(exit code: {exit_code})" + return f"(exit code: {exit_code})" + + return output if output else "(no output)" + + +@server.tool() +async def status() -> str: + """Show workspace pod status. + + Returns the current state of the workspace pod, including + whether it's running, its resource usage, and any recent events. + + Returns: + Status information for the workspace pod + """ + if not SESSION_NAME or not POD_NAMESPACE: + return "Error: Missing required environment variables" + + # Use exec to run kubectl inside the workspace or a simple status check + # For now, call the status endpoint if available, or use a simple exec + pod_name = f"{SESSION_NAME}-ws" + + return f"""Workspace pod: {pod_name} +Namespace: {POD_NAMESPACE} +Session: {SESSION_NAME} + +To check if the pod exists and is running, try: + exec("echo 'Workspace is ready'") + +The workspace pod is created automatically on the first exec() call. +""" + + +@server.tool() +async def logs(lines: int = 100) -> str: + """View workspace pod logs. + + Returns recent logs from the workspace pod. Useful for debugging + background processes or checking what happened after a command. + + Args: + lines: Number of recent lines to show (default: 100) + + Returns: + Recent log output from the workspace pod + + Note: + The workspace container typically runs 'sleep infinity' so logs + are usually empty unless your commands produce container-level output. + """ + if not SESSION_NAME or not POD_NAMESPACE: + return "Error: Missing required environment variables" + + # Note: Could add a /logs endpoint to the operator, or use exec to tail files + return f"""Workspace logs are not directly available via the streaming API. + +To view command history or outputs: + exec("cat ~/.bash_history") + exec("tail -100 /workspace/some.log") + +For container-level logs, the workspace runs 'sleep infinity' so container +logs are typically empty. +""" + + +def main(): + """Main entry point for the MCP server.""" + logger.info(f"Starting Workspace MCP server for session {SESSION_NAME}") + logger.info(f"Session-proxy URL: {SESSION_PROXY_URL}") + # FastMCP.run() handles stdio transport by default + server.run(transport="stdio") + + +if __name__ == "__main__": + main() diff --git a/components/runners/claude-code-runner/patches/README.md b/components/runners/claude-code-runner/patches/README.md new file mode 100644 index 000000000..8cff8f7f4 --- /dev/null +++ b/components/runners/claude-code-runner/patches/README.md @@ -0,0 +1,149 @@ +# Claude Agent SDK Patches + +This directory contains patches applied to the `claude-agent-sdk` package to ensure compliance with ADR-0006 (Agent Injection Architecture). + +## Bash Tool Removal (ADR-0006) + +**File**: `patch_sdk_remove_bash.py` + +### Problem + +Per ADR-0006, all command execution must go through the AgenticTask CRD, not the native Bash tool. While the SDK supports `allowed_tools` and `disallowed_tools` parameters, these may only control which tools can be *used*, not which tools appear in the initialization message sent to Claude. + +When the SDK initializes, it sends a SystemMessage with `subtype='init'` that lists all available tools. If Bash appears in this list, Claude may attempt to use it regardless of the `allowed_tools` configuration. + +### Solution + +This patch implements a **belt-and-suspenders approach** with three layers of defense: + +1. **Allowed Tools List** (`wrapper.py:410`): Bash is NOT included in the `allowed_tools` list +2. **Disallowed Tools List** (`wrapper.py:415`): Bash is explicitly added to `disallowed_tools` +3. **Runtime Monkey Patch** (`patch_sdk_remove_bash.py`): Intercepts SDK initialization to filter Bash from the tools list before it reaches Claude + +### How It Works + +#### Phase 1: Build-time Patching + +The `patch_sdk_remove_bash.py` script runs during Docker image build (see `Dockerfile:44`): + +```dockerfile +RUN pip install --no-cache-dir /app/claude-runner[observability] \ + && pip install --no-cache-dir aiofiles mcp \ + && python3 /app/claude-runner/patches/patch_sdk_remove_bash.py +``` + +This script: +1. Locates the installed `claude-agent-sdk` package +2. Creates a runtime wrapper module (`_bash_filter_wrapper.py`) in the SDK directory +3. This wrapper monkey-patches the SDK to filter Bash from tool lists + +#### Phase 2: Runtime Import + +Before importing the SDK, `wrapper.py:258-262` imports the filter: + +```python +try: + import _bash_filter_wrapper + logging.info("Bash filter wrapper loaded - Bash tool will be excluded from init message") +except ImportError as e: + logging.warning(f"Bash filter wrapper not available: {e} - relying on allowed_tools filtering") +``` + +The wrapper patches: +- `SystemMessage.__init__()`: Filters Bash from `data['tools']` if present +- Any SDK methods that construct tool lists + +#### Phase 3: SDK Configuration + +The SDK is configured with both positive (allowed) and negative (disallowed) tool lists: + +```python +allowed_tools = ["Read", "Write", "Glob", "Grep", "Edit", "MultiEdit", "WebSearch", "WebFetch"] +disallowed_tools = ["Bash"] + +options = ClaudeAgentOptions( + allowed_tools=allowed_tools, + disallowed_tools=disallowed_tools, + ... +) +``` + +### Verification + +To verify Bash is not in the init message, check the logs when a session starts: + +```bash +# Look for the SDK init message in pod logs +kubectl logs -n | grep -A 20 "ClaudeSDKClient.*init" +``` + +Expected output should show tools list WITHOUT "Bash": + +```json +{ + "type": "system", + "subtype": "init", + "data": { + "tools": ["Read", "Write", "Glob", "Grep", "Edit", "MultiEdit", "WebSearch", "WebFetch", "mcp__agentictask"], + ... + } +} +``` + +### Troubleshooting + +#### Patch fails during build + +If the patch script fails to locate the SDK files: + +``` +[PATCH] ERROR: Cannot locate client.py - patch failed +``` + +This is non-fatal. The runtime wrapper (`_bash_filter_wrapper.py`) will still be created and imported. + +#### Runtime wrapper not imported + +If you see this warning: + +``` +Bash filter wrapper not available: - relying on allowed_tools filtering +``` + +The monkey patch failed to load. The fallback is the `allowed_tools` / `disallowed_tools` configuration, which should still prevent Bash usage (but may not remove it from the init message). + +#### Bash still appears in init message + +If Bash appears in the init message despite all filtering: + +1. Check that `disallowed_tools = ["Bash"]` is set (wrapper.py:415) +2. Check that "Bash" is NOT in `allowed_tools` (wrapper.py:410) +3. Verify the runtime wrapper was imported successfully (check logs) +4. The SDK version may have changed its tool filtering logic - update the patch script + +### Maintenance + +When upgrading `claude-agent-sdk`, review and update: + +1. `patch_sdk_remove_bash.py`: Update file paths if SDK structure changes +2. `_bash_filter_wrapper.py`: Update monkey patches if SDK API changes +3. Test that Bash does NOT appear in init message logs + +### Alternative Approaches Considered + +**Option B from ADR-0006**: Use PreToolUse hook to intercept Bash calls + +```python +async def intercept_bash(input_data, tool_use_id, context): + if input_data["tool_name"] == "Bash": + # Redirect to AgenticTask + return {"permissionDecision": "deny", ...} +``` + +**Why rejected**: This still allows Bash to appear in the init message, which means Claude will attempt to use it and receive permission errors. Our approach prevents Claude from seeing the tool at all, which is cleaner. + +## Related + +- **ADR-0006**: Agent Injection Architecture (`docs/adr/0006-agent-injection-architecture.md`) +- **AgenticTask MCP**: Command execution tool (`mcp-servers/agentictask.py`) +- **Wrapper Configuration**: Tool filtering (`wrapper.py:407-449`) diff --git a/components/runners/claude-code-runner/patches/patch_sdk_remove_bash.py b/components/runners/claude-code-runner/patches/patch_sdk_remove_bash.py new file mode 100644 index 000000000..d783e601c --- /dev/null +++ b/components/runners/claude-code-runner/patches/patch_sdk_remove_bash.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +Patch claude-agent-sdk to remove Bash tool from the init message. + +This patch ensures that when the SDK sends its initialization SystemMessage +listing available tools, the Bash tool is completely excluded from that list. + +Related: ADR-0006 Agent Injection Architecture +Context: All command execution must go through AgenticTask CRD, not native Bash. +""" + +import sys +import os +from pathlib import Path + + +def find_sdk_installation(): + """Find the installed claude-agent-sdk package location.""" + try: + import claude_agent_sdk + sdk_path = Path(claude_agent_sdk.__file__).parent + print(f"[PATCH] Found claude-agent-sdk at: {sdk_path}") + return sdk_path + except ImportError: + print("[PATCH] ERROR: claude-agent-sdk not installed", file=sys.stderr) + sys.exit(1) + + +def patch_client_file(sdk_path: Path): + """ + Patch the client.py file to filter Bash from the init message. + + The SDK's init message includes a list of available tools. We need to + ensure Bash is excluded from this list even if other filtering doesn't + catch it. + """ + # Common file locations in the SDK + possible_files = [ + sdk_path / "client.py", + sdk_path / "src" / "claude_agent_sdk" / "client.py", + sdk_path / "_client.py", + ] + + client_file = None + for f in possible_files: + if f.exists(): + client_file = f + break + + if not client_file: + print(f"[PATCH] WARNING: Could not find client.py in {sdk_path}") + print(f"[PATCH] Searched: {[str(f) for f in possible_files]}") + # Try to find it recursively + for f in sdk_path.rglob("client.py"): + client_file = f + print(f"[PATCH] Found client.py at: {client_file}") + break + + if not client_file: + print("[PATCH] ERROR: Cannot locate client.py - patch failed") + return False + + print(f"[PATCH] Patching: {client_file}") + + # Read the file + content = client_file.read_text() + original_content = content + + # Check if already patched + if "# PATCHED: Remove Bash from tools list (ADR-0006)" in content: + print("[PATCH] File already patched, skipping") + return True + + # Strategy 1: Patch the init message generation + # Look for where tools are listed in the init message + patches_applied = [] + + # Common pattern: tools are gathered into a list before sending + # We'll inject a filter that removes 'Bash' from any tools list + + # Pattern 1: Filter in _get_available_tools or similar method + if "_get_available_tools" in content or "available_tools" in content: + # Add filter after tools are collected + import_patch = """ +# PATCHED: Remove Bash from tools list (ADR-0006) +def _filter_bash_tool(tools): + \"\"\"Remove Bash tool from tools list for ADR-0006 compliance.\"\"\" + if isinstance(tools, list): + return [t for t in tools if t != "Bash" and not (isinstance(t, dict) and t.get("name") == "Bash")] + return tools +""" + # Insert after imports + if "import " in content: + lines = content.split("\n") + insert_idx = 0 + for i, line in enumerate(lines): + if line.startswith("import ") or line.startswith("from "): + insert_idx = i + 1 + + lines.insert(insert_idx, import_patch) + content = "\n".join(lines) + patches_applied.append("Added _filter_bash_tool function") + + # Pattern 2: Find where SystemMessage is created with tools list + # This is the most reliable place to filter + if "SystemMessage" in content and "tools" in content: + # Look for patterns like: tools= or "tools": + # We'll wrap these with our filter + + # This is a heuristic approach - we inject the filter call + # wherever we see tools being assigned in a dict/kwargs context + lines = content.split("\n") + new_lines = [] + + for i, line in enumerate(lines): + new_lines.append(line) + + # Detect tools assignment in init message context + if ("tools" in line and "=" in line and + (i > 0 and ("init" in lines[i-1].lower() or "system" in lines[i-1].lower()))): + + # Check if this is dict-style or assignment + if '"tools":' in line or "'tools':" in line: + # Dict style - wrap the value with filter + # This is complex, so we add a comment for manual review + new_lines.append(" # PATCH MARKER: Review tools filtering here (ADR-0006)") + patches_applied.append(f"Marked line {i} for tools filtering") + + content = "\n".join(new_lines) + + # Write back if we made changes + if content != original_content: + # Backup original + backup_file = client_file.with_suffix(".py.backup") + if not backup_file.exists(): + backup_file.write_text(original_content) + print(f"[PATCH] Created backup: {backup_file}") + + # Write patched version + client_file.write_text(content) + print(f"[PATCH] Applied patches: {patches_applied}") + return True + else: + print("[PATCH] No suitable patch points found - trying alternative approach") + return False + + +def create_runtime_wrapper(sdk_path: Path): + """ + Create a runtime wrapper that intercepts tool initialization. + + This is a more aggressive approach: we monkey-patch the SDK at runtime + to filter Bash from any tools list before it reaches Claude. + """ + wrapper_file = sdk_path / "_bash_filter_wrapper.py" + + wrapper_content = '''""" +Runtime wrapper to filter Bash tool from claude-agent-sdk (ADR-0006). + +This module monkey-patches the SDK to ensure Bash never appears in the +tools list sent to Claude, regardless of allowed_tools configuration. + +Usage: Import this module before importing claude_agent_sdk in wrapper.py +""" + +import sys +from typing import Any, List + + +_original_systemcache = {} + + +def filter_bash_from_tools(tools: Any) -> Any: + """Remove Bash from tools list recursively.""" + if isinstance(tools, list): + filtered = [] + for tool in tools: + # Skip if tool name is "Bash" + if isinstance(tool, str) and tool == "Bash": + continue + elif isinstance(tool, dict) and tool.get("name") == "Bash": + continue + else: + filtered.append(tool) + return filtered + return tools + + +def patch_sdk(): + """Apply monkey patches to claude-agent-sdk.""" + try: + # Import SDK modules + import claude_agent_sdk + from claude_agent_sdk import types as sdk_types + + # Patch SystemMessage if it exists + if hasattr(sdk_types, "SystemMessage"): + original_init = sdk_types.SystemMessage.__init__ + + def patched_init(self, *args, **kwargs): + # Filter tools from data dict if present + if "data" in kwargs and isinstance(kwargs["data"], dict): + if "tools" in kwargs["data"]: + kwargs["data"]["tools"] = filter_bash_from_tools(kwargs["data"]["tools"]) + + return original_init(self, *args, **kwargs) + + sdk_types.SystemMessage.__init__ = patched_init + print("[BASH-FILTER] Patched SystemMessage.__init__") + + # Patch ClaudeSDKClient if needed + if hasattr(claude_agent_sdk, "ClaudeSDKClient"): + client_class = claude_agent_sdk.ClaudeSDKClient + + # Look for methods that might send tool lists + for method_name in dir(client_class): + if "tool" in method_name.lower() or "init" in method_name.lower(): + method = getattr(client_class, method_name, None) + if callable(method) and not method_name.startswith("_"): + # Store reference for potential patching + _original_systemcache[method_name] = method + + print("[BASH-FILTER] SDK patching complete - Bash tool will be filtered from init messages") + + except Exception as e: + print(f"[BASH-FILTER] WARNING: Failed to patch SDK: {e}", file=sys.stderr) + print("[BASH-FILTER] Continuing anyway - rely on allowed_tools filtering") + + +# Auto-patch when imported +patch_sdk() +''' + + wrapper_file.write_text(wrapper_content) + print(f"[PATCH] Created runtime wrapper: {wrapper_file}") + return True + + +def main(): + """Main patching logic.""" + print("[PATCH] ===== Claude Agent SDK Bash Removal Patch =====") + print("[PATCH] ADR-0006: Removing Bash tool from SDK init message") + print() + + sdk_path = find_sdk_installation() + + # Try direct file patching first + if patch_client_file(sdk_path): + print("[PATCH] ✓ Direct file patching succeeded") + else: + print("[PATCH] ⚠ Direct file patching incomplete") + + # Always create runtime wrapper as fallback + if create_runtime_wrapper(sdk_path): + print("[PATCH] ✓ Runtime wrapper created") + + print() + print("[PATCH] ===== Patching Complete =====") + print("[PATCH] To use the runtime wrapper, add to wrapper.py:") + print("[PATCH] sys.path.insert(0, '')") + print("[PATCH] import _bash_filter_wrapper # Before claude_agent_sdk import") + print() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/components/runners/claude-code-runner/wrapper.py b/components/runners/claude-code-runner/wrapper.py index a2058d47f..bd738d40f 100644 --- a/components/runners/claude-code-runner/wrapper.py +++ b/components/runners/claude-code-runner/wrapper.py @@ -221,6 +221,13 @@ async def _run_claude_agent_sdk(self, prompt: str): logging.info(f" CLOUD_ML_REGION: {os.environ.get('CLOUD_ML_REGION')}") # NOW we can safely import the SDK with the correct environment set + # IMPORTANT: Import Bash filter BEFORE SDK to monkey-patch tool filtering (ADR-0006) + try: + import _bash_filter_wrapper + logging.info("Bash filter wrapper loaded - Bash tool will be excluded from init message") + except ImportError as e: + logging.warning(f"Bash filter wrapper not available: {e} - relying on allowed_tools filtering") + from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions from observability import ObservabilityManager @@ -361,12 +368,31 @@ async def _run_claude_agent_sdk(self, prompt: str): # Load MCP server configuration from .mcp.json if present mcp_servers = self._load_mcp_config(cwd_path) - # Build allowed_tools list with MCP server - allowed_tools = ["Read","Write","Bash","Glob","Grep","Edit","MultiEdit","WebSearch","WebFetch"] + if mcp_servers is None: + mcp_servers = {} + + # Build allowed_tools list + # ADR-0006: All command execution goes through workspace MCP streaming exec + # Bash tool is disabled - agent uses workspace MCP tools instead + allowed_tools = ["Read", "Write", "Glob", "Grep", "Edit", "MultiEdit", "WebSearch", "WebFetch"] + logging.info("ADR-0006: Bash DISABLED, using workspace MCP for all command execution") + + # Explicitly disallow Bash tool (belt-and-suspenders with monkey patch) + # This ensures SDK filtering is applied even if the runtime wrapper fails + disallowed_tools: list[str] = ["Bash"] + + # ADR-0006: Add workspace MCP server for streaming command execution + # Uses operator's streaming exec API for real-time output + mcp_servers["workspace"] = { + "command": "python3", + "args": ["/app/mcp-servers/workspace.py"], + } + if mcp_servers: # Add permissions for all tools from each MCP server for server_name in mcp_servers.keys(): - allowed_tools.append(f"mcp__{server_name}") + if f"mcp__{server_name}" not in allowed_tools: + allowed_tools.append(f"mcp__{server_name}") logging.info(f"MCP tool permissions granted for servers: {list(mcp_servers.keys())}") # Build comprehensive workspace context system prompt @@ -374,7 +400,7 @@ async def _run_claude_agent_sdk(self, prompt: str): repos_cfg=repos_cfg, workflow_name=derived_name if active_workflow_url else None, artifacts_path="artifacts", - ambient_config=ambient_config + ambient_config=ambient_config, ) system_prompt_config = { "type": "text", @@ -387,6 +413,7 @@ async def _run_claude_agent_sdk(self, prompt: str): cwd=cwd_path, permission_mode="acceptEdits", allowed_tools=allowed_tools, + disallowed_tools=disallowed_tools if disallowed_tools else None, mcp_servers=mcp_servers, setting_sources=["project"], system_prompt=system_prompt_config @@ -1866,7 +1893,49 @@ async def handle_message(self, message: dict): def _build_workspace_context_prompt(self, repos_cfg, workflow_name, artifacts_path, ambient_config): """Generate comprehensive system prompt describing workspace layout.""" - prompt = "You are Claude Code working in a structured development workspace.\n\n" + prompt = """You are Claude Code working in a structured development workspace. + +## Command Execution (ADR-0006) + +All shell commands execute in an isolated workspace pod via the `workspace` MCP tools. +The Bash tool is not available - use these MCP tools instead: + +**Available tools:** + +- `mcp__workspace__exec(command, workdir, timeout)` - Execute command in workspace pod + - command: Shell command to run (e.g., "npm install", "git status") + - workdir: Working directory (default: /workspace) + - timeout: Timeout in seconds (default: 300, max: 1800) + - State persists across calls (environment, background processes) + - Output streams in real-time + +- `mcp__workspace__status()` - Show workspace pod status + - Returns pod name, namespace, and session info + +- `mcp__workspace__logs(lines)` - View workspace pod logs + - lines: Number of recent lines to show (default: 100) + +**Examples:** +``` +# Install dependencies +mcp__workspace__exec("npm install") + +# Run tests +mcp__workspace__exec("npm test") + +# Build in specific directory +mcp__workspace__exec("cargo build --release", workdir="/workspace/myrepo") + +# Long-running command with extended timeout +mcp__workspace__exec("make all", timeout=600) + +# Check workspace status +mcp__workspace__status() +``` + +The workspace pod shares the same PVC as this session, so file changes are visible to both. + +""" # Current working directory if workflow_name: diff --git a/components/runners/session-proxy/Dockerfile b/components/runners/session-proxy/Dockerfile new file mode 100644 index 000000000..e0bf97847 --- /dev/null +++ b/components/runners/session-proxy/Dockerfile @@ -0,0 +1,16 @@ +# Session Proxy - ADR-0006 streaming exec sidecar +FROM golang:1.21-alpine AS builder + +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o session-proxy . + +FROM alpine:3.19 +RUN apk --no-cache add ca-certificates +COPY --from=builder /app/session-proxy /usr/local/bin/session-proxy + +USER 1000:1000 +ENTRYPOINT ["/usr/local/bin/session-proxy"] diff --git a/components/runners/session-proxy/go.mod b/components/runners/session-proxy/go.mod new file mode 100644 index 000000000..a1b859be8 --- /dev/null +++ b/components/runners/session-proxy/go.mod @@ -0,0 +1,49 @@ +module github.com/ambient-code/platform/components/runners/session-proxy + +go 1.21 + +require ( + k8s.io/api v0.29.0 + k8s.io/apimachinery v0.29.0 + k8s.io/client-go v0.29.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/go-logr/logr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.22.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/gorilla/websocket v1.5.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/spdystream v0.2.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/oauth2 v0.10.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/term v0.13.0 // indirect + golang.org/x/text v0.13.0 // indirect + golang.org/x/time v0.3.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.31.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.110.1 // indirect + k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) diff --git a/components/runners/session-proxy/go.sum b/components/runners/session-proxy/go.sum new file mode 100644 index 000000000..217cef408 --- /dev/null +++ b/components/runners/session-proxy/go.sum @@ -0,0 +1,163 @@ +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= +github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= +github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4= +github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o= +github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg= +github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= +golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.12.0 h1:YW6HUoUmYBpwSgyaGaZq1fHjrBjX1rlpZ54T6mu2kss= +golang.org/x/tools v0.12.0/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.29.0 h1:NiCdQMY1QOp1H8lfRyeEf8eOwV6+0xA6XEE44ohDX2A= +k8s.io/api v0.29.0/go.mod h1:sdVmXoz2Bo/cb77Pxi71IPTSErEW32xa4aXwKH7gfBA= +k8s.io/apimachinery v0.29.0 h1:+ACVktwyicPz0oc6MTMLwa2Pw3ouLAfAon1wPLtG48o= +k8s.io/apimachinery v0.29.0/go.mod h1:eVBxQ/cwiJxH58eK/jd/vAk4mrxmVlnpBH5J2GbMeis= +k8s.io/client-go v0.29.0 h1:KmlDtFcrdUzOYrBhXHgKw5ycWzc3ryPX5mQe0SkG3y8= +k8s.io/client-go v0.29.0/go.mod h1:yLkXH4HKMAywcrD82KMSmfYg2DlE8mepPR4JGSo5n38= +k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= +k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/components/runners/session-proxy/main.go b/components/runners/session-proxy/main.go new file mode 100644 index 000000000..11bdb8f30 --- /dev/null +++ b/components/runners/session-proxy/main.go @@ -0,0 +1,45 @@ +// session-proxy is a sidecar that provides a streaming exec API for workspace containers. +// It listens on localhost and executes commands in the workspace pod via kubectl exec. +// ADR-0006: The proxy holds the SA token; the runner container has no K8s API access. +package main + +import ( + "log" + "os" + + "github.com/ambient-code/platform/components/runners/session-proxy/pkg/proxy" +) + +func main() { + config := proxy.Config{ + SessionName: os.Getenv("SESSION_NAME"), + Namespace: os.Getenv("NAMESPACE"), + ListenAddr: getEnvOrDefault("LISTEN_ADDR", ":8080"), + } + + if config.SessionName == "" { + log.Fatal("SESSION_NAME environment variable is required") + } + if config.Namespace == "" { + log.Fatal("NAMESPACE environment variable is required") + } + + log.Printf("Starting session-proxy for session %s in namespace %s", config.SessionName, config.Namespace) + log.Printf("Listening on %s", config.ListenAddr) + + server, err := proxy.New(config) + if err != nil { + log.Fatalf("Failed to create proxy: %v", err) + } + + if err := server.Start(); err != nil { + log.Fatalf("Failed to start proxy: %v", err) + } +} + +func getEnvOrDefault(key, defaultValue string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultValue +} diff --git a/components/runners/session-proxy/pkg/proxy/handler.go b/components/runners/session-proxy/pkg/proxy/handler.go new file mode 100644 index 000000000..37a0cf84d --- /dev/null +++ b/components/runners/session-proxy/pkg/proxy/handler.go @@ -0,0 +1,273 @@ +// Package proxy implements the streaming exec API for workspace containers. +// +// The proxy runs as a sidecar in the runner pod and handles kubectl exec +// operations to workspace pods, providing streaming command execution over HTTP. +package proxy + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "strconv" + "sync/atomic" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" +) + +// Rate limiting configuration (can be overridden via environment variables) +var ( + // MaxConcurrentExecs limits the number of concurrent exec operations + MaxConcurrentExecs = getEnvInt("MAX_CONCURRENT_EXECS", 10) + // ExecTimeout is the default timeout for exec operations + ExecTimeout = time.Duration(getEnvInt("EXEC_TIMEOUT_SECONDS", 600)) * time.Second +) + +// getEnvInt reads an integer from environment variable with a default fallback +func getEnvInt(key string, defaultVal int) int { + if val := os.Getenv(key); val != "" { + if i, err := strconv.Atoi(val); err == nil && i > 0 { + return i + } + } + return defaultVal +} + +// Config holds proxy configuration +type Config struct { + SessionName string + Namespace string + ListenAddr string +} + +// ExecRequest is the request body for /exec +type ExecRequest struct { + Command []string `json:"command"` + Cwd string `json:"cwd,omitempty"` +} + +// ExecResponse is sent as the final message after command completion +type ExecResponse struct { + ExitCode int `json:"exitCode"` + Error string `json:"error,omitempty"` +} + +// SessionProxy implements the streaming exec API +type SessionProxy struct { + config Config + clientset *kubernetes.Clientset + restConfig *rest.Config + activeExecs int32 // Atomic counter for concurrent exec operations +} + +// New creates a new SessionProxy +func New(config Config) (*SessionProxy, error) { + restConfig, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("failed to create in-cluster config: %w", err) + } + + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("failed to create kubernetes client: %w", err) + } + + return &SessionProxy{ + config: config, + clientset: clientset, + restConfig: restConfig, + }, nil +} + +// Start starts the HTTP server +func (p *SessionProxy) Start() error { + mux := http.NewServeMux() + mux.HandleFunc("/exec", p.handleExec) + mux.HandleFunc("/health", p.handleHealth) + + server := &http.Server{ + Addr: p.config.ListenAddr, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 0, // No timeout for streaming responses + } + + return server.ListenAndServe() +} + +// handleHealth returns 200 OK for health checks +func (p *SessionProxy) handleHealth(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok")) +} + +// handleExec handles streaming exec requests +func (p *SessionProxy) handleExec(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + // Rate limiting: check concurrent exec count + current := atomic.AddInt32(&p.activeExecs, 1) + defer atomic.AddInt32(&p.activeExecs, -1) + + if int(current) > MaxConcurrentExecs { + http.Error(w, fmt.Sprintf("too many concurrent exec requests (max: %d)", MaxConcurrentExecs), http.StatusTooManyRequests) + return + } + + // Parse request + var req ExecRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, fmt.Sprintf("invalid request: %v", err), http.StatusBadRequest) + return + } + + if len(req.Command) == 0 { + http.Error(w, "command is required", http.StatusBadRequest) + return + } + + // Create context with timeout + ctx, cancel := context.WithTimeout(r.Context(), ExecTimeout) + defer cancel() + + // Find workspace pod by label + workspacePod, err := p.findWorkspacePod(ctx) + if err != nil { + http.Error(w, fmt.Sprintf("failed to find workspace pod: %v", err), http.StatusServiceUnavailable) + return + } + + log.Printf("Executing command in pod %s: %v", workspacePod, req.Command) + + // Build the command with optional cwd + cmd := req.Command + if req.Cwd != "" { + // Wrap command to run in specified directory + cmd = []string{"sh", "-c", fmt.Sprintf("cd %q && exec \"$@\"", req.Cwd), "--"} + cmd = append(cmd, req.Command...) + } + + // Set up streaming response + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.Header().Set("X-Content-Type-Options", "nosniff") + w.Header().Set("Transfer-Encoding", "chunked") + w.WriteHeader(http.StatusOK) + + // Flush headers immediately + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + + // Execute command with streaming output (using context with timeout) + exitCode, execErr := p.execInPod(ctx, workspacePod, cmd, w) + + // Send final response as JSON on a new line + resp := ExecResponse{ExitCode: exitCode} + if execErr != nil { + resp.Error = execErr.Error() + } + + // Write a delimiter and the final JSON response + w.Write([]byte("\n---EXIT---\n")) + json.NewEncoder(w).Encode(resp) + + if f, ok := w.(http.Flusher); ok { + f.Flush() + } +} + +// findWorkspacePod finds the workspace pod for this session by label selector +func (p *SessionProxy) findWorkspacePod(ctx context.Context) (string, error) { + labelSelector := fmt.Sprintf("session=%s,type=workspace", p.config.SessionName) + + pods, err := p.clientset.CoreV1().Pods(p.config.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return "", fmt.Errorf("failed to list pods: %w", err) + } + + // Find a running pod + for _, pod := range pods.Items { + if pod.Status.Phase == corev1.PodRunning { + return pod.Name, nil + } + } + + if len(pods.Items) == 0 { + return "", fmt.Errorf("no workspace pod found with selector %q", labelSelector) + } + + return "", fmt.Errorf("workspace pod exists but is not running (phase: %s)", pods.Items[0].Status.Phase) +} + +// execInPod executes a command in the workspace pod and streams output +func (p *SessionProxy) execInPod(ctx context.Context, podName string, cmd []string, output io.Writer) (int, error) { + req := p.clientset.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(podName). + Namespace(p.config.Namespace). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: "workspace", + Command: cmd, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, scheme.ParameterCodec) + + exec, err := remotecommand.NewSPDYExecutor(p.restConfig, "POST", req.URL()) + if err != nil { + return -1, fmt.Errorf("failed to create executor: %w", err) + } + + // Create a streaming writer that flushes after each write + streamWriter := &flushWriter{w: output} + + err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdout: streamWriter, + Stderr: streamWriter, + }) + + if err != nil { + // Try to extract exit code from error + if exitErr, ok := err.(interface{ ExitStatus() int }); ok { + return exitErr.ExitStatus(), nil + } + return -1, err + } + + return 0, nil +} + +// flushWriter wraps an io.Writer and flushes after each write if possible +type flushWriter struct { + w io.Writer +} + +func (fw *flushWriter) Write(p []byte) (n int, err error) { + n, err = fw.w.Write(p) + if err != nil { + return n, err + } + + // Flush if the underlying writer supports it + if f, ok := fw.w.(http.Flusher); ok { + f.Flush() + } + + return n, nil +} diff --git a/docs/adr/0006-agent-injection-architecture.md b/docs/adr/0006-agent-injection-architecture.md new file mode 100644 index 000000000..ae4ad5a97 --- /dev/null +++ b/docs/adr/0006-agent-injection-architecture.md @@ -0,0 +1,252 @@ +# ADR-0006: Workspace Container Architecture + +**Status**: Pending +**Date**: 2025-11-24 (Updated: 2025-12-04) +**Deciders**: Platform Architecture Team +**Related**: [ADR-0001 Kubernetes Native Architecture](0001-kubernetes-native-architecture.md) + +## Context + +Agents need access to tools specific to their task: compilers, test frameworks, language runtimes. Previously, the agent ran directly in the runner container with limited, fixed tooling. + +This ADR introduces **workspace containers**: separate pods where agents execute commands, with user-configurable container images. + +## Decision + +### Workspace Containers + +Each agentic session runs with two pods: + +1. **Runner Pod**: Contains the Claude Code agent and a session proxy sidecar. The agent has no direct shell access—all command execution goes through the proxy. +2. **Workspace Pod**: A user-configurable container where commands actually execute. Shares a PVC with the runner for file access. + +```mermaid +graph LR + subgraph "Runner Pod" + Runner[Runner Container
Claude Code Agent] + Proxy[Session Proxy
Sidecar] + end + subgraph "Workspace Pod" + Workspace[Workspace Container
User's Image] + end + PVC[(Shared PVC
/workspace)] + + Runner -->|localhost:8080| Proxy + Proxy -->|kubectl exec| Workspace + Runner -.->|reads/writes| PVC + Workspace -.->|reads/writes| PVC +``` + +By default, the workspace pod uses a base development image (`quay.io/ambient_code/vteam_workspace`). Users can customize this per-project via ProjectSettings. + +### Configuration + +Users configure workspace containers in the ProjectSettings UI under "Workspace Container". The **Container Image** field accepts a custom image with required tooling (e.g., `node:20`, `rust:1.75`, `python:3.12`); leaving it empty uses the platform default. The **Resource Limits** field allows setting CPU and memory requests/limits for the workspace pod. + +The ProjectSettings CR schema: + +```yaml +apiVersion: vteam.ambient-code/v1alpha1 +kind: ProjectSettings +metadata: + name: settings + namespace: my-project +spec: + workspaceContainer: + image: "node:20-slim" # Optional - defaults to runner image + resources: # Optional + cpuRequest: "500m" + cpuLimit: "2" + memoryRequest: "512Mi" + memoryLimit: "4Gi" +``` + +### Session Proxy Sidecar + +The session proxy is a lightweight Go binary that runs as a sidecar container within the runner pod. It listens on localhost:8080 and provides an HTTP streaming endpoint for command execution. The agent's MCP workspace tool makes streaming HTTP calls to the proxy, which uses kubectl exec to run commands in the workspace pod and streams output back in real-time. + +```mermaid +sequenceDiagram + participant Runner as Runner Container + participant Proxy as Session Proxy + participant Workspace as Workspace Pod + + Runner->>Proxy: POST /exec (streaming)
{command: "npm i", cwd: "/workspace"} + + Proxy->>Workspace: kubectl exec -- sh -c "npm i" + + Workspace-->>Proxy: stdout: "npm WARN..." + Proxy-->>Runner: chunk: "npm WARN..." + + Workspace-->>Proxy: stdout: "added 50 packages" + Proxy-->>Runner: chunk: "added 50 packages" + + Workspace-->>Proxy: exit code: 0 + Proxy-->>Runner: {exit: 0} +``` + +**Why a sidecar instead of the operator?** Keeping the exec API within the runner pod eliminates a network hop to the operator and avoids exposing an HTTP endpoint on the operator. The operator remains a pure watch-based controller. The proxy holds the credentials for pod exec—the runner container has no Kubernetes API access. + +### Why Streaming + +For long-running commands (npm install, cargo build, pytest), users need real-time feedback: + +| Approach | Latency | Streaming | UX | +|----------|---------|-----------|-----| +| CRD polling | ~200ms + poll interval | No | Wait for completion, then see all output | +| Direct streaming | ~10ms | Yes | See output as it happens | + +The MCP protocol supports streaming tool responses, enabling the agent to observe build output, test results, and errors in real-time. + +### Pod Isolation + +The runner and workspace run as separate pods. Within the runner pod, the agent container and proxy sidecar share a network namespace (allowing localhost communication) but have separate filesystem namespaces. The agent cannot directly access the proxy's credentials or make Kubernetes API calls. + +**Session Isolation**: The proxy discovers its workspace pod by label selector (`session=,type=workspace`), not by user-provided pod names. This prevents an agent from targeting another session's workspace even if it could somehow influence the proxy's requests. + +**Privilege Separation**: The runner pod disables automatic service account token mounting (`automountServiceAccountToken: false`). The proxy sidecar alone receives the token via a projected volume mount, giving it `pods/exec` permission scoped to workspace pods in the namespace. The runner container has no token and cannot make Kubernetes API calls. The workspace pod has no special permissions. + +### Disabling Native Bash + +The agent MUST NOT have access to the native Bash tool. This prevents bypassing the proxy, direct execution in the runner container, and running kubectl or other cluster tools. + +The runner wrapper configures allowed tools explicitly, excluding Bash: + +```python +allowed_tools = ["Read", "Write", "Glob", "Grep", "Edit", "WebSearch", "WebFetch"] +# Bash is NOT in the list - all execution goes through workspace MCP tool +``` + +### Security Model + +| Layer | Enforcement | +|-------|-------------| +| **Network isolation** | Proxy listens only on localhost; no external network access to exec API | +| **Session isolation** | Proxy discovers workspace by label; cannot target other sessions | +| **Token isolation** | Only proxy sidecar receives SA token via projected volume | +| **Privilege separation** | Runner has no K8s API access; proxy has only `pods/exec` | +| **No Bash** | Agent cannot execute commands outside the proxy workflow | + +### Threat Mitigations + +| Threat | Mitigation | +|--------|------------| +| Agent creates arbitrary pods | Runner has no SA token; proxy SA only has `pods/exec` | +| Agent accesses other session's workspace | Proxy uses label selector; validates namespace match | +| Agent escalates privileges in workspace | SecurityContext enforces non-root, dropped capabilities | +| Agent steals proxy token | Token mounted only in proxy container, not runner | +| Agent bypasses proxy | No Bash tool; no kubectl binary in runner container | + +### Transparent Version Upgrades + +The operator handles platform upgrades transparently without disrupting running sessions. When the operator deployment is upgraded with new workspace container images or session proxy versions, the system maintains continuity by allowing running sessions to continue with their current images while new sessions automatically receive the updated components. + +This separation of concerns is made possible by the pod-based architecture. Each AgenticSession creates its own runner and workspace pods with image references resolved at creation time. Upgrading the operator deployment updates the default image references in the operator's configuration but does not affect pods that already exist in the cluster. When a running session's runner pod continues executing, it uses the workspace container image and proxy sidecar version that were current when the session started. + +The workspace pod and runner pod are separate entities, enabling independent lifecycle management. In theory, workspace container images could be upgraded without touching the runner pod, though this would require additional controller logic to detect configuration changes and recreate only the workspace pod. The current implementation treats both pods as immutable—upgrading either component means creating a new session. + +Session proxy upgrades require restarting the runner pod because the proxy runs as a sidecar container within that pod. Kubernetes does not support in-place container updates; the entire pod must be replaced. However, this limitation has minimal impact because AgenticSessions are typically short-lived (minutes to hours, not days). Long-running interactive sessions could theoretically be migrated gracefully by checkpointing state and recreating the pod, but this complexity is not justified given current usage patterns. Users can always resume work by starting a new session if an upgrade interrupts a running one. + +The operator tracks component versions through pod annotations and labels. Each runner and workspace pod receives annotations indicating which operator version created it and which image versions are in use. This metadata enables debugging (identifying which sessions run old vs. new images), monitoring (tracking rollout progress as old sessions complete and new ones start), and potential future migration logic (triggering graceful session termination when a deprecated image version reaches end-of-life). + +Version tracking also prevents accidental downgrades. If an operator deployment rolls back to an earlier version, new sessions will use older images, but the operator will not attempt to "fix" newer sessions by restarting them with downgraded images. Each session's image references are immutable once created, stored in the pod spec rather than dynamically resolved. + +This approach balances operational simplicity with user experience. Platform administrators can deploy operator upgrades at any time without coordinating with active users. Running sessions continue uninterrupted, and users benefit from improvements when they start their next session. The upgrade window is effectively zero—there is no maintenance downtime where new sessions cannot be created. + +### AgenticTask CRD (Tekton-inspired) + +Create a Custom Resource for each command, with the operator reconciling execution. + +**Rejected** because it provides no streaming (the agent must poll for final results), adds significant latency (CR creation + etcd write + reconciliation adds ~200ms+ overhead), and causes CR pollution (completed tasks accumulate, requiring TTL/GC logic). + +### Namespace-per-Session + +Create an ephemeral namespace for each AgenticSession. + +**Deferred**: Adds ~200-500ms namespace creation overhead. Current pod-level isolation is sufficient. May revisit for stricter multi-tenancy requirements—see Future Directions. + +### Operator Exec API + +Have the operator expose an HTTP endpoint that the runner calls to execute commands in the workspace. + +**Rejected** because it adds a network hop and latency, requires the operator to expose an HTTP API (moving beyond pure watch-based design), and creates a single point of failure for all command execution across the cluster. + +### Direct kubectl exec from Runner + +Give runner SA `pods/exec` permission and exec directly into a separate workspace pod. + +**Rejected** because the runner could exec into any pod in the namespace and it would require mounting a service account token, expanding the attack surface. + +## Consequences + +### Positive + +This architecture delivers real-time feedback through streaming output for builds, tests, and other long-running commands. The localhost HTTP call within the pod provides minimal latency. The operator remains a pure watch-based controller with no HTTP API exposure. Each session is self-contained—the proxy sidecar requires no cluster-wide coordination. Since there are no command CRs, no garbage collection is needed. + +### Negative + +The sidecar adds resource overhead to each runner pod (though the proxy is lightweight). The proxy requires a service account with `pods/exec` permission, adding RBAC complexity. In-flight commands are lost if the pod restarts, though this is inherent to any streaming approach. + +## Implementation Status + +| Phase | Status | +|-------|--------| +| Session proxy sidecar | In Progress | +| Runner pod with sidecar spec | In Progress | +| Proxy SA and RBAC | In Progress | +| MCP workspace tools | In Progress | +| Hardening (rate limits, resource limits) | Pending | + +## Future Directions + +### Agent-Managed Namespaces + +The current architecture restricts agents to executing commands in a single workspace container. A natural evolution would allow agents to create and manage their own Kubernetes namespaces with full infrastructure capabilities: + +```mermaid +graph TB + subgraph "Project Namespace" + Session[Session Pod] + Operator[Operator] + end + + subgraph "Agent Namespace (ephemeral)" + WS[Workspace Pod] + DB[(Database Pod)] + Cache[Redis Pod] + API[API Service Pod] + end + + Session -->|requests via CR| Operator + Operator -->|creates/manages| WS + Operator -.->|future: create/manage| DB + Operator -.->|future: create/manage| Cache + Operator -.->|future: create/manage| API +``` + +**Use cases** include running integration tests that need a database, cache, and API services; deploying and testing microservices architectures; and spinning up ephemeral preview environments. + +**Design considerations**: The system would enforce strict resource quotas on namespace resources (CPU, memory, pods, services) and network policies for isolation from other namespaces with controlled egress. Automatic namespace deletion when the session ends (TTL/cleanup) ensures resources don't accumulate. Pods within the agent namespace would be able to discover and communicate with each other, and scoped secrets would handle database credentials and API keys. + +**Security model**: The operator creates the namespace with predefined RBAC and quotas. The agent receives a scoped ServiceAccount within that namespace but cannot modify namespace-level resources such as quotas or network policies. All resources inherit OwnerReference to the AgenticSession for cleanup. + +This approach maintains the security boundary (agent never directly touches the host cluster) while enabling more sophisticated workloads. + +### Devcontainer Auto-Detection + +Many repositories include a `.devcontainer/devcontainer.json` file that specifies the development environment—container image, features, extensions, and post-create commands. A natural enhancement would have the operator automatically detect and honor this configuration when creating workspace containers. + +When a session starts, the operator would check the cloned repository for `.devcontainer/devcontainer.json` (or `.devcontainer.json` at the root). If found, the operator extracts the `image` field (or builds from `dockerfile`/`build.dockerfile`) and applies relevant configuration such as environment variables, mount points, and post-create commands. This eliminates the need for users to manually configure workspace containers in ProjectSettings when the repository already defines its environment. + +**Precedence**: Explicit ProjectSettings configuration would override devcontainer detection, allowing users to force a specific image when needed. Repositories without devcontainer configuration fall back to the platform default. + +**Scope limitations**: Not all devcontainer features translate directly to Kubernetes. VS Code extensions, port forwarding rules, and certain lifecycle hooks would be ignored or adapted. The operator would support a subset focused on container image, environment variables, and initialization commands. + +**Security considerations**: The operator must validate devcontainer configurations before applying them. Arbitrary image references from untrusted repositories pose supply chain risks, so organizations may want to restrict allowed registries or require image allowlisting. Post-create commands execute in the workspace container's security context, inheriting all existing sandboxing. + +## References + +- [Kubernetes Pod Security](https://kubernetes.io/docs/concepts/security/pod-security-standards/) +- [Projected Volumes](https://kubernetes.io/docs/concepts/storage/projected-volumes/) +- [Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events) +- [Dev Container Specification](https://containers.dev/implementors/json_reference/)