diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index f46a50d2..6389c728 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -185,7 +185,7 @@ For the target daemon (local or remote): After the container starts: -1. **Clean stale nodes**: `clean_stale_nodes()` finds `NotReady` nodes via `kubectl get nodes` and deletes them. This is needed when a container is recreated but reuses the persistent volume -- k3s registers a new node (using the container ID as hostname) while old node entries persist in etcd. Non-fatal on error; returns the count of removed nodes. +1. **Clean stale nodes**: `clean_stale_nodes()` finds nodes whose name does not match the deterministic k3s `--node-name` and deletes them. That node name is derived from the gateway name but normalized to a Kubernetes-safe lowercase form so existing gateway names that contain `_`, `.`, or uppercase characters still produce a valid node identity. This cleanup is needed when a container is recreated but reuses the persistent volume -- old node entries can persist in etcd. Non-fatal on error; returns the count of removed nodes. 2. **Push local images** (optional, local deploy only): If `OPENSHELL_PUSH_IMAGES` is set, the comma-separated image refs are exported from the local Docker daemon as a single tar, uploaded into the container via `docker put_archive`, and imported into containerd via `ctr images import` in the `k8s.io` namespace. After import, `kubectl rollout restart deployment/openshell openshell` is run, followed by `kubectl rollout status --timeout=180s` to wait for completion. See `crates/openshell-bootstrap/src/push.rs`. 3. **Wait for gateway health**: `wait_for_gateway_ready()` polls the Docker HEALTHCHECK status up to 180 times, 2 seconds apart (6 min total). A background task streams container logs during this wait. Failure modes: - Container exits during polling: error includes recent log lines. diff --git a/architecture/gateway.md b/architecture/gateway.md index 39f97c8c..72574410 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -501,7 +501,7 @@ The Helm chart template is at `deploy/helm/openshell/templates/statefulset.yaml` `SandboxClient` (`crates/openshell-server/src/sandbox/mod.rs`) manages `agents.x-k8s.io/v1alpha1/Sandbox` CRDs. -- **Create**: Translates a `Sandbox` proto into a Kubernetes `DynamicObject` with labels (`openshell.ai/sandbox-id`, `openshell.ai/managed-by: openshell`) and a spec that includes the pod template, environment variables, and gateway-required env vars (`OPENSHELL_SANDBOX_ID`, `OPENSHELL_ENDPOINT`, `OPENSHELL_SSH_LISTEN_ADDR`, etc.). +- **Create**: Translates a `Sandbox` proto into a Kubernetes `DynamicObject` with labels (`openshell.ai/sandbox-id`, `openshell.ai/managed-by: openshell`) and a spec that includes the pod template, environment variables, and gateway-required env vars (`OPENSHELL_SANDBOX_ID`, `OPENSHELL_ENDPOINT`, `OPENSHELL_SSH_LISTEN_ADDR`, etc.). When callers do not provide custom `volumeClaimTemplates`, the server injects a default `workspace` PVC and mounts it at `/sandbox` so the default sandbox home/workdir survives pod rescheduling. - **Delete**: Calls the Kubernetes API to delete the CRD by name. Returns `false` if already gone (404). - **Pod IP resolution**: `agent_pod_ip()` fetches the agent pod and reads `status.podIP`. diff --git a/crates/openshell-bootstrap/src/constants.rs b/crates/openshell-bootstrap/src/constants.rs index 74e381fd..eee9000d 100644 --- a/crates/openshell-bootstrap/src/constants.rs +++ b/crates/openshell-bootstrap/src/constants.rs @@ -13,11 +13,66 @@ pub const SERVER_CLIENT_CA_SECRET_NAME: &str = "openshell-server-client-ca"; pub const CLIENT_TLS_SECRET_NAME: &str = "openshell-client-tls"; /// K8s secret holding the SSH handshake HMAC secret (shared by gateway and sandbox pods). pub const SSH_HANDSHAKE_SECRET_NAME: &str = "openshell-ssh-handshake"; +const NODE_NAME_PREFIX: &str = "openshell-"; +const NODE_NAME_FALLBACK_SUFFIX: &str = "gateway"; +const KUBERNETES_MAX_NAME_LEN: usize = 253; pub fn container_name(name: &str) -> String { format!("openshell-cluster-{name}") } +/// Deterministic k3s node name derived from the gateway name. +/// +/// k3s defaults to using the container hostname (= Docker container ID) as +/// the node name. When the container is recreated (e.g. after an image +/// upgrade), the container ID changes, creating a new k3s node. The +/// `clean_stale_nodes` function then deletes PVCs whose backing PVs have +/// node affinity for the old node — wiping the server database and any +/// sandbox persistent volumes. +/// +/// By passing a deterministic `--node-name` to k3s, the node identity +/// survives container recreation, and PVCs are never orphaned. +/// +/// Gateway names allow Docker-friendly separators and uppercase characters, +/// but Kubernetes node names must be DNS-safe. Normalize the gateway name into +/// a single lowercase RFC 1123 label so previously accepted names such as +/// `prod_us` or `Prod.US` still deploy successfully. +pub fn node_name(name: &str) -> String { + format!("{NODE_NAME_PREFIX}{}", normalize_node_name_suffix(name)) +} + +fn normalize_node_name_suffix(name: &str) -> String { + let mut normalized = String::with_capacity(name.len()); + let mut last_was_separator = false; + + for ch in name.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + last_was_separator = false; + } else if !last_was_separator { + normalized.push('-'); + last_was_separator = true; + } + } + + let mut normalized = normalized.trim_matches('-').to_string(); + if normalized.is_empty() { + normalized.push_str(NODE_NAME_FALLBACK_SUFFIX); + } + + let max_suffix_len = KUBERNETES_MAX_NAME_LEN.saturating_sub(NODE_NAME_PREFIX.len()); + if normalized.len() > max_suffix_len { + normalized.truncate(max_suffix_len); + normalized.truncate(normalized.trim_end_matches('-').len()); + } + + if normalized.is_empty() { + normalized.push_str(NODE_NAME_FALLBACK_SUFFIX); + } + + normalized +} + pub fn volume_name(name: &str) -> String { format!("openshell-cluster-{name}") } @@ -25,3 +80,33 @@ pub fn volume_name(name: &str) -> String { pub fn network_name(name: &str) -> String { format!("openshell-cluster-{name}") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn node_name_normalizes_uppercase_and_underscores() { + assert_eq!(node_name("Prod_US"), "openshell-prod-us"); + } + + #[test] + fn node_name_collapses_and_trims_separator_runs() { + assert_eq!(node_name("._Prod..__-Gateway-."), "openshell-prod-gateway"); + } + + #[test] + fn node_name_falls_back_when_gateway_name_has_no_alphanumerics() { + assert_eq!(node_name("...___---"), "openshell-gateway"); + } + + #[test] + fn node_name_truncates_to_kubernetes_name_limit() { + let gateway_name = "A".repeat(400); + let node_name = node_name(&gateway_name); + + assert!(node_name.len() <= KUBERNETES_MAX_NAME_LEN); + assert!(node_name.starts_with(NODE_NAME_PREFIX)); + assert!(node_name.ends_with('a')); + } +} diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index d9aaed7f..be086e53 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::RemoteOptions; -use crate::constants::{container_name, network_name, volume_name}; +use crate::constants::{container_name, network_name, node_name, volume_name}; use crate::image::{self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, parse_image_ref}; use bollard::API_DEFAULT_VERSION; use bollard::Docker; @@ -482,6 +482,7 @@ pub async fn ensure_container( registry_username: Option<&str>, registry_token: Option<&str>, device_ids: &[String], + resume: bool, ) -> Result { let container_name = container_name(name); @@ -491,25 +492,34 @@ pub async fn ensure_container( .await { Ok(info) => { - // Container exists — verify it is using the expected image. - // Resolve the desired image ref to its content-addressable ID so we - // can compare against the container's image field (which Docker - // stores as an ID). - let desired_id = docker - .inspect_image(image_ref) - .await - .ok() - .and_then(|img| img.id); + // On resume we always reuse the existing container — the persistent + // volume holds k3s etcd state, and recreating the container with + // different env vars would cause the entrypoint to rewrite the + // HelmChart manifest, triggering a Helm upgrade that changes the + // StatefulSet image reference while the old pod still runs with the + // previous image. Reusing the container avoids this entirely. + // + // On a non-resume path we check whether the image changed and + // recreate only when necessary. + let reuse = if resume { + true + } else { + let desired_id = docker + .inspect_image(image_ref) + .await + .ok() + .and_then(|img| img.id); - let container_image_id = info.image; + let container_image_id = info.image.clone(); - let image_matches = match (&desired_id, &container_image_id) { - (Some(desired), Some(current)) => desired == current, - _ => false, + match (&desired_id, &container_image_id) { + (Some(desired), Some(current)) => desired == current, + _ => false, + } }; - if image_matches { - // The container exists with the correct image, but its network + if reuse { + // The container exists and should be reused. Its network // attachment may be stale. When the gateway is resumed after a // container kill, `ensure_network` destroys and recreates the // Docker network (giving it a new ID). The stopped container @@ -543,8 +553,8 @@ pub async fn ensure_container( tracing::info!( "Container {} exists but uses a different image (container={}, desired={}), recreating", container_name, - container_image_id.as_deref().map_or("unknown", truncate_id), - desired_id.as_deref().map_or("unknown", truncate_id), + info.image.as_deref().map_or("unknown", truncate_id), + image_ref, ); let _ = docker.stop_container(&container_name, None).await; @@ -684,6 +694,11 @@ pub async fn ensure_container( format!("REGISTRY_HOST={registry_host}"), format!("REGISTRY_INSECURE={registry_insecure}"), format!("IMAGE_REPO_BASE={image_repo_base}"), + // Deterministic k3s node name so the node identity survives container + // recreation (e.g. after an image upgrade). Without this, k3s uses + // the container ID as the hostname/node name, which changes on every + // container recreate and triggers stale-node PVC cleanup. + format!("OPENSHELL_NODE_NAME={}", node_name(name)), ]; if let Some(endpoint) = registry_endpoint { env_vars.push(format!("REGISTRY_ENDPOINT={endpoint}")); @@ -753,6 +768,14 @@ pub async fn ensure_container( let config = ContainerCreateBody { image: Some(image_ref.to_string()), + // Set the container hostname to the deterministic node name. + // k3s uses the container hostname as its default node name. Without + // this, Docker defaults to the container ID (first 12 hex chars), + // which changes on every container recreation and can cause + // `clean_stale_nodes` to delete the wrong node on resume. The + // hostname persists across container stop/start cycles, ensuring a + // stable node identity. + hostname: Some(node_name(name)), cmd: Some(cmd), env, exposed_ports: Some(exposed_ports), diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index b569cabe..8ce10703 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -314,6 +314,7 @@ where // idempotent and will reuse the volume, create a container if needed, // and start it) let mut resume = false; + let mut resume_container_exists = false; if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { if recreate { log("[status] Removing existing gateway".to_string()); @@ -321,39 +322,51 @@ where } else if existing.container_running { log("[status] Gateway is already running".to_string()); resume = true; + resume_container_exists = true; } else { log("[status] Resuming gateway from existing state".to_string()); resume = true; + resume_container_exists = existing.container_exists; } } - // Ensure the image is available on the target Docker daemon - if remote_opts.is_some() { - log("[status] Downloading gateway".to_string()); - let on_log_clone = Arc::clone(&on_log); - let progress_cb = move |msg: String| { - if let Ok(mut f) = on_log_clone.lock() { - f(msg); - } - }; - image::pull_remote_image( - &target_docker, - &image_ref, - registry_username.as_deref(), - registry_token.as_deref(), - progress_cb, - ) - .await?; - } else { - // Local deployment: ensure image exists (pull if needed) - log("[status] Downloading gateway".to_string()); - ensure_image( - &target_docker, - &image_ref, - registry_username.as_deref(), - registry_token.as_deref(), - ) - .await?; + // Ensure the image is available on the target Docker daemon. + // When both the container and volume exist we can skip the pull entirely + // — the container already references a valid local image. This avoids + // failures when the original image tag (e.g. a local-only + // `openshell/cluster:dev`) is not available from the default registry. + // + // When only the volume survives (container was removed), we still need + // the image to recreate the container, so the pull must happen. + let need_image = !resume || !resume_container_exists; + if need_image { + if remote_opts.is_some() { + log("[status] Downloading gateway".to_string()); + let on_log_clone = Arc::clone(&on_log); + let progress_cb = move |msg: String| { + if let Ok(mut f) = on_log_clone.lock() { + f(msg); + } + }; + image::pull_remote_image( + &target_docker, + &image_ref, + registry_username.as_deref(), + registry_token.as_deref(), + progress_cb, + ) + .await?; + } else { + // Local deployment: ensure image exists (pull if needed) + log("[status] Downloading gateway".to_string()); + ensure_image( + &target_docker, + &image_ref, + registry_username.as_deref(), + registry_token.as_deref(), + ) + .await?; + } } // All subsequent operations use the target Docker (remote or local) @@ -444,6 +457,7 @@ where registry_username.as_deref(), registry_token.as_deref(), &device_ids, + resume, ) .await?; let port = actual_port; diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs index 2a10b265..0f9a96e6 100644 --- a/crates/openshell-bootstrap/src/runtime.rs +++ b/crates/openshell-bootstrap/src/runtime.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use crate::constants::{KUBECONFIG_PATH, container_name}; +use crate::constants::{KUBECONFIG_PATH, container_name, node_name}; use bollard::Docker; use bollard::container::LogOutput; use bollard::exec::CreateExecOptions; @@ -385,11 +385,19 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { let container_name = container_name(name); let mut stale_nodes: Vec = Vec::new(); + // Determine the current node name. With the deterministic `--node-name` + // entrypoint change the k3s node is `openshell-{gateway}`. However, older + // cluster images (built before that change) still use the container hostname + // (= Docker container ID) as the node name. We must handle both: + // + // 1. If the expected deterministic name appears in the node list, use it. + // 2. Otherwise fall back to the container hostname (old behaviour). + // + // This ensures backward compatibility during upgrades where the bootstrap + // CLI is newer than the cluster image. + let deterministic_node = node_name(name); + for attempt in 1..=MAX_ATTEMPTS { - // List ALL node names and the container's own hostname. Any node that - // is not the current container is stale — we cannot rely on the Ready - // condition because k3s may not have marked the old node NotReady yet - // when this runs shortly after container start. let (output, exit_code) = exec_capture_with_exit( docker, &container_name, @@ -406,16 +414,27 @@ pub async fn clean_stale_nodes(docker: &Docker, name: &str) -> Result { .await?; if exit_code == 0 { - // Determine the current node name (container hostname). - let (hostname_out, _) = - exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) - .await?; - let current_hostname = hostname_out.trim().to_string(); - - stale_nodes = output + let all_nodes: Vec<&str> = output .lines() .map(str::trim) - .filter(|l| !l.is_empty() && *l != current_hostname) + .filter(|l| !l.is_empty()) + .collect(); + + // Pick the current node identity: prefer the deterministic name, + // fall back to the container hostname for older cluster images. + let current_node = if all_nodes.contains(&deterministic_node.as_str()) { + deterministic_node.clone() + } else { + // Older cluster image without --node-name: read hostname. + let (hostname_out, _) = + exec_capture_with_exit(docker, &container_name, vec!["hostname".to_string()]) + .await?; + hostname_out.trim().to_string() + }; + + stale_nodes = all_nodes + .into_iter() + .filter(|n| *n != current_node) .map(ToString::to_string) .collect(); break; diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs index 3dca6649..c5e9a833 100644 --- a/crates/openshell-server/src/sandbox/mod.rs +++ b/crates/openshell-server/src/sandbox/mod.rs @@ -34,6 +34,44 @@ const SANDBOX_MANAGED_VALUE: &str = "openshell"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const GPU_RESOURCE_QUANTITY: &str = "1"; +// --------------------------------------------------------------------------- +// Default workspace persistence (temporary — will be replaced by snapshotting) +// --------------------------------------------------------------------------- +// Every sandbox pod gets a PVC-backed `/sandbox` directory so that user data +// (installed packages, files, dotfiles) survives pod rescheduling across +// gateway stop/start cycles. An init container seeds the PVC with the +// image's original `/sandbox` contents on first use so that the Python venv, +// skills, and shell config are not lost when the empty PVC is mounted. +// +// NOTE: This PVC + init-container approach is a stopgap. It has known +// limitations: image upgrades don't propagate into existing PVCs, the init +// copy adds first-start latency, and the full /sandbox directory is +// duplicated on disk. The plan is to replace this with proper container +// snapshotting so that only the diff from the base image is persisted. + +/// Volume name used for the workspace PVC in the pod spec. +const WORKSPACE_VOLUME_NAME: &str = "workspace"; + +/// Mount path for the workspace PVC in the **agent** container. This shadows +/// the image's `/sandbox` directory — the init container copies the image +/// contents into the PVC before the agent starts. +const WORKSPACE_MOUNT_PATH: &str = "/sandbox"; + +/// Mount path for the workspace PVC in the **init** container. A temporary +/// path so the init container can see the image's original `/sandbox` and +/// copy it into the PVC. +const WORKSPACE_INIT_MOUNT_PATH: &str = "/workspace-pvc"; + +/// Name of the init container that seeds the workspace PVC. +const WORKSPACE_INIT_CONTAINER_NAME: &str = "workspace-init"; + +/// Default storage request for the workspace PVC. +const WORKSPACE_DEFAULT_STORAGE: &str = "2Gi"; + +/// Sentinel file written by the init container after copying the image's +/// `/sandbox` contents. Subsequent pod starts skip the copy. +const WORKSPACE_SENTINEL: &str = ".workspace-initialized"; + #[derive(Clone)] pub struct SandboxClient { client: Client, @@ -733,6 +771,107 @@ fn apply_supervisor_sideload(pod_template: &mut serde_json::Value) { } } +/// Apply workspace persistence transforms to an already-built pod template. +/// +/// This injects: +/// 1. A volume mount on the agent container at `/sandbox`. +/// 2. An init container (same image) that seeds the PVC with the image's +/// original `/sandbox` contents on first use. +/// +/// The PVC volume itself is **not** added here — the Sandbox CRD controller +/// automatically creates a volume for each entry in `volumeClaimTemplates` +/// (following the StatefulSet convention). Adding one here would create a +/// duplicate volume name and fail pod validation. +/// +/// The init container mounts the PVC at a temporary path so it can still see +/// the image's `/sandbox` directory. It checks for a sentinel file and skips +/// the copy if the PVC was already initialised. +fn apply_workspace_persistence(pod_template: &mut serde_json::Value, image: &str) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + // 1. Add workspace volume mount to the agent container + let containers = spec.get_mut("containers").and_then(|v| v.as_array_mut()); + if let Some(containers) = containers { + let mut target_index = None; + for (i, c) in containers.iter().enumerate() { + if c.get("name").and_then(|v| v.as_str()) == Some("agent") { + target_index = Some(i); + break; + } + } + let index = target_index.unwrap_or(0); + + if let Some(container) = containers.get_mut(index).and_then(|v| v.as_object_mut()) { + let volume_mounts = container + .entry("volumeMounts") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volume_mounts) = volume_mounts { + volume_mounts.push(serde_json::json!({ + "name": WORKSPACE_VOLUME_NAME, + "mountPath": WORKSPACE_MOUNT_PATH + })); + } + } + } + + // 3. Add the init container that seeds the PVC from the image + let init_containers = spec + .entry("initContainers") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(init_containers) = init_containers { + // The init container mounts the PVC at a temp path so it can still + // read the image's original /sandbox contents. It copies them into + // the PVC only when the sentinel file is absent. + // + // The inner `[ -d ... ]` guard handles custom images that don't have + // a /sandbox directory — the copy is skipped but the sentinel is + // still written so subsequent starts are instant. + let copy_cmd = format!( + "if [ ! -f {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL} ]; then \ + if [ -d {WORKSPACE_MOUNT_PATH} ]; then \ + cp -a {WORKSPACE_MOUNT_PATH}/. {WORKSPACE_INIT_MOUNT_PATH}/; \ + fi && \ + touch {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL}; \ + fi" + ); + + init_containers.push(serde_json::json!({ + "name": WORKSPACE_INIT_CONTAINER_NAME, + "image": image, + "command": ["sh", "-c", copy_cmd], + "securityContext": { "runAsUser": 0 }, + "volumeMounts": [{ + "name": WORKSPACE_VOLUME_NAME, + "mountPath": WORKSPACE_INIT_MOUNT_PATH + }] + })); + } +} + +/// Build the default `volumeClaimTemplates` array for sandbox pods. +/// +/// Provides a single PVC named "workspace" that backs the `/sandbox` +/// directory. The init container seeds it from the image on first use. +fn default_workspace_volume_claim_templates() -> serde_json::Value { + serde_json::json!([{ + "metadata": { + "name": WORKSPACE_VOLUME_NAME + }, + "spec": { + "accessModes": ["ReadWriteOnce"], + "resources": { + "requests": { + "storage": WORKSPACE_DEFAULT_STORAGE + } + } + } + }]) +} + #[allow(clippy::too_many_arguments)] fn sandbox_to_k8s_spec( spec: Option<&SandboxSpec>, @@ -748,6 +887,18 @@ fn sandbox_to_k8s_spec( host_gateway_ip: &str, ) -> serde_json::Value { let mut root = serde_json::Map::new(); + + // Determine early whether the user provided custom volumeClaimTemplates. + // When they haven't, we inject a default workspace VCT and corresponding + // init container + volume mount so sandbox data persists. We need this + // flag before building the podTemplate because the workspace persistence + // transforms are applied inside sandbox_template_to_k8s. + let user_has_vct = spec + .and_then(|s| s.template.as_ref()) + .and_then(|t| struct_to_json(&t.volume_claim_templates)) + .is_some(); + let inject_workspace = !user_has_vct; + if let Some(spec) = spec { if !spec.log_level.is_empty() { root.insert("logLevel".to_string(), serde_json::json!(spec.log_level)); @@ -775,6 +926,7 @@ fn sandbox_to_k8s_spec( &spec.environment, client_tls_secret_name, host_gateway_ip, + inject_workspace, ), ); if !template.agent_socket.is_empty() { @@ -789,6 +941,15 @@ fn sandbox_to_k8s_spec( } } + // Inject the default workspace volumeClaimTemplate when the user didn't + // provide their own. + if inject_workspace { + root.insert( + "volumeClaimTemplates".to_string(), + default_workspace_volume_claim_templates(), + ); + } + // podTemplate is required by the Kubernetes CRD - ensure it's always present if !root.contains_key("podTemplate") { let empty_env = std::collections::HashMap::new(); @@ -809,6 +970,7 @@ fn sandbox_to_k8s_spec( spec_env, client_tls_secret_name, host_gateway_ip, + inject_workspace, ), ); } @@ -833,6 +995,7 @@ fn sandbox_template_to_k8s( spec_environment: &std::collections::HashMap, client_tls_secret_name: &str, host_gateway_ip: &str, + inject_workspace: bool, ) -> serde_json::Value { // The supervisor binary is always side-loaded from the node filesystem // via a hostPath volume, regardless of which sandbox image is used. @@ -959,6 +1122,13 @@ fn sandbox_template_to_k8s( // Always side-load the supervisor binary from the node filesystem apply_supervisor_sideload(&mut result); + // Inject workspace persistence (init container + PVC volume mount) so + // that /sandbox data survives pod rescheduling. Skipped when the user + // provides custom volumeClaimTemplates to avoid conflicts. + if inject_workspace { + apply_workspace_persistence(&mut result, image); + } + result } @@ -1631,6 +1801,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert_eq!( @@ -1664,6 +1835,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert_eq!( @@ -1693,6 +1865,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert_eq!( @@ -1735,6 +1908,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; @@ -1761,6 +1935,7 @@ mod tests { &std::collections::HashMap::new(), "", "172.17.0.1", + true, ); let host_aliases = pod_template["spec"]["hostAliases"] @@ -1791,6 +1966,7 @@ mod tests { &std::collections::HashMap::new(), "", "", + true, ); assert!( @@ -1816,6 +1992,7 @@ mod tests { &std::collections::HashMap::new(), "my-tls-secret", "", + true, ); let volumes = pod_template["spec"]["volumes"] @@ -1831,4 +2008,151 @@ mod tests { "TLS secret volume must use mode 0400 to prevent sandbox user from reading the private key" ); } + + // ----------------------------------------------------------------------- + // Workspace persistence tests + // ----------------------------------------------------------------------- + + #[test] + fn workspace_persistence_injects_init_container_volume_and_mount() { + let mut pod_template = serde_json::json!({ + "spec": { + "containers": [{ + "name": "agent", + "image": "openshell/sandbox:latest" + }] + } + }); + + apply_workspace_persistence(&mut pod_template, "openshell/sandbox:latest"); + + // Init container + let init_containers = pod_template["spec"]["initContainers"] + .as_array() + .expect("initContainers should exist"); + assert_eq!(init_containers.len(), 1); + assert_eq!(init_containers[0]["name"], WORKSPACE_INIT_CONTAINER_NAME); + assert_eq!(init_containers[0]["image"], "openshell/sandbox:latest"); + assert_eq!(init_containers[0]["securityContext"]["runAsUser"], 0); + + // Init container mounts PVC at temp path, not /sandbox + let init_mounts = init_containers[0]["volumeMounts"] + .as_array() + .expect("init volumeMounts should exist"); + assert_eq!(init_mounts.len(), 1); + assert_eq!(init_mounts[0]["name"], WORKSPACE_VOLUME_NAME); + assert_eq!(init_mounts[0]["mountPath"], WORKSPACE_INIT_MOUNT_PATH); + + // Agent container mounts PVC at /sandbox + let agent_mounts = pod_template["spec"]["containers"][0]["volumeMounts"] + .as_array() + .expect("agent volumeMounts should exist"); + let workspace_mount = agent_mounts + .iter() + .find(|m| m["name"] == WORKSPACE_VOLUME_NAME) + .expect("workspace mount should exist on agent container"); + assert_eq!(workspace_mount["mountPath"], WORKSPACE_MOUNT_PATH); + + // The PVC volume is NOT created by apply_workspace_persistence — the + // Sandbox CRD controller adds it from the volumeClaimTemplates. + // Verify we did not inject one (which would cause a duplicate). + let has_pvc_vol = pod_template["spec"]["volumes"] + .as_array() + .map_or(false, |vols| { + vols.iter().any(|v| v["name"] == WORKSPACE_VOLUME_NAME) + }); + assert!( + !has_pvc_vol, + "apply_workspace_persistence must NOT add a PVC volume (the CRD controller does that)" + ); + } + + #[test] + fn workspace_persistence_uses_same_image_as_agent() { + let mut pod_template = serde_json::json!({ + "spec": { + "containers": [{ + "name": "agent", + "image": "my-custom-image:v2" + }] + } + }); + + apply_workspace_persistence(&mut pod_template, "my-custom-image:v2"); + + let init_image = pod_template["spec"]["initContainers"][0]["image"] + .as_str() + .expect("init container should have image"); + assert_eq!( + init_image, "my-custom-image:v2", + "init container must use the same image as the agent container" + ); + } + + #[test] + fn workspace_init_command_checks_sentinel() { + let mut pod_template = serde_json::json!({ + "spec": { + "containers": [{ + "name": "agent", + "image": "img:latest" + }] + } + }); + + apply_workspace_persistence(&mut pod_template, "img:latest"); + + let cmd = pod_template["spec"]["initContainers"][0]["command"] + .as_array() + .expect("command should be an array"); + let script = cmd[2].as_str().expect("third element should be the script"); + assert!( + script.contains(WORKSPACE_SENTINEL), + "init script must check for sentinel file" + ); + assert!( + script.contains("cp -a"), + "init script must copy image contents" + ); + } + + #[test] + fn workspace_persistence_skipped_when_inject_workspace_false() { + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + false, // user provided custom VCTs + ); + + // No init container should be present + assert!( + pod_template["spec"]["initContainers"].is_null() + || pod_template["spec"]["initContainers"] + .as_array() + .is_none_or(|a| a.is_empty()), + "workspace init container must NOT be present when inject_workspace is false" + ); + + // No workspace volume mount on agent + let has_workspace_mount = pod_template["spec"]["containers"][0]["volumeMounts"] + .as_array() + .map_or(false, |mounts| { + mounts.iter().any(|m| m["name"] == WORKSPACE_VOLUME_NAME) + }); + assert!( + !has_workspace_mount, + "workspace mount must NOT be present when inject_workspace is false" + ); + } } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 367665db..3ab2f7b7 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -559,8 +559,26 @@ fi # routing to settle first. wait_for_default_route +# --------------------------------------------------------------------------- +# Deterministic k3s node name +# --------------------------------------------------------------------------- +# By default k3s uses the container hostname (= Docker container ID) as the +# node name. When the container is recreated (e.g. after an image upgrade), +# the container ID changes, registering a new k3s node. The bootstrap code +# then deletes PVCs whose backing PVs have node affinity for the old node — +# wiping the server database and any sandbox persistent volumes. +# +# OPENSHELL_NODE_NAME is set by the bootstrap code to a deterministic value +# derived from the gateway name, so the node identity survives container +# recreation and PVCs are never orphaned. +NODE_NAME_ARG="" +if [ -n "${OPENSHELL_NODE_NAME:-}" ]; then + NODE_NAME_ARG="--node-name=${OPENSHELL_NODE_NAME}" + echo "Using deterministic k3s node name: ${OPENSHELL_NODE_NAME}" +fi + # Execute k3s with explicit resolv-conf passed as a kubelet arg. # k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag; # it must be passed via --kubelet-arg instead. # shellcheck disable=SC2086 -exec /bin/k3s "$@" --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS +exec /bin/k3s "$@" $NODE_NAME_ARG --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS