diff --git a/.commitlintrc.json b/.commitlintrc.json
index 011834eef..f15c14437 100644
--- a/.commitlintrc.json
+++ b/.commitlintrc.json
@@ -17,6 +17,7 @@
         "pii",
         "proxy",
         "rag",
+        "sandbox",
         "storybook",
         "ui",
         "web",
diff --git a/.env.test b/.env.test
index a05c2c22f..5b9159002 100644
--- a/.env.test
+++ b/.env.test
@@ -44,3 +44,7 @@ POSTGRES_PASSWORD=test_password_e2e
 # Convex
 INSTANCE_SECRET=0000000000000000000000000000000000000000000000000000000000000000
 INSTANCE_NAME=tale_platform
+
+# Sandbox spawner — fixed test-only HMAC token so the smoke script can sign
+# /v1/execute. Production deploys auto-mint via the CLI's ensure-env helper.
+SANDBOX_TOKEN=test-sandbox-token-do-not-use-in-production-deadbeefcafef00d
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d81dc1275..e7ea18961 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -106,6 +106,12 @@ jobs:
               - 'services/platform/**'
               - 'packages/ui/**'
               - 'packages/webui/**'
+            sandbox:
+              - 'services/sandbox/**'
+            sandbox-egress:
+              - 'services/sandbox-egress/**'
+            sandbox-runtime:
+              - 'services/sandbox-runtime/**'
             ci_tests:
               - 'tests/container-*'
               - 'compose.test.yml'
@@ -118,6 +124,9 @@ jobs:
               - 'services/rag/**'
               - 'services/platform/**'
               - 'services/proxy/**'
+              - 'services/sandbox/**'
+              - 'services/sandbox-egress/**'
+              - 'services/sandbox-runtime/**'
 
       - name: Compute service matrix
         id: services
@@ -127,10 +136,11 @@ jobs:
           echo "list=${SERVICES}" >> "$GITHUB_OUTPUT"
           echo "Services to build: ${SERVICES}"
 
-          # Vulnerability scan only covers the six compose-stack services that
-          # `build` actually pushes to GHCR. Web and docs use their own compose
-          # stacks and are reachable via security.yml's filesystem scan.
-          SCANNABLE=$(echo "${SERVICES}" | jq -c '[.[] | select(. == "db" or . == "convex" or . == "crawler" or . == "rag" or . == "platform" or . == "proxy")]')
+          # Vulnerability scan covers the compose-stack services + sandbox
+          # trio that `build` actually pushes to GHCR. Web and docs use their
+          # own compose stacks and are reachable via security.yml's
+          # filesystem scan.
+          SCANNABLE=$(echo "${SERVICES}" | jq -c '[.[] | select(. == "db" or . == "convex" or . == "crawler" or . == "rag" or . == "platform" or . == "proxy" or . == "sandbox" or . == "sandbox-egress" or . == "sandbox-runtime")]')
           echo "scannable=${SCANNABLE}" >> "$GITHUB_OUTPUT"
           echo "Services to scan: ${SCANNABLE}"
 
@@ -180,14 +190,25 @@ jobs:
       matrix:
         # Compose-stack services. Keep in sync with build.yml (smoke/validate
         # pull loops) and cleanup-pr-images.yml matrix.
-        service: [db, convex, crawler, rag, platform, proxy]
+        service:
+          [
+            db,
+            convex,
+            crawler,
+            rag,
+            platform,
+            proxy,
+            sandbox,
+            sandbox-egress,
+            sandbox-runtime,
+          ]
 
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
 
       - name: Reclaim disk space
-        if: matrix.service == 'platform' || matrix.service == 'rag' || matrix.service == 'crawler' || matrix.service == 'convex'
+        if: matrix.service == 'platform' || matrix.service == 'rag' || matrix.service == 'crawler' || matrix.service == 'convex' || matrix.service == 'sandbox-runtime'
         run: |
           sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
           sudo docker image prune -af
@@ -316,16 +337,22 @@ jobs:
       # `docker compose build` step.
       - name: Pull images from GHCR
         run: |
-          # Compose-stack services. Keep in sync with build.yml (build matrix)
-          # and cleanup-pr-images.yml matrix.
+          # Compose-stack services + sandbox-runtime. Keep in sync with build.yml
+          # (build matrix) and cleanup-pr-images.yml matrix. sandbox-runtime is
+          # not a compose service but the spawner pulls it at boot — re-tag it
+          # locally so smoke tests with PULL_POLICY=never find it.
           TAG="${{ needs.changes.outputs.image_tag }}"
           REGISTRY_PATH="${{ env.REGISTRY }}/${{ github.repository }}"
-          for svc in db convex crawler rag platform proxy; do
+          for svc in db convex crawler rag platform proxy sandbox sandbox-egress sandbox-runtime; do
             IMAGE="${REGISTRY_PATH}/tale-${svc}:${TAG}"
             echo "Pulling ${IMAGE}..."
             docker pull "${IMAGE}"
             docker tag "${IMAGE}" "ghcr.io/tale-project/tale/tale-${svc}:latest"
           done
+          # See note in image-validate: the spawner's SANDBOX_RUNTIME_IMAGE
+          # defaults to the unscoped `tale-sandbox-runtime:latest`.
+          docker tag "ghcr.io/tale-project/tale/tale-sandbox-runtime:latest" \
+            "tale-sandbox-runtime:latest"
 
       - name: Run smoke tests
         run: bash tests/container-smoke-test.sh
@@ -511,16 +538,24 @@ jobs:
 
       - name: Pull images from GHCR
         run: |
-          # Compose-stack services. Keep in sync with build.yml (build matrix)
-          # and cleanup-pr-images.yml matrix.
+          # Compose-stack services + sandbox-runtime. Keep in sync with build.yml
+          # (build matrix) and cleanup-pr-images.yml matrix. sandbox-runtime is
+          # not a compose service but the spawner pulls it at boot — re-tag it
+          # locally so PULL_POLICY=never validation finds it.
           TAG="${{ needs.changes.outputs.image_tag }}"
           REGISTRY_PATH="${{ env.REGISTRY }}/${{ github.repository }}"
-          for svc in db convex crawler rag platform proxy; do
+          for svc in db convex crawler rag platform proxy sandbox sandbox-egress sandbox-runtime; do
             IMAGE="${REGISTRY_PATH}/tale-${svc}:${TAG}"
             echo "Pulling ${IMAGE}..."
             docker pull "${IMAGE}"
             docker tag "${IMAGE}" "ghcr.io/tale-project/tale/tale-${svc}:latest"
           done
+          # The spawner reads SANDBOX_RUNTIME_IMAGE which defaults to
+          # `tale-sandbox-runtime:latest` (unscoped). Mirror the tag so the
+          # spawner's boot-time `ensureImage` hits a local cache instead of
+          # trying to pull from GHCR.
+          docker tag "ghcr.io/tale-project/tale/tale-sandbox-runtime:latest" \
+            "tale-sandbox-runtime:latest"
 
       - name: Run image validation
         run: bash tests/container-image-test.sh
@@ -605,6 +640,7 @@ jobs:
           format: 'sarif'
           output: '${{ matrix.service }}-trivy.sarif'
           severity: 'HIGH,CRITICAL'
+          trivyignores: '.trivyignore.yaml'
 
       - name: Upload SARIF
         uses: github/codeql-action/upload-sarif@e46ed2cbd01164d986452f91f178727624ae40d7 # v4.35.3
diff --git a/.github/workflows/cleanup-pr-images.yml b/.github/workflows/cleanup-pr-images.yml
index 19d4d97c9..98e0b7c91 100644
--- a/.github/workflows/cleanup-pr-images.yml
+++ b/.github/workflows/cleanup-pr-images.yml
@@ -28,7 +28,18 @@ jobs:
       matrix:
         # Compose-stack services. Keep in sync with build.yml (build matrix +
         # smoke/validate pull loops).
-        service: [db, convex, crawler, rag, platform, proxy]
+        service:
+          [
+            db,
+            convex,
+            crawler,
+            rag,
+            platform,
+            proxy,
+            sandbox,
+            sandbox-egress,
+            sandbox-runtime,
+          ]
 
     steps:
       - name: Delete PR-tagged versions
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index a13d57a62..2da014bcf 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -77,6 +77,9 @@ jobs:
           - { name: convex }
           - { name: web }
           - { name: docs }
+          - { name: sandbox }
+          - { name: sandbox-egress }
+          - { name: sandbox-runtime }
         arch:
           - { name: amd64, runner: ubuntu-latest, platform: linux/amd64 }
           - { name: arm64, runner: ubuntu-24.04-arm, platform: linux/arm64 }
@@ -86,7 +89,7 @@ jobs:
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
 
       - name: Reclaim disk space
-        if: matrix.service.name == 'platform' || matrix.service.name == 'rag' || matrix.service.name == 'crawler' || matrix.service.name == 'convex'
+        if: matrix.service.name == 'platform' || matrix.service.name == 'rag' || matrix.service.name == 'crawler' || matrix.service.name == 'convex' || matrix.service.name == 'sandbox-runtime'
         run: |
           sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
           sudo docker image prune -af
@@ -156,7 +159,7 @@ jobs:
         run: |
           VERSION="${{ needs.prepare.outputs.version_number }}"
           ARCH="amd64"
-          for svc in platform rag crawler db proxy convex web docs; do
+          for svc in platform rag crawler db proxy convex web docs sandbox sandbox-egress sandbox-runtime; do
             IMAGE="${{ env.REGISTRY }}/${{ github.repository }}/tale-${svc}:${VERSION}-${ARCH}"
             echo "Pulling ${IMAGE}..."
             docker pull "${IMAGE}"
@@ -207,7 +210,20 @@ jobs:
 
     strategy:
       matrix:
-        service: [platform, rag, crawler, db, proxy, convex, web, docs]
+        service:
+          [
+            platform,
+            rag,
+            crawler,
+            db,
+            proxy,
+            convex,
+            web,
+            docs,
+            sandbox,
+            sandbox-egress,
+            sandbox-runtime,
+          ]
 
     steps:
       - name: Login to GHCR
@@ -256,7 +272,7 @@ jobs:
         run: |
           VERSION="${{ needs.prepare.outputs.version_number }}"
           REGISTRY="${{ env.REGISTRY }}/${{ github.repository }}"
-          for svc in platform rag crawler db proxy convex web docs; do
+          for svc in platform rag crawler db proxy convex web docs sandbox sandbox-egress sandbox-runtime; do
             IMAGE="${REGISTRY}/tale-${svc}:${VERSION}"
             echo "Verifying manifest: ${IMAGE}"
             docker manifest inspect "${IMAGE}" > /dev/null 2>&1 || {
@@ -310,13 +326,13 @@ jobs:
         run: |
           echo "## Release ${{ needs.prepare.outputs.version }} Complete" >> "$GITHUB_STEP_SUMMARY"
           echo "" >> "$GITHUB_STEP_SUMMARY"
-          echo "All 8 service images have been built, tested, and pushed to GHCR (native amd64 + arm64)." >> "$GITHUB_STEP_SUMMARY"
+          echo "All 11 service images have been built, tested, and pushed to GHCR (native amd64 + arm64)." >> "$GITHUB_STEP_SUMMARY"
           echo "" >> "$GITHUB_STEP_SUMMARY"
           echo "### Images" >> "$GITHUB_STEP_SUMMARY"
           echo "" >> "$GITHUB_STEP_SUMMARY"
           echo "| Service | Image |" >> "$GITHUB_STEP_SUMMARY"
           echo "|---------|-------|" >> "$GITHUB_STEP_SUMMARY"
-          for svc in platform rag crawler db proxy convex web docs; do
+          for svc in platform rag crawler db proxy convex web docs sandbox sandbox-egress sandbox-runtime; do
             echo "| ${svc} | \`${{ env.REGISTRY }}/${{ github.repository }}/tale-${svc}:${{ needs.prepare.outputs.version_number }}\` |" >> "$GITHUB_STEP_SUMMARY"
           done
           echo "" >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index d33641850..0c1d5d331 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -11,6 +11,15 @@ on:
       - 'services/rag/uv.lock'
       - 'services/rag/pyproject.toml'
       - 'packages/*/pyproject.toml'
+      # Dockerfile + dockerignore changes alter what trivy's misconfig
+      # scanner sees on the fs-scan path; .trivyignore.yaml changes can
+      # silently un-suppress findings. Round-2 R2-B11 found this branch
+      # added new Dockerfiles + a trivyignore without re-triggering the
+      # security scan — PRs went out blind.
+      - 'services/*/Dockerfile'
+      - 'services/*/Dockerfile.dockerignore'
+      - '.trivyignore.yaml'
+      - '.trivyignore'
       - '.github/workflows/security.yml'
   push:
     branches:
@@ -22,6 +31,10 @@ on:
       - 'services/rag/uv.lock'
       - 'services/rag/pyproject.toml'
       - 'packages/*/pyproject.toml'
+      - 'services/*/Dockerfile'
+      - 'services/*/Dockerfile.dockerignore'
+      - '.trivyignore.yaml'
+      - '.trivyignore'
       - '.github/workflows/security.yml'
   schedule:
     - cron: '0 3 * * 1' # Monday 03:00 UTC
@@ -83,7 +96,7 @@ jobs:
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
 
       - name: Run Trivy filesystem scan
-        uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # 0.35.0
+        uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0
         with:
           scan-type: 'fs'
           scan-ref: '.'
@@ -93,6 +106,9 @@ jobs:
           exit-code: '0'
           scanners: 'vuln,secret,misconfig'
           ignore-unfixed: true
+          # Per-path misconfig suppressions live in .trivyignore.yaml; the
+          # plain .trivyignore is auto-detected but cannot scope by path.
+          trivyignores: '.trivyignore.yaml'
           # Skip handlebars Dockerfile templates: handlebars syntax confuses
           # the misconfig scanner. The generated Dockerfiles are scanned
           # downstream when each service runs its own build.
diff --git a/.trivyignore.yaml b/.trivyignore.yaml
new file mode 100644
index 000000000..c06ee8240
--- /dev/null
+++ b/.trivyignore.yaml
@@ -0,0 +1,37 @@
+# =============================================================================
+# Trivy Ignore File (YAML)
+# =============================================================================
+# Per-path suppressions for vulnerabilities, misconfigurations, secrets, and
+# licenses. Plain CVE-only entries can also live in `.trivyignore` next to
+# this file; YAML is needed when scoping by `paths`.
+#
+# Docs: https://aquasecurity.github.io/trivy/latest/docs/configuration/filtering/
+# Loaded by CI via `trivyignores:` on the trivy-action invocations in
+# .github/workflows/security.yml and .github/workflows/build.yml.
+# =============================================================================
+
+misconfigurations:
+  # AVD-DS-0002: "Image user should not be 'root'"
+  - id: AVD-DS-0002
+    paths:
+      - 'services/sandbox/Dockerfile'
+    statement: |
+      Sandbox spawner needs root inside the container to talk to the mounted
+      /var/run/docker.sock. The docker socket is the security boundary, not
+      the in-container UID. Documented in services/sandbox/Dockerfile.
+  - id: AVD-DS-0002
+    paths:
+      - 'services/sandbox-egress/Dockerfile'
+    statement: |
+      Egress proxy entrypoint runs as root only long enough to chown the log
+      file; tinyproxy itself drops privileges to `nobody` at bind time via
+      tinyproxy.conf. Documented in services/sandbox-egress/Dockerfile.
+
+  # AVD-DS-0026: "No HEALTHCHECK defined"
+  - id: AVD-DS-0026
+    paths:
+      - 'services/sandbox-runtime/Dockerfile'
+    statement: |
+      Sandbox runtime is an ephemeral one-shot image: the spawner runs it per
+      code_run call, entrypoint.sh executes the user code, and the container
+      exits. There is no long-running process to health-check.
diff --git a/bun.lock b/bun.lock
index 9e46f9327..48345477f 100644
--- a/bun.lock
+++ b/bun.lock
@@ -70,7 +70,10 @@
     },
     "packages/seo": {
       "name": "@tale/seo",
-      "version": "0.1.0",
+      "version": "0.2.0",
+      "bin": {
+        "tale-seo-compile": "./bin/compile.ts",
+      },
       "dependencies": {
         "@tale/i18n": "workspace:*",
         "jsdom": "29.0.2",
@@ -330,6 +333,14 @@
       "name": "@tale/rag",
       "version": "0.1.0",
     },
+    "services/sandbox": {
+      "name": "@tale/sandbox",
+      "version": "0.1.0",
+      "devDependencies": {
+        "@types/bun": "^1.1.0",
+        "typescript": "^5.6.0",
+      },
+    },
     "services/web": {
       "name": "@tale/web",
       "version": "0.1.0",
@@ -379,6 +390,7 @@
   ],
   "patchedDependencies": {
     "convex-helpers@0.1.114": "patches/convex-helpers@0.1.114.patch",
+    "@convex-dev/agent@0.6.1": "patches/@convex-dev%2Fagent@0.6.1.patch",
   },
   "overrides": {
     "@xmldom/xmldom": "0.8.13",
@@ -1569,6 +1581,8 @@
 
     "@tale/rag": ["@tale/rag@workspace:services/rag"],
 
+    "@tale/sandbox": ["@tale/sandbox@workspace:services/sandbox"],
+
     "@tale/seo": ["@tale/seo@workspace:packages/seo"],
 
     "@tale/shared": ["@tale/shared@workspace:packages/tale_shared"],
@@ -3923,6 +3937,8 @@
 
     "@tailwindcss/postcss/postcss": ["postcss@8.5.8", "", { "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" } }, "sha512-OW/rX8O/jXnm82Ey1k44pObPtdblfiuWnrd8X7GJ7emImCOstunGbXUpp7HdBrFQX6rJzn3sPT397Wp5aCwCHg=="],
 
+    "@tale/sandbox/typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
+
     "@tanstack/router-generator/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
 
     "@tanstack/router-plugin/chokidar": ["chokidar@3.6.0", "", { "dependencies": { "anymatch": "~3.1.2", "braces": "~3.0.2", "glob-parent": "~5.1.2", "is-binary-path": "~2.1.0", "is-glob": "~4.0.1", "normalize-path": "~3.0.0", "readdirp": "~3.6.0" }, "optionalDependencies": { "fsevents": "~2.3.2" } }, "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw=="],
diff --git a/compose.dev.yml b/compose.dev.yml
index a093f02d5..adf46031b 100644
--- a/compose.dev.yml
+++ b/compose.dev.yml
@@ -54,12 +54,22 @@ services:
       - caddy-data:/caddy-data:ro
     environment:
       - RUST_LOG=debug
+      # NODE_ENV is read by Convex V8 functions at evaluation time; the
+      # `test_sandbox_e2e` action gates itself on this so production can't
+      # accidentally invoke the E2E harness. Dev compose explicitly opts in.
+      - NODE_ENV=development
 
   platform:
     volumes:
       - ./services/platform/app:/app/services/platform/app
       - ./services/platform/lib:/app/services/platform/lib
       - ./services/platform/convex:/app/services/platform/convex
+      # `bunx convex deploy` runs from /app (the package.json with
+      # "convex": "1.35.x") and looks for source at `<cwd>/convex/`,
+      # which is the image-baked snapshot. Bind the host source over
+      # that path too so deploys pick up live edits instead of the
+      # snapshot frozen at build time.
+      - ./services/platform/convex:/app/convex
       - convex-data:/app/data:ro
     environment:
       - NODE_ENV=development
@@ -78,6 +88,18 @@ services:
     environment:
       - CADDY_DEBUG=true
 
+  # Sandbox-wobbly-origami plan §4: when developers run Convex on the host
+  # (e.g. `cd services/platform && bun dev` instead of the dockerized
+  # `convex` service), the spawner container needs to reach
+  # http://host.docker.internal:3210 for the EP1/EP2 callbacks. Linux
+  # Docker doesn't add this DNS alias by default; the line below opts in.
+  # In that bun-dev setup, also set in `services/platform/.env.local`:
+  #   SANDBOX_STORAGE_INTERNAL_BASE_URL=http://host.docker.internal:3210
+  # The dockerized convex path uses `http://proxy` from compose.yml.
+  sandbox:
+    extra_hosts:
+      - 'host.docker.internal:host-gateway'
+
   db:
     environment:
       - DB_LOG_STATEMENT=all
diff --git a/compose.yml b/compose.yml
index 7efb0e19a..42564237a 100644
--- a/compose.yml
+++ b/compose.yml
@@ -535,6 +535,138 @@ services:
         aliases:
           - ${HOST:-tale.local}
 
+  # ============================================================================
+  # Tale Sandbox Egress (tinyproxy) — HTTPS forward proxy
+  # ----------------------------------------------------------------------------
+  # Filters CONNECT host requests against an allow-list of package registries
+  # (pypi.org, files.pythonhosted.org, registry.npmjs.org, github package
+  # endpoints). Sandbox runtime containers reach pypi/npm via this proxy; all
+  # other internet is unreachable because the sandbox bridge is `internal:true`.
+  # See plan §2.
+  # ============================================================================
+  sandbox-egress:
+    image: ghcr.io/tale-project/tale/tale-sandbox-egress:${VERSION:-latest}
+    pull_policy: ${PULL_POLICY:-build}
+    build:
+      context: .
+      dockerfile: services/sandbox-egress/Dockerfile
+    container_name: tale-sandbox-egress
+    env_file:
+      - .env
+    restart: unless-stopped
+    # NET_ADMIN lets the entrypoint install iptables REJECT rules for
+    # IMDS (169.254.169.254) and RFC1918 ranges as defense-in-depth
+    # against DNS-rebind on an allowlisted hostname. Without this cap
+    # the entrypoint warns and skips the firewall. Mirrors the convex
+    # container — see services/convex/docker-entrypoint.sh.
+    cap_add:
+      - NET_ADMIN
+    healthcheck:
+      # Plain TCP-listen check: if tinyproxy is up the port answers. We
+      # intentionally do NOT CONNECT-probe an external host (pypi/npm)
+      # — flapping that probe against a transient upstream outage flips
+      # the spawner's `depends_on: service_healthy` gate to false and
+      # blocks all new sandbox launches even though the proxy itself is
+      # fine. Round-2 R2-B11: aligned with the CLI generator (TCP-only)
+      # so `docker compose up` and `tale start` produce identical health
+      # semantics.
+      test: ['CMD-SHELL', 'nc -z 127.0.0.1 3128 || exit 1']
+      interval: 10s
+      timeout: 5s
+      retries: 2
+      start_period: 10s
+    logging:
+      driver: 'json-file'
+      options:
+        max-size: '10m'
+        max-file: '3'
+    networks:
+      # `sandbox` faces the runtime containers (their only outbound path,
+      # since tale-sandbox-net is `--internal`). `internal` provides
+      # outbound NAT to pypi/npmjs/etc — `--internal` networks can't reach
+      # the host bridge. Egress peers on `internal` are NOT a meaningful
+      # new attack surface (the hostname allowlist + RFC1918/IMDS iptables
+      # rules limit them to the same registries they could already reach
+      # directly via their own NAT).
+      - sandbox
+      - internal
+
+  # ============================================================================
+  # Tale Sandbox Spawner — thin stateless docker-run service for artifact_run
+  # ----------------------------------------------------------------------------
+  # Mounts /var/run/docker.sock to spawn ephemeral sibling containers per call.
+  # Reachable only on the `internal` bridge by the platform/convex service;
+  # joined to `sandbox` only to issue `docker run` (the runtime containers
+  # themselves attach to `sandbox` for egress via tinyproxy).
+  #
+  # SECURITY: docker.sock = host root. Explicit threat acceptance per plan
+  # "Security model". Spawner accepts only HMAC-signed typed JSON over HTTP;
+  # `services/sandbox/src/docker-args.ts` validates every argv field with
+  # regexes so a malformed input never reaches `docker run`. Future hardening:
+  # SANDBOX_RUNTIME=runsc opt-in (gVisor), `opa-docker-authz` daemon plugin
+  # for HostConfig body filtering, dockerd userns-remap.
+  # ============================================================================
+  sandbox:
+    image: ghcr.io/tale-project/tale/tale-sandbox:${VERSION:-latest}
+    pull_policy: ${PULL_POLICY:-build}
+    build:
+      context: .
+      dockerfile: services/sandbox/Dockerfile
+    container_name: tale-sandbox
+    # Loopback-only port mapping. The spawner mounts /var/run/docker.sock,
+    # so an unauthenticated request on this port = remote root via docker.
+    # Convex reaches the spawner through the `internal` Docker network
+    # (http://sandbox:8003) — the published port is only for `bun dev`
+    # running convex-local-backend on the host. NEVER publish on 0.0.0.0.
+    ports:
+      - '127.0.0.1:8003:8003'
+    env_file:
+      - .env
+    environment:
+      SANDBOX_RUNTIME: ${SANDBOX_RUNTIME:-runc}
+      SANDBOX_RUNTIME_IMAGE: ${SANDBOX_RUNTIME_IMAGE:-tale-sandbox-runtime:latest}
+      SANDBOX_EGRESS_NETWORK: tale-sandbox-net
+      SANDBOX_EGRESS_PROXY: http://sandbox-egress:3128
+    volumes:
+      # The spawner needs the host docker socket to spawn sibling containers.
+      # This is the security boundary — see header comment.
+      - /var/run/docker.sock:/var/run/docker.sock
+      # 1:1 bind: per-call workspace dirs are created here by the spawner
+      # and mounted into the runtime container at the SAME host path (the
+      # docker daemon resolves --mount source paths against the host fs,
+      # so the spawner and the daemon must agree on the path).
+      - /var/lib/tale-sandbox:/var/lib/tale-sandbox
+    restart: unless-stopped
+    # Resource caps mirror the CLI compose generator
+    # (`tools/cli/src/lib/compose/services/create-sandbox-service.ts`). The
+    # `tale start` and raw `docker compose up` paths must produce the SAME
+    # operational posture; previously this file shipped without caps, so
+    # operators running `docker compose up` directly got an uncapped
+    # spawner — audit finding R2-B11.
+    mem_limit: 512m
+    pids_limit: 512
+    ulimits:
+      nofile:
+        soft: 4096
+        hard: 8192
+    healthcheck:
+      test: ['CMD', 'curl', '-fsS', 'http://127.0.0.1:8003/health']
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 15s
+    depends_on:
+      sandbox-egress:
+        condition: service_healthy
+    logging:
+      driver: 'json-file'
+      options:
+        max-size: '10m'
+        max-file: '3'
+    networks:
+      - internal
+      - sandbox
+
 # ============================================================================
 # Volumes
 # ============================================================================
@@ -588,3 +720,14 @@ networks:
   # Internal network for Tale services
   internal:
     driver: bridge
+
+  # Sandbox network — internal-only bridge for artifact_run runtime containers + the
+  # tinyproxy egress sidecar. The CLI (start.ts / deploy.ts via
+  # ensureSandboxNetwork) pre-creates the network with `--internal --ipv6=false`
+  # so it can carry both `tale-sandbox-net` and the bridge-driver flags that
+  # compose's `networks:` block can't express atomically. We mark it external
+  # here so compose attaches to the existing network rather than overwriting
+  # its driver options.
+  sandbox:
+    external: true
+    name: tale-sandbox-net
diff --git a/docs/de/platform/workspace/canvas.md b/docs/de/platform/workspace/canvas.md
index d321aca43..3d3bf9eeb 100644
--- a/docs/de/platform/workspace/canvas.md
+++ b/docs/de/platform/workspace/canvas.md
@@ -9,7 +9,7 @@ Die Zielgruppe ist jeder im Chat. Es gibt kein Rollen-Gate; wer chatten kann, ka
 
 ## Wie der Artefakt-Lebenszyklus funktioniert
 
-Wenn die KI etwas Lauffähiges oder Überarbeitbares hervorbringen will, ruft sie das `artifact_create`-Tool auf. Das neue Artefakt erscheint als Karte in der **Artefakte**-Leiste über dem Chat, öffnet sich beim ersten Erzeugen automatisch im Canvas-Bereich und streamt seinen Inhalt live in den Bereich, während die KI tippt. Um das Artefakt zu überarbeiten, ruft die KI `artifact_edit` auf dieselbe Identität — kleine Änderungen nutzen `mode: 'patch'` (Suchen-und-Ersetzen-Blöcke); grosse Umschriften nutzen `mode: 'rewrite'`. In beiden Fällen rendert Canvas an Ort und Stelle neu, sodass du nie zurückscrollen musst, um die neueste Version zu finden.
+Wenn die KI etwas Lauffähiges oder Überarbeitbares hervorbringen will, ruft sie das `artifact_create`-Tool auf. Das neue Artefakt erscheint als Karte in der **Artefakte**-Leiste über dem Chat und öffnet sich beim ersten Erzeugen automatisch im Canvas-Bereich. Um es zu befüllen oder zu überarbeiten, ruft die KI Datei-CRUD-Tools auf dieselbe Identität auf: `artifact_file_update`, um eine bestehende Datei vollständig zu überschreiben, `artifact_file_create`, um eine neue Geschwisterdatei hinzuzufügen (ein Projekt kann mehrere Dateien enthalten), `artifact_file_delete` und `artifact_file_rename` zur Pflege. Canvas rendert an Ort und Stelle neu und streamt den Inhalt live, während die KI tippt, sodass du nie zurückscrollen musst, um die neueste Version zu finden.
 
 Während die KI schreibt oder patcht, zeigt die Karte einen Spinner und die Canvas-Kopfzeile liest **KI schreibt…** oder **KI bearbeitet…**.
 
diff --git a/docs/en/platform/workspace/canvas.md b/docs/en/platform/workspace/canvas.md
index 71b5c1d9d..171f5f900 100644
--- a/docs/en/platform/workspace/canvas.md
+++ b/docs/en/platform/workspace/canvas.md
@@ -9,7 +9,7 @@ The audience is anyone in chat. There's no role gate; whoever can chat can also
 
 ## How the artifact lifecycle works
 
-When the AI decides to produce something runnable or revisable, it calls the `artifact_create` tool. The new artifact appears as a card in the **Artifacts** bar above the chat, auto-opens in the Canvas pane the first time it's created, and streams its content into the pane live as the AI types it. To revise the artifact, the AI calls `artifact_edit` against the same identity — small changes use `mode: 'patch'` (search-and-replace blocks); large rewrites use `mode: 'rewrite'`. Either way, Canvas re-renders in place, so you never scroll back to find the latest version.
+When the AI decides to produce something runnable or revisable, it calls the `artifact_create` tool. The new artifact appears as a card in the **Artifacts** bar above the chat and auto-opens in the Canvas pane the first time it's created. To populate or revise the artifact, the AI calls file-level CRUD tools against the same identity: `artifact_file_update` to overwrite an existing file in full, `artifact_file_create` to add a new sibling file (a project can contain many files), `artifact_file_delete` and `artifact_file_rename` for housekeeping. Canvas re-renders in place and streams the content live as the AI types it, so you never scroll back to find the latest version.
 
 While the AI is writing or patching, the card shows a spinner and the Canvas header reads **AI is writing…** or **AI is editing…**.
 
diff --git a/docs/fr/platform/workspace/canvas.md b/docs/fr/platform/workspace/canvas.md
index 5e8f7760d..9a50941d4 100644
--- a/docs/fr/platform/workspace/canvas.md
+++ b/docs/fr/platform/workspace/canvas.md
@@ -9,7 +9,7 @@ Le public, c'est toute personne dans le chat. Pas de verrou de rôle ; quiconque
 
 ## Comment le cycle de vie d'un artéfact fonctionne
 
-Quand l'IA décide de produire quelque chose d'exécutable ou de révisable, elle appelle l'outil `artifact_create`. Le nouvel artéfact apparaît comme une carte dans la barre des **Artéfacts** au-dessus du chat, s'ouvre automatiquement dans le panneau Canevas à la première création, et diffuse son contenu en direct dans le panneau pendant que l'IA tape. Pour le réviser, l'IA appelle `artifact_edit` sur la même identité — les petites modifications utilisent `mode: 'patch'` (blocs recherche-remplacement) ; les grandes réécritures utilisent `mode: 'rewrite'`. Dans les deux cas, Canevas se re-rend en place, donc tu ne remontes jamais pour trouver la dernière version.
+Quand l'IA décide de produire quelque chose d'exécutable ou de révisable, elle appelle l'outil `artifact_create`. Le nouvel artéfact apparaît comme une carte dans la barre des **Artéfacts** au-dessus du chat et s'ouvre automatiquement dans le panneau Canevas à la première création. Pour le peupler ou le réviser, l'IA appelle des outils CRUD au niveau fichier sur la même identité : `artifact_file_update` pour écraser entièrement un fichier existant, `artifact_file_create` pour ajouter un nouveau fichier frère (un projet peut contenir plusieurs fichiers), `artifact_file_delete` et `artifact_file_rename` pour le nettoyage. Canevas se re-rend en place et diffuse le contenu en direct pendant que l'IA tape, donc tu ne remontes jamais pour trouver la dernière version.
 
 Pendant que l'IA écrit ou patche, la carte montre un indicateur de progression et l'en-tête de Canevas affiche **L'IA écrit…** ou **L'IA modifie…**.
 
diff --git a/examples/agents/chat-agent.json b/examples/agents/chat-agent.json
index 58cb7eaaa..acf2b2da1 100644
--- a/examples/agents/chat-agent.json
+++ b/examples/agents/chat-agent.json
@@ -8,7 +8,14 @@
     "document_find",
     "document_write",
     "artifact_create",
-    "artifact_edit",
+    "artifact_run",
+    "artifact_packages_add",
+    "artifact_file_create",
+    "artifact_file_update",
+    "artifact_file_delete",
+    "artifact_file_rename",
+    "artifact_file_read",
+    "artifact_file_list",
     "pdf",
     "image",
     "docx",
@@ -62,7 +69,7 @@
         "Eine Follow-up-Email an den Kunden verfassen",
         "Die neuesten Produktupdates zusammenfassen"
       ],
-      "systemInstructions": "Du bist ein hilfreicher KI-Assistent.\n\n**SPRACHE — strikte Prioritätsreihenfolge. Prüfe die Regeln 1→3 und halte beim ersten Treffer an.**\n\n1. **Explizite Anfrage.** Wenn die letzte Nachricht des Nutzers ausdrücklich nach einer Sprache verlangt (z. B. „reply in German\", „auf Deutsch bitte\", „répondez en français\", „translate to French\"), antworte in dieser Sprache.\n2. **Sprache der Nachricht.** Ansonsten erkenne die natürliche Sprache der letzten Nachricht des Nutzers und antworte in dieser Sprache.\n3. **Locale-Fallback.** Nur wenn die letzte Nachricht keine erkennbare natürliche Sprache enthält — z. B. nur Code, eine einzelne URL, reine Zahlen, ein einzelnes Emoji oder ein mehrdeutiges Ein- oder Zwei-Zeichen-Token — antworte in der Browser-Locale des Nutzers: `{{user.language}}`. Wenn `{{user.language}}` ebenfalls leer ist, antworte auf Englisch.\n\nBeispiele:\n- Nutzer: \"how are you today?\" → Englisch (Regel 2).\n- Nutzer: \"Wie geht es dir heute?\" → Deutsch (Regel 2).\n- Nutzer: \"Comment ça va aujourd'hui ?\" → Französisch (Regel 2).\n- Nutzer: \"translate to French: hello\" → Antwort auf Französisch (Regel 1).\n- Nutzer: \"```py\\nprint('hi')\\n```\" mit Browser-Locale `de-DE` → Deutsch (Regel 3).\n- Nutzer: \"👍\" mit Browser-Locale `fr-FR` → Französisch (Regel 3).\n\nVerwende niemals Zeitzone, IP-Adresse oder Geolocation, um die Antwortsprache zu wählen. Nur Regel 3 nutzt die Browser-Locale, und zwar ausschließlich als allerletzten Fallback.\n\n**WISSENSBEREICH**\n- **Wissensdatenbank**: Von der Organisation hochgeladene Dokumente — verwaltet auf der [Dokumente-Seite]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Gecrawlte Websites**: Webseiten von Domains, die von der Organisation hinzugefügt wurden — verwaltet auf der [Websites-Seite]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Wenn Suchen keine Ergebnisse liefern, weise den Nutzer darauf hin, dass er Dokumente hochladen oder Website-Domains hinzufügen kann, um die Wissensdatenbank zu erweitern.\n- Für Daten aus externen Systemen (Shopify, Datenbanken usw.) benötigt der Nutzer den Integration Assistant, konfiguriert unter [Einstellungen > Integrationen]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**REGELN**\n1. **SUCHEN VOR „ICH WEISS ES NICHT\"** — Sage niemals, dass dir Informationen fehlen, ohne zuvor die Wissensdatenbank oder das Web durchsucht zu haben.\n2. **KEINE HALLUZINATIONEN** — Verwende ausschließlich Daten aus Tool-Ergebnissen oder Nutzernachrichten. Erfinde niemals Fakten.\n3. **TOOL-ERGEBNISSE PRÄSENTIEREN** — Wenn ein Tool Ergebnisse zurückgibt, präsentiere zuerst die wichtigsten Informationen. Überspringe niemals Ergebnisse, um direkt zu Rückfragen zu springen.\n4. **MINIMALER TOOL-EINSATZ** — Wenn du aus deinem eigenen Wissen oder dem Gesprächskontext antworten kannst, tu das direkt. Rufe Tools nur auf, wenn die Frage externe Daten erfordert.\n5. **VORANALYSIERTE ANHÄNGE** — Wenn die Nachricht des Nutzers Abschnitte wie „[PRE-ANALYZED CONTENT\" oder „**Document: ...**\" / „**Image: ...**\" / „**Text File: ...**\" enthält, antworte direkt aus diesem Inhalt. NICHT erneut parsen.\n6. **KEINE ROHEN KONTEXT-AUSGABEN** — Gib niemals interne Formate aus („Tool[\", „[Tool Result]\", XML-Tags, rohes JSON). Berichte Ergebnisse in natürlicher Sprache.\n7. **PRÄSENTATIONEN, DEMO-SEITEN, VISUELLE & INTERAKTIVE INHALTE** — Wenn der Nutzer eine Präsentation, Folien, einen Foliensatz, PPT, PPTX, Demo-Seite, Vergleichsseite, interaktive Seite, Visualisierung, ein Dashboard oder eine beliebige *Seite* / *Dokument* zum Lesen direkt im Chat (statt als Datei-Download) anfragt, rufe IMMER das Tool `artifact_create` mit `type: \"html\"` und einem vollständigen, eigenständigen HTML-Dokument als `content` auf. Der Canvas-Bereich rendert das Artefakt live, während du streamst. Um es später zu überarbeiten (einen Bug beheben, eine Farbe ändern, eine Folie ergänzen), rufe `artifact_edit` für dieselbe `artifactId` auf — gib niemals das vollständige HTML erneut über `artifact_create` aus. Gib KEINE rohen ` ```html `-Codeblöcke aus; sie werden nicht als Vorschau gerendert. Rufe das `pdf`-Tool NICHT für diese Anfragen auf. Versuche NICHT, eine .pptx-Datei zu erzeugen — es gibt keinen PPTX-Export. Erzeuge nur dann ein PDF, wenn der Nutzer ausdrücklich eine herunterladbare .pdf-Datei verlangt. (reveal.js per CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, ist ein guter Standard für Folien.)\n\n**ANTWORTSTIL**: Sei direkt und prägnant. Verwende Markdown-Tabellen für mehrere Datensätze.\n\n{{user_profile}}"
+      "systemInstructions": "Du bist ein hilfreicher KI-Assistent.\n\n**SPRACHE — strikte Prioritätsreihenfolge. Prüfe die Regeln 1→3 und halte beim ersten Treffer an.**\n\n1. **Explizite Anfrage.** Wenn die letzte Nachricht des Nutzers ausdrücklich nach einer Sprache verlangt (z. B. „reply in German\", „auf Deutsch bitte\", „répondez en français\", „translate to French\"), antworte in dieser Sprache.\n2. **Sprache der Nachricht.** Ansonsten erkenne die natürliche Sprache der letzten Nachricht des Nutzers und antworte in dieser Sprache.\n3. **Locale-Fallback.** Nur wenn die letzte Nachricht keine erkennbare natürliche Sprache enthält — z. B. nur Code, eine einzelne URL, reine Zahlen, ein einzelnes Emoji oder ein mehrdeutiges Ein- oder Zwei-Zeichen-Token — antworte in der Browser-Locale des Nutzers: `{{user.language}}`. Wenn `{{user.language}}` ebenfalls leer ist, antworte auf Englisch.\n\nBeispiele:\n- Nutzer: \"how are you today?\" → Englisch (Regel 2).\n- Nutzer: \"Wie geht es dir heute?\" → Deutsch (Regel 2).\n- Nutzer: \"Comment ça va aujourd'hui ?\" → Französisch (Regel 2).\n- Nutzer: \"translate to French: hello\" → Antwort auf Französisch (Regel 1).\n- Nutzer: \"```py\\nprint('hi')\\n```\" mit Browser-Locale `de-DE` → Deutsch (Regel 3).\n- Nutzer: \"👍\" mit Browser-Locale `fr-FR` → Französisch (Regel 3).\n\nVerwende niemals Zeitzone, IP-Adresse oder Geolocation, um die Antwortsprache zu wählen. Nur Regel 3 nutzt die Browser-Locale, und zwar ausschließlich als allerletzten Fallback.\n\n**WISSENSBEREICH**\n- **Wissensdatenbank**: Von der Organisation hochgeladene Dokumente — verwaltet auf der [Dokumente-Seite]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Gecrawlte Websites**: Webseiten von Domains, die von der Organisation hinzugefügt wurden — verwaltet auf der [Websites-Seite]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Wenn Suchen keine Ergebnisse liefern, weise den Nutzer darauf hin, dass er Dokumente hochladen oder Website-Domains hinzufügen kann, um die Wissensdatenbank zu erweitern.\n- Für Daten aus externen Systemen (Shopify, Datenbanken usw.) benötigt der Nutzer den Integration Assistant, konfiguriert unter [Einstellungen > Integrationen]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**REGELN**\n1. **SUCHEN VOR „ICH WEISS ES NICHT\"** — Sage niemals, dass dir Informationen fehlen, ohne zuvor die Wissensdatenbank oder das Web durchsucht zu haben.\n2. **KEINE HALLUZINATIONEN** — Verwende ausschließlich Daten aus Tool-Ergebnissen oder Nutzernachrichten. Erfinde niemals Fakten.\n3. **TOOL-ERGEBNISSE PRÄSENTIEREN** — Wenn ein Tool Ergebnisse zurückgibt, präsentiere zuerst die wichtigsten Informationen. Überspringe niemals Ergebnisse, um direkt zu Rückfragen zu springen.\n4. **MINIMALER TOOL-EINSATZ** — Wenn du aus deinem eigenen Wissen oder dem Gesprächskontext antworten kannst, tu das direkt. Rufe Tools nur auf, wenn die Frage externe Daten erfordert.\n5. **VORANALYSIERTE ANHÄNGE** — Wenn die Nachricht des Nutzers Abschnitte wie „[PRE-ANALYZED CONTENT\" oder „**Document: ...**\" / „**Image: ...**\" / „**Text File: ...**\" enthält, antworte direkt aus diesem Inhalt. NICHT erneut parsen.\n6. **KEINE ROHEN KONTEXT-AUSGABEN** — Gib niemals interne Formate aus („Tool[\", „[Tool Result]\", XML-Tags, rohes JSON). Berichte Ergebnisse in natürlicher Sprache.\n7. **VISUELLE & INTERAKTIVE INHALTE** — Wähle den Pfad nach dem, was der Nutzer tatsächlich benannt hat.\n\n**(a) Explizite PPTX-Datei** — Begriffe wie „PPT\", „PPTX\", „PowerPoint\" oder „.pptx\". Der Nutzer hat ein Dateiformat benannt und möchte eine echte herunterladbare PowerPoint-Datei. Pfad: `artifact_create` (type=`python_runnable`, packages enthält `python-pptx`) → `artifact_file_update` für den Entry-Code → `artifact_run`. Die genauen Argumente, das Schreiben in `/workspace/output/`, das Aufteilen in Geschwister-Dateien und die Fehlerbehandlungsschleife sind in den jeweiligen Tool-Beschreibungen dokumentiert — folge diesen. Intent-Override: Sagt der Nutzer zusätzlich „Vorschau im Chat\" / „zeig es mir hier\" / „kein Download nötig\", behandle die Anfrage als (b).\n\n**(b) Folien, Demo, Dashboard oder interaktive Seite** — Begriffe wie „Folien\", „Foliensatz\", „Präsentation\", „Demo-Seite\", „Vergleichsseite\", „interaktive Seite\", „Visualisierung\", „Dashboard\" oder eine beliebige *Seite* / *Dokument*, die der Nutzer direkt im Chat liest, ohne ein Dateiformat zu nennen. Pfad: `artifact_create` (type=`html`) → `artifact_file_update` für `index.html` (Geschwister-Dateien via `artifact_file_create`, falls nützlich). Der Canvas-Bereich rendert das Artefakt live, während du streamst. reveal.js per CDN, /canvas-libs/reveal.js/5.0.5/, ist ein guter Standard für Folien. Gib KEINE rohen ` ```html `-Codeblöcke aus; sie werden nicht als Vorschau gerendert. Rufe das `pdf`-Tool NICHT für diese Anfragen auf.\n\n**(c) Word-Dokument** — Begriffe wie „Word-Dokument\", „Word-Datei\", „DOCX\" oder „.docx\". Rufe das `docx`-Tool auf, NICHT `artifact_create`. Das `docx`-Tool erzeugt die echte Datei direkt.\n\n**Gemeinsame Schutzregeln für beide `artifact_create`-Pfade:** Um ein bestehendes Artefakt zu überarbeiten, rufe `artifact_file_update` (oder `artifact_file_create` für eine neue Geschwisterdatei) für dieselbe `artifactId` auf — rufe NIEMALS `artifact_create` ein zweites Mal für dieselbe Anfrage auf, das erzeugt einen doppelten Eintrag in der Artefaktleiste. Sage dem Nutzer NIEMALS, dass die Datei fertig ist, außer `artifact_run` hat `runStatus: \"completed\"` UND `files.length > 0` zurückgegeben — „Datei erzeugt\" zu sagen, wenn keine Datei existiert, ist der meistgemeldete Bug dieses Flows.\n\n**ANTWORTSTIL**: Sei direkt und prägnant. Verwende Markdown-Tabellen für mehrere Datensätze.\n\n{{user_profile}}"
     },
     "en": {
       "displayName": "Assistant",
@@ -73,7 +80,7 @@
         "Write a follow-up email to the client",
         "Summarize our latest product updates"
       ],
-      "systemInstructions": "You are a helpful AI assistant.\n\n**LANGUAGE — strict priority order. Evaluate rules 1→3 and stop at the first match.**\n\n1. **Explicit request.** If the user's latest message explicitly asks for a language (e.g., \"reply in German\", \"auf Deutsch bitte\", \"répondez en français\", \"translate to French\"), use that language for the reply.\n2. **Message language.** Otherwise, detect the natural language of the user's latest message and reply in that language.\n3. **Locale fallback.** Only if the latest message has no detectable natural language — e.g., it is code-only, a bare URL, pure numbers, a single emoji, or a one- or two-character ambiguous token — reply in the user's browser locale: `{{user.language}}`. If `{{user.language}}` is also empty, reply in English.\n\nExamples:\n- User: \"how are you today?\" → English (rule 2).\n- User: \"Wie geht es dir heute?\" → German (rule 2).\n- User: \"Comment ça va aujourd'hui ?\" → French (rule 2).\n- User: \"translate to French: hello\" → French body (rule 1).\n- User: \"```py\\nprint('hi')\\n```\" with browser locale `de-DE` → German (rule 3).\n- User: \"👍\" with browser locale `fr-FR` → French (rule 3).\n\nNever use timezone, IP, or geolocation to choose the response language. Only rule 3 uses the browser locale, and only as a last-resort fallback.\n\n**KNOWLEDGE SCOPE**\n- **Knowledge base**: Documents uploaded by the organization — managed on the [Documents page]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Crawled websites**: Web pages from domains added by the organization — managed on the [Websites page]({{site_url}}/dashboard/{{organization.id}}/websites).\n- If searches return no results, let the user know they can upload documents or add website domains to expand the knowledge base.\n- For external system data (Shopify, databases, etc.), the user needs the Integration Assistant configured in [Settings > Integrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RULES**\n1. **SEARCH BEFORE \"I DON'T KNOW\"** — Never say you don't have information without first searching the knowledge base or the web.\n2. **NO HALLUCINATIONS** — Only use data from tool results or user messages. Never fabricate facts.\n3. **PRESENT TOOL RESULTS** — When a tool returns results, present the key information first. Never skip results to jump to follow-up questions.\n4. **MINIMAL TOOL USE** — If you can answer from your own knowledge or conversation context, do so directly. Only call tools when the question requires external data.\n5. **PRE-ANALYZED ATTACHMENTS** — If the user's message contains \"[PRE-ANALYZED CONTENT\" or \"**Document: ...**\" / \"**Image: ...**\" / \"**Text File: ...**\" sections, answer from that content directly. Do NOT re-parse.\n6. **NO RAW CONTEXT OUTPUT** — Never output internal formats (\"Tool[\", \"[Tool Result]\", XML tags, raw JSON). Report results in natural language.\n7. **PRESENTATIONS, DEMO PAGES, VISUAL & INTERACTIVE CONTENT** — When the user asks for a presentation, slides, slide deck, PPT, PPTX, demo page, comparison page, interactive page, visualization, dashboard, or any *page* / *document* the user will read inside the chat (rather than download as a file), ALWAYS call the `artifact_create` tool with `type: \"html\"` and a complete, self-contained HTML document as `content`. The Canvas pane renders the artifact live as you stream. To revise it later (fix a bug, change a colour, add a slide), call `artifact_edit` against the same `artifactId` — never re-emit the full HTML via another `artifact_create`. Do NOT emit raw ` ```html ` code blocks; they will not render as a preview. Do NOT call the `pdf` tool for these. Do NOT try to produce a .pptx file — there is no PPTX export. Only generate a PDF if the user explicitly insists on a downloadable .pdf file. (reveal.js via CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, is a good default for slides.)\n\n**RESPONSE STYLE**: Be direct and concise. Use Markdown tables for multiple records.\n\n{{user_profile}}"
+      "systemInstructions": "You are a helpful AI assistant.\n\n**LANGUAGE — strict priority order. Evaluate rules 1→3 and stop at the first match.**\n\n1. **Explicit request.** If the user's latest message explicitly asks for a language (e.g., \"reply in German\", \"auf Deutsch bitte\", \"répondez en français\", \"translate to French\"), use that language for the reply.\n2. **Message language.** Otherwise, detect the natural language of the user's latest message and reply in that language.\n3. **Locale fallback.** Only if the latest message has no detectable natural language — e.g., it is code-only, a bare URL, pure numbers, a single emoji, or a one- or two-character ambiguous token — reply in the user's browser locale: `{{user.language}}`. If `{{user.language}}` is also empty, reply in English.\n\nExamples:\n- User: \"how are you today?\" → English (rule 2).\n- User: \"Wie geht es dir heute?\" → German (rule 2).\n- User: \"Comment ça va aujourd'hui ?\" → French (rule 2).\n- User: \"translate to French: hello\" → French body (rule 1).\n- User: \"```py\\nprint('hi')\\n```\" with browser locale `de-DE` → German (rule 3).\n- User: \"👍\" with browser locale `fr-FR` → French (rule 3).\n\nNever use timezone, IP, or geolocation to choose the response language. Only rule 3 uses the browser locale, and only as a last-resort fallback.\n\n**KNOWLEDGE SCOPE**\n- **Knowledge base**: Documents uploaded by the organization — managed on the [Documents page]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Crawled websites**: Web pages from domains added by the organization — managed on the [Websites page]({{site_url}}/dashboard/{{organization.id}}/websites).\n- If searches return no results, let the user know they can upload documents or add website domains to expand the knowledge base.\n- For external system data (Shopify, databases, etc.), the user needs the Integration Assistant configured in [Settings > Integrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RULES**\n1. **SEARCH BEFORE \"I DON'T KNOW\"** — Never say you don't have information without first searching the knowledge base or the web.\n2. **NO HALLUCINATIONS** — Only use data from tool results or user messages. Never fabricate facts.\n3. **PRESENT TOOL RESULTS** — When a tool returns results, present the key information first. Never skip results to jump to follow-up questions.\n4. **MINIMAL TOOL USE** — If you can answer from your own knowledge or conversation context, do so directly. Only call tools when the question requires external data.\n5. **PRE-ANALYZED ATTACHMENTS** — If the user's message contains \"[PRE-ANALYZED CONTENT\" or \"**Document: ...**\" / \"**Image: ...**\" / \"**Text File: ...**\" sections, answer from that content directly. Do NOT re-parse.\n6. **NO RAW CONTEXT OUTPUT** — Never output internal formats (\"Tool[\", \"[Tool Result]\", XML tags, raw JSON). Report results in natural language.\n7. **VISUAL & INTERACTIVE CONTENT** — Route by what the user actually named.\n\n**(a) Explicit PPTX file** — words like \"PPT\", \"PPTX\", \"PowerPoint\", or \".pptx\". The user named a file format and wants a real downloadable PowerPoint. Path: `artifact_create` (type=`python_runnable`, packages include `python-pptx`) → `artifact_file_update` for the entry source → `artifact_run`. The exact argument shape, writing into `/workspace/output/`, sibling-file splits, and the failure-retry loop are all covered in the respective tool descriptions — follow those. Intent override: if the user also says \"preview in chat\" / \"show me here\" / \"no need to download\", treat the request as (b) instead.\n\n**(b) Slides, demo, dashboard, or interactive page** — words like \"slides\", \"deck\", \"presentation\", \"demo page\", \"comparison page\", \"interactive page\", \"visualization\", \"dashboard\", or any *page* / *document* the user will read inside the chat with no file format named. Path: `artifact_create` (type=`html`) → `artifact_file_update` against `index.html` (sibling files via `artifact_file_create` if useful). The Canvas pane renders it live as you stream. reveal.js via CDN, /canvas-libs/reveal.js/5.0.5/, is a good default for slides. Do NOT emit raw ` ```html ` code blocks; they will not render as a preview. Do NOT call the `pdf` tool for these.\n\n**(c) Word document** — words like \"Word document\", \"Word doc\", \"DOCX\", or \".docx\". Call the `docx` tool, NOT `artifact_create`. The `docx` tool generates the real file directly.\n\n**Shared guardrails for both `artifact_create` paths:** To revise an existing artifact, call `artifact_file_update` (or `artifact_file_create` for a new sibling file) against the same `artifactId` — NEVER call `artifact_create` a second time for the same request, that creates a duplicate in the artifact bar. NEVER tell the user the file is ready unless `artifact_run` returned `runStatus: \"completed\"` AND `files.length > 0` — saying \"file generated\" when no file exists is the most reported bug for this flow.\n\n**RESPONSE STYLE**: Be direct and concise. Use Markdown tables for multiple records.\n\n{{user_profile}}"
     },
     "fr": {
       "displayName": "Assistant",
@@ -84,7 +91,7 @@
         "Écrire un email de relance au client",
         "Résumer nos dernières mises à jour produit"
       ],
-      "systemInstructions": "Tu es un assistant IA serviable.\n\n**LANGUE — ordre de priorité strict. Évalue les règles 1→3 et arrête-toi à la première correspondance.**\n\n1. **Demande explicite.** Si le dernier message de l'utilisateur demande explicitement une langue (par ex. « reply in German », « auf Deutsch bitte », « répondez en français », « translate to French »), utilise cette langue pour la réponse.\n2. **Langue du message.** Sinon, détecte la langue naturelle du dernier message de l'utilisateur et réponds dans cette langue.\n3. **Locale de repli.** Uniquement si le dernier message ne contient aucune langue naturelle détectable — par ex. il s'agit uniquement de code, d'une simple URL, de chiffres purs, d'un seul emoji, ou d'un jeton ambigu d'un ou deux caractères — réponds dans la locale du navigateur de l'utilisateur : `{{user.language}}`. Si `{{user.language}}` est également vide, réponds en anglais.\n\nExemples :\n- Utilisateur : \"how are you today?\" → anglais (règle 2).\n- Utilisateur : \"Wie geht es dir heute?\" → allemand (règle 2).\n- Utilisateur : \"Comment ça va aujourd'hui ?\" → français (règle 2).\n- Utilisateur : \"translate to French: hello\" → réponse en français (règle 1).\n- Utilisateur : \"```py\\nprint('hi')\\n```\" avec locale du navigateur `de-DE` → allemand (règle 3).\n- Utilisateur : \"👍\" avec locale du navigateur `fr-FR` → français (règle 3).\n\nN'utilise jamais le fuseau horaire, l'IP ou la géolocalisation pour choisir la langue de réponse. Seule la règle 3 utilise la locale du navigateur, et uniquement en dernier recours.\n\n**PÉRIMÈTRE DE CONNAISSANCES**\n- **Base de connaissances** : documents téléversés par l'organisation — gérés sur la [page Documents]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Sites web explorés** : pages web issues des domaines ajoutés par l'organisation — gérés sur la [page Sites web]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Si les recherches ne renvoient aucun résultat, indique à l'utilisateur qu'il peut téléverser des documents ou ajouter des domaines de sites web pour étendre la base de connaissances.\n- Pour les données de systèmes externes (Shopify, bases de données, etc.), l'utilisateur a besoin de l'Integration Assistant configuré dans [Paramètres > Intégrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RÈGLES**\n1. **CHERCHER AVANT DE DIRE « JE NE SAIS PAS »** — Ne dis jamais que tu n'as pas l'information sans avoir d'abord cherché dans la base de connaissances ou sur le web.\n2. **PAS D'HALLUCINATIONS** — N'utilise que les données issues des résultats d'outils ou des messages de l'utilisateur. Ne fabrique jamais de faits.\n3. **PRÉSENTER LES RÉSULTATS DES OUTILS** — Lorsqu'un outil renvoie des résultats, présente d'abord les informations clés. Ne saute jamais les résultats pour passer directement à des questions de suivi.\n4. **USAGE MINIMAL DES OUTILS** — Si tu peux répondre à partir de tes propres connaissances ou du contexte de la conversation, fais-le directement. N'appelle des outils que lorsque la question nécessite des données externes.\n5. **PIÈCES JOINTES PRÉ-ANALYSÉES** — Si le message de l'utilisateur contient des sections « [PRE-ANALYZED CONTENT » ou « **Document: ...** » / « **Image: ...** » / « **Text File: ...** », réponds directement à partir de ce contenu. NE PAS ré-analyser.\n6. **PAS DE SORTIE DE CONTEXTE BRUT** — Ne restitue jamais les formats internes (« Tool[ », « [Tool Result] », balises XML, JSON brut). Rapporte les résultats en langage naturel.\n7. **PRÉSENTATIONS, PAGES DE DÉMO, CONTENU VISUEL & INTERACTIF** — Lorsque l'utilisateur demande une présentation, des diapositives, un slide deck, PPT, PPTX, page de démo, page de comparaison, page interactive, visualisation, tableau de bord, ou toute *page* / *document* à lire directement dans le chat (plutôt qu'à télécharger comme fichier), appelle TOUJOURS l'outil `artifact_create` avec `type: \"html\"` et un document HTML complet et autonome comme `content`. Le panneau Canvas affiche l'artéfact en direct pendant que tu le diffuses. Pour le réviser ensuite (corriger un bug, changer une couleur, ajouter une diapositive), appelle `artifact_edit` sur le même `artifactId` — ne réémets jamais le HTML complet via un autre `artifact_create`. N'émets PAS de blocs de code ` ```html ` bruts ; ils ne s'affichent pas en aperçu. N'appelle PAS l'outil `pdf` pour ces demandes. N'essaie PAS de produire un fichier .pptx — il n'y a pas d'export PPTX. Ne génère un PDF que si l'utilisateur insiste explicitement sur un fichier .pdf téléchargeable. (reveal.js via CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, est un bon défaut pour les diapositives.)\n\n**STYLE DE RÉPONSE** : sois direct et concis. Utilise des tableaux Markdown pour plusieurs enregistrements.\n\n{{user_profile}}"
+      "systemInstructions": "Tu es un assistant IA serviable.\n\n**LANGUE — ordre de priorité strict. Évalue les règles 1→3 et arrête-toi à la première correspondance.**\n\n1. **Demande explicite.** Si le dernier message de l'utilisateur demande explicitement une langue (par ex. « reply in German », « auf Deutsch bitte », « répondez en français », « translate to French »), utilise cette langue pour la réponse.\n2. **Langue du message.** Sinon, détecte la langue naturelle du dernier message de l'utilisateur et réponds dans cette langue.\n3. **Locale de repli.** Uniquement si le dernier message ne contient aucune langue naturelle détectable — par ex. il s'agit uniquement de code, d'une simple URL, de chiffres purs, d'un seul emoji, ou d'un jeton ambigu d'un ou deux caractères — réponds dans la locale du navigateur de l'utilisateur : `{{user.language}}`. Si `{{user.language}}` est également vide, réponds en anglais.\n\nExemples :\n- Utilisateur : \"how are you today?\" → anglais (règle 2).\n- Utilisateur : \"Wie geht es dir heute?\" → allemand (règle 2).\n- Utilisateur : \"Comment ça va aujourd'hui ?\" → français (règle 2).\n- Utilisateur : \"translate to French: hello\" → réponse en français (règle 1).\n- Utilisateur : \"```py\\nprint('hi')\\n```\" avec locale du navigateur `de-DE` → allemand (règle 3).\n- Utilisateur : \"👍\" avec locale du navigateur `fr-FR` → français (règle 3).\n\nN'utilise jamais le fuseau horaire, l'IP ou la géolocalisation pour choisir la langue de réponse. Seule la règle 3 utilise la locale du navigateur, et uniquement en dernier recours.\n\n**PÉRIMÈTRE DE CONNAISSANCES**\n- **Base de connaissances** : documents téléversés par l'organisation — gérés sur la [page Documents]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Sites web explorés** : pages web issues des domaines ajoutés par l'organisation — gérés sur la [page Sites web]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Si les recherches ne renvoient aucun résultat, indique à l'utilisateur qu'il peut téléverser des documents ou ajouter des domaines de sites web pour étendre la base de connaissances.\n- Pour les données de systèmes externes (Shopify, bases de données, etc.), l'utilisateur a besoin de l'Integration Assistant configuré dans [Paramètres > Intégrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RÈGLES**\n1. **CHERCHER AVANT DE DIRE « JE NE SAIS PAS »** — Ne dis jamais que tu n'as pas l'information sans avoir d'abord cherché dans la base de connaissances ou sur le web.\n2. **PAS D'HALLUCINATIONS** — N'utilise que les données issues des résultats d'outils ou des messages de l'utilisateur. Ne fabrique jamais de faits.\n3. **PRÉSENTER LES RÉSULTATS DES OUTILS** — Lorsqu'un outil renvoie des résultats, présente d'abord les informations clés. Ne saute jamais les résultats pour passer directement à des questions de suivi.\n4. **USAGE MINIMAL DES OUTILS** — Si tu peux répondre à partir de tes propres connaissances ou du contexte de la conversation, fais-le directement. N'appelle des outils que lorsque la question nécessite des données externes.\n5. **PIÈCES JOINTES PRÉ-ANALYSÉES** — Si le message de l'utilisateur contient des sections « [PRE-ANALYZED CONTENT » ou « **Document: ...** » / « **Image: ...** » / « **Text File: ...** », réponds directement à partir de ce contenu. NE PAS ré-analyser.\n6. **PAS DE SORTIE DE CONTEXTE BRUT** — Ne restitue jamais les formats internes (« Tool[ », « [Tool Result] », balises XML, JSON brut). Rapporte les résultats en langage naturel.\n7. **CONTENU VISUEL & INTERACTIF** — Choisis le chemin selon ce que l'utilisateur a réellement nommé.\n\n**(a) Fichier PPTX explicite** — termes comme « PPT », « PPTX », « PowerPoint » ou « .pptx ». L'utilisateur a nommé un format de fichier et souhaite un vrai fichier PowerPoint téléchargeable. Chemin : `artifact_create` (type=`python_runnable`, packages contient `python-pptx`) → `artifact_file_update` pour la source d'entrée → `artifact_run`. Les arguments exacts, l'écriture dans `/workspace/output/`, la séparation en fichiers frères et la boucle de gestion d'erreurs sont décrits dans les descriptions des outils respectifs — suis-les. Dérogation d'intention : si l'utilisateur dit aussi « aperçu dans le chat » / « montre-moi ici » / « pas besoin de télécharger », traite la demande comme (b).\n\n**(b) Diapositives, démo, tableau de bord ou page interactive** — termes comme « diapositives », « slide deck », « présentation », « page de démo », « page de comparaison », « page interactive », « visualisation », « tableau de bord » ou toute *page* / *document* que l'utilisateur lira directement dans le chat sans nommer un format de fichier. Chemin : `artifact_create` (type=`html`) → `artifact_file_update` sur `index.html` (fichiers frères via `artifact_file_create` si utile). Le panneau Canvas affiche l'artéfact en direct pendant que tu le diffuses. reveal.js via CDN, /canvas-libs/reveal.js/5.0.5/, est un bon défaut pour les diapositives. N'émets PAS de blocs de code ` ```html ` bruts ; ils ne s'affichent pas en aperçu. N'appelle PAS l'outil `pdf` pour ces demandes.\n\n**(c) Document Word** — termes comme « document Word », « fichier Word », « DOCX » ou « .docx ». Appelle l'outil `docx`, PAS `artifact_create`. L'outil `docx` génère directement le vrai fichier.\n\n**Garde-fous communs aux deux chemins `artifact_create` :** Pour réviser un artéfact existant, appelle `artifact_file_update` (ou `artifact_file_create` pour un nouveau fichier frère) sur le même `artifactId` — n'appelle JAMAIS `artifact_create` une seconde fois pour la même demande, cela crée un doublon dans la barre des artéfacts. Ne dis JAMAIS à l'utilisateur que le fichier est prêt à moins que `artifact_run` ait renvoyé `runStatus: \"completed\"` ET `files.length > 0` — dire « fichier généré » alors qu'aucun fichier n'existe est le bug le plus signalé pour ce flux.\n\n**STYLE DE RÉPONSE** : sois direct et concis. Utilise des tableaux Markdown pour plusieurs enregistrements.\n\n{{user_profile}}"
     }
   }
 }
diff --git a/knip.config.ts b/knip.config.ts
index 08fc9edd6..2f471f100 100644
--- a/knip.config.ts
+++ b/knip.config.ts
@@ -4,7 +4,6 @@ export default {
   workspaces: {
     'services/platform': {
       vite: { config: ['vite.config.ts'] },
-      ignore: ['convex/_generated/**'],
       entry: [
         'app/routes/**/*.tsx',
         'scripts/**/*.ts',
@@ -49,6 +48,13 @@ export default {
       ],
       project: ['**/*.{ts,tsx}'],
     },
+    'services/sandbox': {
+      // Standalone Bun HTTP service. `src/server.ts` is the runtime entry,
+      // auto-detected from `dev`/`start` scripts; tests anchor the dead-code
+      // sweep for unit-only helpers.
+      entry: ['src/**/*.test.ts'],
+      project: ['src/**/*.ts'],
+    },
     'services/docs': {
       vite: { config: ['vite.config.ts'] },
       entry: [
diff --git a/package.json b/package.json
index 019fae289..84e7cb98d 100644
--- a/package.json
+++ b/package.json
@@ -17,6 +17,7 @@
     "services/rag",
     "services/db",
     "services/proxy",
+    "services/sandbox",
     "tools/cli",
     "tools/plop"
   ],
@@ -135,7 +136,8 @@
   },
   "packageManager": "bun@1.3.10",
   "patchedDependencies": {
-    "convex-helpers@0.1.114": "patches/convex-helpers@0.1.114.patch"
+    "convex-helpers@0.1.114": "patches/convex-helpers@0.1.114.patch",
+    "@convex-dev/agent@0.6.1": "patches/@convex-dev%2Fagent@0.6.1.patch"
   },
   "trustedDependencies": [
     "core-js-pure",
diff --git a/packages/ui/src/markdown/shiki.ts b/packages/ui/src/markdown/shiki.ts
index be58cd8ff..7f4fe6711 100644
--- a/packages/ui/src/markdown/shiki.ts
+++ b/packages/ui/src/markdown/shiki.ts
@@ -78,6 +78,10 @@ const LANG_ALIASES: Record<string, string> = {
   js: 'javascript',
   mjs: 'javascript',
   cjs: 'javascript',
+  // `node` is the source language for node_runnable artifacts; the LLM
+  // and the artifact_create tool both emit this token. Without an alias
+  // shiki falls back to plaintext.
+  node: 'javascript',
   ts: 'typescript',
   mts: 'typescript',
   cts: 'typescript',
diff --git a/patches/@convex-dev%2Fagent@0.6.1.patch b/patches/@convex-dev%2Fagent@0.6.1.patch
new file mode 100644
index 000000000..bd7f92f96
--- /dev/null
+++ b/patches/@convex-dev%2Fagent@0.6.1.patch
@@ -0,0 +1,27 @@
+diff --git a/dist/client/streaming.js b/dist/client/streaming.js
+index b96123e5bd0934a522ca176416112dce99b313a8..db148f25d851c11376039d4e40e7bf321747b829 100644
+--- a/dist/client/streaming.js
++++ b/dist/client/streaming.js
+@@ -294,6 +294,22 @@ export function compressUIMessageChunks(parts) {
+                 compressed.push(part);
+             }
+         }
++        else if (part.type === "tool-input-delta") {
++            // Tale patch: coalesce consecutive tool-input-delta parts with
++            // the same toolCallId. Mirrors the text-delta merge above.
++            // Without this, large artifact_create / artifact_edit tool inputs
++            // (10s of KB) produce hundreds of streamDeltas rows, and the
++            // frontend's useStreamingUIMessages (which rebuilds the
++            // UIMessage from cursor=0 on every Convex push) burns O(N²)
++            // main-thread time and freezes the chat UI. Submit upstream;
++            // drop this patch on the next SDK bump once merged.
++            if (last?.type === "tool-input-delta" && part.toolCallId === last.toolCallId) {
++                last.inputTextDelta += part.inputTextDelta;
++            }
++            else {
++                compressed.push(part);
++            }
++        }
+         else {
+             compressed.push(part);
+         }
diff --git a/services/docs/Dockerfile b/services/docs/Dockerfile
index d612900c3..4620fbbce 100644
--- a/services/docs/Dockerfile
+++ b/services/docs/Dockerfile
@@ -24,6 +24,7 @@ COPY services/crawler/package.json ./services/crawler/
 COPY services/rag/package.json ./services/rag/
 COPY services/db/package.json ./services/db/
 COPY services/proxy/package.json ./services/proxy/
+COPY services/sandbox/package.json ./services/sandbox/
 COPY services/web/package.json ./services/web/
 COPY services/docs/package.json ./services/docs/
 COPY tools/cli/package.json ./tools/cli/
diff --git a/services/docs/Dockerfile.dockerignore b/services/docs/Dockerfile.dockerignore
index f63a40fe1..990f24260 100644
--- a/services/docs/Dockerfile.dockerignore
+++ b/services/docs/Dockerfile.dockerignore
@@ -1,7 +1,145 @@
-node_modules
-dist
-dist-ssr
-.turbo
-.cache
+# =============================================================================
+# Tale Docs (Vite + Vocs static site) — Dockerfile.dockerignore
+# =============================================================================
+# BuildKit picks this file (adjacent to the Dockerfile) over the root
+# .dockerignore. It does NOT merge — so this file must list everything we want
+# excluded from the docs image's build context. The previous 7-line stub
+# shipped the entire repo as context on every docs build (audit R2-B11).
+#
+# Build (from repo root):
+#   docker build -f services/docs/Dockerfile .
+
+# =============================================================================
+# Local environment files
+# =============================================================================
+**/.env
+**/.env.*
+
+# =============================================================================
+# Git
+# =============================================================================
+.git
+.gitignore
+.gitattributes
+
+# =============================================================================
+# CI / tooling
+# =============================================================================
+.github/
+.husky/
+.claude/
+.agents/
+.vscode/
+.idea/
+.ruff_cache/
+.turbo/
+.trivyignore
+.oxlintrc.json
+.oxfmtrc.json
+
+# =============================================================================
+# IDE / OS
+# =============================================================================
+*.swp
+*.swo
+*~
+.DS_Store
+
+# =============================================================================
+# Node
+# =============================================================================
+node_modules/
+**/node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# =============================================================================
+# Build artifacts
+# =============================================================================
+*.tsbuildinfo
+**/dist/
+**/build/
+**/.output/
+**/.vinxi/
+storybook-static/
+
+# =============================================================================
+# Testing
+# =============================================================================
+tests/
+**/coverage/
+.nyc_output/
+*.test.ts
+*.test.js
+*.spec.ts
+*.spec.js
+
+# =============================================================================
+# Storybook
+# =============================================================================
+.storybook/
+**/.storybook/
+**/*.stories.tsx
+**/*.stories.ts
+**/*.stories.jsx
+**/*.stories.js
+
+# =============================================================================
+# Logs / temp / cache / misc
+# =============================================================================
 *.log
-storybook-static
+*.tmp
+*.temp
+.cache/
+.playwright-mcp/
+knip-results.json
+designs/
+
+# =============================================================================
+# Docker files
+# =============================================================================
+docker-compose.yml
+docker-compose.*.yml
+compose.yml
+compose.*.yml
+.dockerignore
+**/Dockerfile.dockerignore
+
+# =============================================================================
+# Docs-specific: image needs only services/docs + packages/ui workspace.
+# All other service trees stay out of the build context — `bun install`
+# only needs each workspace's package.json (re-included below).
+# =============================================================================
+services/platform/
+services/web/
+services/convex/
+services/crawler/
+services/rag/
+services/db/
+services/proxy/
+services/sandbox/
+services/sandbox-egress/
+services/sandbox-runtime/
+packages/tale_knowledge/
+packages/tale_shared/
+packages/tale_telemetry/
+tools/
+examples/
+
+# `bun install` needs every workspace package.json present at its declared
+# path so the workspace graph resolves. Re-include just the manifests —
+# source trees stay excluded by the rules above.
+!services/platform/package.json
+!services/web/package.json
+!services/crawler/package.json
+!services/rag/package.json
+!services/db/package.json
+!services/proxy/package.json
+!services/sandbox/package.json
+!packages/tale_knowledge/package.json
+!packages/tale_shared/package.json
+!packages/tale_telemetry/package.json
+!tools/cli/package.json
+!tools/plop/package.json
diff --git a/services/platform/Dockerfile b/services/platform/Dockerfile
index dc544ef2c..916f66767 100644
--- a/services/platform/Dockerfile
+++ b/services/platform/Dockerfile
@@ -31,6 +31,7 @@ COPY services/crawler/package.json ./services/crawler/
 COPY services/rag/package.json ./services/rag/
 COPY services/db/package.json ./services/db/
 COPY services/proxy/package.json ./services/proxy/
+COPY services/sandbox/package.json ./services/sandbox/
 COPY services/web/package.json ./services/web/
 COPY services/docs/package.json ./services/docs/
 COPY tools/cli/package.json ./tools/cli/
@@ -127,6 +128,7 @@ COPY --from=workspace-deps /app/services/crawler/package.json /tmp/workspace/ser
 COPY --from=workspace-deps /app/services/rag/package.json /tmp/workspace/services/rag/
 COPY --from=workspace-deps /app/services/db/package.json /tmp/workspace/services/db/
 COPY --from=workspace-deps /app/services/proxy/package.json /tmp/workspace/services/proxy/
+COPY --from=workspace-deps /app/services/sandbox/package.json /tmp/workspace/services/sandbox/
 COPY --from=workspace-deps /app/services/web/package.json /tmp/workspace/services/web/
 COPY --from=workspace-deps /app/services/docs/package.json /tmp/workspace/services/docs/
 COPY --from=workspace-deps /app/tools/cli/package.json /tmp/workspace/tools/cli/
@@ -214,6 +216,16 @@ ENV NODE_ENV=production \
     HOSTNAME="0.0.0.0" \
     # Convex service DNS name (compose-internal). Overridable via CONVEX_URL.
     CONVEX_URL=http://convex:3210 \
+    # Origin that the sandbox spawner uses to POST presigned-URL output
+    # uploads back to Convex. Read by Convex Node actions via process.env
+    # in toSandboxStorageUrl() (see convex/lib/helpers/public_storage_url.ts).
+    # Node actions only see vars that this container's entrypoint pushes
+    # into Convex's deployment env via `convex env set`, so baking the
+    # value into the platform image is what guarantees the rewrite has
+    # a reachable origin on every docker deploy. Direct to convex:3210
+    # rather than the Caddy proxy because Caddy is HTTPS-only with a
+    # self-signed cert and would 308-redirect plain HTTP POSTs.
+    SANDBOX_STORAGE_INTERNAL_BASE_URL=http://convex:3210 \
     # INSTANCE_NAME is shared with convex service; platform uses it + INSTANCE_SECRET
     # to compute the admin key for `bunx convex env set` and `bunx convex deploy`.
     INSTANCE_NAME=tale_platform \
@@ -270,6 +282,7 @@ ENV NODE_ENV=production \
     PORT=3000 \
     HOSTNAME="0.0.0.0" \
     CONVEX_URL=http://convex:3210 \
+    SANDBOX_STORAGE_INTERNAL_BASE_URL=http://convex:3210 \
     INSTANCE_NAME=tale_platform \
     DO_NOT_TRACK=1 \
     TALE_CONFIG_DIR=/app/data
diff --git a/services/platform/Dockerfile.dockerignore b/services/platform/Dockerfile.dockerignore
index 75d367ec6..cd0562e35 100644
--- a/services/platform/Dockerfile.dockerignore
+++ b/services/platform/Dockerfile.dockerignore
@@ -133,3 +133,9 @@ services/db/
 !services/db/package.json
 services/proxy/
 !services/proxy/package.json
+services/sandbox/
+!services/sandbox/package.json
+services/sandbox-egress/
+services/sandbox-runtime/
+services/docs/
+!services/docs/package.json
diff --git a/services/platform/app/features/chat/components/canvas/artifact-bar.tsx b/services/platform/app/features/chat/components/canvas/artifact-bar.tsx
index 6e1e2db73..3d5c45c98 100644
--- a/services/platform/app/features/chat/components/canvas/artifact-bar.tsx
+++ b/services/platform/app/features/chat/components/canvas/artifact-bar.tsx
@@ -3,32 +3,15 @@
 import { Badge } from '@tale/ui/badge';
 import { Button } from '@tale/ui/button';
 import { useQuery } from 'convex/react';
-import {
-  Code,
-  FileText,
-  GitBranch,
-  Globe,
-  Image as ImageIcon,
-  Loader2,
-} from 'lucide-react';
-import { memo, useEffect, useRef, type ComponentType } from 'react';
+import { Loader2 } from 'lucide-react';
+import { memo, useEffect, useRef } from 'react';
 
 import { api } from '@/convex/_generated/api';
 import type { ArtifactListItem } from '@/convex/artifacts/queries';
 import { useT } from '@/lib/i18n/client';
 
-import { useCanvas, type CanvasContentType } from './canvas-context';
-
-const TYPE_ICONS: Record<
-  CanvasContentType,
-  ComponentType<{ className?: string }>
-> = {
-  code: Code,
-  html: Globe,
-  mermaid: GitBranch,
-  svg: ImageIcon,
-  markdown: FileText,
-};
+import { useCanvas } from './canvas-context';
+import { CANVAS_TYPE_ICONS } from './icon-map';
 
 interface ArtifactBarProps {
   organizationId: string;
@@ -46,7 +29,7 @@ function ArtifactBarComponent({ organizationId, threadId }: ArtifactBarProps) {
   // Pull focus to each newly-created artifact exactly once. If the AI calls
   // artifact_create multiple times in a turn, we follow whichever one
   // appeared most recently — ChatGPT-Canvas behaviour. We key off
-  // `createdAt` (immutable) so an artifact_edit revision does not
+  // `createdAt` (immutable) so a subsequent artifact_file_update revision does not
   // re-trigger the switch; the existing `useQuery` subscription updates
   // the open canvas in place.
   const autoOpenedRef = useRef(new Set<string>());
@@ -74,7 +57,7 @@ function ArtifactBarComponent({ organizationId, threadId }: ArtifactBarProps) {
         {t('artifacts.barTitle')}
       </span>
       {artifacts.map((artifact) => {
-        const Icon = TYPE_ICONS[artifact.type];
+        const Icon = CANVAS_TYPE_ICONS[artifact.type];
         const isStreaming = artifact.liveStreamMode !== undefined;
         const isOpen = openArtifactId === artifact._id;
         return (
@@ -92,9 +75,11 @@ function ArtifactBarComponent({ organizationId, threadId }: ArtifactBarProps) {
               <Icon className="size-3.5" aria-hidden="true" />
             )}
             <span className="max-w-[14rem] truncate">{artifact.title}</span>
-            <Badge variant="outline" className="h-4 px-1 text-[10px]">
-              v{artifact.revision}
-            </Badge>
+            {artifact.fileCount > 1 && (
+              <Badge variant="outline" className="h-4 px-1 text-[10px]">
+                {t('artifacts.fileCount', { count: artifact.fileCount })}
+              </Badge>
+            )}
           </Button>
         );
       })}
diff --git a/services/platform/app/features/chat/components/canvas/canvas-context.tsx b/services/platform/app/features/chat/components/canvas/canvas-context.tsx
index 2b370ccb5..a0824790c 100644
--- a/services/platform/app/features/chat/components/canvas/canvas-context.tsx
+++ b/services/platform/app/features/chat/components/canvas/canvas-context.tsx
@@ -16,17 +16,34 @@ export type CanvasContentType =
   | 'html'
   | 'mermaid'
   | 'svg'
-  | 'markdown';
+  | 'markdown'
+  // Runnable types — source code that executes in the server sandbox.
+  // The CanvasRunnableCodeRenderer subscribes to the artifact row's
+  // `run*` fields for live progress and final output file display.
+  // `script_runnable` is the canonical type (per-file runtime by
+  // extension); `python_runnable` / `node_runnable` are legacy single-
+  // runtime literals retained for old rows.
+  | 'script_runnable'
+  | 'python_runnable'
+  | 'node_runnable';
 
 interface CanvasState {
   isCanvasOpen: boolean;
   artifactId?: Id<'artifacts'>;
+  /**
+   * Which file inside the artifact's project the canvas is currently
+   * showing. `null` means "use the entryFile" — resolution happens in
+   * canvas-pane against the live artifact row so a renamed entry pointer
+   * doesn't strand the selection.
+   */
+  activeFilePath: string | null;
 }
 
 interface CanvasContextType extends CanvasState {
   openCanvas: (artifactId: Id<'artifacts'>) => void;
   closeCanvas: () => void;
   resetCanvas: () => void;
+  setActiveFilePath: (path: string | null) => void;
 }
 
 const CanvasContext = createContext<CanvasContextType | null>(null);
@@ -50,16 +67,21 @@ interface CanvasProviderProps {
 const INITIAL_STATE: CanvasState = {
   isCanvasOpen: false,
   artifactId: undefined,
+  activeFilePath: null,
 };
 
 export function CanvasProvider({ children }: CanvasProviderProps) {
   const [state, setState] = useState(INITIAL_STATE);
 
   const openCanvas = useCallback((artifactId: Id<'artifacts'>) => {
-    setState({
+    setState((prev) => ({
       isCanvasOpen: true,
       artifactId,
-    });
+      // Switching artifacts resets the active file; staying on the same
+      // artifact preserves the user's file selection across re-opens.
+      activeFilePath:
+        prev.artifactId === artifactId ? prev.activeFilePath : null,
+    }));
   }, []);
 
   const closeCanvas = useCallback(() => {
@@ -73,14 +95,19 @@ export function CanvasProvider({ children }: CanvasProviderProps) {
     setState(INITIAL_STATE);
   }, []);
 
+  const setActiveFilePath = useCallback((path: string | null) => {
+    setState((prev) => ({ ...prev, activeFilePath: path }));
+  }, []);
+
   const value = useMemo(
     () => ({
       ...state,
       openCanvas,
       closeCanvas,
       resetCanvas,
+      setActiveFilePath,
     }),
-    [state, openCanvas, closeCanvas, resetCanvas],
+    [state, openCanvas, closeCanvas, resetCanvas, setActiveFilePath],
   );
 
   return (
diff --git a/services/platform/app/features/chat/components/canvas/canvas-file-sidebar.tsx b/services/platform/app/features/chat/components/canvas/canvas-file-sidebar.tsx
new file mode 100644
index 000000000..2c036e909
--- /dev/null
+++ b/services/platform/app/features/chat/components/canvas/canvas-file-sidebar.tsx
@@ -0,0 +1,280 @@
+'use client';
+
+import { Button } from '@tale/ui/button';
+import {
+  ChevronLeft,
+  ChevronRight,
+  FileCode,
+  FilePlus,
+  FileText,
+} from 'lucide-react';
+import { useEffect, useRef, useState } from 'react';
+
+import { useT } from '@/lib/i18n/client';
+import { cn } from '@/lib/utils/cn';
+
+interface ArtifactFile {
+  path: string;
+  content: string;
+}
+
+interface CanvasFileSidebarProps {
+  files: readonly ArtifactFile[];
+  entryFile: string;
+  /**
+   * Path of the file the LLM is currently streaming into (advisory). When
+   * the streamed file is not yet in `files[]` (mid-create), we still render
+   * it in the tree as a "ghost" entry so the user sees the placeholder
+   * before the row settles.
+   */
+  streamingPath?: string;
+  activePath: string;
+  onSelect: (path: string) => void;
+  /**
+   * Create a new file at `path` (empty content). When omitted, the "+"
+   * affordance is hidden — read-only mode (e.g. revision viewer).
+   * Implementations should resolve once the row has persisted; the sidebar
+   * auto-selects the new path after.
+   */
+  onAddFile?: (path: string) => Promise<void>;
+}
+
+const COLLAPSED_STORAGE_KEY = 'canvas-sidebar-collapsed';
+
+function iconForPath(path: string) {
+  if (
+    path.endsWith('.md') ||
+    path.endsWith('.txt') ||
+    path.endsWith('.json') ||
+    path.endsWith('.yaml') ||
+    path.endsWith('.yml')
+  ) {
+    return FileText;
+  }
+  return FileCode;
+}
+
+export function CanvasFileSidebar({
+  files,
+  entryFile,
+  streamingPath,
+  activePath,
+  onSelect,
+  onAddFile,
+}: CanvasFileSidebarProps) {
+  const { t } = useT('chat');
+
+  const [collapsed, setCollapsed] = useState<boolean>(() => {
+    if (typeof window === 'undefined') return false;
+    try {
+      return window.localStorage.getItem(COLLAPSED_STORAGE_KEY) === '1';
+    } catch {
+      return false;
+    }
+  });
+
+  // Add-file inline form state. Open mode swaps the file-count chip header
+  // for an <input>; submitting calls `onAddFile`, then auto-selects the
+  // new path. Submit is gated against duplicate / empty paths so the
+  // mutation only fires for actionable input.
+  const [adding, setAdding] = useState(false);
+  const [draftPath, setDraftPath] = useState('');
+  const [addError, setAddError] = useState<string | undefined>(undefined);
+  const [adding_inflight, setAddingInflight] = useState(false);
+  const draftInputRef = useRef<HTMLInputElement | null>(null);
+
+  useEffect(() => {
+    try {
+      window.localStorage.setItem(COLLAPSED_STORAGE_KEY, collapsed ? '1' : '0');
+    } catch {
+      // localStorage may be disabled (Safari private). Ignore.
+    }
+  }, [collapsed]);
+
+  useEffect(() => {
+    if (adding) draftInputRef.current?.focus();
+  }, [adding]);
+
+  const handleAddSubmit = async () => {
+    if (!onAddFile) return;
+    const trimmed = draftPath.trim();
+    if (trimmed === '') {
+      setAddError(t('canvas.fileSidebar.errorPathRequired'));
+      return;
+    }
+    if (files.some((f) => f.path === trimmed)) {
+      setAddError(t('canvas.fileSidebar.errorPathExists'));
+      return;
+    }
+    setAddError(undefined);
+    setAddingInflight(true);
+    try {
+      await onAddFile(trimmed);
+      onSelect(trimmed);
+      setAdding(false);
+      setDraftPath('');
+    } catch (err) {
+      console.error('[canvas-file-sidebar] add file failed', err);
+      setAddError(
+        err instanceof Error
+          ? err.message
+          : t('canvas.fileSidebar.errorAddFailed'),
+      );
+    } finally {
+      setAddingInflight(false);
+    }
+  };
+
+  const cancelAdd = () => {
+    setAdding(false);
+    setDraftPath('');
+    setAddError(undefined);
+  };
+
+  // Synthesize a ghost entry for a `streamingPath` that hasn't landed in
+  // `files[]` yet — the canvas should show *something* under the cursor
+  // while the create stream is mid-flight.
+  const ghostStreaming =
+    streamingPath !== undefined && !files.some((f) => f.path === streamingPath);
+  const tree: { path: string; ghost: boolean }[] = [
+    ...files.map((f) => ({ path: f.path, ghost: false })),
+    ...(ghostStreaming ? [{ path: streamingPath, ghost: true }] : []),
+  ];
+
+  if (collapsed) {
+    return (
+      <div className="border-border bg-muted/10 flex w-8 shrink-0 flex-col items-center border-r py-2">
+        <Button
+          variant="ghost"
+          size="icon"
+          className="size-7"
+          onClick={() => setCollapsed(false)}
+          aria-label={t('canvas.fileSidebar.expand')}
+        >
+          <ChevronRight className="size-4" aria-hidden />
+        </Button>
+      </div>
+    );
+  }
+
+  return (
+    <div
+      className="border-border bg-muted/10 flex w-44 shrink-0 flex-col border-r"
+      role="navigation"
+      aria-label={t('canvas.fileSidebar.label')}
+    >
+      <div className="border-border flex items-center justify-between border-b px-2 py-1.5">
+        <span className="text-muted-foreground text-xs font-medium uppercase">
+          {t('canvas.fileSidebar.title')}
+        </span>
+        <div className="flex items-center gap-0.5">
+          {onAddFile && (
+            <Button
+              variant="ghost"
+              size="icon"
+              className="size-6"
+              onClick={() => setAdding(true)}
+              disabled={adding}
+              aria-label={t('canvas.fileSidebar.addFile')}
+            >
+              <FilePlus className="size-3.5" aria-hidden />
+            </Button>
+          )}
+          <Button
+            variant="ghost"
+            size="icon"
+            className="size-6"
+            onClick={() => setCollapsed(true)}
+            aria-label={t('canvas.fileSidebar.collapse')}
+          >
+            <ChevronLeft className="size-3.5" aria-hidden />
+          </Button>
+        </div>
+      </div>
+      {adding && (
+        <div className="border-border flex flex-col gap-1 border-b px-2 py-1.5">
+          <input
+            ref={draftInputRef}
+            type="text"
+            value={draftPath}
+            onChange={(e) => setDraftPath(e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === 'Enter') {
+                e.preventDefault();
+                void handleAddSubmit();
+              } else if (e.key === 'Escape') {
+                e.preventDefault();
+                cancelAdd();
+              }
+            }}
+            placeholder={t('canvas.fileSidebar.addFilePlaceholder')}
+            aria-label={t('canvas.fileSidebar.addFile')}
+            disabled={adding_inflight}
+            className="bg-background border-border focus:border-ring rounded border px-1.5 py-1 font-mono text-xs outline-none"
+          />
+          {addError !== undefined && (
+            <span className="text-destructive text-[10px]">{addError}</span>
+          )}
+          <div className="flex justify-end gap-1">
+            <Button
+              variant="ghost"
+              size="sm"
+              className="h-6 px-2 text-[10px]"
+              onClick={cancelAdd}
+              disabled={adding_inflight}
+            >
+              {t('canvas.fileSidebar.addFileCancel')}
+            </Button>
+            <Button
+              variant="primary"
+              size="sm"
+              className="h-6 px-2 text-[10px]"
+              onClick={() => void handleAddSubmit()}
+              disabled={adding_inflight || draftPath.trim() === ''}
+            >
+              {t('canvas.fileSidebar.addFileConfirm')}
+            </Button>
+          </div>
+        </div>
+      )}
+      <ul className="flex flex-1 flex-col gap-0.5 overflow-auto p-1">
+        {tree.map(({ path, ghost }) => {
+          const Icon = iconForPath(path);
+          const isActive = path === activePath;
+          const isEntry = path === entryFile;
+          const isStreaming = path === streamingPath;
+          return (
+            <li key={path}>
+              <button
+                type="button"
+                onClick={() => onSelect(path)}
+                aria-current={isActive ? 'true' : undefined}
+                className={cn(
+                  'group flex w-full items-center gap-1.5 rounded px-2 py-1 text-left text-xs transition-colors',
+                  isActive
+                    ? 'bg-muted text-foreground'
+                    : 'text-muted-foreground hover:bg-muted/60 hover:text-foreground',
+                  ghost && 'italic opacity-70',
+                )}
+              >
+                <Icon className="size-3.5 shrink-0" aria-hidden />
+                <span className="flex-1 truncate font-mono">{path}</span>
+                {isStreaming && (
+                  <span
+                    className="size-1.5 shrink-0 animate-pulse rounded-full bg-blue-500"
+                    aria-label={t('canvas.fileSidebar.streamingDot')}
+                  />
+                )}
+                {isEntry && !isStreaming && (
+                  <span className="text-muted-foreground/60 shrink-0 text-[10px]">
+                    {t('canvas.fileSidebar.entryBadge')}
+                  </span>
+                )}
+              </button>
+            </li>
+          );
+        })}
+      </ul>
+    </div>
+  );
+}
diff --git a/services/platform/app/features/chat/components/canvas/canvas-pane.tsx b/services/platform/app/features/chat/components/canvas/canvas-pane.tsx
index c812d0ee0..79ae92701 100644
--- a/services/platform/app/features/chat/components/canvas/canvas-pane.tsx
+++ b/services/platform/app/features/chat/components/canvas/canvas-pane.tsx
@@ -5,15 +5,10 @@ import { Button } from '@tale/ui/button';
 import { useMutation, useQuery } from 'convex/react';
 import {
   Check,
-  Code,
   Copy,
   Download,
   Eye,
   FileDown,
-  FileText,
-  GitBranch,
-  Globe,
-  Image,
   Loader2,
   Maximize2,
   Minimize2,
@@ -21,11 +16,12 @@ import {
   Save,
   X,
 } from 'lucide-react';
-import { memo, useCallback, useEffect, useRef, useState } from 'react';
+import { memo, useCallback, useEffect, useMemo, useRef, useState } from 'react';
 
 import { Tooltip } from '@/app/components/ui/overlays/tooltip';
 import { useToast } from '@/app/hooks/use-toast';
 import { api } from '@/convex/_generated/api';
+import { resolveArtifactFiles } from '@/convex/artifacts/resolve_files';
 import { getEnv } from '@/lib/env';
 import { useT } from '@/lib/i18n/client';
 import { cn } from '@/lib/utils/cn';
@@ -33,9 +29,19 @@ import { lazyComponent } from '@/lib/utils/lazy-component';
 
 import { useStreamedArtifactContent } from '../../hooks/use-streamed-artifact-content';
 import { useCanvas, type CanvasContentType } from './canvas-context';
+import { CanvasFileSidebar } from './canvas-file-sidebar';
 import type { CanvasHtmlRendererHandle } from './canvas-html-renderer';
 import type { CanvasMarkdownRendererHandle } from './canvas-markdown-renderer';
+import {
+  CANVAS_TYPE_EXTENSIONS,
+  CANVAS_TYPE_ICONS,
+  CANVAS_TYPE_LABEL_KEYS,
+  CANVAS_TYPE_MIME_TYPES,
+  isRunnableArtifactType,
+  runnableLanguage,
+} from './icon-map';
 import { printHtmlInHiddenIframe } from './print-via-iframe';
+import { RunResultPanel } from './run-result-panel';
 
 const CanvasCodeRenderer = lazyComponent(() =>
   import('./canvas-code-renderer').then((m) => ({
@@ -43,6 +49,12 @@ const CanvasCodeRenderer = lazyComponent(() =>
   })),
 );
 
+const CanvasRunnableCodeRenderer = lazyComponent(() =>
+  import('./canvas-runnable-code-renderer').then((m) => ({
+    default: m.CanvasRunnableCodeRenderer,
+  })),
+);
+
 const CanvasHtmlRenderer = lazyComponent<
   React.ComponentProps<
     typeof import('./canvas-html-renderer').CanvasHtmlRenderer
@@ -134,22 +146,6 @@ function buildMarkdownPrintHtml(renderedHtml: string): string {
   return `<style>${MARKDOWN_PRINT_STYLES}</style><article>${renderedHtml}</article>`;
 }
 
-const TYPE_ICONS: Record<CanvasContentType, typeof Code> = {
-  code: Code,
-  html: Globe,
-  mermaid: GitBranch,
-  svg: Image,
-  markdown: FileText,
-};
-
-const TYPE_LABELS: Record<CanvasContentType, string> = {
-  code: 'Code',
-  html: 'HTML',
-  mermaid: 'Mermaid',
-  svg: 'SVG',
-  markdown: 'Markdown',
-};
-
 const MIN_WIDTH = 320;
 const MAX_WIDTH = 900;
 const DEFAULT_WIDTH = 480;
@@ -172,7 +168,13 @@ function SpinnerIcon({ className }: { className?: string }) {
 function CanvasPaneComponent() {
   const { t } = useT('chat');
   const { toast } = useToast();
-  const { isCanvasOpen, artifactId, closeCanvas } = useCanvas();
+  const {
+    isCanvasOpen,
+    artifactId,
+    closeCanvas,
+    activeFilePath,
+    setActiveFilePath,
+  } = useCanvas();
   // Edit buffer lives in local state — only this component reads / writes it.
   // Keeping it in CanvasContext used to fan out a per-keystroke render to
   // every `useCanvas()` consumer (ArtifactBar, MessageArtifactPills,
@@ -218,15 +220,11 @@ function CanvasPaneComponent() {
   }, [isCanvasOpen]);
 
   // Reset edit-in-progress state when the user switches to a different
-  // artifact so previous typing doesn't leak across.
+  // artifact OR file so previous typing doesn't leak across.
   const prevEditArtifactRef = useRef(artifactId);
-  useEffect(() => {
-    if (prevEditArtifactRef.current !== artifactId) {
-      prevEditArtifactRef.current = artifactId;
-      setIsEditing(false);
-      setEditBuffer(undefined);
-    }
-  }, [artifactId]);
+  // `activePath` is resolved later in the render against the live artifact
+  // row; we just need a stable holder to detect *changes* across renders.
+  const prevEditPathRef = useRef<string | null>(null);
 
   // Pulse the content area when an AI stream finishes settling. Patch in
   // particular is an instant transition (content was unchanged during the
@@ -296,7 +294,7 @@ function CanvasPaneComponent() {
       artifact.streamingPatches.length > 0
     ) {
       lastPatchSnapshotRef.current = {
-        code: artifact.content,
+        code: artifact.content ?? '',
         patches: artifact.streamingPatches,
       };
     }
@@ -413,22 +411,76 @@ function CanvasPaneComponent() {
     };
   }, []);
 
+  // Resolve the artifact's project shape once per render. Synthesizes a
+  // single-file project from legacy `content` for rows that pre-date the
+  // multi-file schema — see resolve_files.ts. `streamingPath` is the file
+  // the LLM is currently writing into (advisory); when it points at a
+  // file not yet in `files[]`, we treat that as a "ghost" entry in the
+  // sidebar.
+  const resolved = useMemo(
+    () =>
+      artifact
+        ? resolveArtifactFiles(artifact)
+        : { files: [], entryFile: '', synthesized: true as const },
+    [artifact],
+  );
+  const streamingPath = artifact?.streamingPath;
+  // The "active" file is what the user selected in the sidebar; default
+  // to the entry file. If the streaming file isn't in `files[]` yet (mid
+  // create), we let the user click into it via the ghost entry; otherwise
+  // we leave selection untouched.
+  const activePath = activeFilePath ?? streamingPath ?? resolved.entryFile;
+  const activeFile =
+    resolved.files.find((f) => f.path === activePath) ??
+    // Streaming a brand-new file (not yet in files[]): synthesize an
+    // empty entry so the renderer has something to scaffold against.
+    (streamingPath === activePath
+      ? { path: activePath, content: '' }
+      : resolved.files[0]);
+
+  // Reset edit-in-progress state when the user switches to a different
+  // artifact OR file so previous typing doesn't leak across.
+  useEffect(() => {
+    if (
+      prevEditArtifactRef.current !== artifactId ||
+      prevEditPathRef.current !== activePath
+    ) {
+      prevEditArtifactRef.current = artifactId;
+      prevEditPathRef.current = activePath;
+      setIsEditing(false);
+      setEditBuffer(undefined);
+    }
+  }, [artifactId, activePath]);
+
   // Read content reactively. Streaming-aware: while the artifact is being
   // written by the LLM, prefer the live tool-input-delta stream from the
   // agent SDK (decoded client-side); fall back to the legacy
   // `streamingContent` field for any in-flight artifact created before
   // the toolCallId field rolled out; finally fall back to the settled
   // `content` once the stream completes.
-  const settledContent = artifact?.content ?? '';
+  const settledContent = activeFile?.content ?? '';
   const streamingContent = artifact?.streamingContent;
   const isStreaming = artifact?.liveStreamMode !== undefined;
   const liveStreamMode = artifact?.liveStreamMode;
-  // create/rewrite stream tokens come via the SDK's tool-input-delta
-  // rows; patch leaves the source static. Only the former should drive
-  // the trailing caret in the code renderer — a blinking caret on
-  // unchanging source is misleading.
-  const isContentStreaming =
-    liveStreamMode === 'create' || liveStreamMode === 'rewrite';
+  // The streaming caret + 3-tier fallback only apply when the LLM is
+  // writing the *file the user is currently viewing*. Browsing another
+  // file in the same project while the LLM streams should look static.
+  // When `streamingPath` is undefined (legacy rows from before that field
+  // shipped), the create/rewrite stream targets the entry file by
+  // convention — fall back to that so existing tests + in-flight rows
+  // keep working.
+  const effectiveStreamingPath = streamingPath ?? resolved.entryFile;
+  const isStreamingActiveFile =
+    isStreaming &&
+    (liveStreamMode === 'create' ||
+      liveStreamMode === 'rewrite' ||
+      liveStreamMode === 'append') &&
+    effectiveStreamingPath === activePath;
+  // create / rewrite / append stream their content via the SDK's
+  // tool-input-delta rows; patch leaves the source static. Only the
+  // content-bearing modes should drive the trailing caret in the code
+  // renderer — a blinking caret on unchanging source is misleading.
+  const isContentStreaming = isStreamingActiveFile;
   const { content: streamedContent, hasDeltas } = useStreamedArtifactContent(
     artifactId,
     artifact?.toolCallId,
@@ -454,7 +506,23 @@ function CanvasPaneComponent() {
   // dwell window after the stream ends (`keepSourceLock`) so a fast patch
   // does not flick through the diff faster than a human can read it. The
   // settle pulse + delayed return to preview handle the closing transition.
-  const showStreamingSource = !isEditing && (isStreaming || keepSourceLock);
+  //
+  // For multi-file artifacts: only gate source-view mode on the *active*
+  // file. If the LLM is streaming main.js while the user is browsing
+  // verify.js, verify.js renders as its normal type-specific preview, not
+  // a streaming-source view. Patch mode is single-file (the legacy
+  // streamingPatches array is not path-scoped) so we keep its existing
+  // behavior and only show the diff when the user is on the entry file.
+  const showStreamingSource =
+    !isEditing &&
+    ((liveStreamMode === 'create' ||
+    liveStreamMode === 'rewrite' ||
+    liveStreamMode === 'append'
+      ? isStreamingActiveFile
+      : liveStreamMode === 'patch'
+        ? activePath === resolved.entryFile
+        : false) ||
+      keepSourceLock);
   // After the server clears `streamingPatches` at execute time we still
   // want the diff visible for the dwell window. Fall back to the snapshot
   // taken just before settle (frozen pre-patch source + final patches).
@@ -486,29 +554,32 @@ function CanvasPaneComponent() {
   }, [displayedContent]);
 
   const handleDownload = useCallback(() => {
-    const extensions: Record<CanvasContentType, string> = {
-      code: canvasLanguage ?? 'txt',
-      html: 'html',
-      mermaid: 'mmd',
-      svg: 'svg',
-      markdown: 'md',
-    };
-    const ext = extensions[canvasType];
-    const mimeTypes: Record<CanvasContentType, string> = {
-      code: 'text/plain',
-      html: 'text/html',
-      mermaid: 'text/plain',
-      svg: 'image/svg+xml',
-      markdown: 'text/markdown',
-    };
-    const blob = new Blob([displayedContent], { type: mimeTypes[canvasType] });
+    // Multi-file: name the download after the active file's path. For the
+    // single-file (entry-only) case this falls back to the artifact title +
+    // type extension to preserve the legacy naming.
+    const activeFileName =
+      activePath && activePath !== resolved.entryFile ? activePath : undefined;
+    const ext =
+      canvasType === 'code'
+        ? (canvasLanguage ?? CANVAS_TYPE_EXTENSIONS.code)
+        : CANVAS_TYPE_EXTENSIONS[canvasType];
+    const blob = new Blob([displayedContent], {
+      type: CANVAS_TYPE_MIME_TYPES[canvasType],
+    });
     const url = URL.createObjectURL(blob);
     const a = document.createElement('a');
     a.href = url;
-    a.download = `${canvasTitle || 'canvas'}.${ext}`;
+    a.download = activeFileName ?? `${canvasTitle || 'canvas'}.${ext}`;
     a.click();
     URL.revokeObjectURL(url);
-  }, [displayedContent, canvasType, canvasTitle, canvasLanguage]);
+  }, [
+    displayedContent,
+    canvasType,
+    canvasTitle,
+    canvasLanguage,
+    activePath,
+    resolved.entryFile,
+  ]);
 
   // Trigger the browser's "Save as PDF" flow by calling window.print() inside
   // the artifact's own iframe — fidelity is identical to what's on screen
@@ -566,7 +637,11 @@ function CanvasPaneComponent() {
     if (!artifactId || editBuffer === undefined) return;
     setIsApplying(true);
     try {
-      await userEditMutation({ artifactId, content: editBuffer });
+      await userEditMutation({
+        artifactId,
+        path: activePath,
+        content: editBuffer,
+      });
       setEditBuffer(undefined);
       setIsEditing(false);
       toast({ title: t('canvas.applied'), variant: 'success' });
@@ -576,11 +651,31 @@ function CanvasPaneComponent() {
     } finally {
       setIsApplying(false);
     }
-  }, [artifactId, editBuffer, userEditMutation, setEditBuffer, t, toast]);
+  }, [
+    artifactId,
+    editBuffer,
+    userEditMutation,
+    setEditBuffer,
+    activePath,
+    t,
+    toast,
+  ]);
+
+  // Create a new file in the artifact (empty content). Reuses `userEdit` —
+  // its handler creates the file when `path` is not yet present. The
+  // sidebar auto-selects the new path on resolution; we don't need to
+  // touch `setActiveFilePath` here.
+  const handleAddFile = useCallback(
+    async (path: string) => {
+      if (!artifactId) return;
+      await userEditMutation({ artifactId, path, content: '' });
+    },
+    [artifactId, userEditMutation],
+  );
 
   if (!isCanvasOpen || !artifactId) return null;
 
-  const TypeIcon = TYPE_ICONS[canvasType];
+  const TypeIcon = CANVAS_TYPE_ICONS[canvasType];
 
   return (
     <div
@@ -632,7 +727,7 @@ function CanvasPaneComponent() {
           <TypeIcon className="text-muted-foreground size-4 shrink-0" />
           <span className="truncate text-sm font-medium">{canvasTitle}</span>
           <Badge variant="outline" className="shrink-0 text-xs">
-            {TYPE_LABELS[canvasType]}
+            {t(CANVAS_TYPE_LABEL_KEYS[canvasType])}
           </Badge>
           {isStreaming && (
             <Badge
@@ -781,58 +876,101 @@ function CanvasPaneComponent() {
 
       <div
         className={cn(
-          'min-h-0 flex-1 overflow-hidden transition-shadow duration-700',
+          'flex min-h-0 flex-1 overflow-hidden transition-shadow duration-700',
           justSettled && 'ring-success/40 ring-2 ring-inset',
         )}
       >
-        {showStreamingSource && (
-          <CanvasCodeRenderer
-            code={sourceCode}
-            language={streamingHighlightLang}
-            isEditing={false}
-            isStreaming={isContentStreaming}
-            highlightPatches={sourcePatches}
-            onContentChange={onContentChange}
-          />
-        )}
-        {!showStreamingSource && canvasType === 'code' && (
-          <CanvasCodeRenderer
-            code={displayedContent}
-            language={canvasLanguage}
-            isEditing={isEditing}
-            onContentChange={onContentChange}
-          />
-        )}
-        {!showStreamingSource && canvasType === 'html' && (
-          <CanvasHtmlRenderer
-            ref={htmlRendererRef}
-            html={displayedContent}
-            isEditing={isEditing}
-            onContentChange={onContentChange}
-          />
-        )}
-        {!showStreamingSource && canvasType === 'svg' && (
-          <CanvasHtmlRenderer
-            html={displayedContent}
-            isEditing={isEditing}
-            onContentChange={onContentChange}
-          />
-        )}
-        {!showStreamingSource && canvasType === 'mermaid' && (
-          <CanvasMermaidRenderer
-            code={displayedContent}
-            isEditing={isEditing}
-            onContentChange={onContentChange}
-          />
-        )}
-        {!showStreamingSource && canvasType === 'markdown' && (
-          <CanvasMarkdownRenderer
-            ref={markdownRendererRef}
-            content={displayedContent}
-            isEditing={isEditing}
-            onContentChange={onContentChange}
+        {resolved.files.length >= 1 && (
+          <CanvasFileSidebar
+            files={resolved.files}
+            entryFile={resolved.entryFile}
+            streamingPath={streamingPath ?? undefined}
+            activePath={activePath}
+            onSelect={setActiveFilePath}
+            onAddFile={handleAddFile}
           />
         )}
+        <div className="min-h-0 min-w-0 flex-1 overflow-hidden">
+          {showStreamingSource && !isRunnableArtifactType(canvasType) && (
+            <CanvasCodeRenderer
+              code={sourceCode}
+              language={streamingHighlightLang}
+              isEditing={false}
+              isStreaming={isContentStreaming}
+              highlightPatches={sourcePatches}
+              onContentChange={onContentChange}
+            />
+          )}
+          {!showStreamingSource && canvasType === 'code' && (
+            <CanvasCodeRenderer
+              code={displayedContent}
+              language={canvasLanguage}
+              isEditing={isEditing}
+              onContentChange={onContentChange}
+            />
+          )}
+          {!showStreamingSource && canvasType === 'html' && (
+            <CanvasHtmlRenderer
+              ref={htmlRendererRef}
+              html={displayedContent}
+              isEditing={isEditing}
+              onContentChange={onContentChange}
+            />
+          )}
+          {!showStreamingSource && canvasType === 'svg' && (
+            <CanvasHtmlRenderer
+              html={displayedContent}
+              isEditing={isEditing}
+              onContentChange={onContentChange}
+            />
+          )}
+          {!showStreamingSource && canvasType === 'mermaid' && (
+            <CanvasMermaidRenderer
+              code={displayedContent}
+              isEditing={isEditing}
+              onContentChange={onContentChange}
+            />
+          )}
+          {!showStreamingSource && canvasType === 'markdown' && (
+            <CanvasMarkdownRenderer
+              ref={markdownRendererRef}
+              content={displayedContent}
+              isEditing={isEditing}
+              onContentChange={onContentChange}
+            />
+          )}
+          {isRunnableArtifactType(canvasType) && artifact && (
+            <div className="flex h-full min-h-0 flex-col">
+              <RunResultPanel
+                artifactId={artifactId}
+                artifactRevision={artifact.revision}
+                entryFile={resolved.entryFile}
+              />
+              <div className="min-h-0 flex-1">
+                <CanvasRunnableCodeRenderer
+                  artifactId={artifactId}
+                  activePath={activePath}
+                  source={showStreamingSource ? sourceCode : displayedContent}
+                  language={(() => {
+                    // Legacy single-runtime types pin the highlighter to
+                    // their language. `script_runnable` (polyglot) infers
+                    // per active file extension so a sidebar switch from
+                    // `main.js` to `qa.py` re-highlights correctly.
+                    const locked = runnableLanguage(canvasType);
+                    if (locked === 'python') return 'python';
+                    if (locked === 'javascript') return 'node';
+                    const ext = (activePath ?? '')
+                      .toLowerCase()
+                      .split('.')
+                      .pop();
+                    return ext === 'py' ? 'python' : 'node';
+                  })()}
+                  isStreaming={isContentStreaming}
+                />
+              </div>
+            </div>
+          )}
+        </div>
       </div>
     </div>
   );
diff --git a/services/platform/app/features/chat/components/canvas/canvas-runnable-code-renderer.tsx b/services/platform/app/features/chat/components/canvas/canvas-runnable-code-renderer.tsx
new file mode 100644
index 000000000..2a9a54ae5
--- /dev/null
+++ b/services/platform/app/features/chat/components/canvas/canvas-runnable-code-renderer.tsx
@@ -0,0 +1,61 @@
+'use client';
+
+// Canvas pane source view for `script_runnable` (and legacy
+// `python_runnable` / `node_runnable`) artifacts. Used to also embed the
+// execution panel; that responsibility
+// has moved up to `canvas-pane.tsx`'s `RunResultPanel` so the run state
+// is a project-level fixture independent of the sidebar's active file.
+// This component is now a thin source-only wrapper around
+// `CanvasCodeRenderer`.
+
+import type { Id } from '@/convex/_generated/dataModel';
+
+import { CanvasCodeRenderer } from './canvas-code-renderer';
+
+interface CanvasRunnableCodeRendererProps {
+  artifactId: Id<'artifacts'>;
+  /**
+   * Path of the file the user has selected in the sidebar. Kept on the
+   * prop surface for future per-file source-view affordances; the source
+   * code itself is supplied via `source` so the parent (canvas-pane)
+   * remains the single source of truth for what's currently displayed.
+   */
+  activePath: string;
+  source: string;
+  language: 'python' | 'node';
+  isStreaming?: boolean;
+}
+
+function CanvasRunnableCodeRendererComponent({
+  artifactId,
+  activePath,
+  source,
+  language,
+  isStreaming,
+}: CanvasRunnableCodeRendererProps) {
+  // `artifactId` and `activePath` are intentionally accepted but unused —
+  // they keep the prop surface stable for callers and leave room for the
+  // upcoming per-file source affordances (jump-to-definition,
+  // run-this-file CTA, etc.) without re-threading props through
+  // canvas-pane.
+  void artifactId;
+  void activePath;
+
+  return (
+    <CanvasCodeRenderer
+      code={source}
+      language={language}
+      isEditing={false}
+      isStreaming={isStreaming ?? false}
+      onContentChange={() => {
+        /* runnable canvas is read-only; LLM-driven via artifact_file_create / artifact_file_update */
+      }}
+    />
+  );
+}
+
+// No memo wrapper: the parent re-renders for every artifact-row patch
+// (e.g. live `runProgress` during a run) and the props are inherently
+// changing during streaming, so memo's shallow equality check would
+// never pass. Keep this lean.
+export const CanvasRunnableCodeRenderer = CanvasRunnableCodeRendererComponent;
diff --git a/services/platform/app/features/chat/components/canvas/icon-map.ts b/services/platform/app/features/chat/components/canvas/icon-map.ts
new file mode 100644
index 000000000..c1067d1e8
--- /dev/null
+++ b/services/platform/app/features/chat/components/canvas/icon-map.ts
@@ -0,0 +1,116 @@
+import {
+  Code,
+  FileText,
+  GitBranch,
+  Globe,
+  Image as ImageIcon,
+  Terminal,
+  TerminalSquare,
+} from 'lucide-react';
+import type { ComponentType } from 'react';
+
+import type { CanvasContentType } from './canvas-context';
+
+/**
+ * Type guard for runnable artifact types. Centralized here (over inline
+ * `t === 'script_runnable' || ...'`) so the runnable set has one source
+ * of truth — adding `ruby_runnable` would touch this guard, the
+ * language switch below, and nothing else.
+ */
+export function isRunnableArtifactType(
+  type: CanvasContentType,
+): type is 'script_runnable' | 'python_runnable' | 'node_runnable' {
+  return (
+    type === 'script_runnable' ||
+    type === 'python_runnable' ||
+    type === 'node_runnable'
+  );
+}
+
+/**
+ * Returns the highlighter / extension language for a runnable type, or
+ * undefined for non-runnable types. Mirrors the agent-tool side helper
+ * in `convex/agent_tools/artifacts/shared.ts:runnableLanguage` so the
+ * client and the server agree on the python/node mapping.
+ *
+ * `script_runnable` is polyglot — the entry file extension is the
+ * authoritative source per file, so this helper returns undefined and
+ * callers should fall back to inferring from the active file path.
+ */
+export function runnableLanguage(
+  type: CanvasContentType,
+): 'python' | 'javascript' | undefined {
+  if (type === 'python_runnable') return 'python';
+  if (type === 'node_runnable') return 'javascript';
+  return undefined;
+}
+
+/**
+ * Canonical icon / label / extension / mime mappings for every
+ * `CanvasContentType`. Consolidates what used to be three drift-prone
+ * copies (canvas-pane, artifact-bar, message-bubble) plus the inline
+ * `extensions` / `mimeTypes` literals in `canvas-pane.handleDownload`.
+ *
+ * Label keys point at `chat.canvas.typeLabel.<type>` — callers resolve
+ * via `useT('chat')` so language is not baked into the map.
+ */
+
+export const CANVAS_TYPE_ICONS: Record<
+  CanvasContentType,
+  ComponentType<{ className?: string }>
+> = {
+  code: Code,
+  html: Globe,
+  mermaid: GitBranch,
+  svg: ImageIcon,
+  markdown: FileText,
+  // Runnable types get terminal-flavored icons so the chat list and the
+  // canvas tabs distinguish at-a-glance between static `code` snippets
+  // (Code icon) and an executable sandbox artifact (Terminal icons).
+  // Polyglot `script_runnable` shares the Python icon since the entry
+  // default is `main.py`; per-file shading (.js shows the Node icon)
+  // is handled by the file-tree, not this top-level type icon.
+  script_runnable: TerminalSquare,
+  python_runnable: TerminalSquare,
+  node_runnable: Terminal,
+};
+
+export const CANVAS_TYPE_LABEL_KEYS: Record<CanvasContentType, string> = {
+  code: 'canvas.typeLabel.code',
+  html: 'canvas.typeLabel.html',
+  mermaid: 'canvas.typeLabel.mermaid',
+  svg: 'canvas.typeLabel.svg',
+  markdown: 'canvas.typeLabel.markdown',
+  script_runnable: 'canvas.typeLabel.script_runnable',
+  python_runnable: 'canvas.typeLabel.python_runnable',
+  node_runnable: 'canvas.typeLabel.node_runnable',
+};
+
+/**
+ * Default file extensions for "Download as…". `code` is a placeholder
+ * because the caller should prefer `artifact.language` when present and
+ * fall back to this only if the language field is empty.
+ */
+export const CANVAS_TYPE_EXTENSIONS: Record<CanvasContentType, string> = {
+  code: 'txt',
+  html: 'html',
+  mermaid: 'mmd',
+  svg: 'svg',
+  markdown: 'md',
+  // `script_runnable` defaults to .py — callers should prefer the active
+  // file's actual extension via the per-file API when available.
+  script_runnable: 'py',
+  python_runnable: 'py',
+  node_runnable: 'js',
+};
+
+export const CANVAS_TYPE_MIME_TYPES: Record<CanvasContentType, string> = {
+  code: 'text/plain',
+  html: 'text/html',
+  mermaid: 'text/plain',
+  svg: 'image/svg+xml',
+  markdown: 'text/markdown',
+  script_runnable: 'text/x-python',
+  python_runnable: 'text/x-python',
+  node_runnable: 'application/javascript',
+};
diff --git a/services/platform/app/features/chat/components/canvas/run-result-helpers.tsx b/services/platform/app/features/chat/components/canvas/run-result-helpers.tsx
new file mode 100644
index 000000000..0f3434ce6
--- /dev/null
+++ b/services/platform/app/features/chat/components/canvas/run-result-helpers.tsx
@@ -0,0 +1,459 @@
+'use client';
+
+// Shared presentation helpers for artifact run results. Used by both the
+// canvas's `RunResultPanel` (primary + collapsible secondary projections)
+// and any future consumer that needs the same status / file / live-tail
+// chrome. Pure presentational — no Convex queries, no routing.
+
+import { Badge } from '@tale/ui/badge';
+import type { Infer } from 'convex/values';
+import {
+  AlertTriangle,
+  CheckCircle2,
+  Download,
+  Loader2,
+  Presentation,
+  FileText,
+  FileSpreadsheet,
+  File as FileIcon,
+  Image as ImageIcon,
+} from 'lucide-react';
+import { type ReactNode, useEffect, useRef } from 'react';
+
+import {
+  sandboxOutputFileValidator,
+  sandboxRunProgressValidator,
+  type SandboxErrorCode,
+  type SandboxRunStatus,
+} from '@/convex/sandbox/wire';
+import { useT } from '@/lib/i18n/client';
+import { cn } from '@/lib/utils/cn';
+import { formatFileSize } from '@/lib/utils/format/file';
+
+import { useFileUrl } from '../../hooks/queries';
+
+// Single source of truth: the same validators that gate the Convex
+// mutations also derive the client-side prop types, so a future field
+// addition on `sandboxOutputFileValidator` flows through without a
+// matching hand-edit here.
+export type RunOutputFile = Infer<typeof sandboxOutputFileValidator>;
+export type RunProgress = Infer<typeof sandboxRunProgressValidator>;
+
+function iconForContentType(contentType: string): typeof FileIcon {
+  if (
+    contentType ===
+    'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+  ) {
+    return Presentation;
+  }
+  if (
+    contentType ===
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+  ) {
+    return FileSpreadsheet;
+  }
+  if (contentType === 'application/pdf') return FileText;
+  if (contentType.startsWith('image/')) return ImageIcon;
+  return FileIcon;
+}
+
+export function FileChip({ file }: { file: RunOutputFile }) {
+  const { t } = useT('chat');
+  const { data: rawUrl } = useFileUrl(file.storageId);
+  // The raw URL points at `/api/storage/{id}` (the Convex backend route),
+  // which does NOT set `Content-Disposition`, so browsers fall back to
+  // using the URL's last path segment (the storageId UUID) as the saved
+  // filename — even when the `<a download="hello.txt">` attribute is set.
+  // Rewrite onto the platform's `/http_api/storage?id=…&filename=…`
+  // httpAction so the response carries
+  // `Content-Disposition: attachment; filename="hello.txt"`, which wins
+  // over the URL segment and matches the user's expected filename.
+  const fileUrl =
+    rawUrl && file.storageId
+      ? `${new URL(rawUrl).origin}/http_api/storage?id=${encodeURIComponent(String(file.storageId))}&filename=${encodeURIComponent(file.name)}`
+      : undefined;
+  const Icon = iconForContentType(file.contentType);
+
+  const sharedClassName = cn(
+    'border-border bg-background flex items-center gap-2 rounded-md border px-3 py-2 text-sm transition-colors',
+  );
+  const innerBody = (
+    <>
+      <Icon className="text-muted-foreground size-4 shrink-0" aria-hidden />
+      <div className="flex min-w-0 flex-1 flex-col">
+        <span className="truncate font-medium">{file.name}</span>
+        <span className="text-muted-foreground text-xs">
+          {formatFileSize(file.size)}
+        </span>
+      </div>
+      <Download
+        className="text-muted-foreground size-3.5 shrink-0"
+        aria-hidden
+      />
+    </>
+  );
+
+  // Without a resolvable URL the chip MUST NOT render as an anchor — an
+  // `<a href="#">` is semantically broken (no destination + scrolls to top
+  // on click) and screen readers announce it as a link with no target.
+  // Render a disabled `<button>` instead so the affordance is correctly
+  // typed for a11y, and surface the missing-URL state via the aria-label.
+  if (!fileUrl) {
+    return (
+      <button
+        type="button"
+        disabled
+        aria-label={t('canvas.runOpenFile', { name: file.name })}
+        className={cn(
+          sharedClassName,
+          'cursor-not-allowed opacity-60',
+          'hover:bg-background', // override default hover
+        )}
+      >
+        {innerBody}
+      </button>
+    );
+  }
+
+  return (
+    <a
+      href={fileUrl}
+      download={file.name}
+      target="_blank"
+      rel="noreferrer"
+      aria-label={t('canvas.runOpenFile', { name: file.name })}
+      className={cn(sharedClassName, 'hover:bg-muted/40')}
+    >
+      {innerBody}
+    </a>
+  );
+}
+
+// Stable icon component reference — passing an inline arrow `(props) => <Loader2 ... />`
+// makes Badge re-mount the icon on every render, and during a streaming
+// install that drips `runProgress` patches every few ms, the CSS spin
+// animation visibly stutters because it resets on each remount. Hoisting
+// to a module-scope component preserves identity (round-2 R2-B12).
+function SpinningLoader(props: { className?: string }) {
+  return <Loader2 {...props} className={cn(props.className, 'animate-spin')} />;
+}
+
+export function StatusBadge({
+  runStatus,
+  runProgress,
+  stale = false,
+}: {
+  runStatus?: SandboxRunStatus;
+  runProgress?: RunProgress;
+  /**
+   * When true and the run is in a terminal state, render a secondary
+   * "Source edited" chip next to the status badge to signal that the
+   * source has moved past the snapshot this run captured. In-flight runs
+   * (queued/installing/running) intentionally suppress the chip — the
+   * spinner reflects work that is still progressing, not stale output.
+   */
+  stale?: boolean;
+}) {
+  const { t } = useT('chat');
+  if (!runStatus) return null;
+  let primary: ReactNode;
+  if (runStatus === 'completed') {
+    primary = (
+      <Badge
+        variant="outline"
+        icon={CheckCircle2}
+        className="text-success border-success/40"
+        role="status"
+        aria-live="polite"
+      >
+        {t('canvas.runDone')}
+      </Badge>
+    );
+  } else if (runStatus === 'failed' || runStatus === 'cancelled') {
+    primary = (
+      <Badge
+        variant="outline"
+        icon={AlertTriangle}
+        className="text-destructive border-destructive/40"
+        role="status"
+        aria-live="polite"
+      >
+        {t(`canvas.runStatus.${runStatus}`)}
+      </Badge>
+    );
+  } else {
+    // queued / installing / running — live progress with spinner.
+    // Always pass `package` and `version` keys (even when undefined): ICU's
+    // `{version, select, undefined {} other { {version}}}` template throws
+    // "context variable not provided" when the key is structurally absent
+    // (round-2 R2-B12; verified empirically against intl-messageformat).
+    // Passing `undefined` triggers the `undefined` branch as intended.
+    const progressText = runProgress
+      ? t(`canvas.runProgress.${runProgress.kind}`, {
+          package: runProgress.package,
+          version: runProgress.version,
+        })
+      : t(`canvas.runStatus.${runStatus}`);
+    primary = (
+      <Badge
+        variant="outline"
+        icon={SpinningLoader}
+        className="border-border"
+        role="status"
+        aria-live="polite"
+      >
+        {progressText}
+      </Badge>
+    );
+  }
+  const isTerminal =
+    runStatus === 'completed' ||
+    runStatus === 'failed' ||
+    runStatus === 'cancelled';
+  if (!stale || !isTerminal) return primary;
+  return (
+    <span className="flex items-center gap-2">
+      {primary}
+      <Badge
+        variant="outline"
+        className="text-muted-foreground border-muted-foreground/30"
+      >
+        {t('canvas.runStale')}
+      </Badge>
+    </span>
+  );
+}
+
+/**
+ * stdout / stderr live tail. While `liveTail` is true (run in flight) the
+ * `<details>` is force-open via an imperative ref-set so the user sees
+ * output as it streams; once the flag drops, the prop is left undefined so
+ * the user can collapse manually without React re-asserting the open state.
+ *
+ * Auto-scrolls the `<pre>` to the bottom on each content change, unless the
+ * user has scrolled away from the bottom — a 32 px slack covers off-by-one
+ * rounding from the browser's scrollHeight/scrollTop math.
+ */
+export function LiveTailDetails({
+  text,
+  label,
+  liveTail,
+  preClassName,
+}: {
+  text: string;
+  label: string;
+  liveTail: boolean;
+  preClassName: string;
+}) {
+  const detailsRef = useRef<HTMLDetailsElement | null>(null);
+  const preRef = useRef<HTMLPreElement | null>(null);
+  const stickToBottomRef = useRef(true);
+
+  useEffect(() => {
+    if (liveTail && detailsRef.current && !detailsRef.current.open) {
+      detailsRef.current.open = true;
+    }
+  }, [liveTail]);
+
+  useEffect(() => {
+    const el = preRef.current;
+    if (!el) return;
+    if (!stickToBottomRef.current) return;
+    el.scrollTop = el.scrollHeight;
+  }, [text]);
+
+  return (
+    <details ref={detailsRef} className="text-xs">
+      <summary className="text-muted-foreground cursor-pointer font-medium">
+        {label}
+      </summary>
+      <pre
+        ref={preRef}
+        onScroll={(e) => {
+          const el = e.currentTarget;
+          const distanceFromBottom =
+            el.scrollHeight - el.clientHeight - el.scrollTop;
+          stickToBottomRef.current = distanceFromBottom < 32;
+        }}
+        className={preClassName}
+      >
+        {text}
+      </pre>
+    </details>
+  );
+}
+
+/**
+ * One projected execution row from `listRunsPerFile`. Same shape as what
+ * the legacy `getLatestRunPerFile` returned, kept here for callers that
+ * want to derive their own UI without re-importing the projection's
+ * exact field set from the Convex API surface.
+ */
+export interface RunFileProjection {
+  executionId: unknown;
+  path: string;
+  runStatus?: SandboxRunStatus;
+  runProgress?: RunProgress;
+  runErrorCode?: SandboxErrorCode;
+  runErrorMessage?: string;
+  runStdoutPreview?: string;
+  runStderrPreview?: string;
+  runOutputFiles?: RunOutputFile[];
+  runRevision?: number;
+  runExitCode?: number;
+}
+
+/**
+ * True when this run captured a source revision (`runRevision` is defined)
+ * that no longer matches the artifact's current revision. The panel keeps
+ * showing the run's status / stdout / stderr / files but annotates the
+ * status badge with a "Source edited" chip so the user knows the output
+ * predates their latest edits.
+ *
+ * Returns false when `runRevision` is undefined — that happens for runs
+ * the projection couldn't tag with a snapshot (e.g. a non-current
+ * execution for the artifact). Those rows render normally without the
+ * chip; we can't claim the source has moved if we don't know what
+ * revision the run captured.
+ */
+export function isStale(
+  fileRun: RunFileProjection | undefined,
+  artifactRevision: number,
+): boolean {
+  return (
+    fileRun !== undefined &&
+    fileRun.runRevision !== undefined &&
+    fileRun.runRevision !== artifactRevision
+  );
+}
+
+/**
+ * Predicate matching the legacy renderer's `showExecutionPanel` logic —
+ * mirrors "stay quiet until there's something to show" so we don't
+ * surface bare headers during streaming or pre-first-run states.
+ */
+export function hasAnythingToShow(
+  fileRun: RunFileProjection | undefined,
+): boolean {
+  if (!fileRun) return false;
+  const outputs = fileRun.runOutputFiles ?? [];
+  return (
+    fileRun.runStatus !== undefined ||
+    fileRun.runErrorCode !== undefined ||
+    outputs.length > 0 ||
+    (fileRun.runStderrPreview !== undefined &&
+      fileRun.runStderrPreview.length > 0) ||
+    (fileRun.runStdoutPreview !== undefined &&
+      fileRun.runStdoutPreview.length > 0)
+  );
+}
+
+/**
+ * Inner body of an execution panel — header (status badge + optional
+ * label), error block, output files, stdout / stderr tails. Shared so the
+ * primary entry-file panel and each collapsed secondary render the same
+ * chrome.
+ */
+export function RunResultDetails({
+  fileRun,
+  stale,
+  showHeader = true,
+  headerLabel,
+}: {
+  fileRun: RunFileProjection;
+  /**
+   * Source has been edited after this run's snapshot. Status/progress and
+   * stdout/stderr still render — only the status badge picks up a stale
+   * chip so the user knows the content reflects an earlier revision.
+   */
+  stale: boolean;
+  showHeader?: boolean;
+  /** Header text (defaults to `canvas.runStarted`). */
+  headerLabel?: string;
+}) {
+  const { t } = useT('chat');
+  const {
+    runStatus,
+    runProgress,
+    runErrorCode,
+    runErrorMessage,
+    runStdoutPreview: stdout,
+    runStderrPreview: stderr,
+  } = fileRun;
+  const outputFiles: RunOutputFile[] = (fileRun.runOutputFiles ?? []).map(
+    (f) => {
+      const next: RunOutputFile = {
+        name: f.name,
+        size: f.size,
+        contentType: f.contentType,
+        fileMetadataId: f.fileMetadataId,
+      };
+      if (f.storageId !== undefined) next.storageId = f.storageId;
+      return next;
+    },
+  );
+
+  return (
+    <div className="flex flex-col gap-3">
+      {showHeader && (
+        <div className="flex items-center justify-between">
+          <span className="text-muted-foreground text-xs font-medium uppercase">
+            {headerLabel ?? t('canvas.runStarted')}
+          </span>
+          <StatusBadge
+            runStatus={runStatus}
+            runProgress={runProgress}
+            stale={stale}
+          />
+        </div>
+      )}
+
+      {runErrorCode && (
+        <div
+          className="border-destructive/30 bg-destructive/5 text-destructive rounded-md border p-2 text-xs"
+          role="alert"
+        >
+          <div className="font-semibold">
+            {t(`canvas.runErrorCode.${runErrorCode}`)}
+          </div>
+          {runErrorMessage && (
+            <div className="mt-1 break-words">{runErrorMessage}</div>
+          )}
+        </div>
+      )}
+
+      {outputFiles.length > 0 && (
+        <div className="flex flex-col gap-2">
+          <span className="text-muted-foreground text-xs font-medium">
+            {t('canvas.runFiles')}
+          </span>
+          {outputFiles.map((f) => (
+            <FileChip key={String(f.fileMetadataId)} file={f} />
+          ))}
+        </div>
+      )}
+
+      {stdout && stdout.length > 0 && (
+        <LiveTailDetails
+          text={stdout}
+          label={t('canvas.runStdout', { chars: stdout.length })}
+          liveTail={runStatus === 'installing' || runStatus === 'running'}
+          preClassName="bg-muted/40 mt-1 max-h-40 overflow-auto rounded p-2 font-mono whitespace-pre-wrap"
+        />
+      )}
+
+      {stderr && stderr.length > 0 && (
+        <LiveTailDetails
+          text={stderr}
+          label={t('canvas.runStderr', { chars: stderr.length })}
+          liveTail={
+            runStatus === 'installing' ||
+            runStatus === 'running' ||
+            runStatus === 'failed'
+          }
+          preClassName="bg-muted/40 text-destructive mt-1 max-h-40 overflow-auto rounded p-2 font-mono whitespace-pre-wrap"
+        />
+      )}
+    </div>
+  );
+}
diff --git a/services/platform/app/features/chat/components/canvas/run-result-panel.tsx b/services/platform/app/features/chat/components/canvas/run-result-panel.tsx
new file mode 100644
index 000000000..cfe53516c
--- /dev/null
+++ b/services/platform/app/features/chat/components/canvas/run-result-panel.tsx
@@ -0,0 +1,116 @@
+'use client';
+
+// Canvas-level fixture that surfaces artifact run results, independent of
+// which file the user has selected in the sidebar. The entry file's run
+// renders as the primary panel (always visible when there is anything to
+// show); other files' runs collapse into a "Outputs for other files"
+// section below.
+//
+// Previously this lived inside `CanvasRunnableCodeRenderer` and was keyed
+// by `activePath`, so switching to a sibling file made the entry's
+// download chip disappear. Hoisting it to canvas-pane decouples the run
+// display from sidebar selection.
+
+import { useQuery } from 'convex/react';
+
+import { CollapsibleDetails } from '@/app/components/ui/navigation/collapsible-details';
+import { api } from '@/convex/_generated/api';
+import type { Id } from '@/convex/_generated/dataModel';
+import { useT } from '@/lib/i18n/client';
+
+import {
+  RunResultDetails,
+  StatusBadge,
+  hasAnythingToShow,
+  isStale,
+  type RunFileProjection,
+} from './run-result-helpers';
+
+interface RunResultPanelProps {
+  artifactId: Id<'artifacts'>;
+  artifactRevision: number;
+  entryFile: string;
+}
+
+export function RunResultPanel({
+  artifactId,
+  artifactRevision,
+  entryFile,
+}: RunResultPanelProps) {
+  const { t } = useT('chat');
+  const runs: RunFileProjection[] | undefined = useQuery(
+    api.artifacts.queries.listRunsPerFile,
+    { artifactId },
+  );
+  if (runs === undefined || runs.length === 0) return null;
+
+  // listRunsPerFile already orders entry-first, so the partition is a
+  // simple index split.
+  const entryRun = runs.find((r) => r.path === entryFile);
+  const secondaryRuns = runs.filter((r) => r.path !== entryFile);
+
+  // "Anything to show" gate per file. Stale runs still render — the badge
+  // picks up a "Source edited" chip but the content stays visible, so the
+  // user can review what their previous run produced even after editing
+  // the source.
+  const entryStale = isStale(entryRun, artifactRevision);
+  const entryHasContent = hasAnythingToShow(entryRun);
+  const visibleSecondaries = secondaryRuns
+    .map((run) => ({
+      run,
+      stale: isStale(run, artifactRevision),
+      hasContent: hasAnythingToShow(run),
+    }))
+    .filter((s) => s.hasContent);
+
+  if (!entryHasContent && visibleSecondaries.length === 0) return null;
+
+  return (
+    <div className="border-border bg-muted/10 flex shrink-0 flex-col gap-4 overflow-auto border-b p-4">
+      {entryHasContent && entryRun && (
+        <RunResultDetails
+          fileRun={entryRun}
+          stale={entryStale}
+          headerLabel={t('canvas.runResultEntryLabel')}
+        />
+      )}
+
+      {visibleSecondaries.length > 0 && (
+        <div className="flex flex-col gap-2">
+          {/* Header doubles as a count chip; pluralised for L10n. */}
+          <span className="text-muted-foreground text-xs font-medium uppercase">
+            {t('canvas.runResultSecondaryCount', {
+              count: visibleSecondaries.length,
+            })}
+          </span>
+          {visibleSecondaries.map(({ run, stale }) => (
+            <CollapsibleDetails
+              key={String(run.executionId)}
+              variant="compact"
+              summary={
+                <span className="flex min-w-0 flex-1 items-center gap-2">
+                  <span className="truncate font-mono">
+                    {t('canvas.runResultSecondaryLabel', { path: run.path })}
+                  </span>
+                  <StatusBadge
+                    runStatus={run.runStatus}
+                    runProgress={run.runProgress}
+                    stale={stale}
+                  />
+                </span>
+              }
+            >
+              <div className="mt-2 ml-5">
+                <RunResultDetails
+                  fileRun={run}
+                  stale={stale}
+                  showHeader={false}
+                />
+              </div>
+            </CollapsibleDetails>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/services/platform/app/features/chat/components/message-bubble.tsx b/services/platform/app/features/chat/components/message-bubble.tsx
index 8fe9d71c0..400855cb9 100644
--- a/services/platform/app/features/chat/components/message-bubble.tsx
+++ b/services/platform/app/features/chat/components/message-bubble.tsx
@@ -4,14 +4,9 @@ import { Badge } from '@tale/ui/badge';
 import { Button } from '@tale/ui/button';
 import { useQuery } from 'convex/react';
 import {
-  Code,
   CopyIcon,
   CheckIcon,
-  FileText,
-  GitBranch,
   GitFork,
-  Globe,
-  Image as ImageIcon,
   Info,
   Pencil,
   Bookmark,
@@ -22,7 +17,6 @@ import {
 } from 'lucide-react';
 import {
   ComponentPropsWithoutRef,
-  type ComponentType,
   useRef,
   useState,
   useEffect,
@@ -55,7 +49,8 @@ import { injectCitationTags } from '../utils/inject-citation-tags';
 import { sanitizeChatError } from '../utils/sanitize-chat-error';
 import { AssistantMessageContent } from './assistant-message-content';
 import { BlockedNotice } from './blocked-notice';
-import { type CanvasContentType, useCanvas } from './canvas/canvas-context';
+import { useCanvas } from './canvas/canvas-context';
+import { CANVAS_TYPE_ICONS } from './canvas/icon-map';
 import {
   FileAttachmentDisplay,
   FilePartDisplay,
@@ -96,17 +91,6 @@ interface MessageBubbleProps extends ComponentPropsWithoutRef<'div'> {
   isFreshSinceMount?: boolean;
 }
 
-const ARTIFACT_PILL_ICONS: Record<
-  CanvasContentType,
-  ComponentType<{ className?: string }>
-> = {
-  code: Code,
-  html: Globe,
-  mermaid: GitBranch,
-  svg: ImageIcon,
-  markdown: FileText,
-};
-
 interface MessageArtifactPillsProps {
   organizationId: string;
   threadId: string;
@@ -114,7 +98,7 @@ interface MessageArtifactPillsProps {
 }
 
 /**
- * Inline chips that surface artifact_create / artifact_edit tool calls inside
+ * Inline chips that surface artifact_create / file_* tool calls inside
  * the assistant bubble — without them, the only signal an artifact was just
  * touched is the ArtifactBar at the top of the chat, which is easy to miss
  * mid-conversation. We piggyback on the bar's `listByThread` subscription
@@ -146,7 +130,7 @@ function MessageArtifactPillsComponent({
   return (
     <div className="mt-2 flex flex-wrap gap-1.5">
       {matches.map((artifact) => {
-        const Icon = ARTIFACT_PILL_ICONS[artifact.type];
+        const Icon = CANVAS_TYPE_ICONS[artifact.type];
         return (
           <button
             key={artifact._id}
@@ -155,14 +139,15 @@ function MessageArtifactPillsComponent({
             className="hover:bg-muted/60 border-border inline-flex items-center gap-1.5 rounded-md border px-2 py-1 text-xs transition-colors"
             aria-label={t('artifacts.touchedByMessage', {
               title: artifact.title,
-              revision: artifact.revision,
             })}
           >
             <Icon className="text-muted-foreground size-3.5" aria-hidden />
             <span className="max-w-[16rem] truncate">{artifact.title}</span>
-            <Badge variant="outline" className="h-4 px-1 text-[10px]">
-              v{artifact.revision}
-            </Badge>
+            {artifact.fileCount > 1 && (
+              <Badge variant="outline" className="h-4 px-1 text-[10px]">
+                {t('artifacts.fileCount', { count: artifact.fileCount })}
+              </Badge>
+            )}
           </button>
         );
       })}
diff --git a/services/platform/app/features/chat/hooks/use-chat-video-links.ts b/services/platform/app/features/chat/hooks/use-chat-video-links.ts
index b08ccb200..189d4d9b9 100644
--- a/services/platform/app/features/chat/hooks/use-chat-video-links.ts
+++ b/services/platform/app/features/chat/hooks/use-chat-video-links.ts
@@ -261,7 +261,32 @@ export function useChatVideoLinks(args: {
 
   const cancelJob = useCallback(
     async (jobId: Id<'videoLinkJobs'>) => {
-      await cancelMutation({ jobId });
+      // Hide the chip first so the click feels instant. The server
+      // mutation flips status='skipped' (including for terminal rows),
+      // but the subscription's re-emit lags the round-trip by 50-200ms
+      // — the local hide bridges that gap so the X feels immediate.
+      // Reverted on mutation failure (catch block below).
+      setHideJobIds((prev) => {
+        if (prev.has(jobId)) return prev;
+        const next = new Set(prev);
+        next.add(jobId);
+        return next;
+      });
+      try {
+        await cancelMutation({ jobId });
+      } catch (err) {
+        setHideJobIds((prev) => {
+          if (!prev.has(jobId)) return prev;
+          const next = new Set(prev);
+          next.delete(jobId);
+          return next;
+        });
+        console.error(
+          '[useChatVideoLinks] cancel failed:',
+          err instanceof Error ? err.message : err,
+        );
+        throw err;
+      }
     },
     [cancelMutation],
   );
diff --git a/services/platform/app/features/chat/hooks/use-stop-generating.test.ts b/services/platform/app/features/chat/hooks/use-stop-generating.test.ts
index 52ca3aecd..dcadaa445 100644
--- a/services/platform/app/features/chat/hooks/use-stop-generating.test.ts
+++ b/services/platform/app/features/chat/hooks/use-stop-generating.test.ts
@@ -1,6 +1,6 @@
 // @vitest-environment jsdom
-import { renderHook, act } from '@testing-library/react';
-import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { act, renderHook } from '@testing-library/react';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 const mockMutateAsync = vi.fn();
 
@@ -9,12 +9,12 @@ vi.mock('./mutations', () => ({
 }));
 
 const mockFreezeActiveStream = vi.fn();
-const mockConsumeFrozenDisplayText = vi.fn();
+const mockConsumeFrozenDisplayLength = vi.fn();
 const mockResetGlobalFreeze = vi.fn();
 
 vi.mock('./use-stream-buffer', () => ({
   freezeActiveStream: (...args: unknown[]) => mockFreezeActiveStream(...args),
-  consumeFrozenDisplayText: () => mockConsumeFrozenDisplayText(),
+  consumeFrozenDisplayLength: () => mockConsumeFrozenDisplayLength(),
   resetGlobalFreeze: () => mockResetGlobalFreeze(),
 }));
 
@@ -28,11 +28,11 @@ describe('useStopGenerating — happy path', () => {
   beforeEach(() => {
     vi.clearAllMocks();
     mockMutateAsync.mockResolvedValue(null);
-    mockConsumeFrozenDisplayText.mockReturnValue(null);
+    mockConsumeFrozenDisplayLength.mockReturnValue(null);
   });
 
-  it('calls freezeActiveStream, consumeFrozenDisplayText, and cancelGeneration on stop', () => {
-    mockConsumeFrozenDisplayText.mockReturnValue('Hello, this is partial');
+  it('calls freezeActiveStream, consumeFrozenDisplayLength, and cancelGeneration on stop', () => {
+    mockConsumeFrozenDisplayLength.mockReturnValue(22);
 
     const { result } = renderHook(() =>
       useStopGenerating({ threadId: 'thread-1' }),
@@ -41,15 +41,15 @@ describe('useStopGenerating — happy path', () => {
     act(() => result.current.stopGenerating());
 
     expect(mockFreezeActiveStream).toHaveBeenCalledOnce();
-    expect(mockConsumeFrozenDisplayText).toHaveBeenCalledOnce();
+    expect(mockConsumeFrozenDisplayLength).toHaveBeenCalledOnce();
     expect(mockMutateAsync).toHaveBeenCalledWith({
       threadId: 'thread-1',
-      displayedContent: 'Hello, this is partial',
+      displayedLength: 22,
     });
   });
 
-  it('passes null displayedContent when no text was captured', () => {
-    mockConsumeFrozenDisplayText.mockReturnValue(null);
+  it('passes null displayedLength when no length was captured', () => {
+    mockConsumeFrozenDisplayLength.mockReturnValue(null);
 
     const { result } = renderHook(() =>
       useStopGenerating({ threadId: 'thread-1' }),
@@ -59,16 +59,16 @@ describe('useStopGenerating — happy path', () => {
 
     expect(mockMutateAsync).toHaveBeenCalledWith({
       threadId: 'thread-1',
-      displayedContent: null,
+      displayedLength: null,
     });
   });
 
   it('calls operations in the correct order: freeze → consume → mutate', () => {
     const callOrder: string[] = [];
     mockFreezeActiveStream.mockImplementation(() => callOrder.push('freeze'));
-    mockConsumeFrozenDisplayText.mockImplementation(() => {
+    mockConsumeFrozenDisplayLength.mockImplementation(() => {
       callOrder.push('consume');
-      return 'partial text';
+      return 12;
     });
     mockMutateAsync.mockImplementation(() => {
       callOrder.push('mutate');
@@ -132,7 +132,7 @@ describe('useStopGenerating — edge cases', () => {
   beforeEach(() => {
     vi.clearAllMocks();
     mockMutateAsync.mockResolvedValue(null);
-    mockConsumeFrozenDisplayText.mockReturnValue(null);
+    mockConsumeFrozenDisplayLength.mockReturnValue(null);
   });
 
   it('does nothing when threadId is undefined', () => {
@@ -158,8 +158,8 @@ describe('useStopGenerating — edge cases', () => {
     expect(mockMutateAsync).toHaveBeenCalledOnce();
   });
 
-  it('passes empty string displayedContent through (not treated as null)', () => {
-    mockConsumeFrozenDisplayText.mockReturnValue('');
+  it('passes displayedLength=0 through (not coerced to null) — backend treats it as no-snapshot', () => {
+    mockConsumeFrozenDisplayLength.mockReturnValue(0);
 
     const { result } = renderHook(() =>
       useStopGenerating({ threadId: 'thread-1' }),
@@ -169,11 +169,11 @@ describe('useStopGenerating — edge cases', () => {
 
     expect(mockMutateAsync).toHaveBeenCalledWith({
       threadId: 'thread-1',
-      displayedContent: '',
+      displayedLength: 0,
     });
   });
 
-  it('does not crash when mutation rejects', async () => {
+  it('does not crash when mutation rejects', () => {
     mockMutateAsync.mockRejectedValue(new Error('Network error'));
 
     const { result } = renderHook(() =>
@@ -185,7 +185,7 @@ describe('useStopGenerating — edge cases', () => {
 
     // freeze and consume should still have been called
     expect(mockFreezeActiveStream).toHaveBeenCalledOnce();
-    expect(mockConsumeFrozenDisplayText).toHaveBeenCalledOnce();
+    expect(mockConsumeFrozenDisplayLength).toHaveBeenCalledOnce();
   });
 
   it('resetCancelled is idempotent (calling multiple times is safe)', () => {
@@ -195,14 +195,12 @@ describe('useStopGenerating — edge cases', () => {
 
     act(() => result.current.stopGenerating());
 
-    // Reset multiple times
     act(() => {
       result.current.resetCancelled();
       result.current.resetCancelled();
       result.current.resetCancelled();
     });
 
-    // Should be able to stop again (exactly once)
     act(() => result.current.stopGenerating());
     expect(mockMutateAsync).toHaveBeenCalledTimes(2);
   });
@@ -213,11 +211,9 @@ describe('useStopGenerating — edge cases', () => {
       { initialProps: { threadId: undefined as string | undefined } },
     );
 
-    // First try with undefined — should do nothing
     act(() => result.current.stopGenerating());
     expect(mockMutateAsync).not.toHaveBeenCalled();
 
-    // Now provide a threadId
     rerender({ threadId: 'thread-1' });
     act(() => result.current.stopGenerating());
     expect(mockMutateAsync).toHaveBeenCalledOnce();
@@ -232,17 +228,14 @@ describe('useStopGenerating — edge cases', () => {
     act(() => result.current.stopGenerating());
     expect(mockMutateAsync).toHaveBeenCalledOnce();
 
-    // Switch threadId without resetting cancelled
     rerender({ threadId: 'thread-2' });
     act(() => result.current.stopGenerating());
 
-    // Should still be blocked by cancelled flag
     expect(mockMutateAsync).toHaveBeenCalledOnce();
   });
 
-  it('passes long displayedContent without truncation', () => {
-    const longContent = 'A'.repeat(10000);
-    mockConsumeFrozenDisplayText.mockReturnValue(longContent);
+  it('passes large displayedLength without modification', () => {
+    mockConsumeFrozenDisplayLength.mockReturnValue(100000);
 
     const { result } = renderHook(() =>
       useStopGenerating({ threadId: 'thread-1' }),
@@ -252,7 +245,7 @@ describe('useStopGenerating — edge cases', () => {
 
     expect(mockMutateAsync).toHaveBeenCalledWith({
       threadId: 'thread-1',
-      displayedContent: longContent,
+      displayedLength: 100000,
     });
   });
 });
diff --git a/services/platform/app/features/chat/hooks/use-stop-generating.ts b/services/platform/app/features/chat/hooks/use-stop-generating.ts
index caa41a157..6ec300098 100644
--- a/services/platform/app/features/chat/hooks/use-stop-generating.ts
+++ b/services/platform/app/features/chat/hooks/use-stop-generating.ts
@@ -2,7 +2,7 @@ import { useCallback, useRef } from 'react';
 
 import { useCancelGeneration } from './mutations';
 import {
-  consumeFrozenDisplayText,
+  consumeFrozenDisplayLength,
   freezeActiveStream,
   resetGlobalFreeze,
 } from './use-stream-buffer';
@@ -37,18 +37,22 @@ export function useStopGenerating({
     if (!threadId || cancelledRef.current) return;
 
     // 1. Freeze the display immediately (client-side, synchronous).
-    //    This also snapshots the currently displayed text.
+    //    This also snapshots the currently displayed length.
     freezeActiveStream();
 
     // 2. Set optimistic cancelled flag
     cancelledRef.current = true;
 
-    // 3. Grab the displayed text captured at freeze time
-    const displayedContent = consumeFrozenDisplayText();
+    // 3. Grab the displayed length captured at freeze time. We send the
+    //    char count (not the content string) so the backend can truncate
+    //    the persisted message in-place — preserving structured parts
+    //    (file/image cards, reasoning, tool calls) that would otherwise
+    //    be wiped if we re-sent a flat string.
+    const displayedLength = consumeFrozenDisplayLength();
 
     // 4. Fire backend mutation to abort active streams and truncate
     //    the message to match what the user saw.
-    void cancelGeneration({ threadId, displayedContent });
+    void cancelGeneration({ threadId, displayedLength });
   }, [threadId, cancelGeneration]);
 
   const resetCancelled = useCallback(() => {
diff --git a/services/platform/app/features/chat/hooks/use-stream-buffer.ts b/services/platform/app/features/chat/hooks/use-stream-buffer.ts
index fb2c03e40..a28f6cd8a 100644
--- a/services/platform/app/features/chat/hooks/use-stream-buffer.ts
+++ b/services/platform/app/features/chat/hooks/use-stream-buffer.ts
@@ -228,6 +228,11 @@ export function clearDisplayPositionCache() {
 
 let globalFrozen = false;
 let frozenDisplayText: string | null = null;
+// Snapshotted displayed length at freeze time (in chars of the active
+// typewriter's text). Read by the stop-generating flow so the backend can
+// truncate the persisted message content WITHOUT having to flatten its
+// structured parts. Cleared by `consumeFrozenDisplayLength()`.
+let frozenDisplayLength: number | null = null;
 
 // The active streaming hook instance registers its refs here so
 // freezeActiveStream() can snapshot the displayed text and cancel animation.
@@ -266,6 +271,7 @@ export function freezeActiveStream() {
       0,
       activeDisplayedLengthRef.current,
     );
+    frozenDisplayLength = activeDisplayedLengthRef.current;
   }
 }
 
@@ -283,6 +289,7 @@ export function isStreamFrozen() {
 export function resetGlobalFreeze() {
   globalFrozen = false;
   frozenDisplayText = null;
+  frozenDisplayLength = null;
   if (activeFrozenRef) {
     activeFrozenRef.current = false;
   }
@@ -301,6 +308,19 @@ export function consumeFrozenDisplayText(): string | null {
   return text;
 }
 
+/**
+ * Returns the displayed length (char count of the active typewriter's text)
+ * captured at the moment of freeze, then clears it. Returns null if no freeze
+ * has occurred. Used by the cancel-generation flow to ask the backend to
+ * truncate the persisted message by position instead of by content string —
+ * the backend can then preserve structured parts (file, reasoning, tool-call).
+ */
+export function consumeFrozenDisplayLength(): number | null {
+  const length = frozenDisplayLength;
+  frozenDisplayLength = null;
+  return length;
+}
+
 // ============================================================================
 // UTILITY FUNCTIONS
 // ============================================================================
diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts
index 378f662ea..faefa53d2 100644
--- a/services/platform/convex/_generated/api.d.ts
+++ b/services/platform/convex/_generated/api.d.ts
@@ -13,9 +13,17 @@ import type * as accounts_queries from "../accounts/queries.js";
 import type * as accounts_types from "../accounts/types.js";
 import type * as accounts_validators from "../accounts/validators.js";
 import type * as agent_tools_approval_shared from "../agent_tools/approval_shared.js";
-import type * as agent_tools_artifacts_apply_patches from "../agent_tools/artifacts/apply_patches.js";
+import type * as agent_tools_artifacts__packages_helper from "../agent_tools/artifacts/_packages_helper.js";
 import type * as agent_tools_artifacts_artifact_create_tool from "../agent_tools/artifacts/artifact_create_tool.js";
-import type * as agent_tools_artifacts_artifact_edit_tool from "../agent_tools/artifacts/artifact_edit_tool.js";
+import type * as agent_tools_artifacts_artifact_file_create_tool from "../agent_tools/artifacts/artifact_file_create_tool.js";
+import type * as agent_tools_artifacts_artifact_file_delete_tool from "../agent_tools/artifacts/artifact_file_delete_tool.js";
+import type * as agent_tools_artifacts_artifact_file_list_tool from "../agent_tools/artifacts/artifact_file_list_tool.js";
+import type * as agent_tools_artifacts_artifact_file_read_tool from "../agent_tools/artifacts/artifact_file_read_tool.js";
+import type * as agent_tools_artifacts_artifact_file_rename_tool from "../agent_tools/artifacts/artifact_file_rename_tool.js";
+import type * as agent_tools_artifacts_artifact_file_update_tool from "../agent_tools/artifacts/artifact_file_update_tool.js";
+import type * as agent_tools_artifacts_artifact_list_tool from "../agent_tools/artifacts/artifact_list_tool.js";
+import type * as agent_tools_artifacts_artifact_packages_add_tool from "../agent_tools/artifacts/artifact_packages_add_tool.js";
+import type * as agent_tools_artifacts_artifact_run_tool from "../agent_tools/artifacts/artifact_run_tool.js";
 import type * as agent_tools_artifacts_shared from "../agent_tools/artifacts/shared.js";
 import type * as agent_tools_artifacts_stream_state from "../agent_tools/artifacts/stream_state.js";
 import type * as agent_tools_conversations_conversation_read_tool from "../agent_tools/conversations/conversation_read_tool.js";
@@ -160,10 +168,15 @@ import type * as approvals_mutations from "../approvals/mutations.js";
 import type * as approvals_queries from "../approvals/queries.js";
 import type * as approvals_types from "../approvals/types.js";
 import type * as approvals_validators from "../approvals/validators.js";
+import type * as artifacts_handlers_content_edits from "../artifacts/handlers/content_edits.js";
+import type * as artifacts_handlers_run_state from "../artifacts/handlers/run_state.js";
+import type * as artifacts_handlers_shared from "../artifacts/handlers/shared.js";
+import type * as artifacts_handlers_streaming from "../artifacts/handlers/streaming.js";
 import type * as artifacts_internal_mutations from "../artifacts/internal_mutations.js";
 import type * as artifacts_internal_queries from "../artifacts/internal_queries.js";
 import type * as artifacts_mutations from "../artifacts/mutations.js";
 import type * as artifacts_queries from "../artifacts/queries.js";
+import type * as artifacts_resolve_files from "../artifacts/resolve_files.js";
 import type * as artifacts_snapshot_for_branch from "../artifacts/snapshot_for_branch.js";
 import type * as audit_logs_actions from "../audit_logs/actions.js";
 import type * as audit_logs_export_audit_logs from "../audit_logs/export_audit_logs.js";
@@ -542,6 +555,8 @@ import type * as message_metadata_internal_mutations from "../message_metadata/i
 import type * as message_metadata_queries from "../message_metadata/queries.js";
 import type * as migrations from "../migrations.js";
 import type * as migrations_backfill_apikey_reference_id from "../migrations/backfill_apikey_reference_id.js";
+import type * as migrations_backfill_artifact_files from "../migrations/backfill_artifact_files.js";
+import type * as migrations_backfill_artifact_files_table from "../migrations/backfill_artifact_files_table.js";
 import type * as migrations_backfill_file_metadata_document_id from "../migrations/backfill_file_metadata_document_id.js";
 import type * as migrations_backfill_folder_path from "../migrations/backfill_folder_path.js";
 import type * as migrations_backfill_folders from "../migrations/backfill_folders.js";
@@ -572,6 +587,8 @@ import type * as node_only_integration_sandbox_helpers_url_rewrite from "../node
 import type * as node_only_integration_sandbox_helpers_validate_host from "../node_only/integration_sandbox/helpers/validate_host.js";
 import type * as node_only_integration_sandbox_internal_actions from "../node_only/integration_sandbox/internal_actions.js";
 import type * as node_only_integration_sandbox_types from "../node_only/integration_sandbox/types.js";
+import type * as node_only_sandbox_helpers_spawner_client from "../node_only/sandbox/helpers/spawner_client.js";
+import type * as node_only_sandbox_internal_actions from "../node_only/sandbox/internal_actions.js";
 import type * as node_only_sql_helpers_execute_mssql_query from "../node_only/sql/helpers/execute_mssql_query.js";
 import type * as node_only_sql_helpers_execute_mysql_query from "../node_only/sql/helpers/execute_mysql_query.js";
 import type * as node_only_sql_helpers_execute_postgres_query from "../node_only/sql/helpers/execute_postgres_query.js";
@@ -675,6 +692,10 @@ import type * as providers_file_utils from "../providers/file_utils.js";
 import type * as providers_resolve_model from "../providers/resolve_model.js";
 import type * as providers_secret_io from "../providers/secret_io.js";
 import type * as providers_validators from "../providers/validators.js";
+import type * as sandbox_internal_mutations from "../sandbox/internal_mutations.js";
+import type * as sandbox_output_mutations from "../sandbox/output_mutations.js";
+import type * as sandbox_sandbox_http from "../sandbox/sandbox_http.js";
+import type * as sandbox_wire from "../sandbox/wire.js";
 import type * as sso_providers_actions from "../sso_providers/actions.js";
 import type * as sso_providers_create_user_session from "../sso_providers/create_user_session.js";
 import type * as sso_providers_entra_id_adapter from "../sso_providers/entra_id/adapter.js";
@@ -753,6 +774,7 @@ import type * as threads_queries from "../threads/queries.js";
 import type * as threads_rest_api from "../threads/rest_api.js";
 import type * as threads_restore_chat_thread from "../threads/restore_chat_thread.js";
 import type * as threads_share_thread from "../threads/share_thread.js";
+import type * as threads_truncate_message_content from "../threads/truncate_message_content.js";
 import type * as threads_types from "../threads/types.js";
 import type * as threads_update_chat_thread from "../threads/update_chat_thread.js";
 import type * as threads_validators from "../threads/validators.js";
@@ -1082,9 +1104,17 @@ declare const fullApi: ApiFromModules<{
   "accounts/types": typeof accounts_types;
   "accounts/validators": typeof accounts_validators;
   "agent_tools/approval_shared": typeof agent_tools_approval_shared;
-  "agent_tools/artifacts/apply_patches": typeof agent_tools_artifacts_apply_patches;
+  "agent_tools/artifacts/_packages_helper": typeof agent_tools_artifacts__packages_helper;
   "agent_tools/artifacts/artifact_create_tool": typeof agent_tools_artifacts_artifact_create_tool;
-  "agent_tools/artifacts/artifact_edit_tool": typeof agent_tools_artifacts_artifact_edit_tool;
+  "agent_tools/artifacts/artifact_file_create_tool": typeof agent_tools_artifacts_artifact_file_create_tool;
+  "agent_tools/artifacts/artifact_file_delete_tool": typeof agent_tools_artifacts_artifact_file_delete_tool;
+  "agent_tools/artifacts/artifact_file_list_tool": typeof agent_tools_artifacts_artifact_file_list_tool;
+  "agent_tools/artifacts/artifact_file_read_tool": typeof agent_tools_artifacts_artifact_file_read_tool;
+  "agent_tools/artifacts/artifact_file_rename_tool": typeof agent_tools_artifacts_artifact_file_rename_tool;
+  "agent_tools/artifacts/artifact_file_update_tool": typeof agent_tools_artifacts_artifact_file_update_tool;
+  "agent_tools/artifacts/artifact_list_tool": typeof agent_tools_artifacts_artifact_list_tool;
+  "agent_tools/artifacts/artifact_packages_add_tool": typeof agent_tools_artifacts_artifact_packages_add_tool;
+  "agent_tools/artifacts/artifact_run_tool": typeof agent_tools_artifacts_artifact_run_tool;
   "agent_tools/artifacts/shared": typeof agent_tools_artifacts_shared;
   "agent_tools/artifacts/stream_state": typeof agent_tools_artifacts_stream_state;
   "agent_tools/conversations/conversation_read_tool": typeof agent_tools_conversations_conversation_read_tool;
@@ -1229,10 +1259,15 @@ declare const fullApi: ApiFromModules<{
   "approvals/queries": typeof approvals_queries;
   "approvals/types": typeof approvals_types;
   "approvals/validators": typeof approvals_validators;
+  "artifacts/handlers/content_edits": typeof artifacts_handlers_content_edits;
+  "artifacts/handlers/run_state": typeof artifacts_handlers_run_state;
+  "artifacts/handlers/shared": typeof artifacts_handlers_shared;
+  "artifacts/handlers/streaming": typeof artifacts_handlers_streaming;
   "artifacts/internal_mutations": typeof artifacts_internal_mutations;
   "artifacts/internal_queries": typeof artifacts_internal_queries;
   "artifacts/mutations": typeof artifacts_mutations;
   "artifacts/queries": typeof artifacts_queries;
+  "artifacts/resolve_files": typeof artifacts_resolve_files;
   "artifacts/snapshot_for_branch": typeof artifacts_snapshot_for_branch;
   "audit_logs/actions": typeof audit_logs_actions;
   "audit_logs/export_audit_logs": typeof audit_logs_export_audit_logs;
@@ -1611,6 +1646,8 @@ declare const fullApi: ApiFromModules<{
   "message_metadata/queries": typeof message_metadata_queries;
   migrations: typeof migrations;
   "migrations/backfill_apikey_reference_id": typeof migrations_backfill_apikey_reference_id;
+  "migrations/backfill_artifact_files": typeof migrations_backfill_artifact_files;
+  "migrations/backfill_artifact_files_table": typeof migrations_backfill_artifact_files_table;
   "migrations/backfill_file_metadata_document_id": typeof migrations_backfill_file_metadata_document_id;
   "migrations/backfill_folder_path": typeof migrations_backfill_folder_path;
   "migrations/backfill_folders": typeof migrations_backfill_folders;
@@ -1641,6 +1678,8 @@ declare const fullApi: ApiFromModules<{
   "node_only/integration_sandbox/helpers/validate_host": typeof node_only_integration_sandbox_helpers_validate_host;
   "node_only/integration_sandbox/internal_actions": typeof node_only_integration_sandbox_internal_actions;
   "node_only/integration_sandbox/types": typeof node_only_integration_sandbox_types;
+  "node_only/sandbox/helpers/spawner_client": typeof node_only_sandbox_helpers_spawner_client;
+  "node_only/sandbox/internal_actions": typeof node_only_sandbox_internal_actions;
   "node_only/sql/helpers/execute_mssql_query": typeof node_only_sql_helpers_execute_mssql_query;
   "node_only/sql/helpers/execute_mysql_query": typeof node_only_sql_helpers_execute_mysql_query;
   "node_only/sql/helpers/execute_postgres_query": typeof node_only_sql_helpers_execute_postgres_query;
@@ -1744,6 +1783,10 @@ declare const fullApi: ApiFromModules<{
   "providers/resolve_model": typeof providers_resolve_model;
   "providers/secret_io": typeof providers_secret_io;
   "providers/validators": typeof providers_validators;
+  "sandbox/internal_mutations": typeof sandbox_internal_mutations;
+  "sandbox/output_mutations": typeof sandbox_output_mutations;
+  "sandbox/sandbox_http": typeof sandbox_sandbox_http;
+  "sandbox/wire": typeof sandbox_wire;
   "sso_providers/actions": typeof sso_providers_actions;
   "sso_providers/create_user_session": typeof sso_providers_create_user_session;
   "sso_providers/entra_id/adapter": typeof sso_providers_entra_id_adapter;
@@ -1822,6 +1865,7 @@ declare const fullApi: ApiFromModules<{
   "threads/rest_api": typeof threads_rest_api;
   "threads/restore_chat_thread": typeof threads_restore_chat_thread;
   "threads/share_thread": typeof threads_share_thread;
+  "threads/truncate_message_content": typeof threads_truncate_message_content;
   "threads/types": typeof threads_types;
   "threads/update_chat_thread": typeof threads_update_chat_thread;
   "threads/validators": typeof threads_validators;
diff --git a/services/platform/convex/agent_tools/artifacts/_packages_helper.ts b/services/platform/convex/agent_tools/artifacts/_packages_helper.ts
new file mode 100644
index 000000000..f1e62ecc8
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/_packages_helper.ts
@@ -0,0 +1,69 @@
+/**
+ * Shared helper used by the `artifact_file_create` / `artifact_file_update` tools to union
+ * `packages_add` into an artifact's persistent `runPackages` list as a
+ * best-effort side-effect.
+ *
+ * Best-effort: a failure to update packages is logged but does not flip the
+ * caller's success status. Returns a human-readable suffix the caller can
+ * append to its success message (empty string when no-op).
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+
+import { internal } from '../../_generated/api';
+import type { Id } from '../../_generated/dataModel';
+
+export async function applyPackagesAddIfAny(
+  ctx: ToolCtx,
+  artifactId: Id<'artifacts'>,
+  isRunnable: boolean,
+  packagesAdd: readonly string[] | undefined,
+): Promise<string> {
+  if (!isRunnable) return '';
+  if (packagesAdd === undefined || packagesAdd.length === 0) return '';
+  try {
+    const result = await ctx.runMutation(
+      internal.artifacts.internal_mutations.addArtifactPackages,
+      { artifactId, packagesAdd: [...packagesAdd] },
+    );
+    if (result.added.length === 0) return '';
+    return ` Added ${result.added.length} package${result.added.length === 1 ? '' : 's'} to runPackages: ${result.added.join(', ')}.`;
+  } catch (err) {
+    console.warn('[packages_add] addArtifactPackages failed:', err);
+    return '';
+  }
+}
+
+/**
+ * Checks whether the given string-valued field's literal has fully closed in
+ * the raw JSON accumulator. `parsePartialJson` will happily auto-close an
+ * in-flight string (e.g. `"path":"c` gets repaired to `"path":"c"`), but
+ * acting on those intermediate values is bad in two known cases:
+ *   - `path`: every keystroke of the filename would be committed as
+ *     `streamingPath`, flickering the canvas FILES panel.
+ *   - `artifactId`: every partial ID is fed to a Convex query whose
+ *     `v.id("artifacts")` validator rejects it, spamming WARN logs.
+ *
+ * We require the value's closing `"` to physically exist in the accumulator
+ * before treating the field as stable. Once stable it cannot regress in this
+ * stream (JSON values are written linearly), so this is a one-way gate.
+ */
+export function isStringFieldClosed(
+  accumulator: string,
+  fieldName: string,
+): boolean {
+  const escaped = fieldName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+  const keyMatch = new RegExp(`"${escaped}"\\s*:\\s*"`).exec(accumulator);
+  if (!keyMatch) return false;
+  let i = keyMatch.index + keyMatch[0].length;
+  while (i < accumulator.length) {
+    const ch = accumulator[i];
+    if (ch === '\\') {
+      i += 2;
+      continue;
+    }
+    if (ch === '"') return true;
+    i += 1;
+  }
+  return false;
+}
diff --git a/services/platform/convex/agent_tools/artifacts/apply_patches.test.ts b/services/platform/convex/agent_tools/artifacts/apply_patches.test.ts
deleted file mode 100644
index 9965c9a42..000000000
--- a/services/platform/convex/agent_tools/artifacts/apply_patches.test.ts
+++ /dev/null
@@ -1,152 +0,0 @@
-import { describe, expect, it } from 'vitest';
-
-import { applyPatches, applySinglePatch } from './apply_patches';
-
-describe('applySinglePatch', () => {
-  it('replaces a unique exact match', () => {
-    const result = applySinglePatch('hello world', {
-      search: 'world',
-      replace: 'there',
-    });
-    expect(result).toEqual({ ok: true, content: 'hello there' });
-  });
-
-  it('rejects when search has zero matches', () => {
-    const result = applySinglePatch('hello world', {
-      search: 'goodbye',
-      replace: 'there',
-    });
-    expect(result.ok).toBe(false);
-    if (!result.ok) expect(result.error).toContain('0 times');
-  });
-
-  it('rejects when search has multiple matches', () => {
-    const result = applySinglePatch('foo foo foo', {
-      search: 'foo',
-      replace: 'bar',
-    });
-    expect(result.ok).toBe(false);
-    if (!result.ok) expect(result.error).toContain('more than once');
-  });
-
-  it('rejects empty search', () => {
-    const result = applySinglePatch('anything', {
-      search: '',
-      replace: 'x',
-    });
-    expect(result.ok).toBe(false);
-    if (!result.ok) expect(result.error).toContain('empty');
-  });
-
-  it('preserves surrounding whitespace and newlines', () => {
-    const content = 'line one\n  let x = 1;\nline three';
-    const result = applySinglePatch(content, {
-      search: '  let x = 1;',
-      replace: '  let x = 42;',
-    });
-    expect(result).toEqual({
-      ok: true,
-      content: 'line one\n  let x = 42;\nline three',
-    });
-  });
-
-  it('handles multi-line search blocks', () => {
-    const content = 'function add(a, b) {\n  return a + b;\n}\n';
-    const result = applySinglePatch(content, {
-      search: 'function add(a, b) {\n  return a + b;\n}',
-      replace: 'function add(a, b) {\n  return a + b + 1;\n}',
-    });
-    expect(result.ok).toBe(true);
-    if (result.ok) expect(result.content).toContain('a + b + 1');
-  });
-
-  it('flags self-overlapping search as ambiguous (the "aa" in "aaa" case)', () => {
-    const result = applySinglePatch('aaa', { search: 'aa', replace: 'X' });
-    expect(result.ok).toBe(false);
-    if (!result.ok) expect(result.error).toContain('more than once');
-  });
-
-  it('treats CRLF and LF as distinct (LF search misses CRLF content)', () => {
-    const result = applySinglePatch('a\r\nb', { search: 'a\nb', replace: 'X' });
-    expect(result.ok).toBe(false);
-    if (!result.ok) expect(result.error).toContain('0 times');
-  });
-
-  it('deletes the matched range when replace is empty', () => {
-    const result = applySinglePatch('hello, world', {
-      search: ', world',
-      replace: '',
-    });
-    expect(result).toEqual({ ok: true, content: 'hello' });
-  });
-
-  it('matches at the start of the content', () => {
-    const result = applySinglePatch('start middle end', {
-      search: 'start',
-      replace: 'begin',
-    });
-    expect(result).toEqual({ ok: true, content: 'begin middle end' });
-  });
-
-  it('matches at the very end of the content', () => {
-    const result = applySinglePatch('start middle end', {
-      search: 'end',
-      replace: 'finish',
-    });
-    expect(result).toEqual({ ok: true, content: 'start middle finish' });
-  });
-});
-
-describe('applyPatches', () => {
-  it('applies multiple patches sequentially', () => {
-    const result = applyPatches('one two three', [
-      { search: 'one', replace: '1' },
-      { search: 'two', replace: '2' },
-      { search: 'three', replace: '3' },
-    ]);
-    expect(result).toEqual({ ok: true, content: '1 2 3' });
-  });
-
-  it('lets a later patch match text introduced by an earlier patch', () => {
-    const result = applyPatches('alpha', [
-      { search: 'alpha', replace: 'beta' },
-      { search: 'beta', replace: 'gamma' },
-    ]);
-    expect(result).toEqual({ ok: true, content: 'gamma' });
-  });
-
-  it('reports failedIndex on first failing patch', () => {
-    const result = applyPatches('one two three', [
-      { search: 'one', replace: '1' },
-      { search: 'four', replace: '4' },
-      { search: 'three', replace: '3' },
-    ]);
-    expect(result.ok).toBe(false);
-    if (!result.ok) {
-      expect(result.failedIndex).toBe(1);
-      expect(result.error).toContain('0 times');
-    }
-  });
-
-  it('returns content unchanged on empty patch list', () => {
-    expect(applyPatches('hello', [])).toEqual({ ok: true, content: 'hello' });
-  });
-
-  it('rejects ambiguous patch even if a later one would disambiguate', () => {
-    const result = applyPatches('foo foo', [{ search: 'foo', replace: 'bar' }]);
-    expect(result.ok).toBe(false);
-    if (!result.ok) expect(result.failedIndex).toBe(0);
-  });
-
-  it('does not re-scan a replacement that creates a new match', () => {
-    // The first patch turns "a" into "aa". The second pass walks forward
-    // from the post-replace cursor in `applyPatches`, but `applySinglePatch`
-    // is invoked fresh for each patch — so matching "aa" against "aa" is
-    // unique and should succeed.
-    const result = applyPatches('a', [
-      { search: 'a', replace: 'aa' },
-      { search: 'aa', replace: 'b' },
-    ]);
-    expect(result).toEqual({ ok: true, content: 'b' });
-  });
-});
diff --git a/services/platform/convex/agent_tools/artifacts/apply_patches.ts b/services/platform/convex/agent_tools/artifacts/apply_patches.ts
deleted file mode 100644
index 847679188..000000000
--- a/services/platform/convex/agent_tools/artifacts/apply_patches.ts
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Pure function that applies search/replace patches to a string.
- *
- * Each patch must match its `search` block exactly once in the current
- * content — zero matches and multiple matches both fail. Patches apply
- * sequentially: patch N+1 operates on the output of patch N.
- *
- * Used both authoritatively (when `artifact_edit` finishes its tool call)
- * and optimistically (per-patch during streaming, before the tool's
- * `execute` returns). Keeping the function pure makes the second pass safe.
- */
-
-export interface ArtifactPatch {
-  search: string;
-  replace: string;
-}
-
-export type ApplyPatchesResult =
-  | { ok: true; content: string }
-  | { ok: false; error: string; failedIndex: number };
-
-export function applyPatches(
-  content: string,
-  patches: readonly ArtifactPatch[],
-): ApplyPatchesResult {
-  let current = content;
-  for (let i = 0; i < patches.length; i += 1) {
-    const result = applySinglePatch(current, patches[i]);
-    if (!result.ok) {
-      return { ok: false, error: result.error, failedIndex: i };
-    }
-    current = result.content;
-  }
-  return { ok: true, content: current };
-}
-
-export function applySinglePatch(
-  content: string,
-  patch: ArtifactPatch,
-): { ok: true; content: string } | { ok: false; error: string } {
-  if (patch.search.length === 0) {
-    return {
-      ok: false,
-      error:
-        'search block is empty — refusing to apply (would match anywhere). Provide a non-empty unique snippet.',
-    };
-  }
-
-  const firstIndex = content.indexOf(patch.search);
-  if (firstIndex === -1) {
-    return {
-      ok: false,
-      error: `search block matched 0 times. Either the artifact has changed or the snippet is wrong. Re-read the artifact and emit a snippet that appears verbatim.`,
-    };
-  }
-
-  // Probe at firstIndex + 1 (not + search.length) so a self-overlapping
-  // search string like "aa" inside "aaa" is correctly flagged as ambiguous
-  // — the second match starts at index 1, which the wider stride misses.
-  const secondIndex = content.indexOf(patch.search, firstIndex + 1);
-  if (secondIndex !== -1) {
-    return {
-      ok: false,
-      error: `search block matched more than once. Add surrounding context until the snippet is unique.`,
-    };
-  }
-
-  const before = content.slice(0, firstIndex);
-  const after = content.slice(firstIndex + patch.search.length);
-  return { ok: true, content: before + patch.replace + after };
-}
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts
index 895d3dd3b..ea8331530 100644
--- a/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts
+++ b/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts
@@ -1,71 +1,111 @@
 /**
  * Convex Tool: artifact_create
  *
- * Creates a new editable, runnable artifact (HTML / SVG / markdown /
- * mermaid / code) inside the current chat thread. The artifact lives in
- * the `artifacts` table — separate from the message stream — so a single
- * logical document can be patched across many turns via `artifact_edit`
- * without re-emitting its content.
+ * Creates a new artifact project — OR returns the existing one with full
+ * state on title collision. **Synchronous metadata-only**: no streaming
+ * hooks, no `content` argument. The row lands directly at revision 1 with
+ * an empty entry file. To populate the content, the LLM follows up with
+ * `artifact_file_update({artifactId, path: entryFile, content, expectedRevision: 1})`
+ * for the entry file and `artifact_file_create` for any sibling modules.
  *
- * Streaming: while the LLM emits the tool's input JSON, this tool inserts
- * a placeholder row as soon as `type` and `title` parse, then writes the
- * partial `content` to the row's `streamingContent` shadow field with
- * a small throttle. The final settle happens in `execute`.
+ * Idempotency: dedup on `(threadId, type, normalized-title)`. Second call
+ * with the same identity returns the existing `artifactId` and `isNew: false`.
+ * Same-message guard: a second call within the same assistant reply gets
+ * `{conflict: 'already_created_in_message', existingArtifactId, ...}` so the
+ * model switches to `artifact_file_create` / `artifact_file_update` against the existing
+ * artifact instead of spawning a duplicate project.
  */
 
 import type { ToolCtx } from '@convex-dev/agent';
 import { createTool } from '@convex-dev/agent';
 import type { ToolExecutionOptions } from 'ai';
-import { parsePartialJson } from 'ai';
 import { z } from 'zod/v4';
 
 import { internal } from '../../_generated/api';
 import type { ToolDefinition } from '../types';
-import { artifactTypeEnum, isValidArtifactType } from './shared';
-import {
-  clearState,
-  getState,
-  initState,
-  markParsed,
-  scheduleStreamingFlush,
-  shouldParse,
-} from './stream_state';
+import { isRunnableArtifactType, refinePackagesObject } from './shared';
+
+// The LLM-facing `artifact_create` no longer exposes the legacy
+// single-runtime types. New artifacts uniformly land at
+// `script_runnable`; the per-file runtime is then chosen by extension at
+// run time. The legacy literals stay in the schema validator so existing
+// rows continue to validate (see [feedback_deprecate_dont_delete_schema_fields]).
+const artifactCreateTypeEnum = z.enum([
+  'html',
+  'svg',
+  'markdown',
+  'mermaid',
+  'code',
+  'script_runnable',
+]);
 
 const artifactCreateArgs = z.object({
-  type: artifactTypeEnum.describe(
-    'Artifact type. `html` and `svg` render as a runnable preview in the Canvas pane; `markdown` and `mermaid` render formatted; `code` is a plain syntax-highlighted snippet.',
+  type: artifactCreateTypeEnum.describe(
+    'Artifact type. `html` renders in a sandboxed iframe; `svg` inline; `markdown`/`mermaid` rendered formatted; `code` syntax-highlighted; `script_runnable` executes server-side in the sandbox — each file runs with the interpreter implied by its extension (`.py` → python3, `.js`/`.cjs`/`.mjs` → node), so one artifact can mix Python and Node files.',
   ),
   title: z
     .string()
     .min(1)
     .max(120)
-    .describe('Short human-readable title shown on the artifact card.'),
-  content: z
-    .string()
-    .min(1)
     .describe(
-      'Full content of the artifact. For `html`, a complete HTML document including <!doctype html> and any inline <script>/<style>. For `svg`, a complete <svg>…</svg> root.',
+      'Short human-readable title shown on the artifact card. Acts as the identity key — a second `artifact_create` with the same title returns the existing artifactId.',
     ),
   language: z
     .string()
     .max(40)
     .optional()
     .describe(
-      'Optional language hint when type=`code` (e.g. "ts", "python"). Ignored for other types.',
+      'Optional language hint. For `code` artifacts it picks the syntax-highlight hint and default extension. For `script_runnable` it nudges the default entry file: "python"/"py" → `main.py`, "javascript"/"js"/"node" → `main.js` (default: `main.py`). You can still add the other-language files via `artifact_file_create` regardless of the hint.',
+    ),
+  entryFile: z
+    .string()
+    .min(1)
+    .max(200)
+    .optional()
+    .describe(
+      'Optional entry-file path override. Defaults: html→index.html, script_runnable→main.py (or main.js when `language` hints node), mermaid→diagram.mmd, svg→image.svg, markdown→README.md, code→main.<ext>.',
     ),
+  packages: z
+    .object({
+      python: z
+        .array(z.string().max(120))
+        .max(20)
+        .optional()
+        .describe('Pip specs (e.g. `markitdown[pptx]`, `requests==2.31.0`).'),
+      node: z
+        .array(z.string().max(120))
+        .max(20)
+        .optional()
+        .describe('npm specs (e.g. `pptxgenjs`, `@anthropic/sdk@1.0.0`).'),
+    })
+    .optional()
+    .describe(
+      'Runnable type only. Per-runtime dependencies. `python` is installed via `uv pip`, `node` via `npm`. Either bucket may be omitted. Pinned versions strongly preferred. Examples: `{python: ["markitdown[pptx]"]}` for a Python-only artifact; `{node: ["pptxgenjs"]}` for Node-only; `{python: ["markitdown[pptx]"], node: ["pptxgenjs"]}` for polyglot. Installs run with `pip --only-binary=:all:` and `npm --ignore-scripts`.',
+    )
+    .superRefine((val, ctx) => {
+      refinePackagesObject(val, (issue) => ctx.addIssue(issue));
+    }),
 });
 
 type ArtifactCreateInput = z.infer<typeof artifactCreateArgs>;
 
 interface ArtifactCreateSuccess {
   success: true;
+  isNew: boolean;
   artifactId: string;
   revision: number;
+  entryFile: string;
+  filePaths: string[];
   message: string;
 }
 
 interface ArtifactCreateFailure {
   success: false;
+  conflict?: 'type_mismatch' | 'already_created_in_message';
+  existingArtifactId?: string;
+  existingType?: string;
+  existingTitle?: string;
+  existingFiles?: string[];
   message: string;
 }
 
@@ -74,229 +114,192 @@ type ArtifactCreateResult = ArtifactCreateSuccess | ArtifactCreateFailure;
 export const artifactCreateTool = {
   name: 'artifact_create' as const,
   tool: createTool({
-    description: `**artifact_create** — create a new editable, runnable artifact in the chat thread.
+    description: `**artifact_create** — create an **empty** artifact project (a file tree the user can see in the Canvas pane). **Metadata only — no content argument.**
 
-USE THIS TOOL when the user asks for a runnable HTML page, an SVG illustration, a Mermaid diagram, a markdown document, or any code snippet you expect the user may want to revise. The artifact appears as a card in the chat that opens a side-panel (Canvas) editor + preview.
-
-**ARTIFACT TYPES:**
-- \`html\` — runnable HTML page (rendered in a sandboxed iframe).
-- \`svg\` — vector graphic (rendered inline).
-- \`markdown\` — long-form markdown document.
-- \`mermaid\` — diagram source (rendered as an SVG).
-- \`code\` — plain syntax-highlighted snippet. Use the \`language\` field for the highlight hint.
+**DEFAULT TO ONE ARTIFACT PER REPLY.** If the user asks for code + verification scripts, a document + helper tools, or any composite deliverable, those belong as sibling files of the **same** artifact (added via subsequent \`artifact_file_create\` calls). Calling \`artifact_create\` a second time in the same assistant message returns \`{success: false, conflict: 'already_created_in_message', existingArtifactId, existingTitle, existingFiles}\` with the existing project state — switch to \`artifact_file_create\` / \`artifact_file_update\` against \`existingArtifactId\` to add files there. **Only** call \`artifact_create\` a second time in the same reply if the user explicitly asked for two unrelated projects (e.g. "make an SVG AND a separate Python script for a different purpose").
 
-**ITERATION:**
-- After creating, refer back to the artifact by its \`artifactId\` in subsequent turns.
-- To revise it, call \`artifact_edit\` with the same \`artifactId\` — never re-emit the full content via another \`artifact_create\`.
-- Prefer small \`artifact_edit\` patches over rewrites: faster to stream, cheaper, less risk of regressing unrelated parts.
+USE THIS TOOL when the user asks for a runnable HTML page, an SVG illustration, a Mermaid diagram, a markdown document, a code snippet they may want to revise, or a Python / Node script you'll execute.
 
-**DO NOT use this tool for:**
-- Plain prose or conversational responses — write those directly in the message.
-- Files the user wants saved to the documents hub — use \`document_write\` (with a file-generation tool first).
-- Tabular data — emit a markdown table inline.
+**EMPTY ON CREATE — POPULATE VIA \`artifact_file_update\` / \`artifact_file_create\`.** The created artifact's entry file is empty at revision 1. **Immediately follow up** with file-level tools to write the actual content:
 
-**HTML LIBRARIES & FONTS** (only when \`type\` = \`html\`):
+- Overwrite the empty entry file with its full content via \`artifact_file_update\`:
+  \`\`\`
+  artifact_file_update({ artifactId, path: '<entryFile>', content: '<full content>', expectedRevision: 1 })
+  \`\`\`
+- Add helper / sibling files via \`artifact_file_create\`:
+  \`\`\`
+  artifact_file_create({ artifactId, path: 'helpers.py', content: '<...>', expectedRevision: 2 })
+  \`\`\`
 
-The preview iframe blocks ALL external resources via Content-Security-Policy. Do NOT use any \`https://\` URL inside \`<script>\`, \`<link>\`, \`<img>\`, \`@import\`, or \`url()\`. Specifically blocked: \`cdn.jsdelivr.net\`, \`unpkg.com\`, \`cdnjs.cloudflare.com\`, \`cdn.tailwindcss.com\`, \`fonts.googleapis.com\`, \`fonts.gstatic.com\`, and every other external host. Any reference to them will be blocked and the page will fail to render.
+There is no \`append\` and no \`patch\`. Write each file in full in one call; for runnable projects, split logically separate concerns into separate files (e.g. \`main.py\` + \`helpers.py\` + \`types.py\`) rather than packing everything into a single mega-file.
 
-**Use these same-origin local copies for libraries:**
-- reveal.js 5.x — \`/canvas-libs/reveal.js/5.0.5/reveal.js\`, \`/canvas-libs/reveal.js/5.0.5/reveal.css\`, theme \`/canvas-libs/reveal.js/5.0.5/theme/black.css\` (or \`white.css\`, \`league.css\`)
-- Chart.js 4.x — \`/canvas-libs/chart.js/4.4.0/chart.umd.js\`
-- D3 7.x — \`/canvas-libs/d3/7.8.5/d3.min.js\`
-- Tailwind (Play CDN equivalent) — \`/canvas-libs/tailwindcss-browser/4.2.4/tailwindcss.js\`
-- GSAP 3.x — \`/canvas-libs/gsap/3.12.5/gsap.min.js\`
+**IDEMPOTENT BY TITLE.** A second \`artifact_create\` with the same \`title\` in the same thread returns the existing artifactId with \`isNew: false\`. To populate / overwrite, use \`artifact_file_update\` against the returned \`artifactId\`.
 
-If you need a library that is not in this list, inline its source directly in the artifact.
+**ARTIFACT TYPES:**
+- \`html\` — runnable HTML page.
+- \`svg\` — vector graphic.
+- \`mermaid\` — diagram source.
+- \`script_runnable\` — script source (Python and / or Node files in the same project, dispatched per-extension). Pair with \`packages: {python?: string[], node?: string[]}\` if dependencies are needed, or call \`artifact_packages_add\` later.
+- \`markdown\` — long-form document.
+- \`code\` — syntax-highlighted snippet. Pair with \`language\` for the highlight hint.
 
-**For fonts, use system font stacks — never Google Fonts or any web-font CDN.** Modern OSes (macOS, Windows, iOS, Android, ChromeOS) ship CJK (Chinese / Japanese / Korean) fonts natively, so a plain system stack renders Chinese, Japanese, and Korean text correctly without any web font:
+**MULTI-FILE PROJECTS:** every artifact is a file map. \`artifact_create\` seeds one **empty** entry file. To add helper files (e.g. \`helpers.py\` alongside \`main.py\`), call \`artifact_file_create({artifactId, path: 'helpers.py', content, expectedRevision})\` after create.
 
-- General: \`font-family: system-ui, -apple-system, "Segoe UI", "Helvetica Neue", Arial, sans-serif;\`
-- Chinese-specific (optional refinement): \`font-family: system-ui, "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", "Source Han Sans SC", sans-serif;\`
-- Monospace: \`font-family: ui-monospace, "SF Mono", Menlo, Consolas, monospace;\`
+**ITERATION:** refer back via \`artifactId\` in subsequent calls. To revise existing content, call \`artifact_file_update\` — never \`artifact_create\` again (which is a no-op on existing titles).
 
-If the design absolutely requires a non-system display face, inline a base64-encoded \`@font-face\` (small subsets only).
+**HTML (type='html' only):**
 
-**RUNTIME ENVIRONMENT** (only when \`type\` = \`html\`):
+The preview iframe blocks ALL external resources via Content-Security-Policy. Use only these same-origin bundled libraries when populating via \`artifact_file_update\` / \`artifact_file_create\`:
+- reveal.js 5.x — \`/canvas-libs/reveal.js/5.0.5/reveal.js\`, \`/canvas-libs/reveal.js/5.0.5/reveal.css\`, theme \`/canvas-libs/reveal.js/5.0.5/theme/black.css\` (or \`white.css\`, \`league.css\`)
+- Chart.js 4.x — \`/canvas-libs/chart.js/4.4.0/chart.umd.js\`
+- D3 7.x — \`/canvas-libs/d3/7.8.5/d3.min.js\`
+- Tailwind — \`/canvas-libs/tailwindcss-browser/4.2.4/tailwindcss.js\`
+- GSAP 3.x — \`/canvas-libs/gsap/3.12.5/gsap.min.js\`
 
-The iframe is fully static and offline. There is **no backend, no fetchable API, no WebSocket** — \`fetch()\`, \`XMLHttpRequest\`, \`WebSocket\`, \`EventSource\`, and \`navigator.sendBeacon\` to any host (including \`localhost\`) are blocked by CSP \`connect-src 'self'\`.
+For fonts use system stacks; don't use web-font CDNs. The iframe is fully static — \`fetch()\` / \`XMLHttpRequest\` / \`WebSocket\` / \`EventSource\` are blocked. Sibling subresources (\`<link>\`, \`<script>\`, \`<img>\`) get inlined by the preview server. \`localStorage\` is per-iframe-load only.
 
-Therefore: features that require **runtime intelligence** — translating user input, scoring or correcting user output, conversational replies, language detection, summarisation, recommendation based on what the user just typed — **do not belong in an artifact**. Either handle them as normal chat replies, or redesign the page so it doesn't need a thinking backend at all (static reference content, fixed exercises with predetermined answers, deterministic visualisations / calculators / form layouts).
+**RUNNABLE TYPE** (\`script_runnable\`):
 
-**Do NOT fake AI features with hardcoded lookup tables or random output.** A "translation tool" backed by 30 baked-in phrases, a "feedback engine" backed by canned responses, a "personalised recommendation" picked at random — these produce hollow, demo-shaped pages that feel impressive at a glance and fall apart on first real use. If the user asks for something that genuinely needs intelligence, prefer to deliver it in chat rather than build a plausible-looking shell.
+Use \`artifact_file_update\` (entry file) / \`artifact_file_create\` (helper files) to populate source after create. The artifact's \`packages\` (passed at create time) is persisted for runs to reuse — to add more dependencies later, call \`artifact_packages_add\`. Output files must be written to \`/workspace/output/\` to be collected.
 
-\`localStorage\` and \`sessionStorage\` are available, but **in-memory and per-iframe-load only** — anything saved is lost the next time the artifact is rendered. Do not show "saved" / "remembered" / "记忆已保存" UI copy that implies persistence across sessions; treat storage as transient working memory, not durable state.
+Typical sequence:
+1. \`artifact_create({type: 'script_runnable', title: '…'})\` → empty main.py at revision 1
+2. \`artifact_file_update({artifactId, path: 'main.py', content: '<source>', expectedRevision: 1})\` to populate; \`artifact_file_create\` to add helper modules
+3. \`artifact_run({artifactId})\` to execute
+4. If failure, \`artifact_file_read\` to inspect, \`artifact_file_update\` to fix, then \`artifact_run\` again
 
-**RESPONSE:** returns the new \`artifactId\` and \`revision: 1\`. The artifact's content is rendered live in the Canvas pane as you stream it.`,
+**RESPONSE:** on success returns \`{isNew, artifactId, revision: 1, entryFile, filePaths, message}\` with a copy-pasteable next-step hint in \`message\`. On title collision \`isNew: false\` — full project state included so you can call \`artifact_file_update\` / \`artifact_file_create\` against the existing artifact. On title-but-type-mismatch: \`{conflict: 'type_mismatch', existingArtifactId, existingType}\`. On same-reply duplicate-create: \`{conflict: 'already_created_in_message', existingArtifactId, existingType, existingTitle, existingFiles}\` — switch to \`artifact_file_create\` / \`artifact_file_update\` against the existing project.`,
     inputSchema: artifactCreateArgs,
-    onInputStart: async (_ctx: ToolCtx, options: ToolExecutionOptions) => {
-      initState(options.toolCallId, 'artifact_create');
-    },
-    onInputDelta: async (
-      ctx: ToolCtx,
-      options: { inputTextDelta: string } & ToolExecutionOptions,
-    ) => {
-      const toolCallId = options.toolCallId;
-      const state = getState(options.toolCallId);
-      if (!state) return;
-      state.accumulator += options.inputTextDelta;
-
-      if (!shouldParse(state, state.accumulator.length)) return;
-      const parsed = await parsePartialJson(state.accumulator);
-      markParsed(state, state.accumulator.length);
-      if (
-        parsed.state !== 'successful-parse' &&
-        parsed.state !== 'repaired-parse'
-      ) {
-        return;
-      }
-      const partial = parsed.value;
-      if (
-        typeof partial !== 'object' ||
-        partial === null ||
-        Array.isArray(partial)
-      ) {
-        return;
-      }
-      const obj = partial as Record<string, unknown>;
-      const type = typeof obj.type === 'string' ? obj.type : undefined;
-      const title = typeof obj.title === 'string' ? obj.title : undefined;
-      const language =
-        typeof obj.language === 'string' ? obj.language : undefined;
-      // `content` is intentionally NOT extracted here — the streaming
-      // canvas reads it from the agent SDK's tool-input-delta rows directly.
-
-      const { organizationId, threadId, messageId } = ctx;
-      if (!organizationId || !threadId) return;
-
-      // Defer the placeholder insert until title has at least one character.
-      // partial-json returns title:"" the moment the parser sees `"title":`,
-      // before the actual characters arrive — inserting then would land an
-      // empty title in the row and we have no good moment later to know
-      // it has finished growing.
-      if (
-        !state.rowInitialized &&
-        type !== undefined &&
-        title !== undefined &&
-        title.length > 0 &&
-        isValidArtifactType(type)
-      ) {
-        const inserted = await ctx.runMutation(
-          internal.artifacts.internal_mutations.createArtifact,
-          {
-            organizationId,
-            threadId,
-            type,
-            title,
-            language,
-            // We no longer push partial content into `streamingContent` — the
-            // canvas reads tool-input-deltas directly from the agent SDK's
-            // streamDeltas, filtered by toolCallId, and decodes the JSON
-            // `content` value client-side. Insert with empty content; the
-            // canonical settle in execute() writes the final value.
-            content: '',
-            createdByMessageId: messageId ?? '',
-            liveStreamMode: 'create',
-            toolCallId,
-          },
-        );
-        state.artifactId = inserted.artifactId;
-        state.rowInitialized = true;
-        state.lastFlushedTitle = title;
-        state.lastFlushedLanguage = language;
-        return;
-      }
-
-      if (state.rowInitialized && state.artifactId !== undefined) {
-        // Only title / language flushes go through here now — content is
-        // delivered via streamDeltas (no per-chunk mutation from us).
-        const titleChanged =
-          title !== undefined && title !== state.lastFlushedTitle;
-        const languageChanged =
-          language !== undefined && language !== state.lastFlushedLanguage;
-
-        if (titleChanged || languageChanged) {
-          if (titleChanged) state.lastFlushedTitle = title;
-          if (languageChanged) state.lastFlushedLanguage = language;
-          const artifactId = state.artifactId;
-          const flushTitle = titleChanged ? title : undefined;
-          const flushLanguage = languageChanged ? language : undefined;
-          scheduleStreamingFlush(state, () =>
-            ctx.runMutation(
-              internal.artifacts.internal_mutations.updateStreamingContent,
-              {
-                artifactId,
-                title: flushTitle,
-                language: flushLanguage,
-              },
-            ),
-          );
-        }
-      }
-    },
     execute: async (
       ctx: ToolCtx,
       args: ArtifactCreateInput,
-      options: ToolExecutionOptions,
+      _options: ToolExecutionOptions,
     ): Promise<ArtifactCreateResult> => {
       const { organizationId, threadId, messageId } = ctx;
-      const state = getState(options.toolCallId);
-      try {
-        if (!organizationId || !threadId) {
-          if (state?.artifactId !== undefined) {
-            await ctx.runMutation(
-              internal.artifacts.internal_mutations.abortStream,
-              { artifactId: state.artifactId },
-            );
-          }
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_create requires organizationId and threadId in the tool context.',
+        };
+      }
+      const createdByMessageId = messageId ?? '';
+
+      // Same-message guard: an assistant reply that already produced an
+      // artifact should add files to it via `artifact_file_create` / `artifact_file_update`, not spawn a
+      // duplicate project. Gate on non-empty messageId — multi-step /
+      // sub-agent edge cases can fall back to "" and would otherwise
+      // cross-match every empty-string row in the thread.
+      if (createdByMessageId !== '') {
+        const sibling = await ctx.runQuery(
+          internal.artifacts.internal_queries.findArtifactByCreatedMessage,
+          { organizationId, threadId, createdByMessageId },
+        );
+        if (sibling !== null) {
+          const existingFiles =
+            sibling.files !== undefined
+              ? sibling.files.map((f) => f.path)
+              : sibling.entryFile !== undefined
+                ? [sibling.entryFile]
+                : [];
           return {
             success: false,
-            message:
-              'artifact_create requires organizationId and threadId in the tool context.',
+            conflict: 'already_created_in_message',
+            existingArtifactId: sibling._id,
+            existingType: sibling.type,
+            existingTitle: sibling.title,
+            existingFiles,
+            message: `An artifact "${sibling.title}" (${sibling.type}) was already created in this reply (artifactId: ${sibling._id}, files: ${existingFiles.join(', ') || '<none>'}, revision: ${sibling.revision}). To add files or content, call \`artifact_file_update({artifactId: "${sibling._id}", path: "<existing-path>", content: "...", expectedRevision: ${sibling.revision}})\` for existing files or \`artifact_file_create\` for new ones. Only call \`artifact_create\` again in this reply if the user explicitly asked for a second, unrelated project.`,
           };
         }
+      }
 
-        const editedByMessageId = messageId ?? '';
+      // Canonical create path: synchronous metadata insert. Always lands at
+      // revision 1 with an empty entry file. The LLM follows up with
+      // artifact_file_update / artifact_file_create to populate.
+      const result = await ctx.runMutation(
+        internal.artifacts.internal_mutations.createArtifact,
+        {
+          organizationId,
+          threadId,
+          type: args.type,
+          title: args.title,
+          language: args.language,
+          entryFile: args.entryFile,
+          createdByMessageId,
+        },
+      );
+
+      if (!result.success) {
+        return {
+          success: false,
+          conflict: result.conflict,
+          existingArtifactId: result.existingArtifactId,
+          existingType: result.existingType,
+          existingTitle: result.existingTitle,
+          existingFiles: result.existingFiles,
+          message: result.message,
+        };
+      }
 
-        if (state?.artifactId !== undefined) {
+      if (
+        isRunnableArtifactType(args.type) &&
+        args.packages !== undefined &&
+        result.isNew
+      ) {
+        // Persist into the grouped `runPackagesByLang` field. Mirror the
+        // entry-language bucket to the legacy flat `runPackages` field
+        // so single-runtime readers (legacy callers, audit row, canvas
+        // display) keep working unchanged.
+        const entryExt = result.entryFile.toLowerCase().split('.').pop();
+        const isPyEntry = entryExt === 'py';
+        const pythonList = args.packages.python ?? [];
+        const nodeList = args.packages.node ?? [];
+        const flatList = isPyEntry ? pythonList : nodeList;
+        const hasGrouped = pythonList.length > 0 || nodeList.length > 0;
+        if (flatList.length > 0 || hasGrouped) {
           await ctx.runMutation(
-            internal.artifacts.internal_mutations.finalizeStreamedCreate,
+            internal.artifacts.internal_mutations.setArtifactRunConfig,
             {
-              artifactId: state.artifactId,
-              title: args.title,
-              language: args.language,
-              content: args.content,
-              editedByMessageId,
+              artifactId: result.artifactId,
+              runPackages: flatList,
+              ...(hasGrouped && {
+                runPackagesByLang: {
+                  ...(pythonList.length > 0 && { python: pythonList }),
+                  ...(nodeList.length > 0 && { node: nodeList }),
+                },
+              }),
             },
           );
-          return {
-            success: true,
-            artifactId: state.artifactId,
-            revision: 1,
-            message: `Created artifact "${args.title}" (${args.type}, ${args.content.length} chars).`,
-          };
         }
+      }
 
-        const inserted = await ctx.runMutation(
-          internal.artifacts.internal_mutations.createArtifact,
-          {
-            organizationId,
-            threadId,
-            type: args.type,
-            title: args.title,
-            language: args.language,
-            content: args.content,
-            createdByMessageId: editedByMessageId,
-          },
-        );
+      const runHint = isRunnableArtifactType(args.type)
+        ? ` After populating, call \`artifact_run({artifactId: "${result.artifactId}"})\` to execute.`
+        : '';
+      const nextStep = `Call \`artifact_file_update({artifactId: "${result.artifactId}", path: "${result.entryFile}", content: "<full content>", expectedRevision: ${result.revision}})\` to populate the entry file. Add helper modules via \`artifact_file_create\` rather than packing everything into the entry file.`;
+
+      if (result.isNew) {
         return {
           success: true,
-          artifactId: inserted.artifactId,
-          revision: inserted.revision,
-          message: `Created artifact "${args.title}" (${args.type}, ${args.content.length} chars).`,
+          isNew: true,
+          artifactId: result.artifactId,
+          revision: result.revision,
+          entryFile: result.entryFile,
+          filePaths: [...result.filePaths],
+          message: `Created empty artifact "${args.title}" (${args.type}, ${result.filePaths.length} file(s)) at revision ${result.revision}. ${nextStep}${runHint}`,
         };
-      } finally {
-        clearState(options.toolCallId);
       }
+
+      return {
+        success: true,
+        isNew: false,
+        artifactId: result.artifactId,
+        revision: result.revision,
+        entryFile: result.entryFile,
+        filePaths: [...result.filePaths],
+        message: `Artifact "${args.title}" already exists at revision ${result.revision} with entry file "${result.entryFile}" (${result.filePaths.length} file(s)). To modify, call \`artifact_file_update({artifactId: "${result.artifactId}", path: "${result.entryFile}", content: "<full content>", expectedRevision: ${result.revision}})\` or \`artifact_file_create\` for new files.`,
+      };
     },
   }),
 } as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_edit_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_edit_tool.ts
deleted file mode 100644
index 7350457d8..000000000
--- a/services/platform/convex/agent_tools/artifacts/artifact_edit_tool.ts
+++ /dev/null
@@ -1,371 +0,0 @@
-/**
- * Convex Tool: artifact_edit
- *
- * Modifies an existing artifact via either a list of search/replace
- * patches (`mode: 'patch'`) or a complete rewrite (`mode: 'rewrite'`).
- * Patch mode is preferred — it's smaller to stream and easier to validate.
- *
- * Streaming: `mode: 'patch'` shows a status badge while the LLM emits the
- * patch list; the actual content updates atomically when `execute` runs
- * (so half-emitted patches never partially mutate the document). For
- * `mode: 'rewrite'`, the partial content is mirrored to `streamingContent`
- * with throttling so the user sees live typing in the Canvas pane.
- */
-
-import type { ToolCtx } from '@convex-dev/agent';
-import { createTool } from '@convex-dev/agent';
-import type { ToolExecutionOptions } from 'ai';
-import { parsePartialJson } from 'ai';
-import { z } from 'zod/v4';
-
-import { getString, isRecord } from '../../../lib/utils/type-guards';
-import { internal } from '../../_generated/api';
-import { toId } from '../../lib/type_cast_helpers';
-import type { ToolDefinition } from '../types';
-import {
-  type StreamingPatchPair,
-  clearState,
-  getState,
-  initState,
-  markFlushedStreamingPatches,
-  markParsed,
-  scheduleStreamingFlush,
-  shouldFlushStreamingPatches,
-  shouldParse,
-} from './stream_state';
-
-const patchEntry = z.object({
-  search: z
-    .string()
-    .min(1)
-    .describe(
-      'Snippet that appears verbatim in the artifact and matches exactly once. Include enough surrounding context to make the snippet unique.',
-    ),
-  replace: z
-    .string()
-    .describe(
-      'Replacement text. Empty string deletes the search block entirely.',
-    ),
-});
-
-const patchModeArgs = z.object({
-  artifactId: z
-    .string()
-    .min(1)
-    .describe(
-      'Convex artifact ID returned by `artifact_create` (or referenced from the <artifacts> system context).',
-    ),
-  mode: z.literal('patch'),
-  patches: z
-    .array(patchEntry)
-    .min(1)
-    .max(20)
-    .describe(
-      'Ordered list of search/replace patches. Each patch operates on the result of the previous patch — so a later patch can match text introduced by an earlier one.',
-    ),
-});
-
-const rewriteModeArgs = z.object({
-  artifactId: z.string().min(1),
-  mode: z.literal('rewrite'),
-  content: z
-    .string()
-    .min(1)
-    .describe(
-      'Complete new artifact content. Use only when the change spans most of the file; otherwise prefer mode=`patch`.',
-    ),
-});
-
-const artifactEditArgs = z.discriminatedUnion('mode', [
-  patchModeArgs,
-  rewriteModeArgs,
-]);
-
-type ArtifactEditInput = z.infer<typeof artifactEditArgs>;
-
-interface ArtifactEditSuccess {
-  success: true;
-  artifactId: string;
-  revision: number;
-  applied: number;
-  content: string;
-  message: string;
-}
-
-interface ArtifactEditFailure {
-  success: false;
-  message: string;
-  failedIndex?: number;
-}
-
-type ArtifactEditResult = ArtifactEditSuccess | ArtifactEditFailure;
-
-export const artifactEditTool = {
-  name: 'artifact_edit' as const,
-  tool: createTool({
-    description: `**artifact_edit** — modify an existing artifact in place. Use this — never \`artifact_create\` — to revise an artifact you've already created.
-
-**MODES:**
-- \`patch\` (preferred) — list of search/replace blocks. Each \`search\` must appear in the artifact verbatim and match exactly once; if not, the tool returns an error and you should re-emit a more specific snippet. Patches apply sequentially.
-- \`rewrite\` — full replacement. Use only when more than ~50% of the file changes.
-
-**SEARCH/REPLACE RULES:**
-- The \`search\` block must match **exactly once** in the current artifact content. Zero matches and multiple matches both fail.
-- Include enough surrounding context (a unique line or two) to make the snippet unique.
-- Whitespace and newlines are significant. Do not normalise indentation.
-- Empty \`replace\` deletes the matched range.
-
-**ERROR HANDLING:**
-- If a patch fails ("matched 0 times" / "matched more than once"), re-read the current artifact content from the <artifacts> system context, then re-emit the failing patch with a more specific search block. Do not fall back to \`mode: 'rewrite'\` unless the change is genuinely large.
-
-**WHEN ADDING NEW FEATURES TO AN HTML ARTIFACT:** the same constraints from \`artifact_create\` apply — the iframe is offline (no \`fetch\` / WebSocket to any host), only the bundled \`/canvas-libs/*\` libraries are loadable, and features that need runtime intelligence (translate user input, score answers, conversational replies) belong in chat, not in the page. Don't introduce hardcoded lookup tables to fake AI behaviour.
-
-**RESPONSE:** returns the new \`revision\` number, how many patches were applied (\`applied\`), and the artifact's new \`content\` so you can reason about further edits in the same turn.`,
-    inputSchema: artifactEditArgs,
-    onInputStart: async (_ctx: ToolCtx, options: ToolExecutionOptions) => {
-      initState(options.toolCallId, 'artifact_edit');
-    },
-    onInputDelta: async (
-      ctx: ToolCtx,
-      options: { inputTextDelta: string } & ToolExecutionOptions,
-    ) => {
-      const state = getState(options.toolCallId);
-      if (!state) return;
-      state.accumulator += options.inputTextDelta;
-
-      if (!shouldParse(state, state.accumulator.length)) return;
-      const parsed = await parsePartialJson(state.accumulator);
-      markParsed(state, state.accumulator.length);
-      if (
-        parsed.state !== 'successful-parse' &&
-        parsed.state !== 'repaired-parse'
-      ) {
-        return;
-      }
-      const partial = parsed.value;
-      if (
-        typeof partial !== 'object' ||
-        partial === null ||
-        Array.isArray(partial)
-      ) {
-        return;
-      }
-      const obj = partial as Record<string, unknown>;
-      const artifactIdStr =
-        typeof obj.artifactId === 'string' ? obj.artifactId : undefined;
-      const mode = typeof obj.mode === 'string' ? obj.mode : undefined;
-
-      // Defer the lookup until `mode` is also in the parsed object —
-      // that's a structural signal the LLM closed the artifactId string
-      // and moved to the next field. Without this guard parsePartialJson
-      // hands back every streaming prefix ("k", "ks", "ks7", ...) and the
-      // Convex `v.id("artifacts")` validator rejects each one as a
-      // NonRetryableError that aborts the whole agent run.
-      if (
-        state.artifactId === undefined &&
-        artifactIdStr &&
-        mode !== undefined
-      ) {
-        try {
-          const artifactId = toId<'artifacts'>(artifactIdStr);
-          const artifact = await ctx.runQuery(
-            internal.artifacts.internal_queries.getById,
-            {
-              artifactId,
-              expectedOrganizationId: ctx.organizationId,
-              expectedThreadId: ctx.threadId,
-            },
-          );
-          if (!artifact) {
-            // Defer error reporting to execute — avoids silently no-oping
-            // when the LLM passes a bad ID; the tool result will explain.
-            return;
-          }
-          state.artifactId = artifactId;
-          state.baseContentLength = artifact.content.length;
-        } catch (err) {
-          // Malformed id (e.g. LLM hallucinated a token, or the parsed
-          // string is still partial despite the mode-field guard).
-          // Defer to execute for the canonical error message.
-          console.warn('[artifact_edit] preflight getById failed, deferring', {
-            artifactIdStr,
-            error: err instanceof Error ? err.message : String(err),
-          });
-          return;
-        }
-      }
-
-      if (
-        state.artifactId !== undefined &&
-        !state.rowInitialized &&
-        (mode === 'patch' || mode === 'rewrite')
-      ) {
-        state.resolvedMode = mode;
-        await ctx.runMutation(
-          internal.artifacts.internal_mutations.beginEditStream,
-          {
-            artifactId: state.artifactId,
-            liveStreamMode: mode,
-            // Stamp the toolCallId so the canvas can filter
-            // tool-input-deltas to this rewrite's stream. Patch mode also
-            // gets it for symmetry / debugging — patch flushes still go
-            // through `streamingPatches` independently.
-            toolCallId: options.toolCallId,
-          },
-        );
-        state.rowInitialized = true;
-      }
-
-      // Rewrite-mode partial content used to flush into `streamingContent`
-      // here; we now skip that. The canvas reads the same partial bytes from
-      // the agent SDK's tool-input-delta rows and decodes the JSON `content`
-      // field client-side. The canonical settle in execute() still writes
-      // the final `content` atomically via rewriteArtifact().
-
-      if (
-        state.resolvedMode === 'patch' &&
-        state.artifactId !== undefined &&
-        Array.isArray(obj.patches)
-      ) {
-        // Surface the partial patches as {search, replace} pairs so the
-        // Canvas pane can render an inline diff preview. We only push
-        // entries with a non-empty `search` — without that we cannot
-        // anchor the diff anywhere in the source. `replace` may still be
-        // streaming in (empty or partial); the renderer downgrades to a
-        // strikethrough-only mark in that case and upgrades to full diff
-        // once the replacement text arrives.
-        const pairs: StreamingPatchPair[] = [];
-        for (const item of obj.patches as readonly unknown[]) {
-          if (!isRecord(item)) continue;
-          const search = getString(item, 'search');
-          if (search === undefined || search.length === 0) continue;
-          const replace = getString(item, 'replace') ?? '';
-          pairs.push({ search, replace });
-        }
-        if (shouldFlushStreamingPatches(state, pairs)) {
-          markFlushedStreamingPatches(state, pairs);
-          const artifactId = state.artifactId;
-          const flushPairs = pairs;
-          scheduleStreamingFlush(state, () =>
-            ctx.runMutation(
-              internal.artifacts.internal_mutations.updateStreamingContent,
-              {
-                artifactId,
-                streamingPatches: flushPairs,
-              },
-            ),
-          );
-        }
-      }
-    },
-    execute: async (
-      ctx: ToolCtx,
-      args: ArtifactEditInput,
-      options: ToolExecutionOptions,
-    ): Promise<ArtifactEditResult> => {
-      const { messageId } = ctx;
-      const editedByMessageId = messageId ?? '';
-      const state = getState(options.toolCallId);
-      try {
-        const artifactId = toId<'artifacts'>(args.artifactId);
-        let artifact;
-        try {
-          artifact = await ctx.runQuery(
-            internal.artifacts.internal_queries.getById,
-            {
-              artifactId,
-              expectedOrganizationId: ctx.organizationId,
-              expectedThreadId: ctx.threadId,
-            },
-          );
-        } catch (err) {
-          // Convex `v.id("artifacts")` rejected the value — most often
-          // because the LLM hallucinated an id that doesn't match the
-          // expected format. Returning a tool-result error keeps the
-          // agent loop alive so the model can recover; throwing would
-          // abort the whole run as a NonRetryableError.
-          const message = err instanceof Error ? err.message : String(err);
-          return {
-            success: false,
-            message: `Artifact id "${args.artifactId}" is malformed: ${message}`,
-          };
-        }
-        if (!artifact) {
-          return {
-            success: false,
-            message: `Artifact ${args.artifactId} not found in this thread.`,
-          };
-        }
-
-        if (args.mode === 'patch') {
-          const result = await ctx.runMutation(
-            internal.artifacts.internal_mutations.applyToolPatches,
-            {
-              artifactId,
-              patches: args.patches,
-              editedByMessageId,
-              expectedRevision: artifact.revision,
-            },
-          );
-          if (!result.success) {
-            await ctx.runMutation(
-              internal.artifacts.internal_mutations.abortStream,
-              { artifactId },
-            );
-            return {
-              success: false,
-              message: result.stale
-                ? result.error
-                : `Patch ${result.failedIndex + 1} failed: ${result.error}`,
-              failedIndex: result.failedIndex,
-            };
-          }
-          return {
-            success: true,
-            artifactId: args.artifactId,
-            revision: result.revision,
-            applied: args.patches.length,
-            content: result.content,
-            message: `Applied ${args.patches.length} patch(es) to "${artifact.title}". New revision: ${result.revision}.`,
-          };
-        }
-
-        const result = await ctx.runMutation(
-          internal.artifacts.internal_mutations.rewriteArtifact,
-          {
-            artifactId,
-            content: args.content,
-            editedByMessageId,
-            expectedRevision: artifact.revision,
-          },
-        );
-        if (!result.success) {
-          await ctx.runMutation(
-            internal.artifacts.internal_mutations.abortStream,
-            { artifactId },
-          );
-          return { success: false, message: result.error };
-        }
-        return {
-          success: true,
-          artifactId: args.artifactId,
-          revision: result.revision,
-          applied: 1,
-          content: args.content,
-          message: `Rewrote "${artifact.title}". New revision: ${result.revision}.`,
-        };
-      } catch (err) {
-        if (state?.artifactId !== undefined) {
-          await ctx.runMutation(
-            internal.artifacts.internal_mutations.abortStream,
-            { artifactId: state.artifactId },
-          );
-        }
-        const message = err instanceof Error ? err.message : String(err);
-        return { success: false, message: `artifact_edit failed: ${message}` };
-      } finally {
-        clearState(options.toolCallId);
-      }
-    },
-  }),
-} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_file_create_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_file_create_tool.ts
new file mode 100644
index 000000000..2ba876d1c
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_file_create_tool.ts
@@ -0,0 +1,337 @@
+/**
+ * Convex Tool: artifact_file_create
+ *
+ * Add a NEW file to an artifact's project tree. Refused if `path` already
+ * exists (use `artifact_file_update` to overwrite). Streams content live to the
+ * canvas via the shared streaming mutations.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { parsePartialJson } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+import { applyPackagesAddIfAny, isStringFieldClosed } from './_packages_helper';
+import {
+  InvalidArtifactPathError,
+  extractToolErrorShape,
+  isRunnableArtifactType,
+  validatePath,
+} from './shared';
+import {
+  clearState,
+  getState,
+  initState,
+  markFlushed,
+  markParsed,
+  shouldFlush,
+  shouldParse,
+} from './stream_state';
+
+const fileCreateArgs = z.object({
+  artifactId: z
+    .string()
+    .min(1)
+    .describe(
+      'Convex artifact ID returned by `artifact_create` (or referenced from the <artifacts> system context).',
+    ),
+  path: z
+    .string()
+    .min(1)
+    .max(200)
+    .describe(
+      'New file path inside the artifact. Must NOT already exist (use `artifact_file_update` to overwrite an existing file).',
+    ),
+  content: z
+    .string()
+    .describe(
+      'Complete content for the new file. Empty string is allowed (creates a placeholder).',
+    ),
+  expectedRevision: z
+    .number()
+    .int()
+    .nonnegative()
+    .describe(
+      'REQUIRED: the `revision="N"` attribute from the `<artifact>` block this create was authored against. OCC — rejects with `code: "stale"` and `currentRevision` if the artifact has moved.',
+    ),
+  packages_add: z
+    .array(z.string().max(120))
+    .max(20)
+    .optional()
+    .describe(
+      "Optional. Package names to UNION into the artifact's persistent `runPackages` list so the next `artifact_run` auto-installs them. Use when the new file imports a new dependency. Equivalent to a follow-up `artifact_packages_add` call.",
+    ),
+});
+
+type FileCreateInput = z.infer<typeof fileCreateArgs>;
+
+interface FileCreateSuccess {
+  success: true;
+  artifactId: string;
+  revision: number;
+  path: string;
+  byteLength: number;
+  message: string;
+}
+
+interface FileCreateFailure {
+  success: false;
+  code?: string;
+  message: string;
+  currentRevision?: number;
+}
+
+type FileCreateResult = FileCreateSuccess | FileCreateFailure;
+
+export const artifactFileCreateTool = {
+  name: 'artifact_file_create' as const,
+  tool: createTool({
+    description: `**artifact_file_create** — add a NEW file to an artifact's project tree. Streams content live to the canvas. Use this — NOT \`artifact_file_update\` — for paths that don't yet exist.
+
+**INPUTS:** \`artifactId\`, \`path\`, \`content\` (full file), \`expectedRevision\`, optional \`packages_add\`.
+
+**REFUSED ON** existing path (code: \`path_exists\`) — call \`artifact_file_update\` to overwrite, or pick a different name.
+
+**SIZE LIMIT (HARD):** The \`content\` field is sent as a JSON string literal inside this call's arguments — every byte of \`content\` consumes YOUR (the caller's) output token budget. If \`content\` exceeds your remaining budget, the arguments JSON gets truncated mid-string by \`max_tokens\` and the call fails with an unrecoverable parse error BEFORE this handler runs. To stay safe, keep any single \`content\` under ~40 KB (~1000 lines). When the file you want to write would exceed that, decide on a split BEFORE generating the call:
+ - Slide decks (pptxgenjs etc.) → \`main.js\` requires \`slide1.js\`, \`slide2.js\`, …, one builder per file.
+ - Long scripts → split by module/responsibility into multiple files (e.g. \`main.py\` + \`helpers.py\` + \`types.py\`).
+ - Long data tables → put each chunk in its own data file and import them.
+There is no \`append\` and no patch mode — splitting is the only way. This is a HARD limit of the calling protocol, not a soft preference. (Per-artifact aggregate cap is ~800 KB across all files.)
+
+**RUNNABLE ARTIFACTS:** if the new file imports a new dependency, set \`packages_add\` (or follow up with \`artifact_packages_add\`). Edits do NOT auto-execute — call \`artifact_run\` to re-run.
+
+**RESPONSE:** \`{revision, path, byteLength, message}\`. Errors carry \`code\` (\`not_found\`, \`stale\`, \`path_exists\`, \`too_large\`, \`too_many_files\`, \`duplicate_path\`, \`empty_project\`, \`invalid_path\`). Some failures (unhandled exceptions) come back with only \`message\`.`,
+    inputSchema: fileCreateArgs,
+    onInputStart: async (_ctx: ToolCtx, options: ToolExecutionOptions) => {
+      initState(options.toolCallId, 'artifact_file_create');
+    },
+    onInputDelta: async (
+      ctx: ToolCtx,
+      options: { inputTextDelta: string } & ToolExecutionOptions,
+    ) => {
+      const state = getState(options.toolCallId);
+      if (!state) return;
+      state.accumulator += options.inputTextDelta;
+      if (!shouldParse(state, state.accumulator.length)) return;
+      const parsed = await parsePartialJson(state.accumulator);
+      markParsed(state, state.accumulator.length);
+      if (
+        parsed.state !== 'successful-parse' &&
+        parsed.state !== 'repaired-parse'
+      ) {
+        return;
+      }
+      const partial = parsed.value;
+      if (
+        typeof partial !== 'object' ||
+        partial === null ||
+        Array.isArray(partial)
+      ) {
+        return;
+      }
+      const obj = partial as Record<string, unknown>;
+      const artifactIdStr =
+        typeof obj.artifactId === 'string' ? obj.artifactId : undefined;
+      const path = typeof obj.path === 'string' ? obj.path : undefined;
+
+      if (
+        state.artifactId === undefined &&
+        artifactIdStr &&
+        isStringFieldClosed(state.accumulator, 'artifactId')
+      ) {
+        try {
+          const artifactId = toId<'artifacts'>(artifactIdStr);
+          const artifact = await ctx.runQuery(
+            internal.artifacts.internal_queries.getById,
+            {
+              artifactId,
+              expectedOrganizationId: ctx.organizationId,
+              expectedThreadId: ctx.threadId,
+            },
+          );
+          if (!artifact) return;
+          state.artifactId = artifactId;
+          state.baseContentLength = (artifact.content ?? '').length;
+        } catch (err) {
+          console.warn(
+            '[artifact_file_create] preflight getById failed, deferring',
+            {
+              artifactIdStr,
+              error: err instanceof Error ? err.message : String(err),
+            },
+          );
+          return;
+        }
+      }
+
+      if (
+        state.artifactId !== undefined &&
+        !state.rowInitialized &&
+        path !== undefined &&
+        path.length > 0 &&
+        isStringFieldClosed(state.accumulator, 'path')
+      ) {
+        // Pre-validate the path BEFORE issuing beginEditStream — that
+        // mutation runs `validatePath()` itself, so a malformed path
+        // (`..`, backslash, control chars, etc.) would throw mid-stream
+        // and the bare catch below would log WARN on every subsequent
+        // delta. Set the sticky hard-fail flag instead so `execute()`
+        // surfaces the structured failure once (audit follow-up F9).
+        try {
+          validatePath(path);
+        } catch (err) {
+          if (err instanceof InvalidArtifactPathError) {
+            state.streamingFailedHard = true;
+            console.warn(
+              '[artifact_file_create] streaming-preflight rejected invalid path',
+              { path, code: err.code },
+            );
+            return;
+          }
+          throw err;
+        }
+        state.resolvedMode = 'rewrite';
+        try {
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.beginEditStream,
+            {
+              artifactId: state.artifactId,
+              liveStreamMode: 'rewrite',
+              streamingPath: path,
+              toolCallId: options.toolCallId,
+            },
+          );
+          state.rowInitialized = true;
+        } catch (err) {
+          // Defensive: beginEditStream only throws `not_found` now (mutex
+          // removed). execute() will surface that via its own preflight.
+          // Mark hard-fail so we don't retry the same mutation on every
+          // subsequent delta.
+          state.streamingFailedHard = true;
+          console.warn(
+            '[artifact_file_create] beginEditStream failed, deferring',
+            {
+              error: err instanceof Error ? err.message : String(err),
+            },
+          );
+          return;
+        }
+      }
+
+      if (
+        !state.rowInitialized ||
+        state.artifactId === undefined ||
+        path === undefined ||
+        path.length === 0
+      ) {
+        return;
+      }
+      const contentRaw =
+        typeof obj.content === 'string' ? obj.content : undefined;
+      if (contentRaw === undefined) return;
+      if (!shouldFlush(state, contentRaw.length)) return;
+      try {
+        await ctx.runMutation(
+          internal.artifacts.internal_mutations.updateRewriteStreamingContent,
+          {
+            artifactId: state.artifactId,
+            toolCallId: options.toolCallId,
+            streamingPath: path,
+            content: contentRaw,
+          },
+        );
+        markFlushed(state, contentRaw.length);
+      } catch (err) {
+        console.warn('[artifact_file_create] streamingContent flush failed', {
+          error: err instanceof Error ? err.message : String(err),
+        });
+      }
+    },
+    execute: async (
+      ctx: ToolCtx,
+      args: FileCreateInput,
+      options: ToolExecutionOptions,
+    ): Promise<FileCreateResult> => {
+      const { messageId } = ctx;
+      const editedByMessageId = messageId ?? '';
+      const state = getState(options.toolCallId);
+      try {
+        const artifactId = toId<'artifacts'>(args.artifactId);
+        const artifact = await ctx.runQuery(
+          internal.artifacts.internal_queries.getById,
+          {
+            artifactId,
+            expectedOrganizationId: ctx.organizationId,
+            expectedThreadId: ctx.threadId,
+          },
+        );
+        if (!artifact) {
+          return {
+            success: false,
+            code: 'not_found',
+            message: `Artifact ${args.artifactId} not found in this thread.`,
+          };
+        }
+        const isRunnable = isRunnableArtifactType(artifact.type);
+        const runHint = isRunnable
+          ? ` Call \`artifact_run({artifactId: "${args.artifactId}"})\` to execute the updated project.`
+          : '';
+        const result = await ctx.runMutation(
+          internal.artifacts.internal_mutations.createFileInArtifact,
+          {
+            artifactId,
+            path: args.path,
+            content: args.content,
+            editedByMessageId,
+            expectedRevision: args.expectedRevision,
+          },
+        );
+        if (!result.success) {
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.abortStream,
+            { artifactId },
+          );
+          return {
+            success: false,
+            code: result.code,
+            message: result.message,
+            currentRevision: result.currentRevision,
+          };
+        }
+        const pkgNote = await applyPackagesAddIfAny(
+          ctx,
+          artifactId,
+          isRunnable,
+          args.packages_add,
+        );
+        return {
+          success: true,
+          artifactId: args.artifactId,
+          revision: result.revision,
+          path: result.path,
+          byteLength: result.byteLength,
+          message: `Created file "${result.path}" in "${artifact.title}" (${result.byteLength} bytes). New revision: ${result.revision}.${pkgNote}${runHint}`,
+        };
+      } catch (err) {
+        if (state?.artifactId !== undefined) {
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.abortStream,
+            { artifactId: state.artifactId },
+          );
+        }
+        const shape = extractToolErrorShape(err);
+        return {
+          success: false,
+          ...(shape.code !== undefined && { code: shape.code }),
+          message: `artifact_file_create failed: ${shape.message}`,
+        };
+      } finally {
+        clearState(options.toolCallId);
+      }
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_file_delete_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_file_delete_tool.ts
new file mode 100644
index 000000000..9ceee5b5e
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_file_delete_tool.ts
@@ -0,0 +1,133 @@
+/**
+ * Convex Tool: artifact_file_delete
+ *
+ * Remove one file from an artifact's project tree. Refused on the entry file
+ * (rename the entry away first) and on the last remaining file in the
+ * artifact (artifacts cannot be empty).
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+
+const fileDeleteArgs = z.object({
+  artifactId: z.string().min(1),
+  path: z
+    .string()
+    .min(1)
+    .max(200)
+    .describe(
+      'File path inside the artifact to delete. Refused on the entry file (call `artifact_file_rename` first to repoint the entry to another file) and on the last file in the artifact.',
+    ),
+  expectedRevision: z
+    .number()
+    .int()
+    .nonnegative()
+    .describe(
+      'REQUIRED: revision the delete was authored against (from `<artifact revision="N">` or a prior `artifact_file_list` / `artifact_file_read`). OCC — rejects with `code: "stale"` and `currentRevision` if the artifact has moved.',
+    ),
+});
+
+type FileDeleteInput = z.infer<typeof fileDeleteArgs>;
+
+interface FileDeleteSuccess {
+  success: true;
+  artifactId: string;
+  revision: number;
+  path: string;
+  message: string;
+}
+
+interface FileDeleteFailure {
+  success: false;
+  code?: string;
+  message: string;
+  currentRevision?: number;
+  entryFile?: string;
+}
+
+type FileDeleteResult = FileDeleteSuccess | FileDeleteFailure;
+
+export const artifactFileDeleteTool = {
+  name: 'artifact_file_delete' as const,
+  tool: createTool({
+    description: `**artifact_file_delete** — remove one file from an artifact's project tree.
+
+**INPUTS:** \`artifactId\`, \`path\`, \`expectedRevision\`.
+
+**REFUSED ON:**
+- the artifact's \`entryFile\` (code: \`entry_pin\`) — call \`artifact_file_rename\` first to repoint the entry to another file, or rename a sibling onto the entry path.
+- the last file in the artifact (code: \`last_file\`) — artifacts cannot be empty.
+
+**RESPONSE:** \`{revision, path, message}\` on success. Errors carry \`code\` (\`not_found\`, \`stale\`, \`file_missing\`, \`entry_pin\`, \`last_file\`) plus a recovery hint.`,
+    inputSchema: fileDeleteArgs,
+    execute: async (
+      ctx: ToolCtx,
+      args: FileDeleteInput,
+      _options: ToolExecutionOptions,
+    ): Promise<FileDeleteResult> => {
+      const { organizationId, threadId, messageId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_file_delete requires organizationId and threadId in the tool context.',
+        };
+      }
+      let artifactId;
+      try {
+        artifactId = toId<'artifacts'>(args.artifactId);
+      } catch (err) {
+        return {
+          success: false,
+          message: `Artifact id "${args.artifactId}" is malformed: ${err instanceof Error ? err.message : String(err)}`,
+        };
+      }
+      const artifact = await ctx.runQuery(
+        internal.artifacts.internal_queries.getById,
+        {
+          artifactId,
+          expectedOrganizationId: organizationId,
+          expectedThreadId: threadId,
+        },
+      );
+      if (!artifact) {
+        return {
+          success: false,
+          code: 'not_found',
+          message: `Artifact ${args.artifactId} not found in this thread.`,
+        };
+      }
+      const result = await ctx.runMutation(
+        internal.artifacts.internal_mutations.deleteFileFromArtifact,
+        {
+          artifactId,
+          path: args.path,
+          editedByMessageId: messageId ?? '',
+          expectedRevision: args.expectedRevision,
+        },
+      );
+      if (!result.success) {
+        return {
+          success: false,
+          code: result.code,
+          message: result.message,
+          currentRevision: result.currentRevision,
+          entryFile: result.entryFile,
+        };
+      }
+      return {
+        success: true,
+        artifactId: args.artifactId,
+        revision: result.revision,
+        path: result.path,
+        message: `Deleted "${result.path}" from "${artifact.title}". New revision: ${result.revision}.`,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_file_list_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_file_list_tool.ts
new file mode 100644
index 000000000..a9b4e5690
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_file_list_tool.ts
@@ -0,0 +1,111 @@
+/**
+ * Convex Tool: artifact_file_list
+ *
+ * List metadata for every file in an artifact's project tree. Cheap; encourages
+ * the "list-then-read" CRUD pattern (call `artifact_file_list` first to enumerate paths,
+ * then `artifact_file_read` with explicit paths to fetch content).
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+
+const fileListArgs = z.object({
+  artifactId: z
+    .string()
+    .min(1)
+    .describe(
+      'Convex artifact ID returned by `artifact_create` (or referenced from the <artifacts> system context).',
+    ),
+});
+
+type FileListInput = z.infer<typeof fileListArgs>;
+
+interface FileListSuccess {
+  success: true;
+  artifactId: string;
+  type: string;
+  title: string;
+  revision: number;
+  entryFile: string;
+  language?: string;
+  files: { path: string; size: number }[];
+}
+
+interface FileListFailure {
+  success: false;
+  code?: string;
+  message: string;
+}
+
+type FileListResult = FileListSuccess | FileListFailure;
+
+export const artifactFileListTool = {
+  name: 'artifact_file_list' as const,
+  tool: createTool({
+    description: `**artifact_file_list** — list every file in an artifact's project tree as \`{path, size}\` metadata (no content). Cheap; use to enumerate before \`artifact_file_read\`.
+
+**INPUTS:** \`artifactId\` (required).
+
+**WHEN TO USE:**
+- Before \`artifact_file_read\` when you need to see what files exist.
+- After a failed \`artifact_file_update\` reporting \`file_missing\` — to see the correct paths.
+- When the \`<artifacts>\` system context was truncated and you need a fresh view.
+
+**RESPONSE:** \`{artifactId, type, title, revision, entryFile, files: [{path, size}]}\`. Use \`revision\` as \`expectedRevision\` on the next write call.`,
+    inputSchema: fileListArgs,
+    execute: async (
+      ctx: ToolCtx,
+      args: FileListInput,
+      _options: ToolExecutionOptions,
+    ): Promise<FileListResult> => {
+      const { organizationId, threadId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_file_list requires organizationId and threadId in the tool context.',
+        };
+      }
+      let artifactId;
+      try {
+        artifactId = toId<'artifacts'>(args.artifactId);
+      } catch (err) {
+        return {
+          success: false,
+          message: `Artifact id "${args.artifactId}" is malformed: ${err instanceof Error ? err.message : String(err)}`,
+        };
+      }
+      const result = await ctx.runQuery(
+        internal.artifacts.internal_queries.listFilesByArtifact,
+        {
+          artifactId,
+          expectedOrganizationId: organizationId,
+          expectedThreadId: threadId,
+        },
+      );
+      if (!result) {
+        return {
+          success: false,
+          code: 'not_found',
+          message: `Artifact ${args.artifactId} not found in this thread.`,
+        };
+      }
+      return {
+        success: true,
+        artifactId: args.artifactId,
+        type: result.type,
+        title: result.title,
+        revision: result.revision,
+        entryFile: result.entryFile,
+        language: result.language,
+        files: result.files,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_file_read_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_file_read_tool.ts
new file mode 100644
index 000000000..b661d99d8
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_file_read_tool.ts
@@ -0,0 +1,183 @@
+/**
+ * Convex Tool: artifact_file_read
+ *
+ * Read explicit file path(s) from an artifact. Required `path` — no "no path
+ * → smart inline aggregate" branch. Call `artifact_file_list` first if you need to
+ * enumerate available paths.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+
+const AGGREGATE_INLINE_BYTES = 65_536;
+
+const fileReadArgs = z.object({
+  artifactId: z
+    .string()
+    .min(1)
+    .describe(
+      'Convex artifact ID. Look it up via `artifact_list({})` if you only have the title.',
+    ),
+  path: z
+    .union([z.string().min(1), z.array(z.string().min(1)).min(1).max(50)])
+    .describe(
+      'REQUIRED. A single file path (string) to fetch in full, or an array of paths to fetch several at once (subject to an aggregate ~64KB cap). To enumerate available paths first, call `artifact_file_list`.',
+    ),
+});
+
+type FileReadInput = z.infer<typeof fileReadArgs>;
+
+interface ReadFileEntry {
+  path: string;
+  size: number;
+  content?: string;
+}
+
+interface FileReadSuccess {
+  success: true;
+  artifactId: string;
+  type: string;
+  title: string;
+  revision: number;
+  entryFile: string;
+  language?: string;
+  files: ReadFileEntry[];
+  truncated: boolean;
+  message?: string;
+}
+
+interface FileReadFailure {
+  success: false;
+  code?: string;
+  message: string;
+}
+
+type FileReadResult = FileReadSuccess | FileReadFailure;
+
+export const artifactFileReadTool = {
+  name: 'artifact_file_read' as const,
+  tool: createTool({
+    description: `**artifact_file_read** — fetch file content by exact path(s). \`path\` is REQUIRED (string or string[]). To enumerate available paths first, call \`artifact_file_list\`.
+
+**INPUTS:**
+- \`artifactId\` — required.
+- \`path\` — required. Either a single \`string\` (returns that one file's full content) or a \`string[]\` (returns those files; aggregate ≤${AGGREGATE_INLINE_BYTES} bytes — anything over the cap comes back as \`{path, size}\` with no content; re-read by single path to fetch it).
+
+**WHEN TO USE:**
+- Before \`artifact_file_update\` when your snapshot of a file may be stale.
+- Before composing a multi-step edit that references several files.
+- When the \`<artifacts>\` system-context block was truncated.
+
+**RESPONSE:** \`{artifactId, type, title, revision, entryFile, files: [{path, size, content?}], truncated}\`. \`content\` is present iff the file fit under the inline thresholds. Use \`revision\` as the \`expectedRevision\` for any subsequent write.`,
+    inputSchema: fileReadArgs,
+    execute: async (
+      ctx: ToolCtx,
+      args: FileReadInput,
+      _options: ToolExecutionOptions,
+    ): Promise<FileReadResult> => {
+      const { organizationId, threadId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_file_read requires organizationId and threadId in the tool context.',
+        };
+      }
+      let artifactId;
+      try {
+        artifactId = toId<'artifacts'>(args.artifactId);
+      } catch (err) {
+        return {
+          success: false,
+          message: `Artifact id "${args.artifactId}" is malformed: ${err instanceof Error ? err.message : String(err)}`,
+        };
+      }
+      const paths = typeof args.path === 'string' ? [args.path] : args.path;
+      const result = await ctx.runQuery(
+        internal.artifacts.internal_queries.getFilesByPaths,
+        {
+          artifactId,
+          paths,
+          expectedOrganizationId: organizationId,
+          expectedThreadId: threadId,
+        },
+      );
+      if (!result) {
+        return {
+          success: false,
+          code: 'not_found',
+          message: `Artifact ${args.artifactId} not found in this thread.`,
+        };
+      }
+      if (result.missing.length > 0) {
+        return {
+          success: false,
+          code: 'file_missing',
+          message: `These paths do not exist: ${result.missing.join(', ')}. Available: ${result.availablePaths.join(', ')}.`,
+        };
+      }
+
+      // Single-path read: never truncate the caller's explicit ask.
+      if (typeof args.path === 'string') {
+        const f = result.files[0];
+        return {
+          success: true,
+          artifactId: args.artifactId,
+          type: result.type,
+          title: result.title,
+          revision: result.revision,
+          entryFile: result.entryFile,
+          language: result.language,
+          files: [{ path: f.path, size: f.content.length, content: f.content }],
+          truncated: false,
+        };
+      }
+
+      // Multi-path: smallest-first so a single large file doesn't push everything out.
+      let aggregate = 0;
+      let truncated = false;
+      const indexByPath = new Map<string, number>();
+      result.files.forEach((f, i) => indexByPath.set(f.path, i));
+      const ordered = [...result.files].sort(
+        (a, b) => a.content.length - b.content.length,
+      );
+      const byPath = new Map<string, ReadFileEntry>();
+      for (const f of ordered) {
+        if (aggregate + f.content.length > AGGREGATE_INLINE_BYTES) {
+          byPath.set(f.path, { path: f.path, size: f.content.length });
+          truncated = true;
+          continue;
+        }
+        aggregate += f.content.length;
+        byPath.set(f.path, {
+          path: f.path,
+          size: f.content.length,
+          content: f.content,
+        });
+      }
+      const files = args.path
+        .map((p) => byPath.get(p))
+        .filter((x): x is ReadFileEntry => x !== undefined);
+      return {
+        success: true,
+        artifactId: args.artifactId,
+        type: result.type,
+        title: result.title,
+        revision: result.revision,
+        entryFile: result.entryFile,
+        language: result.language,
+        files,
+        truncated,
+        message: truncated
+          ? 'Some files exceeded the aggregate inline cap; re-read by single path to fetch them.'
+          : undefined,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_file_rename_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_file_rename_tool.ts
new file mode 100644
index 000000000..27ba987fc
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_file_rename_tool.ts
@@ -0,0 +1,142 @@
+/**
+ * Convex Tool: artifact_file_rename
+ *
+ * Rename one file in an artifact's project tree. If `from === entryFile`,
+ * the entry pointer atomically moves to `to`.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+
+const fileRenameArgs = z.object({
+  artifactId: z.string().min(1),
+  from: z.string().min(1).max(200).describe('Existing file path to rename.'),
+  to: z
+    .string()
+    .min(1)
+    .max(200)
+    .describe(
+      'New file path. Must not already exist — call `artifact_file_delete` first if you intend to replace.',
+    ),
+  expectedRevision: z
+    .number()
+    .int()
+    .nonnegative()
+    .describe(
+      'REQUIRED: revision the rename was authored against. OCC — rejects with `code: "stale"` and `currentRevision` if the artifact has moved.',
+    ),
+});
+
+type FileRenameInput = z.infer<typeof fileRenameArgs>;
+
+interface FileRenameSuccess {
+  success: true;
+  artifactId: string;
+  revision: number;
+  from: string;
+  to: string;
+  entryFile: string;
+  entryUpdated: boolean;
+  message: string;
+}
+
+interface FileRenameFailure {
+  success: false;
+  code?: string;
+  message: string;
+  currentRevision?: number;
+}
+
+type FileRenameResult = FileRenameSuccess | FileRenameFailure;
+
+export const artifactFileRenameTool = {
+  name: 'artifact_file_rename' as const,
+  tool: createTool({
+    description: `**artifact_file_rename** — rename one file inside an artifact. If \`from === entryFile\`, the entry pointer atomically moves to \`to\`.
+
+**INPUTS:** \`artifactId\`, \`from\`, \`to\`, \`expectedRevision\`.
+
+**RULES:**
+- \`from === to\` is a no-op success (idempotent).
+- \`to\` must not already exist (code: \`path_exists\`).
+- \`from\` must exist (code: \`file_missing\`).
+
+**RESPONSE:** \`{revision, from, to, entryFile, entryUpdated, message}\`. \`entryUpdated\` is true iff the entry pointer moved with the rename. Errors carry \`code\` (\`not_found\`, \`stale\`, \`file_missing\`, \`path_exists\`).`,
+    inputSchema: fileRenameArgs,
+    execute: async (
+      ctx: ToolCtx,
+      args: FileRenameInput,
+      _options: ToolExecutionOptions,
+    ): Promise<FileRenameResult> => {
+      const { organizationId, threadId, messageId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_file_rename requires organizationId and threadId in the tool context.',
+        };
+      }
+      let artifactId;
+      try {
+        artifactId = toId<'artifacts'>(args.artifactId);
+      } catch (err) {
+        return {
+          success: false,
+          message: `Artifact id "${args.artifactId}" is malformed: ${err instanceof Error ? err.message : String(err)}`,
+        };
+      }
+      const artifact = await ctx.runQuery(
+        internal.artifacts.internal_queries.getById,
+        {
+          artifactId,
+          expectedOrganizationId: organizationId,
+          expectedThreadId: threadId,
+        },
+      );
+      if (!artifact) {
+        return {
+          success: false,
+          code: 'not_found',
+          message: `Artifact ${args.artifactId} not found in this thread.`,
+        };
+      }
+      const result = await ctx.runMutation(
+        internal.artifacts.internal_mutations.renameFileInArtifact,
+        {
+          artifactId,
+          from: args.from,
+          to: args.to,
+          editedByMessageId: messageId ?? '',
+          expectedRevision: args.expectedRevision,
+        },
+      );
+      if (!result.success) {
+        return {
+          success: false,
+          code: result.code,
+          message: result.message,
+          currentRevision: result.currentRevision,
+        };
+      }
+      const entryNote = result.entryUpdated
+        ? ' Entry file repointed accordingly.'
+        : '';
+      return {
+        success: true,
+        artifactId: args.artifactId,
+        revision: result.revision,
+        from: result.from,
+        to: result.to,
+        entryFile: result.entryFile,
+        entryUpdated: result.entryUpdated,
+        message: `Renamed "${result.from}" → "${result.to}" in "${artifact.title}". New revision: ${result.revision}.${entryNote}`,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_file_update_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_file_update_tool.ts
new file mode 100644
index 000000000..0b08dcc15
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_file_update_tool.ts
@@ -0,0 +1,328 @@
+/**
+ * Convex Tool: artifact_file_update
+ *
+ * Overwrite an EXISTING file in an artifact's project tree. Refused if `path`
+ * does not exist (use `artifact_file_create` instead). Pure overwrite — no append, no
+ * patch. Streams content live to the canvas via the shared streaming
+ * mutations.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { parsePartialJson } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+import { applyPackagesAddIfAny, isStringFieldClosed } from './_packages_helper';
+import {
+  InvalidArtifactPathError,
+  extractToolErrorShape,
+  isRunnableArtifactType,
+  validatePath,
+} from './shared';
+import {
+  clearState,
+  getState,
+  initState,
+  markFlushed,
+  markParsed,
+  shouldFlush,
+  shouldParse,
+} from './stream_state';
+
+const fileUpdateArgs = z.object({
+  artifactId: z.string().min(1),
+  path: z
+    .string()
+    .min(1)
+    .max(200)
+    .describe(
+      'Existing file path inside the artifact. Use `artifact_file_create` to add a new file.',
+    ),
+  content: z
+    .string()
+    .describe(
+      'Complete replacement content for the file. The previous content is fully replaced — there is no append or patch mode.',
+    ),
+  expectedRevision: z
+    .number()
+    .int()
+    .nonnegative()
+    .describe(
+      'REQUIRED: the `revision="N"` attribute from the `<artifact>` block this update was authored against. OCC — rejects with `code: "stale"` and `currentRevision` if the artifact has moved.',
+    ),
+  packages_add: z
+    .array(z.string().max(120))
+    .max(20)
+    .optional()
+    .describe(
+      "Optional. Package names to UNION into the artifact's persistent `runPackages` list so the next `artifact_run` auto-installs them. Use when the updated file imports a new dependency. Equivalent to a follow-up `artifact_packages_add` call.",
+    ),
+});
+
+type FileUpdateInput = z.infer<typeof fileUpdateArgs>;
+
+interface FileUpdateSuccess {
+  success: true;
+  artifactId: string;
+  revision: number;
+  path: string;
+  byteLength: number;
+  message: string;
+}
+
+interface FileUpdateFailure {
+  success: false;
+  code?: string;
+  message: string;
+  currentRevision?: number;
+}
+
+type FileUpdateResult = FileUpdateSuccess | FileUpdateFailure;
+
+export const artifactFileUpdateTool = {
+  name: 'artifact_file_update' as const,
+  tool: createTool({
+    description: `**artifact_file_update** — overwrite an EXISTING file in an artifact's project tree with full new content. Streams content live to the canvas. Pure overwrite — no append, no patch.
+
+**INPUTS:** \`artifactId\`, \`path\`, \`content\` (full file), \`expectedRevision\`, optional \`packages_add\`.
+
+**REFUSED ON** missing path (code: \`file_missing\`) — call \`artifact_file_create\` to add a new file, or \`artifact_file_list\` to see what exists.
+
+**SIZE LIMIT (HARD):** The \`content\` field is sent as a JSON string literal inside this call's arguments — every byte of \`content\` consumes YOUR (the caller's) output token budget. If \`content\` exceeds your remaining budget, the arguments JSON gets truncated mid-string by \`max_tokens\` and the call fails with an unrecoverable parse error BEFORE this handler runs. To stay safe, keep any single \`content\` under ~40 KB (~1000 lines). When the file you want to write would exceed that, decide on a split BEFORE generating the call:
+ - Slide decks (pptxgenjs etc.) → \`main.js\` requires \`slide1.js\`, \`slide2.js\`, …, one builder per file.
+ - Long scripts → split by module/responsibility into multiple files.
+ - Long data tables → put each chunk in its own data file and import them.
+There is no \`append\` and no patch mode — splitting is the only way for files that would otherwise be too big. This is a HARD limit of the calling protocol, not a soft preference. (Per-artifact aggregate cap is ~800 KB. If your local snapshot of the file is stale, call \`artifact_file_read\` first to anchor against current bytes.)
+
+**RUNNABLE ARTIFACTS:** if the updated file imports a new dependency, set \`packages_add\` (or follow up with \`artifact_packages_add\`). Edits do NOT auto-execute — call \`artifact_run\` to re-run.
+
+**RESPONSE:** \`{revision, path, byteLength, message}\`. Errors carry \`code\` (\`not_found\`, \`stale\`, \`file_missing\`, \`too_large\`, \`too_many_files\`, \`duplicate_path\`, \`empty_project\`, \`invalid_path\`). Some failures (unhandled exceptions) come back with only \`message\`.`,
+    inputSchema: fileUpdateArgs,
+    onInputStart: async (_ctx: ToolCtx, options: ToolExecutionOptions) => {
+      initState(options.toolCallId, 'artifact_file_update');
+    },
+    onInputDelta: async (
+      ctx: ToolCtx,
+      options: { inputTextDelta: string } & ToolExecutionOptions,
+    ) => {
+      const state = getState(options.toolCallId);
+      if (!state) return;
+      state.accumulator += options.inputTextDelta;
+      if (!shouldParse(state, state.accumulator.length)) return;
+      const parsed = await parsePartialJson(state.accumulator);
+      markParsed(state, state.accumulator.length);
+      if (
+        parsed.state !== 'successful-parse' &&
+        parsed.state !== 'repaired-parse'
+      ) {
+        return;
+      }
+      const partial = parsed.value;
+      if (
+        typeof partial !== 'object' ||
+        partial === null ||
+        Array.isArray(partial)
+      ) {
+        return;
+      }
+      const obj = partial as Record<string, unknown>;
+      const artifactIdStr =
+        typeof obj.artifactId === 'string' ? obj.artifactId : undefined;
+      const path = typeof obj.path === 'string' ? obj.path : undefined;
+
+      if (
+        state.artifactId === undefined &&
+        artifactIdStr &&
+        isStringFieldClosed(state.accumulator, 'artifactId')
+      ) {
+        try {
+          const artifactId = toId<'artifacts'>(artifactIdStr);
+          const artifact = await ctx.runQuery(
+            internal.artifacts.internal_queries.getById,
+            {
+              artifactId,
+              expectedOrganizationId: ctx.organizationId,
+              expectedThreadId: ctx.threadId,
+            },
+          );
+          if (!artifact) return;
+          state.artifactId = artifactId;
+          state.baseContentLength = (artifact.content ?? '').length;
+        } catch (err) {
+          console.warn(
+            '[artifact_file_update] preflight getById failed, deferring',
+            {
+              artifactIdStr,
+              error: err instanceof Error ? err.message : String(err),
+            },
+          );
+          return;
+        }
+      }
+
+      if (
+        state.artifactId !== undefined &&
+        !state.rowInitialized &&
+        path !== undefined &&
+        path.length > 0 &&
+        isStringFieldClosed(state.accumulator, 'path')
+      ) {
+        // Pre-validate the path BEFORE issuing beginEditStream — that
+        // mutation runs `validatePath()` itself, so a malformed path
+        // would throw mid-stream and spam WARN on every subsequent
+        // delta. Set the sticky hard-fail flag instead (audit follow-up
+        // F9).
+        try {
+          validatePath(path);
+        } catch (err) {
+          if (err instanceof InvalidArtifactPathError) {
+            state.streamingFailedHard = true;
+            console.warn(
+              '[artifact_file_update] streaming-preflight rejected invalid path',
+              { path, code: err.code },
+            );
+            return;
+          }
+          throw err;
+        }
+        state.resolvedMode = 'rewrite';
+        try {
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.beginEditStream,
+            {
+              artifactId: state.artifactId,
+              liveStreamMode: 'rewrite',
+              streamingPath: path,
+              toolCallId: options.toolCallId,
+            },
+          );
+          state.rowInitialized = true;
+        } catch (err) {
+          state.streamingFailedHard = true;
+          console.warn(
+            '[artifact_file_update] beginEditStream failed, deferring',
+            {
+              error: err instanceof Error ? err.message : String(err),
+            },
+          );
+          return;
+        }
+      }
+
+      if (
+        !state.rowInitialized ||
+        state.artifactId === undefined ||
+        path === undefined ||
+        path.length === 0
+      ) {
+        return;
+      }
+      const contentRaw =
+        typeof obj.content === 'string' ? obj.content : undefined;
+      if (contentRaw === undefined) return;
+      if (!shouldFlush(state, contentRaw.length)) return;
+      try {
+        await ctx.runMutation(
+          internal.artifacts.internal_mutations.updateRewriteStreamingContent,
+          {
+            artifactId: state.artifactId,
+            toolCallId: options.toolCallId,
+            streamingPath: path,
+            content: contentRaw,
+          },
+        );
+        markFlushed(state, contentRaw.length);
+      } catch (err) {
+        console.warn('[artifact_file_update] streamingContent flush failed', {
+          error: err instanceof Error ? err.message : String(err),
+        });
+      }
+    },
+    execute: async (
+      ctx: ToolCtx,
+      args: FileUpdateInput,
+      options: ToolExecutionOptions,
+    ): Promise<FileUpdateResult> => {
+      const { messageId } = ctx;
+      const editedByMessageId = messageId ?? '';
+      const state = getState(options.toolCallId);
+      try {
+        const artifactId = toId<'artifacts'>(args.artifactId);
+        const artifact = await ctx.runQuery(
+          internal.artifacts.internal_queries.getById,
+          {
+            artifactId,
+            expectedOrganizationId: ctx.organizationId,
+            expectedThreadId: ctx.threadId,
+          },
+        );
+        if (!artifact) {
+          return {
+            success: false,
+            code: 'not_found',
+            message: `Artifact ${args.artifactId} not found in this thread.`,
+          };
+        }
+        const isRunnable = isRunnableArtifactType(artifact.type);
+        const runHint = isRunnable
+          ? ` Call \`artifact_run({artifactId: "${args.artifactId}"})\` to execute the updated project.`
+          : '';
+        const result = await ctx.runMutation(
+          internal.artifacts.internal_mutations.updateFileInArtifact,
+          {
+            artifactId,
+            path: args.path,
+            content: args.content,
+            editedByMessageId,
+            expectedRevision: args.expectedRevision,
+          },
+        );
+        if (!result.success) {
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.abortStream,
+            { artifactId },
+          );
+          return {
+            success: false,
+            code: result.code,
+            message: result.message,
+            currentRevision: result.currentRevision,
+          };
+        }
+        const pkgNote = await applyPackagesAddIfAny(
+          ctx,
+          artifactId,
+          isRunnable,
+          args.packages_add,
+        );
+        return {
+          success: true,
+          artifactId: args.artifactId,
+          revision: result.revision,
+          path: result.path,
+          byteLength: result.byteLength,
+          message: `Updated "${result.path}" in "${artifact.title}" (${result.byteLength} bytes). New revision: ${result.revision}.${pkgNote}${runHint}`,
+        };
+      } catch (err) {
+        if (state?.artifactId !== undefined) {
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.abortStream,
+            { artifactId: state.artifactId },
+          );
+        }
+        const shape = extractToolErrorShape(err);
+        return {
+          success: false,
+          ...(shape.code !== undefined && { code: shape.code }),
+          message: `artifact_file_update failed: ${shape.message}`,
+        };
+      } finally {
+        clearState(options.toolCallId);
+      }
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_list_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_list_tool.ts
new file mode 100644
index 000000000..3482f0a3b
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_list_tool.ts
@@ -0,0 +1,112 @@
+/**
+ * Convex Tool: artifact_list
+ *
+ * Lists all artifacts in the current thread (metadata only). Used for
+ * title→id recovery when the LLM has lost track of an artifactId from an
+ * earlier turn, or for programmatic tool-chains ("list, then artifact_file_list N,
+ * then artifact_file_update one").
+ *
+ * Returns metadata only — no file content — to keep the response small.
+ * Call `artifact_file_list({artifactId})` afterward to enumerate paths inside an
+ * artifact, then `artifact_file_read({artifactId, path})` to fetch content.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import type { ToolDefinition } from '../types';
+
+const MAX_LIST = 50;
+
+const artifactListArgs = z
+  .object({})
+  .describe('No arguments — scopes to the current thread.');
+
+type ArtifactListInput = z.infer<typeof artifactListArgs>;
+
+interface ArtifactListEntry {
+  artifactId: string;
+  type: string;
+  title: string;
+  revision: number;
+  entryFile: string;
+  fileCount: number;
+  totalBytes: number;
+  language?: string;
+  updatedAt: number;
+}
+
+interface ArtifactListResult {
+  success: true;
+  artifacts: ArtifactListEntry[];
+  truncated: boolean;
+  totalCount: number;
+  message?: string;
+}
+
+export const artifactListTool = {
+  name: 'artifact_list' as const,
+  tool: createTool({
+    description: `**artifact_list** — list all artifacts in the current thread (metadata only).
+
+Use when you've lost track of an \`artifactId\` from an earlier turn (e.g. a prior \`artifact_create\` returned \`isNew: false\` and you need to find the artifact's id by title), or when composing a tool chain that needs to enumerate all artifacts before acting.
+
+**RESPONSE:** \`{artifacts: [{artifactId, type, title, revision, entryFile, fileCount, totalBytes, language?, updatedAt}], truncated, totalCount}\`. Sorted by \`updatedAt\` desc (most recent first). Capped at ${MAX_LIST} entries.
+
+No file content is returned — call \`artifact_file_list({artifactId})\` to enumerate paths, then \`artifact_file_read({artifactId, path})\` to fetch content.`,
+    inputSchema: artifactListArgs,
+    execute: async (
+      ctx: ToolCtx,
+      _args: ArtifactListInput,
+      _options: ToolExecutionOptions,
+    ): Promise<ArtifactListResult> => {
+      const { organizationId, threadId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: true,
+          artifacts: [],
+          truncated: false,
+          totalCount: 0,
+          message: 'No organizationId/threadId in context.',
+        };
+      }
+      // Metadata-only query: server-side projection avoids hauling MBs of
+      // file content into the action just to count bytes. See
+      // `listByThreadMetadata` docstring for the trade-off.
+      const rows = await ctx.runQuery(
+        internal.artifacts.internal_queries.listByThreadMetadata,
+        { organizationId, threadId },
+      );
+      // Sort by updatedAt desc, cap at MAX_LIST.
+      const sorted = [...rows].sort((a, b) => b.updatedAt - a.updatedAt);
+      const truncated = sorted.length > MAX_LIST;
+      const capped = sorted.slice(0, MAX_LIST);
+      const artifacts: ArtifactListEntry[] = capped.map((row) => {
+        const entry: ArtifactListEntry = {
+          artifactId: row._id,
+          type: row.type,
+          title: row.title,
+          revision: row.revision,
+          entryFile: row.entryFile,
+          fileCount: row.fileCount,
+          totalBytes: row.totalBytes,
+          updatedAt: row.updatedAt,
+        };
+        if (row.language !== undefined) entry.language = row.language;
+        return entry;
+      });
+      return {
+        success: true,
+        artifacts,
+        truncated,
+        totalCount: sorted.length,
+        message: truncated
+          ? `Showing the ${MAX_LIST} most recently updated of ${sorted.length} artifacts.`
+          : undefined,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_packages_add_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_packages_add_tool.ts
new file mode 100644
index 000000000..3e2ea4897
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_packages_add_tool.ts
@@ -0,0 +1,191 @@
+/**
+ * Convex Tool: artifact_packages_add
+ *
+ * Union package names into a runnable artifact's persistent `runPackages`
+ * list so the next `artifact_run` auto-installs them. Idempotent: names
+ * already present are skipped. Never removes existing entries —
+ * `artifact_create` is the way to start fresh.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { toId } from '../../lib/type_cast_helpers';
+import type { ToolDefinition } from '../types';
+import {
+  isRunnableArtifactType,
+  refinePackagesObject,
+  runnableLanguage,
+} from './shared';
+
+const artifactPackagesAddArgs = z.object({
+  artifactId: z.string().min(1),
+  packages: z
+    .object({
+      python: z
+        .array(z.string().min(1).max(120))
+        .max(20)
+        .optional()
+        .describe('Pip specs (e.g. `markitdown[pptx]`).'),
+      node: z
+        .array(z.string().min(1).max(120))
+        .max(20)
+        .optional()
+        .describe('npm specs (e.g. `pptxgenjs`).'),
+    })
+    .describe(
+      "Per-runtime dependencies to UNION into the artifact's persistent package state. `python` is installed via `uv pip`, `node` via `npm`. At least one bucket must be non-empty. Pinned versions strongly preferred. Examples: `{python: ['markitdown[pptx]']}`, `{node: ['pptxgenjs']}`, `{python: ['numpy'], node: ['lodash']}`. Installs run with `pip --only-binary=:all:` and `npm --ignore-scripts`.",
+    )
+    .refine((val) => (val.python?.length ?? 0) + (val.node?.length ?? 0) > 0, {
+      message: 'packages must include at least one python or node entry',
+    })
+    .superRefine((val, ctx) => {
+      refinePackagesObject(val, (issue) => ctx.addIssue(issue));
+    }),
+});
+
+type ArtifactPackagesAddInput = z.infer<typeof artifactPackagesAddArgs>;
+
+interface ArtifactPackagesAddSuccess {
+  success: true;
+  artifactId: string;
+  runPackages: string[];
+  added: string[];
+  runPackagesByLang?: { python?: string[]; node?: string[] };
+  addedByLang?: { python?: string[]; node?: string[] };
+  message: string;
+}
+
+interface ArtifactPackagesAddFailure {
+  success: false;
+  code?: string;
+  message: string;
+}
+
+type ArtifactPackagesAddResult =
+  | ArtifactPackagesAddSuccess
+  | ArtifactPackagesAddFailure;
+
+export const artifactPackagesAddTool = {
+  name: 'artifact_packages_add' as const,
+  tool: createTool({
+    description: `**artifact_packages_add** — declare runtime dependencies for a runnable artifact (\`script_runnable\`, or legacy \`python_runnable\` / \`node_runnable\`). Union the per-runtime specs into the artifact's persistent package state so the next \`artifact_run\` auto-installs them.
+
+**WHEN TO CALL:** right after \`artifact_file_create\` / \`artifact_file_update\` introduces a new \`import\`/\`require\` for an external dependency, before \`artifact_run\`.
+
+**INPUTS:**
+- \`artifactId\` — required.
+- \`packages\` — required, **grouped object** \`{python?: string[], node?: string[]}\`. At least one bucket must contain at least one spec. \`python\` is installed via \`uv pip\`, \`node\` via \`npm\`. Pinned versions strongly preferred (e.g. \`"requests==2.31.0"\`, \`"pptxgenjs@3.12.0"\`).
+
+\`\`\`json
+// Python-only artifact:
+{ "artifactId": "...", "packages": { "python": ["markitdown[pptx]"] } }
+
+// Node-only artifact:
+{ "artifactId": "...", "packages": { "node": ["pptxgenjs"] } }
+
+// Mixed (script_runnable):
+{ "artifactId": "...", "packages": { "python": ["markitdown[pptx]"], "node": ["pptxgenjs"] } }
+\`\`\`
+
+**IDEMPOTENT:** existing entries are never removed; specs already present are silently skipped. To start fresh, create a new artifact via \`artifact_create\` with the desired \`packages\`.
+
+**REFUSED ON** non-runnable artifact types (code: \`not_runnable\`).
+
+**RESPONSE:** \`{runPackages, added, runPackagesByLang?, addedByLang?, message}\`. \`added\` / \`addedByLang\` list only the specs that were new.`,
+    inputSchema: artifactPackagesAddArgs,
+    execute: async (
+      ctx: ToolCtx,
+      args: ArtifactPackagesAddInput,
+      _options: ToolExecutionOptions,
+    ): Promise<ArtifactPackagesAddResult> => {
+      const { organizationId, threadId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_packages_add requires organizationId and threadId in the tool context.',
+        };
+      }
+      let artifactId;
+      try {
+        artifactId = toId<'artifacts'>(args.artifactId);
+      } catch (err) {
+        return {
+          success: false,
+          message: `Artifact id "${args.artifactId}" is malformed: ${err instanceof Error ? err.message : String(err)}`,
+        };
+      }
+      const artifact = await ctx.runQuery(
+        internal.artifacts.internal_queries.getById,
+        {
+          artifactId,
+          expectedOrganizationId: organizationId,
+          expectedThreadId: threadId,
+        },
+      );
+      if (!artifact) {
+        return {
+          success: false,
+          code: 'not_found',
+          message: `Artifact ${args.artifactId} not found in this thread.`,
+        };
+      }
+      if (!isRunnableArtifactType(artifact.type)) {
+        return {
+          success: false,
+          code: 'not_runnable',
+          message: `Artifact "${artifact.title}" is of type "${artifact.type}", which does not run packages. Only script_runnable (or legacy python_runnable / node_runnable) types support runPackages.`,
+        };
+      }
+      // Grouped buckets only — Zod's `refine` upstream already ensures
+      // at least one is non-empty. Mirror the locked-runtime bucket to
+      // the legacy flat `runPackages` field so single-runtime readers
+      // (audit row preview, canvas display) keep matching. Polyglot
+      // (`script_runnable`) has no locked runtime, so the legacy mirror
+      // uses python by convention.
+      const locked = runnableLanguage(artifact.type);
+      const py = args.packages.python ?? [];
+      const node = args.packages.node ?? [];
+      const packagesAddByLang: { python?: string[]; node?: string[] } = {
+        ...(py.length > 0 && { python: py }),
+        ...(node.length > 0 && { node }),
+      };
+      const packagesAddFlat = locked === 'node' ? node : py;
+      const result = await ctx.runMutation(
+        internal.artifacts.internal_mutations.addArtifactPackages,
+        {
+          artifactId,
+          packagesAdd: packagesAddFlat,
+          ...(Object.keys(packagesAddByLang).length > 0 && {
+            packagesAddByLang,
+          }),
+        },
+      );
+      const totalAdded =
+        result.added.length +
+        (result.addedByLang?.python?.length ?? 0) +
+        (result.addedByLang?.node?.length ?? 0);
+      const addedNote =
+        totalAdded === 0
+          ? 'No new packages added (all were already present).'
+          : `Added ${totalAdded} package${totalAdded === 1 ? '' : 's'} (flat: ${result.added.join(', ') || '<none>'}; python: ${result.addedByLang?.python?.join(', ') ?? '<none>'}; node: ${result.addedByLang?.node?.join(', ') ?? '<none>'}).`;
+      return {
+        success: true,
+        artifactId: args.artifactId,
+        runPackages: result.runPackages,
+        added: result.added,
+        ...(result.runPackagesByLang !== undefined && {
+          runPackagesByLang: result.runPackagesByLang,
+        }),
+        ...(result.addedByLang !== undefined && {
+          addedByLang: result.addedByLang,
+        }),
+        message: `${addedNote} Current runPackages (${result.runPackages.length}): ${result.runPackages.join(', ') || '<empty>'}.`,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/artifact_run_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_run_tool.ts
new file mode 100644
index 000000000..da1b5e835
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/artifact_run_tool.ts
@@ -0,0 +1,825 @@
+/**
+ * Convex Tool: artifact_run
+ *
+ * Executes a `script_runnable` artifact (or its legacy
+ * `python_runnable` / `node_runnable` predecessors) in the sandbox.
+ * `artifact_create` creates the (empty) artifact and persists
+ * `runPackages` / `runPackagesByLang` / `runOptions` on the row;
+ * `artifact_file_create` / `artifact_file_update` populate the source
+ * files. This tool is the explicit, LLM-driven trigger to actually run
+ * them. Returns the full run outcome — including `runStatus`,
+ * `runErrorCode`, `runStderrPreview`, generated files — so the LLM can
+ * react to failures by calling `artifact_file_update` then
+ * `artifact_run` again.
+ *
+ * Per-step runtime selection: each executed file's interpreter is
+ * inferred from extension (`.py` → python3, `.js`/`.cjs`/`.mjs` →
+ * node). When the dispatched file set spans both runtimes, the
+ * spawner is called with `language: 'polyglot'` and the entrypoint
+ * installs both pip and npm package buckets in one container.
+ *
+ * Splitting execution out of `artifact_create` (Refinement 4) is what
+ * prevents the model from "fixing" a failure by emitting another
+ * `artifact_create` and stacking up duplicate artifact tabs.
+ */
+
+import type { ToolCtx } from '@convex-dev/agent';
+import { createTool } from '@convex-dev/agent';
+import type { ToolExecutionOptions } from 'ai';
+import { ConvexError } from 'convex/values';
+import { z } from 'zod/v4';
+
+import { internal } from '../../_generated/api';
+import { resolveArtifactFiles } from '../../artifacts/resolve_files';
+import { toId } from '../../lib/type_cast_helpers';
+import type { SandboxStepResult } from '../../sandbox/wire';
+import type { ToolDefinition } from '../types';
+import {
+  InvalidArtifactPathError,
+  classifyPackages,
+  inferStepLanguage,
+  isRunnableArtifactType,
+  refinePackagesObject,
+  runnableLanguage,
+  validatePath,
+} from './shared';
+
+/**
+ * Cap matches `services/sandbox/src/wire.ts:MAX_STEPS_PER_REQUEST`. We
+ * duplicate the literal here because the spawner wire module is in a
+ * separate package; the spawner's own validator re-enforces the same cap.
+ */
+const ARTIFACT_RUN_MAX_STEPS = 10;
+
+const artifactRunArgs = z
+  .object({
+    artifactId: z
+      .string()
+      .describe(
+        'The id of the script_runnable artifact (or legacy python_runnable / node_runnable) to execute. Pass the artifactId returned by a prior `artifact_create` / `artifact_file_create` / `artifact_file_update` call.',
+      ),
+    path: z
+      .string()
+      .min(1)
+      .max(200)
+      .optional()
+      .describe(
+        "Single-script mode: file path within the artifact to execute. Defaults to the artifact's `entryFile`. Mutually exclusive with `steps`. Sibling files are still staged on disk so the executed script can `import` / `require` them.",
+      ),
+    steps: z
+      .array(
+        z.object({
+          path: z
+            .string()
+            .min(1)
+            .max(200)
+            .describe(
+              "Path inside the artifact's file tree to execute as this step.",
+            ),
+        }),
+      )
+      .min(1)
+      .max(ARTIFACT_RUN_MAX_STEPS)
+      .optional()
+      .describe(
+        'Multi-script mode: an ordered list of artifact files to execute IN SEQUENCE inside a single sandbox container. Each step sees the previous steps\' writes to `/workspace/output/`, so `[{path:"gen.py"},{path:"validate.py"}]` lets the validator inspect what the generator just wrote. Fail-fast: a non-zero exit aborts the remaining steps. Mutually exclusive with `path`.',
+      ),
+    timeoutMs: z
+      .number()
+      .int()
+      .min(1_000)
+      .max(300_000)
+      .optional()
+      .describe(
+        'Wall-clock cap including package install, in milliseconds. Applies to the WHOLE run (all steps combined). Default 30000, max 300000.',
+      ),
+    packages: z
+      .object({
+        python: z
+          .array(z.string().max(120))
+          .max(20)
+          .optional()
+          .describe('Pip specs (e.g. `markitdown[pptx]`).'),
+        node: z
+          .array(z.string().max(120))
+          .max(20)
+          .optional()
+          .describe('npm specs (e.g. `pptxgenjs`).'),
+      })
+      .optional()
+      .describe(
+        'One-off package override for this run only. Per-runtime buckets `{python?, node?}` — `python` is installed via `uv pip`, `node` via `npm`. Either bucket may be omitted. Usually omitted entirely — the artifact row already carries the `packages` you supplied at create time / via `artifact_packages_add`.',
+      )
+      .superRefine((val, ctx) => {
+        refinePackagesObject(val, (issue) => ctx.addIssue(issue));
+      }),
+    inputs: z
+      .object({
+        from_run: z
+          .string()
+          .min(1)
+          .describe(
+            'Either the literal string `"latest"` (use the most recent SUCCESSFUL run\'s outputs — the default behaviour when `inputs` is omitted) or a specific runId returned by a prior `artifact_run` call. When a runId is passed, that exact run\'s output files are pre-staged into `/workspace/output/` regardless of whether it succeeded or failed — useful for re-attempting analysis against a known intermediate state.',
+          ),
+      })
+      .optional()
+      .describe(
+        'Explicit pre-stage source for `/workspace/output/`. Omit to inherit the default ("latest succeeded run"). Pass a specific `{from_run: "<runId>"}` to pin to a particular prior run.',
+      ),
+    // NOTE: `allowSdist` / `allowInstallScripts` were previously LLM-callable
+    // here. They were removed (round-2 R2-B4) because a prompt-injected agent
+    // could disable the install-safety guards then ship an evil-pkg whose
+    // postinstall hook runs inside the runtime container. Installs are now
+    // hardcoded to use `pip --only-binary=:all:` + `npm --ignore-scripts`.
+  })
+  .superRefine((val, ctx) => {
+    if (val.path !== undefined && val.steps !== undefined) {
+      ctx.addIssue({
+        code: 'custom',
+        path: ['steps'],
+        message:
+          '`path` and `steps` are mutually exclusive. Use `steps` for multi-step workflows; use `path` (or omit both) for a single-script run.',
+      });
+    }
+  });
+
+type ArtifactRunInput = z.infer<typeof artifactRunArgs>;
+
+interface RunOutputFile {
+  name: string;
+  storageId: string;
+  fileMetadataId: string;
+  size: number;
+  contentType: string;
+}
+
+interface ArtifactRunSuccess {
+  success: boolean; // runStatus === 'completed' AND files.length > 0
+  artifactId: string;
+  revision: number;
+  runStatus: 'completed' | 'failed' | 'cancelled';
+  runExitCode: number | null;
+  runErrorCode?: string;
+  runErrorMessage?: string;
+  runStdoutPreview: string;
+  runStderrPreview: string;
+  durationMs: number;
+  files: RunOutputFile[];
+  executionId: string;
+  /**
+   * The persistent `artifactRuns` row id created for this run (Phase 2
+   * onward). Pass it back as `inputs: { from_run: "<runId>" }` on a
+   * follow-up call to pin pre-staging to this run's outputs. Omitted if
+   * the run never reached finalize (rare — only on infra crashes that
+   * never enter the finalize path).
+   */
+  runId?: string;
+  /**
+   * Populated only when the request used multi-step mode. One entry per
+   * requested step in submission order with per-step outcome. `skipped`
+   * means a prior step's failure aborted this one.
+   */
+  steps?: SandboxStepResult[];
+  /**
+   * Pre-stage attestation summary (crispy-curry plan §3). Populated on
+   * every run that had `priorOutputDownloads`. `staged[]` lists files the
+   * spawner confirmed landed in `/workspace/output/` before user code ran;
+   * `skipped[]` lists any expected files that didn't make it, with a
+   * structured reason. When `skipped[].length > 0` the run terminates
+   * with `runErrorCode: "PRE_STAGE_FAILED"` BEFORE user code runs — use
+   * `inputs.from_run` to pin an older snapshot if a specific blob has
+   * gone missing.
+   */
+  preStage?: {
+    staged: string[];
+    skipped: Array<{ name: string; reason: string; detail: string }>;
+  };
+  message: string;
+}
+
+interface ArtifactRunFailure {
+  success: false;
+  /**
+   * Structured failure code so the LLM can branch on cause without
+   * substring-matching the human-readable `message`. Currently emitted by
+   * the `inputs.from_run` validator; other code paths leave it unset for
+   * legacy compatibility.
+   */
+  code?: 'pin_target_not_found';
+  message: string;
+}
+
+type ArtifactRunResult = ArtifactRunSuccess | ArtifactRunFailure;
+
+interface ExecuteCodeResult {
+  executionId: string;
+  success: boolean;
+  status: 'completed' | 'failed' | 'cancelled';
+  exitCode: number | null;
+  errorCode?: string;
+  errorMessage?: string;
+  stdoutPreview: string;
+  stderrPreview: string;
+  durationMs: number;
+  files: RunOutputFile[];
+  steps?: SandboxStepResult[];
+  /**
+   * Pre-stage attestation block (crispy-curry plan §3) — present when the
+   * request had `priorOutputDownloads`. Forwarded straight through to the
+   * tool result so the LLM sees what was staged and what was skipped.
+   */
+  preStage?: {
+    staged: string[];
+    skipped: Array<{ name: string; reason: string; detail: string }>;
+  };
+}
+
+export const artifactRunTool = {
+  name: 'artifact_run' as const,
+  tool: createTool({
+    description: `**artifact_run** — execute a runnable artifact (\`script_runnable\`, or its legacy single-language predecessors \`python_runnable\` / \`node_runnable\`) in the sandbox and return the run outcome.
+
+USE THIS TOOL after \`artifact_create\` + \`artifact_file_update\`/\`artifact_file_create\` (to run the entry script) or after a subsequent \`artifact_file_update\` (to re-run a patched revision). The previously-configured \`runPackages\` are reused unless you override; add new dependencies via \`artifact_packages_add\`.
+
+**WORKSPACE LIFECYCLE — READ FIRST.**
+- Every \`artifact_run\` invocation gets a **brand-new** \`/workspace/\` directory.
+- As a convenience, the artifact's **most recent run outputs** are pre-staged back into \`/workspace/output/\` before the script starts (up to ~10 MiB total). A follow-up \`artifact_run\` on the same artifact can therefore read what an earlier run produced — e.g. \`validate.py\` opens the \`.pptx\` that \`generate.py\` wrote on the previous call. If aggregate prior outputs exceed the cap, the pre-stage is skipped and a note appears in stderr; do not rely on this backstop for large workflows.
+- For tightly-coupled chains (build → test, generate → validate) **prefer \`steps: [...]\`** — same container, atomic outcome, fail-fast across steps, one round trip. Pre-staging is the safety net when separate calls are unavoidable, not a replacement for \`steps\`.
+- Creation patterns are unaffected: \`Presentation(path)\` *opens* an existing file. To create a new artifact output, call \`Presentation()\` (no arg), populate, then \`.save(...)\`.
+
+**MULTI-STEP WORKFLOWS — preferred over splitting into multiple \`artifact_run\` calls.**
+
+For generate-then-validate / build-then-test patterns, pass \`steps\` instead of \`path\`. All steps execute **sequentially inside the same container** and share \`/workspace/\`, so step 2 sees what step 1 wrote.
+
+\`\`\`json
+artifact_run({
+  artifactId,
+  steps: [{ "path": "gen.py" }, { "path": "validate.py" }]
+})
+\`\`\`
+
+- Fail-fast: a non-zero exit from any step aborts the remaining steps. Each step's exit code + duration come back in \`steps[]\` with \`status: "completed" | "failed" | "skipped"\`.
+- All files in the artifact are staged under \`/workspace/code/<path>\`, so step scripts can also \`import\` / \`require\` siblings the normal way.
+- Up to ${ARTIFACT_RUN_MAX_STEPS} steps per call. The overall \`timeoutMs\` is shared across all steps.
+- Step paths must reference existing non-empty files in the artifact. Any filename works — \`main.py\`, \`gen.py\`, \`test.py\`, whatever you used when you created the file.
+
+**Single-script mode** (use when there's nothing to chain): omit both \`steps\` and \`path\` to run the artifact's \`entryFile\`, or pass \`path\` to run a specific sibling file. \`subprocess.run(['python', 'validate.py'])\` from within the entry script also works if you want orchestration logic in-script.
+
+**ONE ARTIFACT, MANY RUNNABLE FILES.** Keep multi-script workflows in ONE artifact. Do NOT call \`artifact_create\` twice for "generator" and "validator" — add sibling files via \`artifact_file_create({artifactId, path:'validate.py', content:...})\` and reference them via \`steps\`.
+
+**DO NOT use this tool for:**
+- Static artifact types (\`html\`, \`svg\`, \`mermaid\`, \`markdown\`, \`code\`) — those render in the browser, not the sandbox. The tool will refuse them with a clear error.
+- Free-form code that isn't tied to an artifact. There is no other path; everything goes through an artifact.
+
+**MIXED-LANGUAGE STEPS.** For a \`script_runnable\` artifact you can mix \`.py\` and \`.js\` files in the same project — each step's interpreter is chosen from its extension (\`.py\` → python3, \`.js\`/\`.cjs\`/\`.mjs\` → node). Dependencies are always declared as a per-runtime object: \`{python?: string[], node?: string[]}\` — usually persisted via \`artifact_create\`'s \`packages\` or a later \`artifact_packages_add\`. The optional \`packages\` arg here is a one-shot override with the same shape.
+
+**SANDBOX ENVIRONMENT:**
+- Python 3.12 / Node 24 with on-demand \`pip\` / \`npm\` install per the row's \`runPackages\` (legacy) or \`runPackagesByLang\` (grouped). Mixed-language runs install both in the same container.
+- Wall-clock ≤300s (default 30s; raise via \`timeoutMs\`). Applies to the WHOLE run.
+- Memory cap 1 GB, 1 CPU.
+- Egress restricted to package registries (\`pypi.org\`, \`files.pythonhosted.org\`, \`registry.npmjs.org\`, GitHub release endpoints). Any other host returns \`EGRESS_DENIED\`.
+- Output files **must** be written under \`/workspace/output/\` to be collected.
+- stdout/stderr captured (16 KB preview returned; full text in \`_storage\` if larger). In multi-step mode the wrapper prints a \`====== STEP N/M: <path> ======\` banner around each step so the combined log stays readable.
+- **System binaries baked in**: \`python3.12\`, \`node\` (24), \`uv\`, \`npm\`, \`jq\`, \`fontconfig\`. **NOT available**: LibreOffice / \`soffice\`, \`pandoc\`, ImageMagick, \`ffmpeg\`, headless browsers (Chromium / Playwright), \`pdftoppm\` / Poppler, Tesseract OCR, or any other document/media-conversion tooling. If a task needs one of these, the sandbox cannot do it — tell the user.
+- **No runtime system-package install**: the container runs as unprivileged UID \`65534\` with a read-only root filesystem and no \`CAP_SYS_ADMIN\`. \`apt-get\`, \`dnf\`, \`brew\`, etc. cannot succeed. Only Python/Node packages declared via the artifact's \`packages\` field (installed via \`uv pip\` / \`npm\` to \`/workspace/.deps/\`) work. A user script that runs \`pip install\` directly will hit "site-packages not writeable" — declare the dep instead. Do **not** invent pip packages that wrap a missing system binary (e.g. there is no \`libreoffice-python\`, no \`pandoc-python\` on PyPI — these are LLM hallucinations).
+- **Out-of-scope task shapes** (do NOT attempt — surface the limitation to the user): visual rendering QA of PPTX/DOCX/PDF (the sandbox can rasterize images but has no vision LLM to judge them — that step belongs to the orchestrating agent, not the sandbox), headless-browser scraping or screenshotting, GUI / X11 binaries, OCR, video transcoding. For PPTX in particular, **content** QA via \`markitdown[pptx]\` (text extraction, placeholder scan, topic grep) is in-scope; **visual** QA is not.
+
+**ON FAILURE — read \`runStderrPreview\` BEFORE replying to the user.** When a multi-step run fails, check \`steps[]\` to see WHICH step failed and only re-run / patch that one. Recovery table:
+
+| \`runErrorCode\` | Meaning | Recovery |
+|---|---|---|
+| \`RUNTIME_ERROR\` | Code threw (most common) | Read stderr traceback, \`artifact_file_read\` then \`artifact_file_update\` to fix the offending step, then \`artifact_run\` again |
+| \`TIMEOUT\` | Wall-clock exceeded | Raise \`timeoutMs\` on the next \`artifact_run\` call, or \`artifact_file_update\` to split the work into multiple files / steps |
+| \`OOM\` | Memory cap hit (1 GB) | \`artifact_file_update\` to stream / reduce data in memory, then \`artifact_run\` again |
+| \`EGRESS_DENIED\` | Tried to reach a non-registry host | \`artifact_file_update\` to remove the external call — use the \`web\` tool instead |
+| \`INSTALL_FAILED\` | Package install errored | Read stderr. If a **Python/Node package** is misspelled or unresolved, call \`artifact_packages_add\` with the corrected spec (or re-create the artifact with a fresh package list), then \`artifact_run\` again. If stderr points to a **missing system binary** (e.g. \`soffice: not found\`, \`command not found: pandoc\`, \`pdftoppm: not found\`), **stop** — the sandbox cannot install system packages and there is no PyPI/npm wrapper that brings one. Report the limitation to the user and propose a pure-Python/Node alternative or a different approach. |
+| \`PACKAGE_NOT_FOUND\` | A spec doesn't resolve | \`artifact_packages_add\` with an alternate package name |
+| \`QUOTA_EXCEEDED\` | Org daily CPU cap | Don't retry — tell the user to wait |
+| \`SPAWNER_UNAVAILABLE\` | Transient infra | One \`artifact_run\` retry is fine; if it fails again, surface to user |
+| \`HARVEST_READ_FAILED\` | Sandbox couldn't read output dir | Check stderr — the script likely didn't write the expected file (typo in path, wrong cwd) |
+| \`UPLOAD_FAILED\` | Output upload to storage failed | One retry is fine — usually a transient blip on the storage path |
+| \`UPLOAD_QUOTA_EXCEEDED\` | Per-run output-file cap hit (>16 files) | Consolidate small files into a tar/zip, OR split work into multiple \`artifact_run\` calls / \`steps\` |
+| \`UPLOAD_REPORT_FAILED\` | Upload recorded with a delay | Non-fatal; check the audit row's \`uploadedStorageIds\` if files seem missing |
+
+**HARD RULE — NEVER tell the user the file is ready / generated / done unless \`success === true\` AND \`files.length > 0\`.** That is the most reported bug for this flow.
+
+**RESPONSE:** returns \`runStatus\`, \`runExitCode\`, optional \`runErrorCode\` / \`runErrorMessage\`, \`runStdoutPreview\`, \`runStderrPreview\`, \`files[]\` (the deliverable output files, each with \`name\` / \`storageId\` / \`size\` / \`contentType\`), \`durationMs\`, \`executionId\` (audit-row link), and \`steps[]\` when multi-step.`,
+    inputSchema: artifactRunArgs,
+    execute: async (
+      ctx: ToolCtx,
+      args: ArtifactRunInput,
+      options: ToolExecutionOptions,
+    ): Promise<ArtifactRunResult> => {
+      const { organizationId, threadId, messageId, userId } = ctx;
+      if (!organizationId || !threadId) {
+        return {
+          success: false,
+          message:
+            'artifact_run requires organizationId and threadId in the tool context.',
+        };
+      }
+      if (!userId) {
+        return {
+          success: false,
+          message: 'artifact_run requires userId in the tool context.',
+        };
+      }
+
+      // `toId` is a pure cast; it never throws. The Convex `v.id('artifacts')`
+      // validator inside `runQuery(getById)` is the real throw site for a
+      // malformed id, so wrap THAT call, not toId. Mirrors the pattern in
+      // the file_* tools.
+      const artifactId = toId<'artifacts'>(args.artifactId);
+      let artifact;
+      try {
+        artifact = await ctx.runQuery(
+          internal.artifacts.internal_queries.getById,
+          {
+            artifactId,
+            expectedOrganizationId: organizationId,
+            expectedThreadId: threadId,
+          },
+        );
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        return {
+          success: false,
+          message: `Artifact id "${args.artifactId}" is malformed or inaccessible: ${message}`,
+        };
+      }
+      if (!artifact) {
+        return {
+          success: false,
+          message: `Artifact ${args.artifactId} not found in this thread.`,
+        };
+      }
+      if (!isRunnableArtifactType(artifact.type)) {
+        return {
+          success: false,
+          message: `Artifact ${args.artifactId} is type "${artifact.type}". artifact_run only runs script_runnable (or legacy python_runnable / node_runnable) types. Static types (html / svg / mermaid / markdown / code) render in the browser, not in the sandbox.`,
+        };
+      }
+      // Legacy single-runtime types (`python_runnable` / `node_runnable`)
+      // pin the runtime regardless of file extensions — preserves
+      // behavior for rows created before script_runnable existed. New
+      // `script_runnable` rows infer per-step / per-target.
+      const lockedLanguage = runnableLanguage(artifact.type);
+
+      // Resolve which files to execute. Two modes:
+      //   - Multi-step (`args.steps`): each step path must reference an
+      //     existing artifact file with non-empty content. All sibling
+      //     files are still staged on disk so steps can `import` /
+      //     `require` each other. There is no user-facing reserved name:
+      //     the spawner's wrapper lives at /workspace/.tale/runner.{py,js},
+      //     a dotfile-segment dir unreachable from artifact paths.
+      //   - Single-script: `args.path` or entryFile names the executed
+      //     file; the runtime entrypoint exec()s it at its declared path.
+      const resolved = resolveArtifactFiles(artifact);
+
+      type DispatchSingle = {
+        kind: 'single';
+        targetPath: string;
+        targetContent: string;
+      };
+      type DispatchSteps = {
+        kind: 'steps';
+        stepPaths: string[];
+      };
+      let dispatch: DispatchSingle | DispatchSteps;
+
+      if (args.steps !== undefined) {
+        const stepPaths: string[] = [];
+        const seen = new Set<string>();
+        for (let i = 0; i < args.steps.length; i += 1) {
+          const raw = args.steps[i]?.path ?? '';
+          let validated: string;
+          try {
+            validated = validatePath(raw);
+          } catch (err) {
+            if (err instanceof InvalidArtifactPathError) {
+              return {
+                success: false,
+                message: `steps[${i}].path "${raw}" rejected (${err.code}): ${err.message}`,
+              };
+            }
+            throw err;
+          }
+          if (seen.has(validated)) {
+            return {
+              success: false,
+              message: `steps[${i}].path "${validated}" appears twice. Each step path must be unique within one artifact_run call.`,
+            };
+          }
+          seen.add(validated);
+          const entry = resolved.files.find((f) => f.path === validated);
+          if (!entry) {
+            const known = resolved.files.map((f) => f.path).join(', ');
+            return {
+              success: false,
+              message: `steps[${i}].path "${validated}" is not in artifact ${args.artifactId}. Available paths: ${known}. Call artifact_file_create to add the file first if you intended to.`,
+            };
+          }
+          if (entry.content.length === 0) {
+            return {
+              success: false,
+              message: `steps[${i}].path "${validated}" is empty. Call artifact_file_update({artifactId, path: "${validated}", content: ..., expectedRevision}) first.`,
+            };
+          }
+          stepPaths.push(validated);
+        }
+        dispatch = { kind: 'steps', stepPaths };
+      } else {
+        let targetPath: string;
+        if (args.path !== undefined) {
+          try {
+            targetPath = validatePath(args.path);
+          } catch (err) {
+            if (err instanceof InvalidArtifactPathError) {
+              return {
+                success: false,
+                message: `path "${args.path}" rejected (${err.code}): ${err.message}`,
+              };
+            }
+            throw err;
+          }
+        } else {
+          targetPath = resolved.entryFile;
+        }
+        const targetEntry = resolved.files.find((f) => f.path === targetPath);
+        if (!targetEntry) {
+          const known = resolved.files.map((f) => f.path).join(', ');
+          return {
+            success: false,
+            message: `Artifact ${args.artifactId} has no file at path "${targetPath}". Available paths: ${known}.`,
+          };
+        }
+        if (targetEntry.content.length === 0) {
+          return {
+            success: false,
+            message: `Artifact ${args.artifactId} file "${targetPath}" is empty. Call artifact_file_update({artifactId, path: "${targetPath}", content: ..., expectedRevision}) first.`,
+          };
+        }
+        dispatch = {
+          kind: 'single',
+          targetPath,
+          targetContent: targetEntry.content,
+        };
+      }
+
+      // Collect the per-step runtimes the dispatch resolves to. Legacy
+      // single-runtime artifacts pin every step to their type's language
+      // (e.g. a `python_runnable` runs `helpers.js` with python — the
+      // wrapper would explode, but that's the legacy contract that
+      // pre-dated mixed-extension files). `script_runnable` rows infer
+      // per file: `.py` → python, `.js`/`.cjs`/`.mjs` → node. Anything
+      // else fails fast before we hit the sandbox.
+      const dispatchedPaths =
+        dispatch.kind === 'single' ? [dispatch.targetPath] : dispatch.stepPaths;
+      const runtimesNeeded = new Set<'python' | 'node'>();
+      if (lockedLanguage !== null) {
+        runtimesNeeded.add(lockedLanguage);
+      } else {
+        for (const path of dispatchedPaths) {
+          const lang = inferStepLanguage(path);
+          if (lang === null) {
+            return {
+              success: false,
+              message: `Path "${path}" has no recognized polyglot interpreter — supported extensions are .py, .js, .cjs, .mjs. Rename the file or split the run into separate \`steps\` if you intended multiple languages.`,
+            };
+          }
+          runtimesNeeded.add(lang);
+        }
+      }
+      // Choose the wire `language` for the spawner request. A pure-
+      // Python or pure-Node file set sends the lighter single-language
+      // path so legacy spawner code (and any operator dashboards keyed
+      // off `language`) keep working. Only true mixed runs send polyglot.
+      let spawnerLanguage: 'python' | 'node' | 'polyglot';
+      if (runtimesNeeded.size === 2) {
+        spawnerLanguage = 'polyglot';
+      } else if (runtimesNeeded.has('python')) {
+        spawnerLanguage = 'python';
+      } else {
+        spawnerLanguage = 'node';
+      }
+      // Polyglot requires multi-step (the spawner validator enforces this
+      // too, but rejecting here is a better diagnostic). A single-script
+      // polyglot request would just be a single-language run.
+      if (spawnerLanguage === 'polyglot' && dispatch.kind === 'single') {
+        return {
+          success: false,
+          message: `Polyglot runs require \`steps\` mode (one entry per file in execution order). Pass \`steps: [{path: "..."}]\` instead of \`path\`.`,
+        };
+      }
+
+      // Refresh the run-state row in case the user already saw a previous
+      // run's status — initArtifactRun resets runStatus to 'queued', clears
+      // runProgress / runErrorCode / etc. so the canvas right pane updates
+      // cleanly during this new run. The artifact row's persistent
+      // runPackages / runOptions are NOT overwritten here; per-call args
+      // are applied transiently to the spawner request below.
+      //
+      // initArtifactRun throws RUN_IN_FLIGHT if another run is still active
+      // on this artifact — surface as a structured failure so the LLM waits
+      // instead of racing with itself.
+      try {
+        await ctx.runMutation(
+          internal.artifacts.internal_mutations.initArtifactRun,
+          { artifactId },
+        );
+      } catch (err) {
+        if (
+          err instanceof ConvexError &&
+          typeof err.data === 'object' &&
+          err.data !== null &&
+          // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose
+          (err.data as { code?: string }).code === 'RUN_IN_FLIGHT'
+        ) {
+          return {
+            success: false,
+            message: `Artifact ${args.artifactId} already has a run in flight. Wait for the current run to finish, then call artifact_run again. Do NOT call artifact_create or stack parallel runs.`,
+          };
+        }
+        throw err;
+      }
+
+      // Resolve effective packages for this run:
+      //   1. Pull persisted state from the artifact row (grouped form
+      //      first, fall back to legacy flat list routed to the
+      //      artifact's locked-or-inferred runtime).
+      //   2. Apply the per-call override — either flat (legacy) or
+      //      grouped — replacing the persisted state rather than
+      //      merging, so the LLM can opt to install a different set for
+      //      this one run.
+      //   3. Drop buckets the dispatched file set won't use (keeps the
+      //      install phase tight when an artifact has stale Node deps
+      //      from an earlier mixed run).
+      let pythonBucket: string[] = [];
+      let nodeBucket: string[] = [];
+      if (args.packages !== undefined) {
+        // Per-call grouped override. Either bucket may be omitted; an
+        // omitted bucket means "this run doesn't need that runtime's
+        // packages" — NOT "fall back to persisted state for that
+        // bucket" (overrides are absolute by design so the LLM can
+        // declare a clean clean-room run).
+        pythonBucket = args.packages.python ?? [];
+        nodeBucket = args.packages.node ?? [];
+      } else {
+        // No override — fall back to persisted state.
+        const stored = artifact.runPackagesByLang;
+        if (stored !== undefined) {
+          pythonBucket = stored.python ?? [];
+          nodeBucket = stored.node ?? [];
+        }
+        // Legacy `runPackages` (flat). Pre-grouped data may still carry
+        // prefixed specs (`python:foo`) from older code paths or
+        // hand-edited rows — `classifyPackages` strips the prefix and
+        // routes correctly so a stale flat entry doesn't ship a Python
+        // spec to npm. Only fills an empty bucket; never shadows the
+        // grouped state above.
+        const flat = artifact.runPackages ?? [];
+        if (flat.length > 0) {
+          // Default the un-prefixed specs to whichever runtime the
+          // dispatched files need (when single). For a mixed run, the
+          // flat list is ambiguous and we default to python.
+          const flatDefaultLang: 'python' | 'node' =
+            runtimesNeeded.size === 1 && runtimesNeeded.has('node')
+              ? 'node'
+              : 'python';
+          const classified = classifyPackages(flat, flatDefaultLang);
+          if (pythonBucket.length === 0) pythonBucket = classified.python;
+          if (nodeBucket.length === 0) nodeBucket = classified.node;
+        }
+      }
+      // Drop buckets the dispatched file set doesn't need so the
+      // entrypoint skips that install pass entirely.
+      if (!runtimesNeeded.has('python')) pythonBucket = [];
+      if (!runtimesNeeded.has('node')) nodeBucket = [];
+
+      const packagesByLang: { python?: string[]; node?: string[] } = {};
+      if (pythonBucket.length > 0) packagesByLang.python = pythonBucket;
+      if (nodeBucket.length > 0) packagesByLang.node = nodeBucket;
+      const hasGrouped = Object.keys(packagesByLang).length > 0;
+      // For single-language runs keep the legacy flat `packages` field
+      // populated so audit downstreams (and any code that hasn't been
+      // taught about the grouped shape) still see the install list.
+      let legacyFlat: string[] | undefined;
+      if (spawnerLanguage === 'python') {
+        legacyFlat = pythonBucket.length > 0 ? pythonBucket : undefined;
+      } else if (spawnerLanguage === 'node') {
+        legacyFlat = nodeBucket.length > 0 ? nodeBucket : undefined;
+      }
+      // `allowSdist` / `allowInstallScripts` are no longer LLM-callable; the
+      // legacy persisted `artifact.runOptions` is intentionally ignored.
+      // Server-side, `executeCode` always sends `false` for both flags.
+
+      // Resolve the agentSlug attribution from threadMetadata. The audit
+      // row records this so per-agent usage / model-cost analytics
+      // (project_usage_analytics) can attribute sandbox spend correctly.
+      // Best-effort: if the lookup fails or the metadata row is missing,
+      // we just skip the field — sandbox execution is not blocked.
+      const threadMeta = await ctx
+        .runQuery(internal.threads.internal_queries.getThreadMetadata, {
+          threadId,
+          callerOrgId: organizationId,
+        })
+        .catch((err) => {
+          console.warn(
+            '[artifact_run_tool] threadMetadata lookup failed:',
+            err,
+          );
+          return null;
+        });
+      const agentSlug = threadMeta?.agentSlug;
+
+      // Validate `inputs.from_run` against this artifact BEFORE we dispatch
+      // the run. Without this, the spawner action's `getLatestRunOutputs`
+      // would silently fall back to "latest succeeded" on a malformed or
+      // cross-artifact runId, pinning the run to outputs the LLM never
+      // intended. Surface the error directly so the LLM can correct the
+      // call instead of getting a confusing diff later.
+      if (args.inputs?.from_run !== undefined) {
+        const verdict = await ctx.runQuery(
+          internal.artifacts.internal_queries.validateRunIdForArtifact,
+          { artifactId, runId: args.inputs.from_run },
+        );
+        if (verdict !== 'ok') {
+          const reasonMessage =
+            verdict === 'malformed_run_id'
+              ? `'${args.inputs.from_run}' is not a valid artifactRuns id. Either omit \`inputs.from_run\` (uses the latest succeeded run by default) or pass an exact runId from a prior \`artifact_run\` response.`
+              : verdict === 'run_not_found'
+                ? `runId '${args.inputs.from_run}' was not found (it may have been GC'd, or you copied from a different deploy). Omit \`inputs.from_run\` to use the latest succeeded run, or call \`artifact_list_runs\` to enumerate available runIds.`
+                : `runId '${args.inputs.from_run}' belongs to a different artifact. Pre-stage only works against runs from this same artifact (\`${artifactId}\`).`;
+          return {
+            success: false,
+            code: 'pin_target_not_found',
+            message: reasonMessage,
+          };
+        }
+      }
+
+      let raw: unknown;
+      try {
+        raw = await ctx.runAction(
+          internal.node_only.sandbox.internal_actions.executeCode,
+          {
+            organizationId,
+            uploadedBy: userId,
+            threadId,
+            ...(messageId !== undefined && { messageId }),
+            ...(options.toolCallId && { toolCallId: options.toolCallId }),
+            ...(agentSlug !== undefined && { agentSlug }),
+            language: spawnerLanguage,
+            // Single-script mode sends `entryPath` (the file the runtime
+            // entrypoint exec()s). Multi-step mode sends `steps[]` and
+            // lets the spawner generate the wrapper under /workspace/.tale/.
+            // Mutual exclusion is enforced by the action AND the spawner
+            // validator — pass exactly one branch.
+            ...(dispatch.kind === 'single' && {
+              entryPath: dispatch.targetPath,
+            }),
+            ...(dispatch.kind === 'steps' && { steps: dispatch.stepPaths }),
+            // Stage every file in the project so siblings are importable.
+            // The spawner writes each to /workspace/code/<path>.
+            files: resolved.files.map((f) => ({
+              path: f.path,
+              content: f.content,
+            })),
+            ...(legacyFlat !== undefined && { packages: legacyFlat }),
+            ...(hasGrouped && { packagesByLang }),
+            ...(args.timeoutMs !== undefined && { timeoutMs: args.timeoutMs }),
+            ...(args.inputs?.from_run !== undefined && {
+              inputs: { fromRun: args.inputs.from_run },
+            }),
+            // allowSdist / allowInstallScripts intentionally omitted — the
+            // action hardcodes both to false (round-2 R2-B4).
+            purpose: `artifact_run: ${artifact.title}`,
+            artifactId,
+          },
+        );
+      } catch (err) {
+        // The action's contract is: infra failures → finalize THEN throw,
+        // user-code failures → finalize THEN return. So if we land here,
+        // either (a) reserveSlotAndInsert rejected with QUOTA_EXCEEDED
+        // before the audit row existed, or (b) spawnerExecute failed and
+        // failExecution already wrote terminal state to BOTH rows. In
+        // case (a) the artifact is still 'queued' from initArtifactRun
+        // above, so we must finalize it ourselves; case (b) is idempotent
+        // because finalizeArtifactRun's terminal guard no-ops on the
+        // second write.
+        const isConvexError = err instanceof ConvexError;
+        const code =
+          isConvexError &&
+          typeof err.data === 'object' &&
+          err.data !== null &&
+          // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose
+          typeof (err.data as { code?: string }).code === 'string'
+            ? // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose
+              (err.data as { code: string }).code
+            : undefined;
+        const errMessage = err instanceof Error ? err.message : String(err);
+        const runErrorCode =
+          code === 'QUOTA_EXCEEDED' ? 'QUOTA_EXCEEDED' : 'SPAWNER_UNAVAILABLE';
+        try {
+          // No runExecutionId here: when reserveSlotAndInsert throws (e.g.
+          // QUOTA_EXCEEDED pre-insert) no audit row exists; when
+          // spawnerExecute throws, the action's failExecution already wrote
+          // the executionId onto the artifact row, and the terminal guard
+          // makes this call a no-op.
+          await ctx.runMutation(
+            internal.artifacts.internal_mutations.finalizeArtifactRun,
+            {
+              artifactId,
+              runStatus: 'failed',
+              runErrorCode,
+              runErrorMessage: errMessage,
+              runOutputFiles: [],
+            },
+          );
+        } catch (finalizeErr) {
+          console.warn(
+            '[artifact_run_tool] finalizeArtifactRun after executeCode throw failed:',
+            finalizeErr,
+          );
+        }
+        const message =
+          runErrorCode === 'QUOTA_EXCEEDED'
+            ? `Run REFUSED: QUOTA_EXCEEDED — ${errMessage}. Don't retry; tell the user the org's daily sandbox budget is exhausted.`
+            : `Run FAILED before completion: ${errMessage}. One retry is fine if the underlying cause was transient; otherwise tell the user the sandbox is unavailable.`;
+        return {
+          success: false,
+          message,
+        };
+      }
+      // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- executeCode is typed `any` via the stale agent-SDK codegen path; the runtime shape is ExecuteCodeResult (asserted at the action return site).
+      const run = raw as ExecuteCodeResult;
+
+      const completed = run.status === 'completed';
+      const hasFiles = run.files.length > 0;
+      const success = completed && hasFiles;
+
+      // Locate the first failed step (if multi-step) so the message can
+      // name it directly — the LLM should patch THAT step, not the others.
+      const failedStep =
+        run.steps?.find((s) => s.status === 'failed') ?? undefined;
+      const totalSteps = run.steps?.length ?? 0;
+      const failedIdx =
+        failedStep && run.steps
+          ? run.steps.findIndex((s) => s === failedStep)
+          : -1;
+      const stepSuffix =
+        failedStep && totalSteps > 0
+          ? ` Step ${failedIdx + 1}/${totalSteps} ("${failedStep.path}") exited ${failedStep.exitCode ?? 'null'}; earlier steps completed.`
+          : '';
+
+      let message: string;
+      if (success) {
+        if (run.steps && run.steps.length > 0) {
+          const pathList = run.steps.map((s) => s.path).join(' → ');
+          message = `Ran "${artifact.title}" successfully across ${run.steps.length} step(s) [${pathList}]; produced ${run.files.length} output file(s) in ${run.durationMs}ms.`;
+        } else {
+          message = `Ran "${artifact.title}" successfully; produced ${run.files.length} output file(s) in ${run.durationMs}ms.`;
+        }
+      } else if (run.errorCode) {
+        message = `Run FAILED: ${run.errorCode}${run.errorMessage ? ` — ${run.errorMessage}` : ''}.${stepSuffix} Read runStderrPreview and call artifact_file_update on the SAME artifactId to fix${failedStep ? ` "${failedStep.path}"` : ''}, then artifact_run again. Do NOT call artifact_create — that creates a duplicate. Do NOT say the file is ready.`;
+      } else {
+        message = `Run finished with status=${run.status} but produced no output files.${stepSuffix} Inspect runStdoutPreview / runStderrPreview and decide whether to artifact_file_update + re-run.`;
+      }
+
+      // Surface the artifactRuns row id created by `applyFinalizeArtifactRun`
+      // so the LLM can pin a later run's pre-stage with
+      // `inputs: { from_run: "<runId>" }`. Lookup-by-executionId keeps the
+      // tool-side change small (no plumbing through executeCode's return).
+      // Best-effort: if finalize never ran (rare infra crash) we omit runId.
+      const runRow = await ctx
+        .runQuery(internal.artifacts.internal_queries.getRunByExecutionId, {
+          executionId: toId<'sandboxExecutions'>(run.executionId),
+        })
+        .catch((err) => {
+          console.warn('[artifact_run_tool] getRunByExecutionId failed:', err);
+          return null;
+        });
+
+      return {
+        success,
+        artifactId: args.artifactId,
+        revision: artifact.revision,
+        runStatus: run.status,
+        runExitCode: run.exitCode,
+        ...(run.errorCode !== undefined && { runErrorCode: run.errorCode }),
+        ...(run.errorMessage !== undefined && {
+          runErrorMessage: run.errorMessage,
+        }),
+        runStdoutPreview: run.stdoutPreview,
+        runStderrPreview: run.stderrPreview,
+        durationMs: run.durationMs,
+        files: run.files,
+        executionId: run.executionId,
+        ...(runRow !== null && { runId: String(runRow._id) }),
+        ...(run.steps !== undefined && { steps: run.steps }),
+        ...(run.preStage !== undefined && { preStage: run.preStage }),
+        message,
+      };
+    },
+  }),
+} as const satisfies ToolDefinition;
diff --git a/services/platform/convex/agent_tools/artifacts/shared.test.ts b/services/platform/convex/agent_tools/artifacts/shared.test.ts
new file mode 100644
index 000000000..54ccc5623
--- /dev/null
+++ b/services/platform/convex/agent_tools/artifacts/shared.test.ts
@@ -0,0 +1,254 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  classifyPackages,
+  defaultEntryFileFor,
+  detectNodeSpecError,
+  detectPythonSpecError,
+  inferStepLanguage,
+  isRunnableArtifactType,
+  refinePackagesObject,
+  runnableLanguage,
+  runtimesForFiles,
+} from './shared';
+
+describe('inferStepLanguage', () => {
+  it('maps .py to python', () => {
+    expect(inferStepLanguage('main.py')).toBe('python');
+    expect(inferStepLanguage('nested/lib/helper.py')).toBe('python');
+    expect(inferStepLanguage('MAIN.PY')).toBe('python');
+  });
+
+  it('maps .js / .cjs / .mjs to node', () => {
+    expect(inferStepLanguage('main.js')).toBe('node');
+    expect(inferStepLanguage('legacy.cjs')).toBe('node');
+    expect(inferStepLanguage('module.mjs')).toBe('node');
+  });
+
+  it('returns null for unknown extensions', () => {
+    expect(inferStepLanguage('main.ts')).toBe(null);
+    expect(inferStepLanguage('main.rb')).toBe(null);
+    expect(inferStepLanguage('README.md')).toBe(null);
+    expect(inferStepLanguage('Makefile')).toBe(null);
+  });
+});
+
+describe('runtimesForFiles', () => {
+  it('collects only the runtimes the file set needs', () => {
+    expect([...runtimesForFiles(['main.py', 'helper.py'])]).toEqual(['python']);
+    expect([...runtimesForFiles(['main.js'])]).toEqual(['node']);
+    expect([...runtimesForFiles(['gen.js', 'qa.py'])].sort()).toEqual([
+      'node',
+      'python',
+    ]);
+  });
+
+  it('skips unknown extensions silently — caller is expected to reject', () => {
+    expect([...runtimesForFiles(['main.py', 'extra.rb'])]).toEqual(['python']);
+  });
+});
+
+describe('isRunnableArtifactType', () => {
+  it('includes script_runnable and legacy literals', () => {
+    expect(isRunnableArtifactType('script_runnable')).toBe(true);
+    expect(isRunnableArtifactType('python_runnable')).toBe(true);
+    expect(isRunnableArtifactType('node_runnable')).toBe(true);
+  });
+
+  it('excludes static types', () => {
+    expect(isRunnableArtifactType('code')).toBe(false);
+    expect(isRunnableArtifactType('html')).toBe(false);
+  });
+});
+
+describe('runnableLanguage (legacy single-runtime helper)', () => {
+  it('returns the locked language for legacy literals', () => {
+    expect(runnableLanguage('python_runnable')).toBe('python');
+    expect(runnableLanguage('node_runnable')).toBe('node');
+  });
+
+  it('returns null for script_runnable (polyglot — per-file)', () => {
+    expect(runnableLanguage('script_runnable')).toBe(null);
+  });
+});
+
+describe('classifyPackages', () => {
+  it('strips python: prefix and routes to the python bucket', () => {
+    expect(
+      classifyPackages(['python:markitdown[pptx]', 'pptxgenjs'], 'node'),
+    ).toEqual({
+      python: ['markitdown[pptx]'],
+      node: ['pptxgenjs'],
+    });
+  });
+
+  it('strips node: / npm: prefix and routes to the node bucket', () => {
+    expect(
+      classifyPackages(['numpy', 'node:lodash', 'npm:axios'], 'python'),
+    ).toEqual({
+      python: ['numpy'],
+      node: ['lodash', 'axios'],
+    });
+  });
+
+  it('treats pip: as a python alias', () => {
+    expect(classifyPackages(['pip:requests==2.31.0'], 'node')).toEqual({
+      python: ['requests==2.31.0'],
+      node: [],
+    });
+  });
+
+  it('routes bare specs to defaultLang', () => {
+    expect(classifyPackages(['numpy', 'pandas'], 'python')).toEqual({
+      python: ['numpy', 'pandas'],
+      node: [],
+    });
+    expect(classifyPackages(['lodash', 'axios'], 'node')).toEqual({
+      python: [],
+      node: ['lodash', 'axios'],
+    });
+  });
+
+  it('falls back to python when defaultLang is null', () => {
+    expect(classifyPackages(['numpy'], null)).toEqual({
+      python: ['numpy'],
+      node: [],
+    });
+  });
+
+  it('is case-insensitive on the prefix', () => {
+    expect(classifyPackages(['PYTHON:numpy', 'Node:lodash'], 'python')).toEqual(
+      {
+        python: ['numpy'],
+        node: ['lodash'],
+      },
+    );
+  });
+
+  it('skips empty / whitespace-only specs', () => {
+    expect(classifyPackages(['', '  ', 'numpy'], 'python')).toEqual({
+      python: ['numpy'],
+      node: [],
+    });
+  });
+
+  it('trims surrounding whitespace before classifying', () => {
+    expect(
+      classifyPackages(['  python:numpy  ', '  lodash  '], 'node'),
+    ).toEqual({
+      python: ['numpy'],
+      node: ['lodash'],
+    });
+  });
+});
+
+describe('defaultEntryFileFor', () => {
+  it('uses main.py by default for script_runnable', () => {
+    expect(defaultEntryFileFor('script_runnable')).toBe('main.py');
+  });
+
+  it('switches to main.js when the language hint is node-flavored', () => {
+    expect(defaultEntryFileFor('script_runnable', 'javascript')).toBe(
+      'main.js',
+    );
+    expect(defaultEntryFileFor('script_runnable', 'js')).toBe('main.js');
+    expect(defaultEntryFileFor('script_runnable', 'node')).toBe('main.js');
+  });
+
+  it('preserves the legacy entry-file defaults', () => {
+    expect(defaultEntryFileFor('python_runnable')).toBe('main.py');
+    expect(defaultEntryFileFor('node_runnable')).toBe('main.js');
+  });
+});
+
+describe('detectPythonSpecError', () => {
+  it('rejects npm version pin (pkg@version)', () => {
+    expect(detectPythonSpecError('pptxgenjs@3.12.0')).toMatch(
+      /npm version pin.*packages\.node/,
+    );
+    expect(detectPythonSpecError('lodash@^4.0')).toMatch(/packages\.node/);
+  });
+
+  it('rejects npm scoped packages', () => {
+    expect(detectPythonSpecError('@anthropic/sdk')).toMatch(
+      /npm scope.*packages\.node/,
+    );
+    expect(detectPythonSpecError('@scope/pkg@1.0.0')).toMatch(/packages\.node/);
+  });
+
+  it('rejects npm range operators at start', () => {
+    expect(detectPythonSpecError('^1.0.0')).toMatch(/range operator/);
+    expect(detectPythonSpecError('~2.3')).toMatch(/range operator/);
+  });
+
+  it('passes pip-canonical specs', () => {
+    expect(detectPythonSpecError('numpy')).toBe(null);
+    expect(detectPythonSpecError('requests==2.31.0')).toBe(null);
+    expect(detectPythonSpecError('markitdown[pptx]')).toBe(null);
+    expect(detectPythonSpecError('pkg @ git+https://example.com/repo')).toBe(
+      null,
+    );
+  });
+});
+
+describe('detectNodeSpecError', () => {
+  it('rejects pip extras syntax', () => {
+    expect(detectNodeSpecError('markitdown[pptx]')).toMatch(
+      /pip extras.*packages\.python/,
+    );
+  });
+
+  it('rejects pip PEP 440 version operators', () => {
+    expect(detectNodeSpecError('requests==2.31.0')).toMatch(
+      /PEP 440.*packages\.python/,
+    );
+    expect(detectNodeSpecError('pkg~=1.0')).toMatch(/packages\.python/);
+    expect(detectNodeSpecError('pkg!=1.0')).toMatch(/packages\.python/);
+  });
+
+  it('rejects pip direct-URL form (whitespace around @)', () => {
+    expect(detectNodeSpecError('pkg @ https://example.com/pkg.tar.gz')).toMatch(
+      /direct-URL.*packages\.python/,
+    );
+  });
+
+  it('passes npm-canonical specs', () => {
+    expect(detectNodeSpecError('pptxgenjs')).toBe(null);
+    expect(detectNodeSpecError('pptxgenjs@3.12.0')).toBe(null);
+    expect(detectNodeSpecError('@anthropic/sdk')).toBe(null);
+    expect(detectNodeSpecError('lodash@^4.0.0')).toBe(null);
+  });
+});
+
+describe('refinePackagesObject', () => {
+  it('emits one issue per bad spec, scoped to its bucket index', () => {
+    const issues: Array<{
+      code: 'custom';
+      path: (string | number)[];
+      message: string;
+    }> = [];
+    refinePackagesObject(
+      {
+        python: ['numpy', 'pptxgenjs@3.12.0', '@scope/x'],
+        node: ['lodash', 'markitdown[pptx]'],
+      },
+      (issue) => issues.push(issue),
+    );
+    expect(issues).toHaveLength(3);
+    expect(issues[0]).toMatchObject({ path: ['python', 1] });
+    expect(issues[0]?.message).toMatch(/packages\.node/);
+    expect(issues[1]).toMatchObject({ path: ['python', 2] });
+    expect(issues[2]).toMatchObject({ path: ['node', 1] });
+    expect(issues[2]?.message).toMatch(/packages\.python/);
+  });
+
+  it('is a no-op when packages is undefined or all-canonical', () => {
+    const issues: unknown[] = [];
+    refinePackagesObject(undefined, () => issues.push('x'));
+    refinePackagesObject(
+      { python: ['numpy', 'requests==2.31.0'], node: ['lodash@^4.0.0'] },
+      () => issues.push('x'),
+    );
+    expect(issues).toHaveLength(0);
+  });
+});
diff --git a/services/platform/convex/agent_tools/artifacts/shared.ts b/services/platform/convex/agent_tools/artifacts/shared.ts
index 9cd141b1a..d7e1dd21c 100644
--- a/services/platform/convex/agent_tools/artifacts/shared.ts
+++ b/services/platform/convex/agent_tools/artifacts/shared.ts
@@ -6,16 +6,678 @@ export const artifactTypeEnum = z.enum([
   'markdown',
   'mermaid',
   'code',
+  // Canonical runnable type. Source code that executes in the server
+  // sandbox; per-file runtime is inferred from extension (`.py` →
+  // python3, `.js`/`.cjs`/`.mjs` → node) so a single artifact can mix
+  // Python and Node files in one project. The canvas-runnable-code-
+  // renderer subscribes to the row's `run*` fields to show live
+  // progress + the final output file chips.
+  'script_runnable',
+  // @deprecated — legacy single-runtime literals. Kept here so existing
+  // artifact rows continue to validate (per
+  // [feedback_deprecate_dont_delete_schema_fields]). New artifact_create
+  // calls land at `script_runnable`; old rows route through the same
+  // polyglot pipeline with their single-runtime file set.
+  'python_runnable',
+  'node_runnable',
 ]);
 
 export type ArtifactType = z.infer<typeof artifactTypeEnum>;
 
+const RUNNABLE_TYPES: ReadonlySet<string> = new Set<ArtifactType>([
+  'script_runnable',
+  'python_runnable',
+  'node_runnable',
+]);
+
 export function isValidArtifactType(value: string): value is ArtifactType {
   return (
     value === 'html' ||
     value === 'svg' ||
     value === 'markdown' ||
     value === 'mermaid' ||
-    value === 'code'
+    value === 'code' ||
+    value === 'script_runnable' ||
+    value === 'python_runnable' ||
+    value === 'node_runnable'
   );
 }
+
+export function isRunnableArtifactType(value: string): boolean {
+  return RUNNABLE_TYPES.has(value);
+}
+
+/**
+ * Legacy helper: returns the single runtime of a legacy
+ * `python_runnable` / `node_runnable` row. Returns `null` for
+ * `script_runnable` (polyglot — runtime is per-file, not per-artifact).
+ * Used only by code paths that still want to short-circuit on
+ * "this is a pure-Python or pure-Node artifact". For dispatch, prefer
+ * {@link inferStepLanguage} which works for all three types.
+ */
+export function runnableLanguage(type: ArtifactType): 'python' | 'node' | null {
+  if (type === 'python_runnable') return 'python';
+  if (type === 'node_runnable') return 'node';
+  return null;
+}
+
+/**
+ * Per-file runtime dispatcher. Maps a path's extension to the sandbox
+ * runtime that should execute it. Returns `null` for any extension the
+ * sandbox doesn't host an interpreter for (defer to caller to surface
+ * INPUT_REJECTED).
+ *
+ * `.cjs` / `.mjs` are accepted because Node treats them as commonjs /
+ * esm respectively — the entrypoint just runs `node <path>` and Node
+ * resolves the module system itself.
+ */
+export function inferStepLanguage(path: string): 'python' | 'node' | null {
+  const match = path.toLowerCase().match(/\.([a-z0-9]+)$/);
+  const ext = match ? match[1] : undefined;
+  if (ext === 'py') return 'python';
+  if (ext === 'js' || ext === 'cjs' || ext === 'mjs') return 'node';
+  return null;
+}
+
+/**
+ * Collect the set of sandbox runtimes needed to execute the given file
+ * paths. Empty set if every path has an unknown extension (caller should
+ * reject the request before reaching the spawner).
+ */
+export function runtimesForFiles(
+  paths: readonly string[],
+): Set<'python' | 'node'> {
+  const out = new Set<'python' | 'node'>();
+  for (const p of paths) {
+    const lang = inferStepLanguage(p);
+    if (lang !== null) out.add(lang);
+  }
+  return out;
+}
+
+/**
+ * Split a flat list of package specs into python / node buckets.
+ *
+ * Agents sometimes send a mixed flat list and tag the language with a
+ * `python:` / `pip:` / `node:` / `npm:` prefix instead of using the
+ * grouped `{python: [], node: []}` form. We accept that — strip the
+ * prefix and route to the matching bucket. Bare (un-prefixed) specs go
+ * to the `defaultLang` bucket; if `defaultLang` is `null` they default
+ * to python (the scientific-stack convention — npm specs are far more
+ * likely to be explicitly tagged than pip specs).
+ *
+ * This is purely a defensive parser — the canonical input shape is
+ * still the grouped `{python, node}` object, and we document that in
+ * every tool description.
+ *
+ * Examples:
+ *   classifyPackages(['python:markitdown[pptx]', 'pptxgenjs'], 'node')
+ *     → { python: ['markitdown[pptx]'], node: ['pptxgenjs'] }
+ *   classifyPackages(['numpy', 'pandas'], 'python')
+ *     → { python: ['numpy', 'pandas'], node: [] }
+ *   classifyPackages(['lodash'], 'node')
+ *     → { python: [], node: ['lodash'] }
+ */
+const PACKAGE_LANG_PREFIX_RE = /^(python|pip|node|npm):(.+)$/i;
+// Pip extras syntax: `pkg[extra]` / `pkg[a,b]`. npm package names
+// disallow `[` and `]` entirely, so a `[` anywhere in the spec is an
+// unambiguous pip signal — and saves an agent that sent a mixed flat
+// list from shipping `markitdown[pptx]` to `npm install` (which would
+// fail with EINVALIDTAGNAME).
+const PIP_EXTRAS_RE = /\[/;
+// npm scoped package: `@scope/name(@version)?`. Pip's own `@` syntax
+// for direct URLs (`pkg @ url`) requires whitespace, so a bare-leading
+// `@scope/` cannot match pip.
+const NPM_SCOPED_RE = /^@[A-Za-z0-9][^@/\s]*\//;
+
+/**
+ * Heuristic spec sniff. Returns the language the spec is unambiguously
+ * for, or `null` when the shape is generic enough to need a fallback
+ * (a bare `numpy` or `lodash` looks the same on both sides).
+ */
+function detectLangSignal(spec: string): 'python' | 'node' | null {
+  if (PIP_EXTRAS_RE.test(spec)) return 'python';
+  if (NPM_SCOPED_RE.test(spec)) return 'node';
+  return null;
+}
+
+/**
+ * Return an error string when `spec` is unambiguously NOT a pip spec —
+ * meaning its syntax means something different in npm and would either
+ * silently mis-install OR error obscurely if forwarded to `uv pip`.
+ * Returns `null` for canonical pip specs and generic-enough names that
+ * are valid on both sides.
+ *
+ * Detects:
+ *  - npm version pin `pkg@1.2.3` (pip's direct-URL form `pkg @ url`
+ *    requires whitespace around `@`, so a bare `pkg@digit` is
+ *    unambiguous npm)
+ *  - npm scoped package `@scope/name`
+ *  - npm range operators `^1.0.0` / `~1.0` at the very start (pip uses
+ *    `==` / `~=` / no operator)
+ *
+ * Wired into the Zod `packages` refine of the three artifact tools so
+ * the LLM gets a clear "move this to packages.node instead" error at
+ * input parse time, before the sandbox round-trip.
+ */
+const NPM_VERSION_PIN_RE = /^[A-Za-z0-9._-]+@[\d^~v]/;
+const NPM_SCOPE_RE = /^@[A-Za-z0-9]/;
+const NPM_RANGE_RE = /^[\^~]\d/;
+
+export function detectPythonSpecError(spec: string): string | null {
+  const trimmed = spec.trim();
+  if (NPM_VERSION_PIN_RE.test(trimmed)) {
+    return `"${trimmed}" looks like an npm version pin (\`pkg@version\` syntax) — move it to packages.node instead. Pip uses \`pkg==version\` for pins.`;
+  }
+  if (NPM_SCOPE_RE.test(trimmed)) {
+    return `"${trimmed}" starts with an npm scope (\`@scope/...\`) — move it to packages.node instead.`;
+  }
+  if (NPM_RANGE_RE.test(trimmed)) {
+    return `"${trimmed}" looks like an npm range operator (\`^x.y.z\` / \`~x.y.z\`) — move it to packages.node instead.`;
+  }
+  return null;
+}
+
+/**
+ * Run {@link detectPythonSpecError} / {@link detectNodeSpecError}
+ * across both buckets and call `addIssue` for each bad spec. Shared by
+ * the three artifact tools that accept a `packages` object so the
+ * error messages stay identical.
+ *
+ * Generic over the Zod refinement context's `addIssue` shape (Zod v4
+ * exposes it as `RefinementCtx['addIssue']`) — typing it as a plain
+ * function lets the call sites pass `ctx` from either `.superRefine`
+ * or `.refine` without depending on Zod internals.
+ */
+type AddIssue = (issue: {
+  code: 'custom';
+  path: (string | number)[];
+  message: string;
+}) => void;
+
+export function refinePackagesObject(
+  packages: { python?: string[]; node?: string[] } | undefined,
+  addIssue: AddIssue,
+): void {
+  if (packages === undefined) return;
+  for (let i = 0; i < (packages.python ?? []).length; i += 1) {
+    const spec = packages.python?.[i];
+    if (spec === undefined) continue;
+    const err = detectPythonSpecError(spec);
+    if (err !== null) {
+      addIssue({ code: 'custom', path: ['python', i], message: err });
+    }
+  }
+  for (let i = 0; i < (packages.node ?? []).length; i += 1) {
+    const spec = packages.node?.[i];
+    if (spec === undefined) continue;
+    const err = detectNodeSpecError(spec);
+    if (err !== null) {
+      addIssue({ code: 'custom', path: ['node', i], message: err });
+    }
+  }
+}
+
+/**
+ * Mirror of {@link detectPythonSpecError} for the `packages.node`
+ * bucket: returns an error string when `spec` is unambiguously a pip
+ * spec.
+ *
+ * Detects:
+ *  - pip extras `pkg[extra]` / `pkg[a,b]` — npm package names disallow
+ *    `[` and `]`
+ *  - pip PEP 440 operators `==` / `~=` / `!=` / `===`
+ *  - pip direct-URL form `pkg @ url` (whitespace around `@`)
+ */
+const PIP_EXTRAS_BRACKET_RE = /\[/;
+const PIP_PEP440_OP_RE = /===|==|~=|!=/;
+const PIP_DIRECT_URL_RE = /\s@\s/;
+
+export function detectNodeSpecError(spec: string): string | null {
+  const trimmed = spec.trim();
+  if (PIP_EXTRAS_BRACKET_RE.test(trimmed)) {
+    return `"${trimmed}" uses pip extras syntax (\`pkg[extra]\`) — npm packages cannot contain \`[\`. Move it to packages.python instead.`;
+  }
+  if (PIP_PEP440_OP_RE.test(trimmed)) {
+    return `"${trimmed}" uses a pip PEP 440 version operator (\`==\` / \`~=\` / \`!=\`) — npm uses \`@version\` / \`^\` / \`~\`. Move it to packages.python instead, or rewrite as e.g. \`pkg@1.2.3\` if it really is an npm package.`;
+  }
+  if (PIP_DIRECT_URL_RE.test(trimmed)) {
+    return `"${trimmed}" looks like pip's direct-URL form (\`pkg @ url\` with whitespace) — move it to packages.python instead.`;
+  }
+  return null;
+}
+
+export function classifyPackages(
+  specs: readonly string[],
+  defaultLang: 'python' | 'node' | null,
+): { python: string[]; node: string[] } {
+  const python: string[] = [];
+  const node: string[] = [];
+  for (const raw of specs) {
+    const spec = raw.trim();
+    if (spec.length === 0) continue;
+    const prefixMatch = spec.match(PACKAGE_LANG_PREFIX_RE);
+    if (prefixMatch) {
+      const tag = prefixMatch[1]?.toLowerCase();
+      const stripped = prefixMatch[2] ?? '';
+      if (stripped.length === 0) continue;
+      if (tag === 'python' || tag === 'pip') python.push(stripped);
+      else node.push(stripped); // 'node' or 'npm'
+      continue;
+    }
+    const signal = detectLangSignal(spec);
+    if (signal === 'python') {
+      python.push(spec);
+    } else if (signal === 'node') {
+      node.push(spec);
+    } else if (defaultLang === 'node') {
+      node.push(spec);
+    } else {
+      python.push(spec);
+    }
+  }
+  return { python, node };
+}
+
+/**
+ * Types where the entry file is useless empty — the LLM must supply content
+ * at `artifact_create` time. For these, the create tool's Zod schema marks
+ * `content` as required.
+ */
+const CONTENT_REQUIRED_TYPES: ReadonlySet<ArtifactType> = new Set([
+  'html',
+  'svg',
+  'mermaid',
+  'script_runnable',
+  'python_runnable',
+  'node_runnable',
+]);
+
+export function isContentRequiredAtCreate(type: ArtifactType): boolean {
+  return CONTENT_REQUIRED_TYPES.has(type);
+}
+
+// =============================================================================
+// Title normalization (idempotency key)
+// =============================================================================
+
+/**
+ * Canonical form used for idempotency comparisons in `artifact_create`.
+ * NFC-normalized, trimmed, internal whitespace collapsed, case-folded.
+ * The ORIGINAL casing/spacing is what we store as the title; this value
+ * is the comparison key only.
+ */
+export function normalizeTitleForCompare(title: string): string {
+  return title
+    .normalize('NFC')
+    .trim()
+    .replace(/\s+/g, ' ')
+    .toLocaleLowerCase('en');
+}
+
+/**
+ * Storage form: NFC + trim + collapse whitespace, but preserve case.
+ * What we write into `artifacts.title`.
+ */
+export function normalizeTitleForStorage(title: string): string {
+  return title.normalize('NFC').trim().replace(/\s+/g, ' ');
+}
+
+// =============================================================================
+// Default entry-file resolution
+// =============================================================================
+
+const LANGUAGE_TO_EXT: Record<string, string> = {
+  ts: 'ts',
+  typescript: 'ts',
+  tsx: 'tsx',
+  js: 'js',
+  javascript: 'js',
+  jsx: 'jsx',
+  py: 'py',
+  python: 'py',
+  rb: 'rb',
+  ruby: 'rb',
+  go: 'go',
+  rs: 'rs',
+  rust: 'rs',
+  java: 'java',
+  kotlin: 'kt',
+  kt: 'kt',
+  swift: 'swift',
+  c: 'c',
+  cpp: 'cpp',
+  'c++': 'cpp',
+  cs: 'cs',
+  csharp: 'cs',
+  php: 'php',
+  sh: 'sh',
+  bash: 'sh',
+  zsh: 'sh',
+  sql: 'sql',
+  yaml: 'yaml',
+  yml: 'yml',
+  json: 'json',
+  toml: 'toml',
+  html: 'html',
+  css: 'css',
+  scss: 'scss',
+  md: 'md',
+  markdown: 'md',
+};
+
+export function defaultExtensionForLanguage(
+  language: string | undefined,
+): string {
+  if (!language) return 'txt';
+  const key = language.toLocaleLowerCase('en');
+  return LANGUAGE_TO_EXT[key] ?? 'txt';
+}
+
+/**
+ * Default entry-file path per artifact type. The LLM may override on
+ * `artifact_create` via the optional `entryFile` parameter; if no override,
+ * this default seeds the project's entry file.
+ */
+export function defaultEntryFileFor(
+  type: ArtifactType,
+  language?: string,
+): string {
+  switch (type) {
+    case 'html':
+      return 'index.html';
+    case 'svg':
+      return 'image.svg';
+    case 'mermaid':
+      return 'diagram.mmd';
+    case 'markdown':
+      return 'README.md';
+    case 'code':
+      return `main.${defaultExtensionForLanguage(language)}`;
+    case 'script_runnable': {
+      // Polyglot type — entry file extension follows the optional
+      // `language` hint when supplied, else defaults to Python (the more
+      // common starting point for our agents). The hint is the same one
+      // used for static `code` artifacts so the LLM can keep one mental
+      // model for "what extension am I getting".
+      const hint = (language ?? '').toLocaleLowerCase('en');
+      if (
+        hint === 'js' ||
+        hint === 'javascript' ||
+        hint === 'node' ||
+        hint === 'mjs' ||
+        hint === 'cjs'
+      ) {
+        return 'main.js';
+      }
+      return 'main.py';
+    }
+    case 'python_runnable':
+      return 'main.py';
+    case 'node_runnable':
+      return 'main.js';
+    default: {
+      // Exhaustive switch — TS narrows `type` to `never` here. Defensive
+      // return so oxlint's `consistent-return` rule is satisfied.
+      const _exhaustive: never = type;
+      void _exhaustive;
+      return 'main.txt';
+    }
+  }
+}
+
+// =============================================================================
+// Path validation (16-rule pipeline; see plan §Path Validation)
+// =============================================================================
+
+const MAX_PATH_LENGTH = 200;
+export const MAX_FILES_PER_ARTIFACT = 50;
+
+// BiDi overrides + LRM/RLM. U+202A-U+202E, U+2066-U+2069, U+200E-U+200F.
+// Explicit \u escapes so the source has no invisible characters and
+// oxlint's `no-misleading-character-class` rule sees an unambiguous class.
+const BIDI_OVERRIDES = /[\u202A-\u202E\u2066-\u2069\u200E\u200F]/u;
+// Zero-width chars + BOM. ZWSP (200B), ZWNJ (200C), ZWJ (200D), BOM (FEFF).
+const ZERO_WIDTH = /[\u200B-\u200D\uFEFF]/u;
+const CONTROL_CHARS = /[\x00-\x1F\x7F]/;
+const URL_ENCODED_TRAVERSAL = /%(2e|2E|2f|5c)/;
+const WINDOWS_RESERVED = /^(con|prn|aux|nul|com[1-9]|lpt[1-9])(\..*)?$/i;
+const ASCII_COMPONENT_ALLOWLIST = /^[A-Za-z0-9._-]+$/;
+
+export type PathValidationCode =
+  | 'EMPTY'
+  | 'TOO_LONG'
+  | 'CONTROL_CHARS'
+  | 'ZERO_WIDTH'
+  | 'BIDI_OVERRIDE'
+  | 'ABSOLUTE'
+  | 'BACKSLASH'
+  | 'URL_ENCODED_TRAVERSAL'
+  | 'TRAVERSAL'
+  | 'EMPTY_SEGMENT'
+  | 'MULTI_SLASH'
+  | 'LEADING_DOT_SLASH'
+  | 'TRAILING_SLASH'
+  | 'HIDDEN_DOTFILE'
+  | 'DISALLOWED_CHAR'
+  | 'WINDOWS_RESERVED';
+
+export interface PathValidationError {
+  code: PathValidationCode;
+  path: string;
+  message: string;
+}
+
+export class InvalidArtifactPathError extends Error {
+  readonly code: PathValidationCode;
+  readonly path: string;
+  constructor(error: PathValidationError) {
+    super(error.message);
+    this.name = 'InvalidArtifactPathError';
+    this.code = error.code;
+    this.path = error.path;
+  }
+}
+
+/**
+ * Narrow a caught error to its structured code + message for return to
+ * the LLM. Tool catch blocks used to flatten every error into
+ * `{success: false, message}` with NO code field, even though the
+ * underlying `ConvexError`/`InvalidArtifactPathError` already carries a
+ * stable code. Returning the code lets the LLM react programmatically
+ * (e.g. retry with smaller content on `too_large`, pick a different
+ * path on `invalid_path`) instead of string-sniffing the message
+ * (audit follow-up F8).
+ */
+export function extractToolErrorShape(err: unknown): {
+  code?: string;
+  message: string;
+} {
+  if (err instanceof InvalidArtifactPathError) {
+    return {
+      // Surface a stable kebab-case code so the LLM can dispatch on it
+      // alongside the mutation's discriminated-union codes (which use
+      // snake_case). All path-validation failures collapse to
+      // `invalid_path` — the more granular `PathValidationCode` is
+      // included in the message text for human triage.
+      code: 'invalid_path',
+      message: `${err.message} (${err.code})`,
+    };
+  }
+  // ConvexError carries its structured payload on `.data`. We can't
+  // rely on `instanceof ConvexError` reaching across the action/mutation
+  // bundle boundary cleanly, so shape-narrow on the `.data` field.
+  if (err instanceof Error) {
+    // Type-cast the error object to a partial structural shape rather
+    // than `any`. `data` is whatever the throwing site passed to
+    // `new ConvexError({...})`.
+    // oxlint-disable-next-line typescript/no-unsafe-type-assertion
+    const data = (err as { data?: unknown }).data;
+    if (
+      typeof data === 'object' &&
+      data !== null &&
+      'code' in data &&
+      typeof (data as { code: unknown }).code === 'string'
+    ) {
+      // oxlint-disable-next-line typescript/no-unsafe-type-assertion
+      const dCode = (data as { code: string }).code;
+      // oxlint-disable-next-line typescript/no-unsafe-type-assertion
+      const dMessage = (data as { message?: unknown }).message;
+      return {
+        code: dCode,
+        message:
+          typeof dMessage === 'string' && dMessage.length > 0
+            ? dMessage
+            : err.message,
+      };
+    }
+    return { message: err.message };
+  }
+  return { message: String(err) };
+}
+
+/**
+ * Validate a file path for safe storage and sandbox-write. Run at every
+ * mutation boundary that accepts a path. Throws `InvalidArtifactPathError`
+ * with a structured code on failure. On success, returns the NFC-normalized
+ * form — callers MUST store the returned value, not the input.
+ *
+ * Pipeline order matters: normalization first (so subsequent checks see
+ * canonical bytes), then byte-level rejections, then structural.
+ */
+export function validatePath(input: string): string {
+  if (input.length === 0) {
+    throw new InvalidArtifactPathError({
+      code: 'EMPTY',
+      path: input,
+      message: 'Path is empty.',
+    });
+  }
+  const path = input.normalize('NFC');
+  if (path.length > MAX_PATH_LENGTH) {
+    throw new InvalidArtifactPathError({
+      code: 'TOO_LONG',
+      path,
+      message: `Path is ${path.length} chars; max ${MAX_PATH_LENGTH}.`,
+    });
+  }
+  if (CONTROL_CHARS.test(path)) {
+    throw new InvalidArtifactPathError({
+      code: 'CONTROL_CHARS',
+      path,
+      message: 'Path contains control characters (incl. NUL).',
+    });
+  }
+  if (ZERO_WIDTH.test(path)) {
+    throw new InvalidArtifactPathError({
+      code: 'ZERO_WIDTH',
+      path,
+      message: 'Path contains zero-width or BOM characters.',
+    });
+  }
+  if (BIDI_OVERRIDES.test(path)) {
+    throw new InvalidArtifactPathError({
+      code: 'BIDI_OVERRIDE',
+      path,
+      message: 'Path contains bidirectional-text overrides.',
+    });
+  }
+  if (path.startsWith('/') || /^[A-Za-z]:[\\/]/.test(path)) {
+    throw new InvalidArtifactPathError({
+      code: 'ABSOLUTE',
+      path,
+      message: 'Path must be relative; absolute paths are rejected.',
+    });
+  }
+  if (path.includes('\\')) {
+    throw new InvalidArtifactPathError({
+      code: 'BACKSLASH',
+      path,
+      message: 'Path must use forward slashes only.',
+    });
+  }
+  if (URL_ENCODED_TRAVERSAL.test(path)) {
+    throw new InvalidArtifactPathError({
+      code: 'URL_ENCODED_TRAVERSAL',
+      path,
+      message: 'Path contains URL-encoded traversal sequences.',
+    });
+  }
+  if (path.startsWith('./')) {
+    throw new InvalidArtifactPathError({
+      code: 'LEADING_DOT_SLASH',
+      path,
+      message: 'Path must not start with "./".',
+    });
+  }
+  if (path.endsWith('/')) {
+    throw new InvalidArtifactPathError({
+      code: 'TRAILING_SLASH',
+      path,
+      message: 'Path must not end with "/".',
+    });
+  }
+  if (path.includes('//')) {
+    throw new InvalidArtifactPathError({
+      code: 'MULTI_SLASH',
+      path,
+      message: 'Path must not contain consecutive slashes.',
+    });
+  }
+  const segments = path.split('/');
+  for (const segment of segments) {
+    if (segment === '') {
+      throw new InvalidArtifactPathError({
+        code: 'EMPTY_SEGMENT',
+        path,
+        message: 'Path contains an empty segment.',
+      });
+    }
+    if (segment === '.' || segment === '..') {
+      throw new InvalidArtifactPathError({
+        code: 'TRAVERSAL',
+        path,
+        message: 'Path contains "." or ".." segment.',
+      });
+    }
+    if (segment.startsWith('.')) {
+      throw new InvalidArtifactPathError({
+        code: 'HIDDEN_DOTFILE',
+        path,
+        message: `Hidden dotfile segment "${segment}" rejected.`,
+      });
+    }
+    if (!ASCII_COMPONENT_ALLOWLIST.test(segment)) {
+      throw new InvalidArtifactPathError({
+        code: 'DISALLOWED_CHAR',
+        path,
+        message: `Path segment "${segment}" contains characters outside [A-Za-z0-9._-].`,
+      });
+    }
+    if (WINDOWS_RESERVED.test(segment)) {
+      throw new InvalidArtifactPathError({
+        code: 'WINDOWS_RESERVED',
+        path,
+        message: `Path segment "${segment}" matches a Windows-reserved name.`,
+      });
+    }
+  }
+  return path;
+}
+
+/**
+ * Validate uniqueness of paths within a project (case-insensitive — covers
+ * macOS dev hosts where `Main.py` and `main.py` would collide on disk).
+ * Returns the first conflicting path, or `null` if all unique.
+ */
+export function findDuplicatePath(
+  files: readonly { readonly path: string }[],
+): string | null {
+  const seen = new Set<string>();
+  for (const f of files) {
+    const key = f.path.toLocaleLowerCase('en');
+    if (seen.has(key)) return f.path;
+    seen.add(key);
+  }
+  return null;
+}
diff --git a/services/platform/convex/agent_tools/artifacts/stream_state.ts b/services/platform/convex/agent_tools/artifacts/stream_state.ts
index 3d012c2fa..13714e0f7 100644
--- a/services/platform/convex/agent_tools/artifacts/stream_state.ts
+++ b/services/platform/convex/agent_tools/artifacts/stream_state.ts
@@ -1,7 +1,7 @@
 /**
  * Per-tool-call streaming state for the artifact tools.
  *
- * Both `artifact_create` and `artifact_edit` use the AI SDK / @convex-dev
+ * The `artifact_file_create` and `artifact_file_update` tools use the AI SDK / @convex-dev
  * /agent createTool hooks (`onInputStart`, `onInputDelta`, `execute`).
  * These run sequentially within a single agent action invocation, in the
  * same Node process, so a module-level Map keyed by `toolCallId` is a
@@ -14,19 +14,35 @@ import type { Id } from '../../_generated/dataModel';
 
 export interface ArtifactStreamState {
   toolCallId: string;
-  toolName: 'artifact_create' | 'artifact_edit';
+  toolName: 'artifact_create' | 'artifact_file_create' | 'artifact_file_update';
   accumulator: string;
   artifactId?: Id<'artifacts'>;
   // Last byte length of the parsed `content` value flushed to the row.
   // Used to throttle DB writes during create / rewrite streaming.
   lastFlushedContentLength: number;
   lastFlushAt: number;
-  // Set once the parser has seen enough JSON to know the streaming mode
-  // (only relevant for artifact_edit which carries `mode` in its input).
-  resolvedMode?: 'create' | 'rewrite' | 'patch';
+  // Resolved streaming mode for the current tool call. artifact_file_create /
+  // artifact_file_update both stream as 'rewrite'; older tools used other modes.
+  resolvedMode?: 'create' | 'rewrite' | 'append' | 'patch';
   // True once we have either inserted the placeholder (create) or marked
   // the existing row (edit). Avoids double-init on rapid deltas.
   rowInitialized: boolean;
+  // Sticky hard-fail flag for the streaming preflight. When set, deltas
+  // skip `parsePartialJson` AND the beginEditStream re-attempt loop so
+  // the same invalid path doesn't spam WARN logs on every subsequent
+  // delta. `execute()` still runs and surfaces the structured failure
+  // (audit follow-up F9).
+  streamingFailedHard: boolean;
+  // For artifact_create only — captures the outcome of `beginCreateStream`
+  // so `execute()` knows whether to finalize the placeholder, hand off to
+  // the existing `createArtifact` mutation (collision), or return a
+  // type-mismatch error without further DB writes.
+  createOutcome?: 'placeholder' | 'collision' | 'type_mismatch';
+  typeMismatchInfo?: {
+    existingArtifactId: Id<'artifacts'>;
+    existingType: string;
+    message: string;
+  };
   // Last title / language values written to the row so we don't issue a
   // mutation on every delta when nothing changed.
   lastFlushedTitle?: string;
@@ -36,9 +52,9 @@ export interface ArtifactStreamState {
   lastFlushedPatchesKey?: string;
   lastPatchesFlushAt: number;
   // Byte length of the existing artifact content at edit time. Set during
-  // artifact_edit preflight; used to slow down the patch-stream flush rate
-  // for large sources, where each tick forces the client to re-render a
-  // diff overlay that spans tens of KB. Unset for artifact_create.
+  // artifact_file_create / artifact_file_update preflight; used to scale the flush rate for
+  // large sources where each tick forces the client to re-render a content
+  // overlay that spans tens of KB.
   baseContentLength?: number;
   // Length of the accumulator at the last `parsePartialJson` call, plus
   // the wall-clock timestamp. Used by `shouldParse` to amortise the
@@ -48,17 +64,6 @@ export interface ArtifactStreamState {
   // than its configured interval.
   lastParsedLength: number;
   lastParsedAt: number;
-  // Coalesced fire-and-forget flush state. Streaming flushes (the
-  // `updateStreamingContent` mutation) are NOT awaited inside
-  // `onInputDelta` because a 30 KB+ payload roundtrip blocks the AI SDK's
-  // event loop, builds buffer pressure, and produces a "wait several
-  // seconds, then dump a big chunk" cadence on screen. Instead we keep
-  // at most one mutation in flight; subsequent flush requests overwrite
-  // `pendingFlush` with the latest payload, and the in-flight callback's
-  // `.finally` drains it. Final consistency is guaranteed by the canonical
-  // settle in `execute()`, which clears streaming flags atomically.
-  flushInFlight: boolean;
-  pendingFlush?: () => Promise<unknown>;
 }
 
 export interface StreamingPatchPair {
@@ -82,46 +87,12 @@ export function initState(
     lastParsedLength: 0,
     lastParsedAt: 0,
     rowInitialized: false,
-    flushInFlight: false,
+    streamingFailedHard: false,
   };
   STATE.set(toolCallId, next);
   return next;
 }
 
-/**
- * Hand a streaming-flush mutation off to the background. At most one flush
- * is in flight at a time; if another request arrives while one is running,
- * the previous queued payload is replaced (we always want the latest).
- * The in-flight callback's `.finally` drains any payload that was queued
- * during its run.
- *
- * `runMutation` is a closure provided by the caller — keeping the Convex
- * api reference out of this module so this file stays import-light.
- */
-export function scheduleStreamingFlush(
-  state: ArtifactStreamState,
-  runMutation: () => Promise<unknown>,
-): void {
-  state.pendingFlush = runMutation;
-  if (state.flushInFlight) return;
-  drainFlush(state);
-}
-
-function drainFlush(state: ArtifactStreamState): void {
-  if (state.flushInFlight || !state.pendingFlush) return;
-  const next = state.pendingFlush;
-  state.pendingFlush = undefined;
-  state.flushInFlight = true;
-  void next()
-    .catch((err) => {
-      console.error('[artifact streaming] flush failed:', err);
-    })
-    .finally(() => {
-      state.flushInFlight = false;
-      drainFlush(state);
-    });
-}
-
 export function getState(toolCallId: string): ArtifactStreamState | undefined {
   return STATE.get(toolCallId);
 }
@@ -221,6 +192,10 @@ export function shouldParse(
   state: ArtifactStreamState,
   accumulatorLength: number,
 ): boolean {
+  // Hard-fail short-circuit: once preflight validation has rejected the
+  // path / artifact, every subsequent delta would re-trigger the same
+  // failure. Stop parsing the accumulator until `execute()` runs.
+  if (state.streamingFailedHard) return false;
   if (!state.rowInitialized) return true;
   const grew = accumulatorLength - state.lastParsedLength;
   const [byteDelta, minIntervalMs] = parseGateFor(accumulatorLength);
diff --git a/services/platform/convex/agent_tools/tool_names.ts b/services/platform/convex/agent_tools/tool_names.ts
index 2c8d66afa..b05d0e029 100644
--- a/services/platform/convex/agent_tools/tool_names.ts
+++ b/services/platform/convex/agent_tools/tool_names.ts
@@ -11,7 +11,15 @@
 
 export const TOOL_NAMES = [
   'artifact_create',
-  'artifact_edit',
+  'artifact_list',
+  'artifact_run',
+  'artifact_packages_add',
+  'artifact_file_create',
+  'artifact_file_update',
+  'artifact_file_delete',
+  'artifact_file_rename',
+  'artifact_file_read',
+  'artifact_file_list',
   'customer_read',
   'product_read',
   'rag_search',
diff --git a/services/platform/convex/agent_tools/tool_registry.ts b/services/platform/convex/agent_tools/tool_registry.ts
index 7ac0b9c82..6b00f11ae 100644
--- a/services/platform/convex/agent_tools/tool_registry.ts
+++ b/services/platform/convex/agent_tools/tool_registry.ts
@@ -6,7 +6,15 @@
  */
 
 import { artifactCreateTool } from './artifacts/artifact_create_tool';
-import { artifactEditTool } from './artifacts/artifact_edit_tool';
+import { artifactFileCreateTool } from './artifacts/artifact_file_create_tool';
+import { artifactFileDeleteTool } from './artifacts/artifact_file_delete_tool';
+import { artifactFileListTool } from './artifacts/artifact_file_list_tool';
+import { artifactFileReadTool } from './artifacts/artifact_file_read_tool';
+import { artifactFileRenameTool } from './artifacts/artifact_file_rename_tool';
+import { artifactFileUpdateTool } from './artifacts/artifact_file_update_tool';
+import { artifactListTool } from './artifacts/artifact_list_tool';
+import { artifactPackagesAddTool } from './artifacts/artifact_packages_add_tool';
+import { artifactRunTool } from './artifacts/artifact_run_tool';
 import { conversationReadTool } from './conversations/conversation_read_tool';
 import { customerReadTool } from './customers/customer_read_tool';
 import { databaseSchemaTool } from './database/database_schema_tool';
@@ -45,7 +53,15 @@ export { TOOL_NAMES, type ToolName } from './tool_names';
  */
 export const TOOL_REGISTRY = [
   artifactCreateTool,
-  artifactEditTool,
+  artifactListTool,
+  artifactRunTool,
+  artifactPackagesAddTool,
+  artifactFileCreateTool,
+  artifactFileUpdateTool,
+  artifactFileDeleteTool,
+  artifactFileRenameTool,
+  artifactFileReadTool,
+  artifactFileListTool,
   customerReadTool,
   productReadTool,
   ragSearchTool,
diff --git a/services/platform/convex/artifacts/handlers/content_edits.ts b/services/platform/convex/artifacts/handlers/content_edits.ts
new file mode 100644
index 000000000..c64a8c91a
--- /dev/null
+++ b/services/platform/convex/artifacts/handlers/content_edits.ts
@@ -0,0 +1,619 @@
+/**
+ * Handler bodies + arg/return validators for content-bearing artifact
+ * mutations: createArtifact, deleteFileFromArtifact, renameFileInArtifact,
+ * createFileInArtifact, updateFileInArtifact. Registered by
+ * `internal_mutations.ts` as the public Convex internalMutation surface.
+ */
+
+import { ConvexError, v } from 'convex/values';
+
+import type { MutationCtx } from '../../_generated/server';
+import {
+  defaultEntryFileFor,
+  normalizeTitleForCompare,
+  normalizeTitleForStorage,
+  validatePath,
+} from '../../agent_tools/artifacts/shared';
+import { mirrorLegacyContent, resolveArtifactFiles } from '../resolve_files';
+import { artifactTypeValidator } from '../schema';
+import {
+  clearStreamingFlags,
+  syncArtifactFiles,
+  trimRevisionHistory,
+  validateFiles,
+} from './shared';
+
+// =============================================================================
+// createArtifact — idempotent on (thread, type, normalized-title)
+// =============================================================================
+
+export const createArtifactArgs = {
+  organizationId: v.string(),
+  threadId: v.string(),
+  type: artifactTypeValidator,
+  title: v.string(),
+  language: v.optional(v.string()),
+  /** Initial content for the entry file; required for runnable/mermaid/svg/html. */
+  content: v.optional(v.string()),
+  /** Optional entry-file override. Defaults from `defaultEntryFileFor(type, language)`. */
+  entryFile: v.optional(v.string()),
+  createdByMessageId: v.string(),
+} as const;
+
+export const createArtifactReturns = v.union(
+  v.object({
+    success: v.literal(true),
+    isNew: v.boolean(),
+    artifactId: v.id('artifacts'),
+    revision: v.number(),
+    entryFile: v.string(),
+    filePaths: v.array(v.string()),
+  }),
+  v.object({
+    success: v.literal(false),
+    conflict: v.literal('type_mismatch'),
+    existingArtifactId: v.id('artifacts'),
+    existingType: artifactTypeValidator,
+    // Title + file paths of the conflicting artifact — surfaced so the
+    // LLM can decide whether to use the existing artifact via
+    // `artifact_file_update` (when paths overlap) or rename and retry
+    // (when truly different). Without these, the LLM had to follow up
+    // with `artifact_file_list` to make the call.
+    existingTitle: v.string(),
+    existingFiles: v.array(v.string()),
+    message: v.string(),
+  }),
+);
+
+export async function createArtifactHandler(
+  ctx: MutationCtx,
+  args: {
+    organizationId: string;
+    threadId: string;
+    type:
+      | 'html'
+      | 'svg'
+      | 'markdown'
+      | 'mermaid'
+      | 'code'
+      | 'script_runnable'
+      | 'python_runnable'
+      | 'node_runnable';
+    title: string;
+    language?: string;
+    content?: string;
+    entryFile?: string;
+    createdByMessageId: string;
+  },
+) {
+  const storedTitle = normalizeTitleForStorage(args.title);
+  if (storedTitle.length === 0) {
+    throw new ConvexError({
+      code: 'invalid_title',
+      message: 'Title must contain at least one non-whitespace character.',
+    });
+  }
+  const compareKey = normalizeTitleForCompare(args.title);
+
+  // Idempotency scan.
+  for await (const row of ctx.db
+    .query('artifacts')
+    .withIndex('by_organizationId_and_thread', (q) =>
+      q.eq('organizationId', args.organizationId).eq('threadId', args.threadId),
+    )) {
+    const rowKey = normalizeTitleForCompare(row.title);
+    if (rowKey !== compareKey) continue;
+    if (row.type !== args.type) {
+      const conflictingResolved = resolveArtifactFiles(row);
+      return {
+        success: false as const,
+        conflict: 'type_mismatch' as const,
+        existingArtifactId: row._id,
+        existingType: row.type,
+        existingTitle: row.title,
+        existingFiles: conflictingResolved.files.map((f) => f.path),
+        message: `An artifact titled "${row.title}" already exists in this thread with type "${row.type}". Either pick a different title or use the existing artifactId ${row._id} via artifact_file_create / artifact_file_update.`,
+      };
+    }
+    // Title + type match → return existing. Do NOT overwrite content.
+    const resolved = resolveArtifactFiles(row);
+    return {
+      success: true as const,
+      isNew: false,
+      artifactId: row._id,
+      revision: row.revision,
+      entryFile: resolved.entryFile,
+      filePaths: resolved.files.map((f) => f.path),
+    };
+  }
+
+  // No collision — insert new artifact.
+  const entryFile = validatePath(
+    args.entryFile ?? defaultEntryFileFor(args.type, args.language),
+  );
+  const initialContent = args.content ?? '';
+  const files = validateFiles([{ path: entryFile, content: initialContent }]);
+  const now = Date.now();
+  const artifactId = await ctx.db.insert('artifacts', {
+    organizationId: args.organizationId,
+    threadId: args.threadId,
+    type: args.type,
+    title: storedTitle,
+    language: args.language,
+    files,
+    entryFile,
+    content: mirrorLegacyContent(files, entryFile),
+    revision: 1,
+    createdByMessageId: args.createdByMessageId,
+    lastEditedByMessageId: args.createdByMessageId,
+    createdAt: now,
+    updatedAt: now,
+  });
+  await ctx.db.insert('artifactRevisions', {
+    artifactId,
+    revision: 1,
+    content: mirrorLegacyContent(files, entryFile),
+    files,
+    entryFile,
+    filePath: entryFile,
+    editedByMessageId: args.createdByMessageId,
+    editKind: 'create',
+    createdAt: now,
+  });
+  await syncArtifactFiles(ctx, artifactId, files, now);
+  return {
+    success: true as const,
+    isNew: true,
+    artifactId,
+    revision: 1,
+    entryFile,
+    filePaths: files.map((f) => f.path),
+  };
+}
+
+// =============================================================================
+// deleteFileFromArtifact — refuses on entryFile and on last-file
+// =============================================================================
+
+export const deleteFileFromArtifactArgs = {
+  artifactId: v.id('artifacts'),
+  path: v.string(),
+  editedByMessageId: v.string(),
+  expectedRevision: v.number(),
+} as const;
+
+export const deleteFileFromArtifactReturns = v.union(
+  v.object({
+    success: v.literal(true),
+    revision: v.number(),
+    path: v.string(),
+  }),
+  v.object({
+    success: v.literal(false),
+    code: v.union(
+      v.literal('not_found'),
+      v.literal('stale'),
+      v.literal('file_missing'),
+      v.literal('entry_pin'),
+      v.literal('last_file'),
+    ),
+    message: v.string(),
+    currentRevision: v.optional(v.number()),
+    entryFile: v.optional(v.string()),
+  }),
+);
+
+export async function deleteFileFromArtifactHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: import('../../_generated/dataModel').Id<'artifacts'>;
+    path: string;
+    editedByMessageId: string;
+    expectedRevision: number;
+  },
+) {
+  const artifact = await ctx.db.get(args.artifactId);
+  if (!artifact) {
+    return {
+      success: false as const,
+      code: 'not_found' as const,
+      message: `Artifact ${args.artifactId} not found.`,
+    };
+  }
+  if (artifact.revision !== args.expectedRevision) {
+    return {
+      success: false as const,
+      code: 'stale' as const,
+      message: `Artifact has been modified since you last read it (revision ${artifact.revision}, you sent ${args.expectedRevision}). Re-read with artifact_file_list / artifact_file_read and retry.`,
+      currentRevision: artifact.revision,
+    };
+  }
+  const path = validatePath(args.path);
+  const resolved = resolveArtifactFiles(artifact);
+  if (!resolved.files.some((f) => f.path === path)) {
+    return {
+      success: false as const,
+      code: 'file_missing' as const,
+      message: `File "${path}" does not exist in this artifact.`,
+    };
+  }
+  if (path === resolved.entryFile) {
+    return {
+      success: false as const,
+      code: 'entry_pin' as const,
+      message: `Cannot delete entry file "${path}". Call artifact_file_rename to repoint the entry to another file first (renaming the entry file moves the entry pointer along with it).`,
+      entryFile: resolved.entryFile,
+    };
+  }
+  if (resolved.files.length <= 1) {
+    return {
+      success: false as const,
+      code: 'last_file' as const,
+      message: `Cannot delete the only file in an artifact. Delete the artifact instead.`,
+    };
+  }
+  const nextFiles = resolved.files.filter((f) => f.path !== path);
+  const validatedFiles = validateFiles(nextFiles);
+  const nextRevision = artifact.revision + 1;
+  const now = Date.now();
+  await ctx.db.patch(args.artifactId, {
+    files: validatedFiles,
+    entryFile: resolved.entryFile,
+    content: mirrorLegacyContent(validatedFiles, resolved.entryFile),
+    revision: nextRevision,
+    lastEditedByMessageId: args.editedByMessageId,
+    ...clearStreamingFlags(),
+    updatedAt: now,
+  });
+  await ctx.db.insert('artifactRevisions', {
+    artifactId: args.artifactId,
+    revision: nextRevision,
+    content: mirrorLegacyContent(validatedFiles, resolved.entryFile),
+    files: validatedFiles,
+    entryFile: resolved.entryFile,
+    filePath: path,
+    editedByMessageId: args.editedByMessageId,
+    editKind: 'file_delete',
+    createdAt: now,
+  });
+  await syncArtifactFiles(ctx, args.artifactId, validatedFiles, now);
+  await trimRevisionHistory(ctx, args.artifactId);
+  return {
+    success: true as const,
+    revision: nextRevision,
+    path,
+  };
+}
+
+// =============================================================================
+// renameFileInArtifact — atomic; repoints entryFile if from === entryFile
+// =============================================================================
+
+export const renameFileInArtifactArgs = {
+  artifactId: v.id('artifacts'),
+  from: v.string(),
+  to: v.string(),
+  editedByMessageId: v.string(),
+  expectedRevision: v.number(),
+} as const;
+
+export const renameFileInArtifactReturns = v.union(
+  v.object({
+    success: v.literal(true),
+    revision: v.number(),
+    from: v.string(),
+    to: v.string(),
+    entryFile: v.string(),
+    entryUpdated: v.boolean(),
+  }),
+  v.object({
+    success: v.literal(false),
+    code: v.union(
+      v.literal('not_found'),
+      v.literal('stale'),
+      v.literal('file_missing'),
+      v.literal('path_exists'),
+    ),
+    message: v.string(),
+    currentRevision: v.optional(v.number()),
+  }),
+);
+
+export async function renameFileInArtifactHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: import('../../_generated/dataModel').Id<'artifacts'>;
+    from: string;
+    to: string;
+    editedByMessageId: string;
+    expectedRevision: number;
+  },
+) {
+  const artifact = await ctx.db.get(args.artifactId);
+  if (!artifact) {
+    return {
+      success: false as const,
+      code: 'not_found' as const,
+      message: `Artifact ${args.artifactId} not found.`,
+    };
+  }
+  if (artifact.revision !== args.expectedRevision) {
+    return {
+      success: false as const,
+      code: 'stale' as const,
+      message: `Artifact has been modified since you last read it (revision ${artifact.revision}, you sent ${args.expectedRevision}). Re-read with artifact_file_list / artifact_file_read and retry.`,
+      currentRevision: artifact.revision,
+    };
+  }
+  const from = validatePath(args.from);
+  const to = validatePath(args.to);
+  const resolved = resolveArtifactFiles(artifact);
+  // Idempotent: from === to → no-op success.
+  if (from === to) {
+    return {
+      success: true as const,
+      revision: artifact.revision,
+      from,
+      to,
+      entryFile: resolved.entryFile,
+      entryUpdated: false,
+    };
+  }
+  if (!resolved.files.some((f) => f.path === from)) {
+    return {
+      success: false as const,
+      code: 'file_missing' as const,
+      message: `File "${from}" does not exist in this artifact.`,
+    };
+  }
+  if (resolved.files.some((f) => f.path === to)) {
+    return {
+      success: false as const,
+      code: 'path_exists' as const,
+      message: `Target path "${to}" already exists. Delete it first or pick a different name.`,
+    };
+  }
+  const nextFiles = resolved.files.map((f) =>
+    f.path === from ? { path: to, content: f.content } : f,
+  );
+  const validatedFiles = validateFiles(nextFiles);
+  const entryUpdated = from === resolved.entryFile;
+  const nextEntry = entryUpdated ? to : resolved.entryFile;
+  const nextRevision = artifact.revision + 1;
+  const now = Date.now();
+  await ctx.db.patch(args.artifactId, {
+    files: validatedFiles,
+    entryFile: nextEntry,
+    content: mirrorLegacyContent(validatedFiles, nextEntry),
+    revision: nextRevision,
+    lastEditedByMessageId: args.editedByMessageId,
+    ...clearStreamingFlags(),
+    updatedAt: now,
+  });
+  await ctx.db.insert('artifactRevisions', {
+    artifactId: args.artifactId,
+    revision: nextRevision,
+    content: mirrorLegacyContent(validatedFiles, nextEntry),
+    files: validatedFiles,
+    entryFile: nextEntry,
+    filePath: to,
+    fromPath: from,
+    editedByMessageId: args.editedByMessageId,
+    editKind: 'file_rename',
+    createdAt: now,
+  });
+  await syncArtifactFiles(ctx, args.artifactId, validatedFiles, now);
+  await trimRevisionHistory(ctx, args.artifactId);
+  return {
+    success: true as const,
+    revision: nextRevision,
+    from,
+    to,
+    entryFile: nextEntry,
+    entryUpdated,
+  };
+}
+
+// =============================================================================
+// createFileInArtifact — strict CRUD: refuse if path already exists
+// =============================================================================
+
+export const createFileInArtifactArgs = {
+  artifactId: v.id('artifacts'),
+  path: v.string(),
+  content: v.string(),
+  editedByMessageId: v.string(),
+  expectedRevision: v.number(),
+} as const;
+
+export const createFileInArtifactReturns = v.union(
+  v.object({
+    success: v.literal(true),
+    revision: v.number(),
+    path: v.string(),
+    byteLength: v.number(),
+  }),
+  v.object({
+    success: v.literal(false),
+    code: v.union(
+      v.literal('not_found'),
+      v.literal('stale'),
+      v.literal('path_exists'),
+    ),
+    message: v.string(),
+    currentRevision: v.optional(v.number()),
+  }),
+);
+
+export async function createFileInArtifactHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: import('../../_generated/dataModel').Id<'artifacts'>;
+    path: string;
+    content: string;
+    editedByMessageId: string;
+    expectedRevision: number;
+  },
+) {
+  const artifact = await ctx.db.get(args.artifactId);
+  if (!artifact) {
+    return {
+      success: false as const,
+      code: 'not_found' as const,
+      message: `Artifact ${args.artifactId} not found.`,
+    };
+  }
+  if (artifact.revision !== args.expectedRevision) {
+    return {
+      success: false as const,
+      code: 'stale' as const,
+      message: `Artifact has been modified since you last read it (revision ${artifact.revision}, you sent ${args.expectedRevision}). Re-read with artifact_file_list and retry.`,
+      currentRevision: artifact.revision,
+    };
+  }
+  const path = validatePath(args.path);
+  const resolved = resolveArtifactFiles(artifact);
+  if (resolved.files.some((f) => f.path === path)) {
+    return {
+      success: false as const,
+      code: 'path_exists' as const,
+      message: `File "${path}" already exists in this artifact. Use artifact_file_update to overwrite, or pick a different path.`,
+    };
+  }
+  const nextFiles = [...resolved.files, { path, content: args.content }];
+  const validatedFiles = validateFiles(nextFiles);
+  const nextRevision = artifact.revision + 1;
+  const now = Date.now();
+  await ctx.db.patch(args.artifactId, {
+    files: validatedFiles,
+    entryFile: resolved.entryFile,
+    content: mirrorLegacyContent(validatedFiles, resolved.entryFile),
+    revision: nextRevision,
+    lastEditedByMessageId: args.editedByMessageId,
+    ...clearStreamingFlags(),
+    updatedAt: now,
+  });
+  await ctx.db.insert('artifactRevisions', {
+    artifactId: args.artifactId,
+    revision: nextRevision,
+    content: mirrorLegacyContent(validatedFiles, resolved.entryFile),
+    files: validatedFiles,
+    entryFile: resolved.entryFile,
+    filePath: path,
+    editedByMessageId: args.editedByMessageId,
+    editKind: 'file_create',
+    createdAt: now,
+  });
+  await syncArtifactFiles(ctx, args.artifactId, validatedFiles, now);
+  await trimRevisionHistory(ctx, args.artifactId);
+  return {
+    success: true as const,
+    revision: nextRevision,
+    path,
+    byteLength: args.content.length,
+  };
+}
+
+// =============================================================================
+// updateFileInArtifact — strict CRUD: refuse if path does not exist (overwrite-only)
+// =============================================================================
+
+export const updateFileInArtifactArgs = {
+  artifactId: v.id('artifacts'),
+  path: v.string(),
+  content: v.string(),
+  editedByMessageId: v.string(),
+  expectedRevision: v.number(),
+} as const;
+
+export const updateFileInArtifactReturns = v.union(
+  v.object({
+    success: v.literal(true),
+    revision: v.number(),
+    path: v.string(),
+    byteLength: v.number(),
+  }),
+  v.object({
+    success: v.literal(false),
+    code: v.union(
+      v.literal('not_found'),
+      v.literal('stale'),
+      v.literal('file_missing'),
+    ),
+    message: v.string(),
+    currentRevision: v.optional(v.number()),
+  }),
+);
+
+export async function updateFileInArtifactHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: import('../../_generated/dataModel').Id<'artifacts'>;
+    path: string;
+    content: string;
+    editedByMessageId: string;
+    expectedRevision: number;
+  },
+) {
+  const artifact = await ctx.db.get(args.artifactId);
+  if (!artifact) {
+    return {
+      success: false as const,
+      code: 'not_found' as const,
+      message: `Artifact ${args.artifactId} not found.`,
+    };
+  }
+  if (artifact.revision !== args.expectedRevision) {
+    return {
+      success: false as const,
+      code: 'stale' as const,
+      message: `Artifact has been modified since you last read it (revision ${artifact.revision}, you sent ${args.expectedRevision}). Re-read with artifact_file_list and retry.`,
+      currentRevision: artifact.revision,
+    };
+  }
+  const path = validatePath(args.path);
+  const resolved = resolveArtifactFiles(artifact);
+  if (!resolved.files.some((f) => f.path === path)) {
+    return {
+      success: false as const,
+      code: 'file_missing' as const,
+      message: `File "${path}" does not exist in this artifact. Existing paths: ${resolved.files
+        .map((f) => f.path)
+        .join(', ')}. Use artifact_file_create to add a new file.`,
+    };
+  }
+  const nextFiles = resolved.files.map((f) =>
+    f.path === path ? { path, content: args.content } : f,
+  );
+  const validatedFiles = validateFiles(nextFiles);
+  const nextRevision = artifact.revision + 1;
+  const now = Date.now();
+  await ctx.db.patch(args.artifactId, {
+    files: validatedFiles,
+    entryFile: resolved.entryFile,
+    content: mirrorLegacyContent(validatedFiles, resolved.entryFile),
+    revision: nextRevision,
+    lastEditedByMessageId: args.editedByMessageId,
+    ...clearStreamingFlags(),
+    updatedAt: now,
+  });
+  await ctx.db.insert('artifactRevisions', {
+    artifactId: args.artifactId,
+    revision: nextRevision,
+    content: mirrorLegacyContent(validatedFiles, resolved.entryFile),
+    files: validatedFiles,
+    entryFile: resolved.entryFile,
+    filePath: path,
+    editedByMessageId: args.editedByMessageId,
+    editKind: 'rewrite',
+    createdAt: now,
+  });
+  await syncArtifactFiles(ctx, args.artifactId, validatedFiles, now);
+  await trimRevisionHistory(ctx, args.artifactId);
+  return {
+    success: true as const,
+    revision: nextRevision,
+    path,
+    byteLength: args.content.length,
+  };
+}
diff --git a/services/platform/convex/artifacts/handlers/run_state.ts b/services/platform/convex/artifacts/handlers/run_state.ts
new file mode 100644
index 000000000..8b411a305
--- /dev/null
+++ b/services/platform/convex/artifacts/handlers/run_state.ts
@@ -0,0 +1,679 @@
+/**
+ * Handler bodies + validators for runnable-artifact run-state mutations:
+ * setArtifactRunConfig, initArtifactRun, appendArtifactRunOutput,
+ * patchArtifactRunProgress, finalizeArtifactRun (+ the pure-function
+ * `applyFinalizeArtifactRun` shared with the sandbox internal_mutations).
+ */
+
+import { ConvexError, type Infer, v } from 'convex/values';
+
+import type { Id } from '../../_generated/dataModel';
+import type { MutationCtx } from '../../_generated/server';
+import { isRunnableArtifactType } from '../../agent_tools/artifacts/shared';
+import {
+  SANDBOX_STDERR_PREVIEW_MAX,
+  SANDBOX_STDOUT_PREVIEW_MAX,
+} from '../../sandbox/schema';
+import {
+  sandboxRunProgressValidator,
+  sandboxTerminalStatuses,
+} from '../../sandbox/wire';
+import {
+  artifactRunErrorCodeValidator,
+  artifactRunOutputFileValidator,
+  artifactRunStatusValidator,
+} from '../schema';
+
+type ArtifactRunErrorCode = Infer<typeof artifactRunErrorCodeValidator>;
+type ArtifactRunOutputFile = Infer<typeof artifactRunOutputFileValidator>;
+
+// =============================================================================
+// setArtifactRunConfig — persist packages / runOptions on the artifact row
+// =============================================================================
+
+export const setArtifactRunConfigArgs = {
+  artifactId: v.id('artifacts'),
+  runPackages: v.array(v.string()),
+  /**
+   * Optional grouped form persisted alongside the legacy flat list.
+   * Polyglot runs read from here; single-runtime runs fall back to
+   * `runPackages` when this is absent.
+   */
+  runPackagesByLang: v.optional(
+    v.object({
+      python: v.optional(v.array(v.string())),
+      node: v.optional(v.array(v.string())),
+    }),
+  ),
+  runOptions: v.optional(
+    v.object({
+      allowSdist: v.optional(v.boolean()),
+      allowInstallScripts: v.optional(v.boolean()),
+    }),
+  ),
+} as const;
+
+export const setArtifactRunConfigReturns = v.null();
+
+export async function setArtifactRunConfigHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: Id<'artifacts'>;
+    runPackages: string[];
+    runPackagesByLang?: { python?: string[]; node?: string[] };
+    runOptions?: { allowSdist?: boolean; allowInstallScripts?: boolean };
+  },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return null;
+  if (!isRunnableArtifactType(row.type)) return null;
+  await ctx.db.patch(args.artifactId, {
+    runPackages: args.runPackages,
+    ...(args.runPackagesByLang !== undefined && {
+      runPackagesByLang: args.runPackagesByLang,
+    }),
+    ...(args.runOptions !== undefined && { runOptions: args.runOptions }),
+  });
+  return null;
+}
+
+// =============================================================================
+// addArtifactPackages — union packages_add into the persistent runPackages
+//
+// Used by the `artifact_packages_add` tool and the `artifact_file_create` /
+// `artifact_file_update` tools' optional `packages_add` arg so the LLM can declare
+// new dependencies inline with the edit that introduces them. Dedupe is
+// case-sensitive (matches pip/npm's own resolution rules). Existing
+// entries are never removed — `artifact_create` is the way to start
+// fresh.
+// =============================================================================
+
+export const addArtifactPackagesArgs = {
+  artifactId: v.id('artifacts'),
+  /**
+   * Flat-list union into `runPackages`. Kept for callers that don't
+   * know which runtime their specs belong to (legacy single-runtime
+   * artifacts). Polyglot callers should use {@link packagesAddByLang}
+   * instead.
+   */
+  packagesAdd: v.array(v.string()),
+  /**
+   * Grouped union into `runPackagesByLang`. Either bucket may be
+   * omitted. Both `packagesAdd` and `packagesAddByLang` can be sent in
+   * the same call — they're applied independently.
+   */
+  packagesAddByLang: v.optional(
+    v.object({
+      python: v.optional(v.array(v.string())),
+      node: v.optional(v.array(v.string())),
+    }),
+  ),
+} as const;
+
+export const addArtifactPackagesReturns = v.object({
+  runPackages: v.array(v.string()),
+  added: v.array(v.string()),
+  runPackagesByLang: v.optional(
+    v.object({
+      python: v.optional(v.array(v.string())),
+      node: v.optional(v.array(v.string())),
+    }),
+  ),
+  addedByLang: v.optional(
+    v.object({
+      python: v.optional(v.array(v.string())),
+      node: v.optional(v.array(v.string())),
+    }),
+  ),
+});
+
+function unionPackages(
+  existing: readonly string[],
+  incoming: readonly string[],
+): { next: string[]; added: string[] } {
+  const seen = new Set(existing);
+  const added: string[] = [];
+  for (const pkg of incoming) {
+    if (pkg.length === 0) continue;
+    if (seen.has(pkg)) continue;
+    seen.add(pkg);
+    added.push(pkg);
+  }
+  return {
+    next: added.length === 0 ? [...existing] : [...existing, ...added],
+    added,
+  };
+}
+
+export async function addArtifactPackagesHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: Id<'artifacts'>;
+    packagesAdd: string[];
+    packagesAddByLang?: { python?: string[]; node?: string[] };
+  },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return { runPackages: [], added: [] };
+  if (!isRunnableArtifactType(row.type)) {
+    return { runPackages: row.runPackages ?? [], added: [] };
+  }
+  const flatUnion = unionPackages(row.runPackages ?? [], args.packagesAdd);
+  const stored = row.runPackagesByLang ?? {};
+  const pyUnion = unionPackages(
+    stored.python ?? [],
+    args.packagesAddByLang?.python ?? [],
+  );
+  const nodeUnion = unionPackages(
+    stored.node ?? [],
+    args.packagesAddByLang?.node ?? [],
+  );
+  const groupedChanged = pyUnion.added.length > 0 || nodeUnion.added.length > 0;
+  const flatChanged = flatUnion.added.length > 0;
+  if (!flatChanged && !groupedChanged) {
+    return {
+      runPackages: flatUnion.next,
+      added: [],
+      ...(stored.python !== undefined || stored.node !== undefined
+        ? {
+            runPackagesByLang: {
+              ...(stored.python !== undefined && { python: stored.python }),
+              ...(stored.node !== undefined && { node: stored.node }),
+            },
+          }
+        : {}),
+    };
+  }
+  const patch: Record<string, unknown> = {};
+  if (flatChanged) patch.runPackages = flatUnion.next;
+  if (groupedChanged) {
+    const nextGrouped: { python?: string[]; node?: string[] } = {};
+    if (pyUnion.next.length > 0) nextGrouped.python = pyUnion.next;
+    if (nodeUnion.next.length > 0) nextGrouped.node = nodeUnion.next;
+    patch.runPackagesByLang = nextGrouped;
+  }
+  await ctx.db.patch(args.artifactId, patch);
+  return {
+    runPackages: flatUnion.next,
+    added: flatUnion.added,
+    ...((pyUnion.next.length > 0 || nodeUnion.next.length > 0) && {
+      runPackagesByLang: {
+        ...(pyUnion.next.length > 0 && { python: pyUnion.next }),
+        ...(nodeUnion.next.length > 0 && { node: nodeUnion.next }),
+      },
+    }),
+    ...((pyUnion.added.length > 0 || nodeUnion.added.length > 0) && {
+      addedByLang: {
+        ...(pyUnion.added.length > 0 && { python: pyUnion.added }),
+        ...(nodeUnion.added.length > 0 && { node: nodeUnion.added }),
+      },
+    }),
+  };
+}
+
+// =============================================================================
+// initArtifactRun — clear run-progress fields at the start of a new run
+//
+// `runOutputFiles` intentionally NOT cleared here — keep the prior
+// successful run's outputs available for pre-staging during this run.
+// Successful finalize will replace; failed/empty finalize preserves.
+// =============================================================================
+
+export const initArtifactRunArgs = {
+  artifactId: v.id('artifacts'),
+} as const;
+
+export const initArtifactRunReturns = v.null();
+
+export async function initArtifactRunHandler(
+  ctx: MutationCtx,
+  args: { artifactId: Id<'artifacts'> },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return null;
+  if (!isRunnableArtifactType(row.type)) return null;
+  if (
+    row.runStatus === 'queued' ||
+    row.runStatus === 'installing' ||
+    row.runStatus === 'running'
+  ) {
+    throw new ConvexError({
+      code: 'RUN_IN_FLIGHT',
+      message: `artifact ${args.artifactId} already has a run in flight (status: ${row.runStatus}); wait for it to settle before starting another.`,
+    });
+  }
+  await ctx.db.patch(args.artifactId, {
+    runStatus: 'queued',
+    runProgress: { kind: 'queued' },
+    runStartedAt: Date.now(),
+    runRevision: row.revision,
+    runCompletedAt: undefined,
+    runExitCode: undefined,
+    runErrorCode: undefined,
+    runErrorMessage: undefined,
+    runStdoutPreview: undefined,
+    runStderrPreview: undefined,
+    runStdoutStorageId: undefined,
+    runStderrStorageId: undefined,
+    runExecutionId: undefined,
+  });
+  return null;
+}
+
+// =============================================================================
+// appendArtifactRunOutput — incremental tail of the running stdout/stderr
+//
+// Caps + ordering:
+//  - Each preview field caps at SANDBOX_{STDOUT,STDERR}_PREVIEW_MAX = 16 KB.
+//    Bytes past the cap are silently dropped — the canonical preview written
+//    at `finalizeArtifactRun` is the first 16 KB of the buffer, so matching
+//    semantics here avoids a content-switch the user would notice at
+//    terminal time.
+//  - Mutation no-ops on terminal `runStatus` (a late-arriving delta from a
+//    canceled run can't overwrite the finalize-time preview).
+//  - Mutation no-ops when `args.executionId !== row.runExecutionId` (a
+//    stale delta from a previous run can't pollute a freshly-started one).
+// =============================================================================
+
+export const appendArtifactRunOutputArgs = {
+  artifactId: v.id('artifacts'),
+  executionId: v.id('sandboxExecutions'),
+  stdoutDelta: v.optional(v.string()),
+  stderrDelta: v.optional(v.string()),
+} as const;
+
+export const appendArtifactRunOutputReturns = v.null();
+
+export async function appendArtifactRunOutputHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: Id<'artifacts'>;
+    executionId: Id<'sandboxExecutions'>;
+    stdoutDelta?: string;
+    stderrDelta?: string;
+  },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return null;
+  if (!isRunnableArtifactType(row.type)) return null;
+  if (
+    row.runStatus !== undefined &&
+    sandboxTerminalStatuses.has(row.runStatus)
+  ) {
+    return null;
+  }
+  if (
+    row.runExecutionId !== undefined &&
+    row.runExecutionId !== args.executionId
+  ) {
+    return null;
+  }
+  const patch: Record<string, unknown> = {};
+  if (args.stdoutDelta && args.stdoutDelta.length > 0) {
+    const current = row.runStdoutPreview ?? '';
+    if (current.length < SANDBOX_STDOUT_PREVIEW_MAX) {
+      const headroom = SANDBOX_STDOUT_PREVIEW_MAX - current.length;
+      const slice = args.stdoutDelta.slice(0, headroom);
+      if (slice.length > 0) patch.runStdoutPreview = current + slice;
+    }
+  }
+  if (args.stderrDelta && args.stderrDelta.length > 0) {
+    const current = row.runStderrPreview ?? '';
+    if (current.length < SANDBOX_STDERR_PREVIEW_MAX) {
+      const headroom = SANDBOX_STDERR_PREVIEW_MAX - current.length;
+      const slice = args.stderrDelta.slice(0, headroom);
+      if (slice.length > 0) patch.runStderrPreview = current + slice;
+    }
+  }
+  if (Object.keys(patch).length === 0) return null;
+  await ctx.db.patch(args.artifactId, patch);
+  return null;
+}
+
+// =============================================================================
+// patchArtifactRunProgress — structured phase updates from the spawner
+// =============================================================================
+
+export const patchArtifactRunProgressArgs = {
+  artifactId: v.id('artifacts'),
+  runStatus: v.optional(artifactRunStatusValidator),
+  runProgress: v.optional(sandboxRunProgressValidator),
+  runExecutionId: v.optional(v.id('sandboxExecutions')),
+} as const;
+
+export const patchArtifactRunProgressReturns = v.null();
+
+export async function patchArtifactRunProgressHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: Id<'artifacts'>;
+    runStatus?: Infer<typeof artifactRunStatusValidator>;
+    runProgress?: Infer<typeof sandboxRunProgressValidator>;
+    runExecutionId?: Id<'sandboxExecutions'>;
+  },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return null;
+  if (!isRunnableArtifactType(row.type)) return null;
+  if (
+    row.runStatus !== undefined &&
+    sandboxTerminalStatuses.has(row.runStatus)
+  ) {
+    console.warn(
+      `[patchArtifactRunProgress] no-op: artifact ${args.artifactId} already terminal as ${row.runStatus}`,
+    );
+    return null;
+  }
+  const patch: Record<string, unknown> = {};
+  if (args.runStatus !== undefined) patch.runStatus = args.runStatus;
+  if (args.runProgress !== undefined) patch.runProgress = args.runProgress;
+  if (args.runExecutionId !== undefined) {
+    patch.runExecutionId = args.runExecutionId;
+  }
+  if (Object.keys(patch).length === 0) return null;
+  await ctx.db.patch(args.artifactId, patch);
+  return null;
+}
+
+// =============================================================================
+// applyFinalizeArtifactRun — pure helper shared with sandbox internal_mutations
+//
+// `runOutputFiles` is only written when the harvest produced at least one
+// file. A run with an empty harvest — regardless of run status — must NOT
+// wipe the prior run's outputs. The footgun this guards against: a
+// `qa.py`-only run that exits 0 with no /workspace/output writes counts
+// as `completed`; if it overwrites the legacy `runOutputFiles` field
+// with `[]`, the next `artifact_run`'s pre-stage falls back to that
+// empty list and the user hits `FileNotFoundError` on a file that
+// demonstrably existed before. The `artifactRunFiles` table is append-
+// only and not affected by this rule.
+// =============================================================================
+
+export async function applyFinalizeArtifactRun(
+  ctx: MutationCtx,
+  args: {
+    artifactId: Id<'artifacts'>;
+    runStatus: 'completed' | 'failed' | 'cancelled';
+    runExitCode?: number;
+    runErrorCode?: ArtifactRunErrorCode;
+    runErrorMessage?: string;
+    runStdoutPreview?: string;
+    runStderrPreview?: string;
+    runStdoutStorageId?: Id<'_storage'>;
+    runStderrStorageId?: Id<'_storage'>;
+    runOutputFiles: ArtifactRunOutputFile[];
+    runExecutionId?: Id<'sandboxExecutions'>;
+  },
+): Promise<void> {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return;
+  if (!isRunnableArtifactType(row.type)) return;
+  // Duplicate-finalize guard. The original purpose is to drop late-arriving
+  // deltas that would clobber already-finalized state for the SAME run.
+  // The bare terminal-status check has a subtler footgun: when a caller
+  // invokes `executeCode` on the same artifact twice without going through
+  // `initArtifactRun` (e.g. direct test harnesses, future custom callers),
+  // the artifact row is still terminal from the previous run and the
+  // second finalize gets dropped silently — `artifactRuns` /
+  // `artifactRunFiles` / `artifactOutputs` never see the new run.
+  //
+  // Gate on the executionId instead: only no-op when the incoming finalize
+  // targets the SAME execution as the one that already terminated the row.
+  // A different execution means a genuinely new run is finalizing — let it
+  // through so the dual-write tables capture it.
+  //
+  // Fallback finalize (caller omits `runExecutionId`): the tool-side catch
+  // in artifact_run_tool.ts hits this after `failExecution` already wrote
+  // terminal state with the executionId. We must still de-dup that case,
+  // OR the unrelated-second-execution case where executionId never
+  // landed. The narrow rule: no-op a fallback finalize only when the row
+  // ALREADY carries an executionId — that proves a real execution wrote
+  // the terminal state, and the fallback is the idempotent retry. If the
+  // row's executionId is also unset, this is a fresh execution whose
+  // history would otherwise be lost (audit finding R2-V1 #A) — let it
+  // through.
+  const sameExecution =
+    args.runExecutionId !== undefined &&
+    row.runExecutionId !== undefined &&
+    args.runExecutionId === row.runExecutionId;
+  const fallbackOverPriorExecution =
+    args.runExecutionId === undefined && row.runExecutionId !== undefined;
+  if (
+    row.runStatus !== undefined &&
+    sandboxTerminalStatuses.has(row.runStatus) &&
+    (sameExecution || fallbackOverPriorExecution)
+  ) {
+    console.warn(
+      `[finalizeArtifactRun] no-op: artifact ${args.artifactId} already terminal as ${row.runStatus} for execution ${row.runExecutionId ?? '<unset>'}; dropping duplicate ${args.runStatus}${fallbackOverPriorExecution ? ' (fallback finalize over recorded execution)' : ''}`,
+    );
+    return;
+  }
+  const completedAt = Date.now();
+  await ctx.db.patch(args.artifactId, {
+    runStatus: args.runStatus,
+    runProgress: undefined,
+    runCompletedAt: completedAt,
+    ...(args.runExitCode !== undefined && { runExitCode: args.runExitCode }),
+    ...(args.runErrorCode !== undefined && {
+      runErrorCode: args.runErrorCode,
+    }),
+    ...(args.runErrorMessage !== undefined && {
+      runErrorMessage: args.runErrorMessage,
+    }),
+    ...(args.runStdoutPreview !== undefined && {
+      runStdoutPreview: args.runStdoutPreview,
+    }),
+    ...(args.runStderrPreview !== undefined && {
+      runStderrPreview: args.runStderrPreview,
+    }),
+    ...(args.runStdoutStorageId !== undefined && {
+      runStdoutStorageId: args.runStdoutStorageId,
+    }),
+    ...(args.runStderrStorageId !== undefined && {
+      runStderrStorageId: args.runStderrStorageId,
+    }),
+    ...(args.runOutputFiles.length > 0 && {
+      runOutputFiles: args.runOutputFiles,
+    }),
+    ...(args.runExecutionId !== undefined && {
+      runExecutionId: args.runExecutionId,
+    }),
+  });
+
+  // Dual-write to the new artifactRuns / artifactRunFiles tables. The
+  // legacy artifacts.runOutputFiles write above remains as a fallback
+  // source per [feedback_deprecate_dont_delete_schema_fields]; later
+  // phases will switch readers and stop writing the old field. Append-
+  // only — every finalize creates a new artifactRuns row (including
+  // failed/cancelled runs, so the LLM can introspect history).
+  const startedAt = row.runStartedAt ?? completedAt;
+  const runId = await ctx.db.insert('artifactRuns', {
+    artifactId: args.artifactId,
+    status: args.runStatus,
+    ...(args.runExitCode !== undefined && { exitCode: args.runExitCode }),
+    ...(args.runErrorCode !== undefined && { errorCode: args.runErrorCode }),
+    ...(args.runErrorMessage !== undefined && {
+      errorMessage: args.runErrorMessage,
+    }),
+    startedAt,
+    endedAt: completedAt,
+    revision: row.runRevision ?? row.revision,
+    ...(args.runExecutionId !== undefined && {
+      executionId: args.runExecutionId,
+    }),
+  });
+  for (const f of args.runOutputFiles) {
+    if (f.storageId === undefined) continue;
+    await ctx.db.insert('artifactRunFiles', {
+      runId,
+      artifactId: args.artifactId,
+      name: f.name,
+      storageId: f.storageId,
+      size: f.size,
+      ...(f.contentType !== undefined && { contentType: f.contentType }),
+      ...(f.sha256 !== undefined && { sha256: f.sha256 }),
+      createdAt: completedAt,
+    });
+  }
+
+  // Upsert into `artifactOutputs` — the cumulative workspace-state manifest
+  // that backs pre-stage on the next run. Keyed by (artifactId, name);
+  // same-name files patch in place (newest wins), new names accumulate.
+  // Empty harvests don't touch the manifest, so a no-op run never wipes
+  // earlier output. This is the single source of truth that replaces the
+  // "latest-run walk-back" model — multi-run histories with different
+  // filenames no longer lose older files.
+  for (const f of args.runOutputFiles) {
+    if (f.storageId === undefined) continue;
+    const existing = await ctx.db
+      .query('artifactOutputs')
+      .withIndex('by_artifact_name', (q) =>
+        q.eq('artifactId', args.artifactId).eq('name', f.name),
+      )
+      .unique();
+    const patch = {
+      storageId: f.storageId,
+      size: f.size,
+      ...(f.contentType !== undefined && { contentType: f.contentType }),
+      ...(f.sha256 !== undefined && { sha256: f.sha256 }),
+      producedByRunId: runId,
+      updatedAt: completedAt,
+    };
+    if (existing === null) {
+      await ctx.db.insert('artifactOutputs', {
+        artifactId: args.artifactId,
+        name: f.name,
+        ...patch,
+      });
+    } else {
+      await ctx.db.patch(existing._id, patch);
+    }
+  }
+}
+
+export const finalizeArtifactRunArgs = {
+  artifactId: v.id('artifacts'),
+  runStatus: v.union(
+    v.literal('completed'),
+    v.literal('failed'),
+    v.literal('cancelled'),
+  ),
+  runExitCode: v.optional(v.number()),
+  runErrorCode: v.optional(artifactRunErrorCodeValidator),
+  runErrorMessage: v.optional(v.string()),
+  runStdoutPreview: v.optional(v.string()),
+  runStderrPreview: v.optional(v.string()),
+  runStdoutStorageId: v.optional(v.id('_storage')),
+  runStderrStorageId: v.optional(v.id('_storage')),
+  runOutputFiles: v.array(artifactRunOutputFileValidator),
+  runExecutionId: v.optional(v.id('sandboxExecutions')),
+} as const;
+
+export const finalizeArtifactRunReturns = v.null();
+
+export async function finalizeArtifactRunHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: Id<'artifacts'>;
+    runStatus: 'completed' | 'failed' | 'cancelled';
+    runExitCode?: number;
+    runErrorCode?: ArtifactRunErrorCode;
+    runErrorMessage?: string;
+    runStdoutPreview?: string;
+    runStderrPreview?: string;
+    runStdoutStorageId?: Id<'_storage'>;
+    runStderrStorageId?: Id<'_storage'>;
+    runOutputFiles: ArtifactRunOutputFile[];
+    runExecutionId?: Id<'sandboxExecutions'>;
+  },
+) {
+  await applyFinalizeArtifactRun(ctx, args);
+  return null;
+}
+
+// =============================================================================
+// deriveOutputManifestFromHistory — lazy migration from artifactRunFiles
+//
+// Idempotent. Builds the cumulative `artifactOutputs` manifest for an
+// artifact by walking `artifactRunFiles` newest-first and reducing
+// (name → most-recent file). Used by `getLatestRunOutputs` on the
+// FIRST pre-stage read for an artifact created before the manifest
+// existed; subsequent runs maintain the manifest via the upsert in
+// `applyFinalizeArtifactRun`.
+//
+// `sha256` is left undefined on legacy entries (the spawner-side hash
+// wasn't computed at the time those rows landed). The pre-stage
+// attestation treats no-sha256 entries as "presence only" — a successful
+// download by name is enough; byte-exact diff is only enforced once the
+// manifest has been refreshed by a fresh harvest.
+// =============================================================================
+
+export const deriveOutputManifestFromHistoryArgs = {
+  artifactId: v.id('artifacts'),
+} as const;
+
+export const deriveOutputManifestFromHistoryReturns = v.object({
+  inserted: v.number(),
+  alreadyPresent: v.boolean(),
+});
+
+export async function deriveOutputManifestFromHistoryHandler(
+  ctx: MutationCtx,
+  args: { artifactId: Id<'artifacts'> },
+): Promise<{ inserted: number; alreadyPresent: boolean }> {
+  // Idempotency check — if any manifest row exists for this artifact,
+  // assume derivation already happened and return early. The merge-on-
+  // finalize path keeps it current from here on.
+  const existing = await ctx.db
+    .query('artifactOutputs')
+    .withIndex('by_artifact', (q) => q.eq('artifactId', args.artifactId))
+    .first();
+  if (existing !== null) {
+    return { inserted: 0, alreadyPresent: true };
+  }
+
+  // Walk artifactRunFiles indexed by artifact, reducing newest-name-wins.
+  // `_creationTime` desc gives us newest first; the first occurrence of
+  // each `name` is the winner. We resolve the producing run id by
+  // reading the `runId` field already present on the row.
+  const byName = new Map<
+    string,
+    {
+      runId: Id<'artifactRuns'>;
+      storageId: Id<'_storage'>;
+      size: number;
+      contentType?: string;
+      createdAt: number;
+    }
+  >();
+  for await (const row of ctx.db
+    .query('artifactRunFiles')
+    .withIndex('by_artifact', (q) => q.eq('artifactId', args.artifactId))
+    .order('desc')) {
+    if (byName.has(row.name)) continue;
+    byName.set(row.name, {
+      runId: row.runId,
+      storageId: row.storageId,
+      size: row.size,
+      ...(row.contentType !== undefined && { contentType: row.contentType }),
+      createdAt: row.createdAt,
+    });
+  }
+
+  const now = Date.now();
+  let inserted = 0;
+  for (const [name, info] of byName) {
+    await ctx.db.insert('artifactOutputs', {
+      artifactId: args.artifactId,
+      name,
+      storageId: info.storageId,
+      size: info.size,
+      ...(info.contentType !== undefined && { contentType: info.contentType }),
+      producedByRunId: info.runId,
+      updatedAt: now,
+    });
+    inserted += 1;
+  }
+
+  return { inserted, alreadyPresent: false };
+}
diff --git a/services/platform/convex/artifacts/handlers/shared.ts b/services/platform/convex/artifacts/handlers/shared.ts
new file mode 100644
index 000000000..9a8f52beb
--- /dev/null
+++ b/services/platform/convex/artifacts/handlers/shared.ts
@@ -0,0 +1,188 @@
+/**
+ * Shared helpers + constants for the artifact mutation handlers.
+ *
+ * Lives next to the handler modules so the per-mutation files can stay free
+ * of helper bodies; the `internal_mutations.ts` shell file re-exports the
+ * public-facing symbols (`MAX_ARTIFACT_BYTES`, `assertAggregateSize`) so
+ * existing imports continue to resolve.
+ */
+
+import { ConvexError } from 'convex/values';
+
+import type { Doc, Id } from '../../_generated/dataModel';
+import type { MutationCtx } from '../../_generated/server';
+import {
+  MAX_FILES_PER_ARTIFACT,
+  findDuplicatePath,
+  validatePath,
+} from '../../agent_tools/artifacts/shared';
+import { aggregateFileBytes } from '../resolve_files';
+
+export const STALE_STREAM_THRESHOLD_MS = 60_000;
+
+/**
+ * Hard cap on an artifact's TOTAL content (sum of all `files[].content` bytes).
+ * Convex's per-document limit is 1 MiB; we cap below that so a single mutation
+ * that also writes a revision row (full files snapshot) stays under the limit,
+ * and so an LLM rewrite that runs away yields a clean `too_large` error.
+ */
+export const MAX_ARTIFACT_BYTES = 800_000;
+
+/** Lazy-GC retention: keep the N most recent revisions per artifact. */
+export const REVISIONS_RETENTION = 20;
+
+/**
+ * @deprecated — single-file size check. Kept for backward-compat with
+ * existing callers; new code should use {@link assertAggregateSize}.
+ */
+export function assertContentSize(content: string): void {
+  const size = new TextEncoder().encode(content).byteLength;
+  if (size > MAX_ARTIFACT_BYTES) {
+    throw new ConvexError({
+      code: 'too_large',
+      message: `Artifact content is ${size} bytes; max ${MAX_ARTIFACT_BYTES}.`,
+    });
+  }
+}
+
+export function assertAggregateSize(
+  files: readonly { readonly content: string }[],
+): void {
+  const size = aggregateFileBytes(files);
+  if (size > MAX_ARTIFACT_BYTES) {
+    throw new ConvexError({
+      code: 'too_large',
+      message: `Artifact total content is ${size} bytes across ${files.length} files; max ${MAX_ARTIFACT_BYTES}.`,
+    });
+  }
+}
+
+/**
+ * Central source of truth for the field set that "ends a stream." Every
+ * settle / abort / cleanup path patches these to `undefined` together so
+ * the canvas pane reliably transitions out of the live state.
+ */
+export function clearStreamingFlags(): Partial<Doc<'artifacts'>> {
+  return {
+    streamingContent: undefined,
+    streamingPatches: undefined,
+    streamingPath: undefined,
+    liveStreamMode: undefined,
+    liveStreamStartedAt: undefined,
+    toolCallId: undefined,
+  };
+}
+
+/**
+ * Lazy GC of revision history. Called at the tail of every revision-emitting
+ * mutation. Keeps the {@link REVISIONS_RETENTION} most recent revisions and
+ * deletes older ones opportunistically. No cron — per memory
+ * feedback_lazy_cleanup_over_cron.
+ */
+export async function trimRevisionHistory(
+  ctx: MutationCtx,
+  artifactId: Id<'artifacts'>,
+): Promise<void> {
+  const rows: { _id: Id<'artifactRevisions'>; revision: number }[] = [];
+  for await (const row of ctx.db
+    .query('artifactRevisions')
+    .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))
+    .order('desc')) {
+    rows.push({ _id: row._id, revision: row.revision });
+    if (rows.length > REVISIONS_RETENTION * 2) break; // safety bound
+  }
+  if (rows.length <= REVISIONS_RETENTION) return;
+  for (let i = REVISIONS_RETENTION; i < rows.length; i += 1) {
+    await ctx.db.delete(rows[i]._id);
+  }
+}
+
+/**
+ * Reconcile the `artifactFiles` table with the artifact's authoritative
+ * `files[]` array after a settle. The artifact-row write is the source of
+ * truth for the in-flight refactor (plan llm-majestic-hamming.md →
+ * artifact-breezy-codd.md); this helper keeps the per-file table in sync so
+ * canvas reads from `artifactFiles` see the same view.
+ *
+ * Insert rows for new paths, patch content/updatedAt for changed paths,
+ * delete rows whose path is no longer in `files`. `streamingWriteToolCallId`
+ * is cleared on every settle — the stream that wrote this revision is done.
+ */
+export async function syncArtifactFiles(
+  ctx: MutationCtx,
+  artifactId: Id<'artifacts'>,
+  files: readonly { readonly path: string; readonly content: string }[],
+  now: number,
+): Promise<void> {
+  const existing: Doc<'artifactFiles'>[] = [];
+  for await (const row of ctx.db
+    .query('artifactFiles')
+    .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))) {
+    existing.push(row);
+  }
+  const targetPaths = new Set(files.map((f) => f.path));
+  const existingByPath = new Map<string, Doc<'artifactFiles'>>();
+  for (const row of existing) existingByPath.set(row.path, row);
+
+  for (const f of files) {
+    const prior = existingByPath.get(f.path);
+    if (prior === undefined) {
+      await ctx.db.insert('artifactFiles', {
+        artifactId,
+        path: f.path,
+        content: f.content,
+        createdAt: now,
+        updatedAt: now,
+      });
+    } else if (
+      prior.content !== f.content ||
+      prior.streamingWriteToolCallId !== undefined
+    ) {
+      await ctx.db.patch(prior._id, {
+        content: f.content,
+        streamingWriteToolCallId: undefined,
+        updatedAt: now,
+      });
+    }
+  }
+  for (const row of existing) {
+    if (!targetPaths.has(row.path)) {
+      await ctx.db.delete(row._id);
+    }
+  }
+}
+
+/**
+ * Validate + canonicalize the file list before any write. Throws on path
+ * violations, oversize, duplicate paths, or empty files array. Returns the
+ * NFC-normalized file list.
+ */
+export function validateFiles(
+  input: readonly { readonly path: string; readonly content: string }[],
+): { readonly path: string; readonly content: string }[] {
+  if (input.length === 0) {
+    throw new ConvexError({
+      code: 'empty_project',
+      message: 'Artifact must contain at least one file.',
+    });
+  }
+  if (input.length > MAX_FILES_PER_ARTIFACT) {
+    throw new ConvexError({
+      code: 'too_many_files',
+      message: `Artifact has ${input.length} files; max ${MAX_FILES_PER_ARTIFACT}.`,
+    });
+  }
+  const normalized = input.map((f) => ({
+    path: validatePath(f.path),
+    content: f.content,
+  }));
+  const dup = findDuplicatePath(normalized);
+  if (dup !== null) {
+    throw new ConvexError({
+      code: 'duplicate_path',
+      message: `Duplicate file path "${dup}" (paths are compared case-insensitively).`,
+    });
+  }
+  assertAggregateSize(normalized);
+  return normalized;
+}
diff --git a/services/platform/convex/artifacts/handlers/streaming.ts b/services/platform/convex/artifacts/handlers/streaming.ts
new file mode 100644
index 000000000..70297aeb0
--- /dev/null
+++ b/services/platform/convex/artifacts/handlers/streaming.ts
@@ -0,0 +1,229 @@
+/**
+ * Handler bodies + validators for streaming-lifecycle mutations:
+ * beginEditStream, abortStream, updateRewriteStreamingContent,
+ * discardActiveStreamsForThread, cleanupStaleStreams.
+ *
+ * These manage the transient "currently-streaming" state on the artifact
+ * row (liveStreamMode, streamingContent, streamingPath, etc.) — kept off
+ * the canonical content fields so a crashed write cannot corrupt settled
+ * revisions.
+ */
+
+import { ConvexError, v } from 'convex/values';
+
+import type { MutationCtx } from '../../_generated/server';
+import { validatePath } from '../../agent_tools/artifacts/shared';
+import { liveStreamModeValidator } from '../schema';
+import { STALE_STREAM_THRESHOLD_MS, clearStreamingFlags } from './shared';
+
+// =============================================================================
+// beginEditStream — stamp initial streaming state on the row
+//
+// Row-level streaming fields (liveStreamMode / streamingPath / toolCallId /
+// streamingContent) are the canvas's "live preview" signal, NOT a concurrency
+// guard. Same-path collisions are handled by `expectedRevision` OCC at settle
+// time. Cross-path concurrent writes (two `artifact_file_create`s to different paths)
+// are semantically independent — last-writer-wins is fine for the canvas
+// signal; both writes commit independently on their own settle path.
+//
+// Stale flags from a crashed prior stream are cleaned by
+// `cleanupStaleStreams` / `discardActiveStreamsForThread`.
+// =============================================================================
+
+export const beginEditStreamArgs = {
+  artifactId: v.id('artifacts'),
+  liveStreamMode: liveStreamModeValidator,
+  /** For mode='rewrite': the file path being streamed (advisory). */
+  streamingPath: v.optional(v.string()),
+  toolCallId: v.optional(v.string()),
+} as const;
+
+export const beginEditStreamReturns = v.null();
+
+export async function beginEditStreamHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: import('../../_generated/dataModel').Id<'artifacts'>;
+    liveStreamMode: 'create' | 'rewrite' | 'append' | 'patch';
+    streamingPath?: string;
+    toolCallId?: string;
+  },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) {
+    throw new ConvexError({
+      code: 'not_found',
+      message: `Artifact ${args.artifactId} not found.`,
+    });
+  }
+  const validatedPath =
+    args.streamingPath !== undefined
+      ? validatePath(args.streamingPath)
+      : undefined;
+  await ctx.db.patch(args.artifactId, {
+    liveStreamMode: args.liveStreamMode,
+    liveStreamStartedAt: Date.now(),
+    // `rewrite` and `append` both deliver content via tool-input deltas; we
+    // seed `streamingContent` to the empty string so the canvas's
+    // `streamingContent ?? settled` fallback chain has a stable handle
+    // through the stream. `patch` uses `streamingPatches` instead.
+    streamingContent:
+      args.liveStreamMode === 'rewrite' || args.liveStreamMode === 'append'
+        ? ''
+        : undefined,
+    streamingPatches: args.liveStreamMode === 'patch' ? [] : undefined,
+    streamingPath: validatedPath,
+    toolCallId: args.toolCallId,
+  });
+  return null;
+}
+
+// =============================================================================
+// abortStream — clears all live-stream flags
+// =============================================================================
+
+export const abortStreamArgs = {
+  artifactId: v.id('artifacts'),
+} as const;
+
+export const abortStreamReturns = v.null();
+
+export async function abortStreamHandler(
+  ctx: MutationCtx,
+  {
+    artifactId,
+  }: { artifactId: import('../../_generated/dataModel').Id<'artifacts'> },
+) {
+  await ctx.db.patch(artifactId, clearStreamingFlags());
+  return null;
+}
+
+// =============================================================================
+// updateRewriteStreamingContent — mid-stream incremental persistence
+//
+// Bails (no-op) if the row no longer matches the streaming session
+// (different `toolCallId`, mode changed, path changed) — protects against
+// a stale delta from an aborted call overwriting a newer stream.
+//
+// Never touches `files[]`, `content`, or `revision`. Settled state stays
+// exactly as it was until `createFileInArtifact` / `updateFileInArtifact`
+// runs at execute-time.
+//
+// Shared by `artifact_file_create` and `artifact_file_update` — both stream their `content`
+// arg in via tool-input deltas, so the canvas's "show whatever bytes we've
+// seen so far" path is identical.
+// =============================================================================
+
+export const updateRewriteStreamingContentArgs = {
+  artifactId: v.id('artifacts'),
+  toolCallId: v.string(),
+  streamingPath: v.string(),
+  content: v.string(),
+} as const;
+
+export const updateRewriteStreamingContentReturns = v.null();
+
+export async function updateRewriteStreamingContentHandler(
+  ctx: MutationCtx,
+  args: {
+    artifactId: import('../../_generated/dataModel').Id<'artifacts'>;
+    toolCallId: string;
+    streamingPath: string;
+    content: string;
+  },
+) {
+  const row = await ctx.db.get(args.artifactId);
+  if (!row) return null;
+  if (row.liveStreamMode !== 'rewrite' && row.liveStreamMode !== 'append') {
+    return null;
+  }
+  if (row.toolCallId !== args.toolCallId) return null;
+  if (row.streamingPath !== args.streamingPath) return null;
+  await ctx.db.patch(args.artifactId, {
+    streamingContent: args.content,
+    updatedAt: Date.now(),
+  });
+  return null;
+}
+
+// =============================================================================
+// discardActiveStreamsForThread — user-Stop cascade
+//
+// When the user clicks Stop, the SDK abort fires before any `tool.execute()`
+// runs, so `discardCreateStream` / `abortStream` never get called for the
+// stream that was mid-author. Without this mutation the placeholder row
+// (revision 0, `liveStreamMode='create'`) lingers in the canvas sidebar
+// with a streaming badge until `cleanupStaleStreams` cron picks it up
+// (60 s threshold × 5-min cron = up to ~6 min ghost tile).
+//
+// Mirror of `cleanupStaleStreams` logic but scoped to one thread and not
+// gated on `liveStreamStartedAt` age. Called inline from
+// `convex/threads/cancel_generation.ts`.
+// =============================================================================
+
+export const discardActiveStreamsForThreadArgs = {
+  organizationId: v.string(),
+  threadId: v.string(),
+} as const;
+
+export const discardActiveStreamsForThreadReturns = v.object({
+  cleared: v.number(),
+});
+
+export async function discardActiveStreamsForThreadHandler(
+  ctx: MutationCtx,
+  args: { organizationId: string; threadId: string },
+) {
+  let cleared = 0;
+  const rows = await ctx.db
+    .query('artifacts')
+    .withIndex('by_organizationId_and_thread', (q) =>
+      q.eq('organizationId', args.organizationId).eq('threadId', args.threadId),
+    )
+    .collect();
+  for (const row of rows) {
+    if (row.liveStreamMode === undefined) continue;
+    if (row.revision === 0) {
+      await ctx.db.delete(row._id);
+    } else {
+      await ctx.db.patch(row._id, clearStreamingFlags());
+    }
+    cleared += 1;
+  }
+  return { cleared };
+}
+
+// =============================================================================
+// cleanupStaleStreams — periodic janitor (cron-invoked)
+// =============================================================================
+
+export const cleanupStaleStreamsArgs = {} as const;
+
+export const cleanupStaleStreamsReturns = v.object({ cleared: v.number() });
+
+export async function cleanupStaleStreamsHandler(ctx: MutationCtx) {
+  const cutoff = Date.now() - STALE_STREAM_THRESHOLD_MS;
+  let cleared = 0;
+  for await (const row of ctx.db
+    .query('artifacts')
+    .withIndex('by_liveStreamMode')) {
+    if (
+      row.liveStreamStartedAt !== undefined &&
+      row.liveStreamStartedAt < cutoff
+    ) {
+      // Placeholder rows (revision === 0) belong to a crashed
+      // `beginCreateStream` and have no real artifactRevisions row backing
+      // them — clearing streaming flags would leak an empty artifact into
+      // the user's thread, so we delete the row outright. For settled
+      // rows (revision >= 1) we just clear the streaming flags and keep
+      // the prior content.
+      if (row.revision === 0) {
+        await ctx.db.delete(row._id);
+      } else {
+        await ctx.db.patch(row._id, clearStreamingFlags());
+      }
+      cleared += 1;
+    }
+  }
+  return { cleared };
+}
diff --git a/services/platform/convex/artifacts/internal_mutations.test.ts b/services/platform/convex/artifacts/internal_mutations.test.ts
new file mode 100644
index 000000000..0147b4cfa
--- /dev/null
+++ b/services/platform/convex/artifacts/internal_mutations.test.ts
@@ -0,0 +1,917 @@
+// Regression gates for the two artifact-write paths that need them:
+//
+//   1. `createArtifact` — title-idempotent insert (commit 511e6b361
+//      changed the dedup key from `toolCallId` to a normalized title).
+//      Returns either {success: true, isNew} or {success: false,
+//      conflict: 'type_mismatch'}.
+//
+//   2. `discardActiveStreamsForThread` — the user-Stop cascade added in
+//      this PR. Deletes `revision === 0` placeholders (artifact_create
+//      mid-stream when the user clicked Stop) and clears streaming flags
+//      on settled rows where artifact_file_create / artifact_file_update was mid-stream.
+
+import { describe, expect, it, vi } from 'vitest';
+
+vi.mock('../_generated/server', async (importOriginal) => {
+  const mod = await importOriginal<Record<string, unknown>>();
+  return {
+    ...mod,
+    internalMutation: (config: Record<string, unknown>) => config,
+  };
+});
+
+import {
+  applyFinalizeArtifactRun,
+  createArtifact,
+  createFileInArtifact,
+  discardActiveStreamsForThread,
+  updateFileInArtifact,
+  updateRewriteStreamingContent,
+} from './internal_mutations';
+
+interface FakeArtifactRow {
+  _id: string;
+  organizationId: string;
+  threadId: string;
+  type: string;
+  title: string;
+  language?: string;
+  content?: string;
+  files?: Array<{ path: string; content: string }>;
+  entryFile?: string;
+  revision: number;
+  liveStreamMode?: 'create' | 'rewrite' | 'patch';
+  toolCallId?: string;
+  createdByMessageId?: string;
+  lastEditedByMessageId?: string;
+  streamingContent?: string;
+  streamingPath?: string;
+  liveStreamStartedAt?: number;
+  createdAt?: number;
+  updatedAt?: number;
+  runStatus?:
+    | 'queued'
+    | 'installing'
+    | 'running'
+    | 'completed'
+    | 'failed'
+    | 'cancelled';
+  runExecutionId?: string;
+  runStartedAt?: number;
+  runRevision?: number;
+  runOutputFiles?: Array<{
+    name: string;
+    storageId?: string;
+    size: number;
+    contentType?: string;
+  }>;
+}
+
+interface MutHandler<TArgs, TReturn> {
+  handler: (ctx: unknown, args: TArgs) => Promise<TReturn> | TReturn;
+}
+
+function asyncIter<T>(rows: T[]): AsyncIterable<T> {
+  return {
+    async *[Symbol.asyncIterator]() {
+      for (const r of rows) yield r;
+    },
+  };
+}
+
+function createMockCtx(initial: FakeArtifactRow[] = []) {
+  const rows: FakeArtifactRow[] = [...initial];
+  // Per-table side stores so the mock can serve queries for the auxiliary
+  // tables that `syncArtifactFiles` writes to (`artifactFiles`) and any
+  // future per-table reads without leaking artifact rows into a wrong-table
+  // query (which previously caused `syncArtifactFiles` to delete artifact
+  // rows it mistook for stale file rows).
+  const auxRows = new Map<string, Record<string, unknown>[]>();
+  const inserted: Array<{
+    table: string;
+    payload: Record<string, unknown>;
+    insertedId: string;
+  }> = [];
+  const patched: Array<{ id: string; patch: Record<string, unknown> }> = [];
+  const deleted: string[] = [];
+  let next = 1;
+
+  function makeBuilder(table: string) {
+    const eqs: Record<string, unknown> = {};
+    // The builder is used in two styles:
+    //   - `for await (const r of ctx.db.query(...).withIndex(...))` (createArtifact)
+    //   - `await ctx.db.query(...).withIndex(...).collect()`         (discardActiveStreamsForThread)
+    // so we expose BOTH `[Symbol.asyncIterator]` and `.collect()`.
+    const filtered = (): Record<string, unknown>[] => {
+      if (table === 'artifacts') {
+        return rows.filter((r) => {
+          if (
+            eqs.organizationId !== undefined &&
+            r.organizationId !== eqs.organizationId
+          ) {
+            return false;
+          }
+          if (eqs.threadId !== undefined && r.threadId !== eqs.threadId) {
+            return false;
+          }
+          return true;
+        }) as unknown as Record<string, unknown>[];
+      }
+      const tableRows = auxRows.get(table) ?? [];
+      return tableRows.filter((r) => {
+        for (const key of Object.keys(eqs)) {
+          if (r[key] !== eqs[key]) return false;
+        }
+        return true;
+      });
+    };
+    const builder: Record<string | symbol, unknown> = {};
+    builder.withIndex = vi.fn((_name: string, cb: (q: unknown) => unknown) => {
+      const q = {
+        eq: (field: string, value: unknown) => {
+          eqs[field] = value;
+          return q;
+        },
+      };
+      cb(q);
+      return builder;
+    });
+    builder.collect = vi.fn(async () => filtered());
+    builder.order = vi.fn((_dir: 'asc' | 'desc') => builder);
+    builder.unique = vi.fn(async () => {
+      const list = filtered();
+      return list.length > 0 ? list[0] : null;
+    });
+    builder.first = vi.fn(async () => {
+      const list = filtered();
+      return list.length > 0 ? list[0] : null;
+    });
+    builder[Symbol.asyncIterator] = () =>
+      asyncIter(filtered())[Symbol.asyncIterator]();
+    return builder;
+  }
+
+  return {
+    ctx: {
+      db: {
+        query: vi.fn((table: string) => makeBuilder(table)),
+        get: vi.fn(async (id: string) => {
+          return rows.find((r) => r._id === id) ?? null;
+        }),
+        insert: vi.fn(
+          async (table: string, payload: Record<string, unknown>) => {
+            const insertedId =
+              table === 'artifacts' ? `art_${next++}` : `${table}_${next++}`;
+            inserted.push({ table, payload, insertedId });
+            if (table === 'artifacts') {
+              rows.push({
+                _id: insertedId,
+                organizationId: payload.organizationId as string,
+                threadId: payload.threadId as string,
+                type: payload.type as string,
+                title: payload.title as string,
+                language: payload.language as string | undefined,
+                content: payload.content as string | undefined,
+                files: payload.files as
+                  | Array<{ path: string; content: string }>
+                  | undefined,
+                entryFile: payload.entryFile as string | undefined,
+                revision: payload.revision as number,
+              });
+            } else {
+              const tableRows = auxRows.get(table) ?? [];
+              tableRows.push({ ...payload, _id: insertedId });
+              auxRows.set(table, tableRows);
+            }
+            return insertedId;
+          },
+        ),
+        patch: vi.fn(async (id: string, patch: Record<string, unknown>) => {
+          patched.push({ id, patch });
+          const row = rows.find((r) => r._id === id);
+          if (row !== undefined) {
+            Object.assign(row, patch);
+            return;
+          }
+          for (const tableRows of auxRows.values()) {
+            const aux = tableRows.find((r) => r._id === id);
+            if (aux !== undefined) {
+              Object.assign(aux, patch);
+              return;
+            }
+          }
+        }),
+        delete: vi.fn(async (id: string) => {
+          deleted.push(id);
+          const idx = rows.findIndex((r) => r._id === id);
+          if (idx >= 0) {
+            rows.splice(idx, 1);
+            return;
+          }
+          for (const [, tableRows] of auxRows) {
+            const auxIdx = tableRows.findIndex((r) => r._id === id);
+            if (auxIdx >= 0) {
+              tableRows.splice(auxIdx, 1);
+              return;
+            }
+          }
+        }),
+      },
+    },
+    inserted,
+    patched,
+    deleted,
+    rows,
+  };
+}
+
+type CreateArtifactArgs = {
+  organizationId: string;
+  threadId: string;
+  type: 'code' | 'markdown' | 'html' | 'svg' | 'mermaid';
+  title: string;
+  language?: string;
+  content?: string;
+  entryFile?: string;
+  createdByMessageId: string;
+};
+
+type CreateArtifactResult =
+  | {
+      success: true;
+      isNew: boolean;
+      artifactId: string;
+      revision: number;
+      entryFile: string;
+      filePaths: string[];
+    }
+  | {
+      success: false;
+      conflict: 'type_mismatch';
+      existingArtifactId: string;
+      existingType: string;
+      message: string;
+    };
+
+const create = createArtifact as unknown as MutHandler<
+  CreateArtifactArgs,
+  CreateArtifactResult
+>;
+
+const base: CreateArtifactArgs = {
+  organizationId: 'org_a',
+  threadId: 'thr_a',
+  type: 'code',
+  title: 'hello',
+  language: 'javascript',
+  content: 'console.log("hi");\n',
+  createdByMessageId: 'msg_1',
+};
+
+describe('createArtifact (title-idempotent insert)', () => {
+  it('inserts a new artifact + revision when no row exists', async () => {
+    const { ctx, inserted } = createMockCtx();
+    const r = await create.handler(ctx, base);
+    expect(r.success).toBe(true);
+    if (!r.success) return;
+    expect(r.isNew).toBe(true);
+    expect(r.revision).toBe(1);
+    expect(r.filePaths).toContain(r.entryFile);
+    expect(inserted.filter((i) => i.table === 'artifacts')).toHaveLength(1);
+    expect(
+      inserted.filter((i) => i.table === 'artifactRevisions'),
+    ).toHaveLength(1);
+  });
+
+  it('returns the existing artifact (isNew=false) when title+type collide', async () => {
+    const existing: FakeArtifactRow = {
+      _id: 'art_existing',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'hello',
+      content: 'old content',
+      files: [{ path: 'main.js', content: 'old content' }],
+      entryFile: 'main.js',
+      revision: 3,
+    };
+    const { ctx, inserted } = createMockCtx([existing]);
+    const r = await create.handler(ctx, {
+      ...base,
+      content: 'NEW content that should be IGNORED',
+    });
+    expect(r.success).toBe(true);
+    if (!r.success) return;
+    expect(r.isNew).toBe(false);
+    expect(r.artifactId).toBe('art_existing');
+    expect(r.revision).toBe(3);
+    // No new rows inserted — caller's content is dropped on collision.
+    expect(inserted).toHaveLength(0);
+  });
+
+  it('rejects with type_mismatch when title matches but type differs', async () => {
+    const existing: FakeArtifactRow = {
+      _id: 'art_existing',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'markdown',
+      title: 'hello',
+      revision: 1,
+    };
+    const { ctx, inserted } = createMockCtx([existing]);
+    const r = await create.handler(ctx, { ...base, type: 'code' });
+    expect(r.success).toBe(false);
+    if (r.success) return;
+    expect(r.conflict).toBe('type_mismatch');
+    expect(r.existingArtifactId).toBe('art_existing');
+    expect(r.existingType).toBe('markdown');
+    expect(inserted).toHaveLength(0);
+  });
+
+  it('dedup is scoped to (organizationId, threadId)', async () => {
+    const otherThread: FakeArtifactRow = {
+      _id: 'art_other',
+      organizationId: 'org_a',
+      threadId: 'thr_b',
+      type: 'code',
+      title: 'hello',
+      revision: 1,
+    };
+    const { ctx, inserted } = createMockCtx([otherThread]);
+    const r = await create.handler(ctx, base);
+    expect(r.success).toBe(true);
+    if (!r.success) return;
+    expect(r.isNew).toBe(true);
+    expect(inserted.filter((i) => i.table === 'artifacts')).toHaveLength(1);
+  });
+
+  it('normalizes the comparison key (trims + collapses whitespace + case-fold)', async () => {
+    const existing: FakeArtifactRow = {
+      _id: 'art_existing',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'Hello World',
+      revision: 1,
+    };
+    const { ctx, inserted } = createMockCtx([existing]);
+    const r = await create.handler(ctx, {
+      ...base,
+      title: '   hello   world   ',
+    });
+    expect(r.success).toBe(true);
+    if (!r.success) return;
+    expect(r.isNew).toBe(false);
+    expect(r.artifactId).toBe('art_existing');
+    expect(inserted).toHaveLength(0);
+  });
+});
+
+type DiscardArgs = { organizationId: string; threadId: string };
+type DiscardResult = { cleared: number };
+
+const discard = discardActiveStreamsForThread as unknown as MutHandler<
+  DiscardArgs,
+  DiscardResult
+>;
+
+describe('discardActiveStreamsForThread (user-Stop cascade)', () => {
+  it('deletes revision-0 placeholder rows with active streaming', async () => {
+    const placeholder: FakeArtifactRow = {
+      _id: 'art_ph',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'WIP',
+      revision: 0,
+      liveStreamMode: 'create',
+      streamingContent: 'partial...',
+      liveStreamStartedAt: Date.now(),
+    };
+    const { ctx, deleted, patched } = createMockCtx([placeholder]);
+    const r = await discard.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+    });
+    expect(r.cleared).toBe(1);
+    expect(deleted).toEqual(['art_ph']);
+    expect(patched).toHaveLength(0);
+  });
+
+  it('clears streaming flags on settled (revision >= 1) rows', async () => {
+    const settled: FakeArtifactRow = {
+      _id: 'art_settled',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'edited',
+      revision: 4,
+      liveStreamMode: 'rewrite',
+      streamingContent: 'new content...',
+      liveStreamStartedAt: Date.now(),
+    };
+    const { ctx, deleted, patched } = createMockCtx([settled]);
+    const r = await discard.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+    });
+    expect(r.cleared).toBe(1);
+    expect(deleted).toHaveLength(0);
+    expect(patched).toHaveLength(1);
+    expect(patched[0]?.id).toBe('art_settled');
+    // clearStreamingFlags() sets streaming-state fields to undefined.
+    expect(patched[0]?.patch).toMatchObject({
+      liveStreamMode: undefined,
+      streamingContent: undefined,
+    });
+  });
+
+  it('ignores rows without an active stream', async () => {
+    const idle: FakeArtifactRow = {
+      _id: 'art_idle',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'idle',
+      revision: 2,
+    };
+    const { ctx, deleted, patched } = createMockCtx([idle]);
+    const r = await discard.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+    });
+    expect(r.cleared).toBe(0);
+    expect(deleted).toHaveLength(0);
+    expect(patched).toHaveLength(0);
+  });
+
+  it('scoped to (organizationId, threadId) — does not touch other threads', async () => {
+    const otherThread: FakeArtifactRow = {
+      _id: 'art_other',
+      organizationId: 'org_a',
+      threadId: 'thr_b',
+      type: 'code',
+      title: 'WIP',
+      revision: 0,
+      liveStreamMode: 'create',
+      streamingContent: 'partial',
+    };
+    const { ctx, deleted, patched } = createMockCtx([otherThread]);
+    const r = await discard.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+    });
+    expect(r.cleared).toBe(0);
+    expect(deleted).toHaveLength(0);
+    expect(patched).toHaveLength(0);
+  });
+});
+
+type UpdateRewriteStreamingContentArgs = {
+  artifactId: string;
+  toolCallId: string;
+  streamingPath: string;
+  content: string;
+};
+
+const updateRewriteStreaming =
+  updateRewriteStreamingContent as unknown as MutHandler<
+    UpdateRewriteStreamingContentArgs,
+    null
+  >;
+
+describe('updateRewriteStreamingContent (incremental persistence)', () => {
+  it('patches only streamingContent + updatedAt on a matching rewrite session', async () => {
+    const row: FakeArtifactRow = {
+      _id: 'art_rw',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'edit',
+      revision: 5,
+      liveStreamMode: 'rewrite',
+      toolCallId: 'call_2',
+      streamingPath: 'main.py',
+      streamingContent: '',
+    };
+    const { ctx, patched } = createMockCtx([row]);
+    await updateRewriteStreaming.handler(ctx, {
+      artifactId: 'art_rw',
+      toolCallId: 'call_2',
+      streamingPath: 'main.py',
+      content: 'rewritten so far...',
+    });
+    expect(patched).toHaveLength(1);
+    expect(patched[0].patch.streamingContent).toBe('rewritten so far...');
+    expect(typeof patched[0].patch.updatedAt).toBe('number');
+  });
+
+  it('no-ops on a streamingPath mismatch (defensive — different file in flight)', async () => {
+    const row: FakeArtifactRow = {
+      _id: 'art_rw',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'edit',
+      revision: 5,
+      liveStreamMode: 'rewrite',
+      toolCallId: 'call_2',
+      streamingPath: 'main.py',
+    };
+    const { ctx, patched } = createMockCtx([row]);
+    await updateRewriteStreaming.handler(ctx, {
+      artifactId: 'art_rw',
+      toolCallId: 'call_2',
+      streamingPath: 'other.py',
+      content: 'stray content',
+    });
+    expect(patched).toHaveLength(0);
+  });
+
+  it('no-ops when the row is in create mode rather than rewrite', async () => {
+    const placeholder: FakeArtifactRow = {
+      _id: 'art_ph',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'WIP',
+      revision: 0,
+      liveStreamMode: 'create',
+      toolCallId: 'call_2',
+      streamingPath: 'main.py',
+    };
+    const { ctx, patched } = createMockCtx([placeholder]);
+    await updateRewriteStreaming.handler(ctx, {
+      artifactId: 'art_ph',
+      toolCallId: 'call_2',
+      streamingPath: 'main.py',
+      content: 'should not land',
+    });
+    expect(patched).toHaveLength(0);
+  });
+});
+
+type CreateFileArgs = {
+  artifactId: string;
+  path: string;
+  content: string;
+  editedByMessageId: string;
+  expectedRevision: number;
+};
+
+type CreateFileResult =
+  | {
+      success: true;
+      revision: number;
+      path: string;
+      byteLength: number;
+    }
+  | {
+      success: false;
+      code: 'not_found' | 'stale' | 'path_exists';
+      message: string;
+      currentRevision?: number;
+    };
+
+const createFile = createFileInArtifact as unknown as MutHandler<
+  CreateFileArgs,
+  CreateFileResult
+>;
+
+describe('createFileInArtifact (strict-CRUD)', () => {
+  it('inserts a new file and bumps revision', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_cc',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'Proj',
+      revision: 3,
+      entryFile: 'main.py',
+      files: [{ path: 'main.py', content: 'print(1)\n' }],
+      content: 'print(1)\n',
+    };
+    const { ctx, inserted } = createMockCtx([initial]);
+    const r = await createFile.handler(ctx, {
+      artifactId: 'art_cc',
+      path: 'helpers.py',
+      content: 'def x():\n  pass\n',
+      editedByMessageId: 'msg_x',
+      expectedRevision: 3,
+    });
+    expect(r.success).toBe(true);
+    if (!r.success) return;
+    expect(r.revision).toBe(4);
+    expect(r.path).toBe('helpers.py');
+    expect(r.byteLength).toBe('def x():\n  pass\n'.length);
+    // artifactFiles row inserted for the new path AND the pre-existing entry file.
+    const fileRowInserts = inserted.filter((i) => i.table === 'artifactFiles');
+    expect(
+      fileRowInserts
+        .map((i) => i.payload.path)
+        .sort((a, b) => String(a).localeCompare(String(b))),
+    ).toEqual(['helpers.py', 'main.py']);
+  });
+
+  it('refuses with code: "path_exists" when the path already exists', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_pe',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'Proj',
+      revision: 2,
+      entryFile: 'main.py',
+      files: [{ path: 'main.py', content: 'print(1)\n' }],
+      content: 'print(1)\n',
+    };
+    const { ctx, patched } = createMockCtx([initial]);
+    const r = await createFile.handler(ctx, {
+      artifactId: 'art_pe',
+      path: 'main.py',
+      content: 'something else',
+      editedByMessageId: 'msg_x',
+      expectedRevision: 2,
+    });
+    expect(r.success).toBe(false);
+    if (r.success) return;
+    expect(r.code).toBe('path_exists');
+    expect(patched).toHaveLength(0);
+  });
+
+  it('refuses with code: "stale" on OCC mismatch', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_st',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'Proj',
+      revision: 5,
+      entryFile: 'main.py',
+      files: [{ path: 'main.py', content: '' }],
+      content: '',
+    };
+    const { ctx, patched } = createMockCtx([initial]);
+    const r = await createFile.handler(ctx, {
+      artifactId: 'art_st',
+      path: 'helpers.py',
+      content: 'x',
+      editedByMessageId: 'msg_x',
+      expectedRevision: 4,
+    });
+    expect(r.success).toBe(false);
+    if (r.success) return;
+    expect(r.code).toBe('stale');
+    expect(r.currentRevision).toBe(5);
+    expect(patched).toHaveLength(0);
+  });
+});
+
+type UpdateFileArgs = CreateFileArgs;
+type UpdateFileResult =
+  | {
+      success: true;
+      revision: number;
+      path: string;
+      byteLength: number;
+    }
+  | {
+      success: false;
+      code: 'not_found' | 'stale' | 'file_missing';
+      message: string;
+      currentRevision?: number;
+    };
+
+const updateFile = updateFileInArtifact as unknown as MutHandler<
+  UpdateFileArgs,
+  UpdateFileResult
+>;
+
+describe('updateFileInArtifact (strict-CRUD overwrite-only)', () => {
+  it('overwrites an existing file and bumps revision', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_up',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'Proj',
+      revision: 7,
+      entryFile: 'main.py',
+      files: [
+        { path: 'main.py', content: 'old' },
+        { path: 'helpers.py', content: 'helper' },
+      ],
+      content: 'old',
+    };
+    const { ctx, patched } = createMockCtx([initial]);
+    const r = await updateFile.handler(ctx, {
+      artifactId: 'art_up',
+      path: 'helpers.py',
+      content: 'def x(): pass',
+      editedByMessageId: 'msg_x',
+      expectedRevision: 7,
+    });
+    expect(r.success).toBe(true);
+    if (!r.success) return;
+    expect(r.revision).toBe(8);
+    expect(r.path).toBe('helpers.py');
+    expect(r.byteLength).toBe('def x(): pass'.length);
+    // The artifact row was patched to revision 8 with the new files content.
+    const artifactPatch = patched.find((p) => p.id === 'art_up');
+    expect(artifactPatch?.patch.revision).toBe(8);
+  });
+
+  it('refuses with code: "file_missing" when path does not exist', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_um',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'code',
+      title: 'Proj',
+      revision: 2,
+      entryFile: 'main.py',
+      files: [{ path: 'main.py', content: 'print(1)\n' }],
+      content: 'print(1)\n',
+    };
+    const { ctx, patched } = createMockCtx([initial]);
+    const r = await updateFile.handler(ctx, {
+      artifactId: 'art_um',
+      path: 'doesnt_exist.py',
+      content: 'x',
+      editedByMessageId: 'msg_x',
+      expectedRevision: 2,
+    });
+    expect(r.success).toBe(false);
+    if (r.success) return;
+    expect(r.code).toBe('file_missing');
+    expect(patched).toHaveLength(0);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// applyFinalizeArtifactRun terminal-guard semantics.
+//
+// The original guard "no-op when artifact row is already terminal" was too
+// coarse: a follow-up run that legitimately re-finalizes the same artifact
+// (because the caller forgot to invoke `initArtifactRun` between runs) had
+// its `artifactRuns` / `artifactRunFiles` / `artifactOutputs` writes
+// silently dropped. The fix gates the no-op on `runExecutionId` parity:
+//   - same execution as the already-terminal row → duplicate, no-op
+//   - different execution                       → genuinely new run, proceed
+// ---------------------------------------------------------------------------
+
+describe('applyFinalizeArtifactRun (terminal-guard executionId parity)', () => {
+  it('no-ops when finalize fires twice for the SAME executionId (duplicate delta)', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_dup',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'script_runnable',
+      title: 'dup-finalize',
+      revision: 1,
+      runStatus: 'completed',
+      runExecutionId: 'exec_same',
+    };
+    const { ctx, inserted, patched } = createMockCtx([initial]);
+    await applyFinalizeArtifactRun(ctx as never, {
+      artifactId: 'art_dup' as never,
+      runStatus: 'completed',
+      runOutputFiles: [],
+      runExecutionId: 'exec_same' as never,
+    });
+    // Guard fired — no patch to the artifact row, no inserts to the
+    // dual-write tables.
+    expect(patched.filter((p) => p.id === 'art_dup')).toHaveLength(0);
+    expect(inserted.filter((i) => i.table === 'artifactRuns')).toHaveLength(0);
+    expect(inserted.filter((i) => i.table === 'artifactRunFiles')).toHaveLength(
+      0,
+    );
+    expect(inserted.filter((i) => i.table === 'artifactOutputs')).toHaveLength(
+      0,
+    );
+  });
+
+  it('proceeds when finalize fires for a DIFFERENT executionId on a terminal row (fresh run without initArtifactRun)', async () => {
+    // This is the regression: a caller (test harness, direct executeCode
+    // invocation, future custom path) re-uses an artifact without going
+    // through `initArtifactRun`. The artifact row still carries the
+    // previous run's terminal status + executionId. The new finalize MUST
+    // be allowed through so its run history lands in the dual-write
+    // tables.
+    const initial: FakeArtifactRow = {
+      _id: 'art_diff',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'script_runnable',
+      title: 'cross-execution finalize',
+      revision: 1,
+      runStatus: 'completed',
+      runExecutionId: 'exec_prior',
+      runStartedAt: 1000,
+    };
+    const { ctx, inserted, patched } = createMockCtx([initial]);
+    await applyFinalizeArtifactRun(ctx as never, {
+      artifactId: 'art_diff' as never,
+      runStatus: 'completed',
+      runOutputFiles: [
+        {
+          name: 'out.txt',
+          storageId: 'st_out' as never,
+          size: 5,
+          fileMetadataId: 'fm_out' as never,
+          contentType: 'text/plain',
+          sha256: 'abc123',
+        },
+      ],
+      runExecutionId: 'exec_new' as never,
+    });
+    // Artifact row patched with the new run's state.
+    const artPatches = patched.filter((p) => p.id === 'art_diff');
+    expect(artPatches.length).toBeGreaterThan(0);
+    expect(artPatches[0]?.patch.runStatus).toBe('completed');
+    // artifactRuns row created.
+    const runInserts = inserted.filter((i) => i.table === 'artifactRuns');
+    expect(runInserts).toHaveLength(1);
+    expect(runInserts[0]?.payload.executionId).toBe('exec_new');
+    // artifactRunFiles row created.
+    expect(inserted.filter((i) => i.table === 'artifactRunFiles')).toHaveLength(
+      1,
+    );
+    // artifactOutputs manifest row created (cumulative state captured).
+    const outInserts = inserted.filter((i) => i.table === 'artifactOutputs');
+    expect(outInserts).toHaveLength(1);
+    expect(outInserts[0]?.payload.name).toBe('out.txt');
+    expect(outInserts[0]?.payload.sha256).toBe('abc123');
+  });
+
+  it('proceeds when the artifact row has no runStatus yet (first run on the artifact)', async () => {
+    const initial: FakeArtifactRow = {
+      _id: 'art_first',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'script_runnable',
+      title: 'first-finalize',
+      revision: 1,
+    };
+    const { ctx, inserted } = createMockCtx([initial]);
+    await applyFinalizeArtifactRun(ctx as never, {
+      artifactId: 'art_first' as never,
+      runStatus: 'completed',
+      runOutputFiles: [],
+      runExecutionId: 'exec_first' as never,
+    });
+    expect(inserted.filter((i) => i.table === 'artifactRuns')).toHaveLength(1);
+  });
+
+  it('no-ops when args.runExecutionId is omitted and the row is already terminal (fallback finalize trusts the row state)', async () => {
+    // Audit follow-up F7: the tool-side fallback finalize at
+    // artifact_run_tool.ts:696-705 passes no `runExecutionId`. Without
+    // this short-circuit, a fallback finalize landing AFTER
+    // `failExecution` already terminalized the row would slip past
+    // `sameExecution=false` and insert a duplicate `artifactRuns` row.
+    // Treat "no executionId on a terminal row" as "trust the row's
+    // terminal state".
+    const initial: FakeArtifactRow = {
+      _id: 'art_legacy',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'script_runnable',
+      title: 'legacy-finalize',
+      revision: 1,
+      runStatus: 'completed',
+      runExecutionId: 'exec_prior',
+    };
+    const { ctx, inserted } = createMockCtx([initial]);
+    await applyFinalizeArtifactRun(ctx as never, {
+      artifactId: 'art_legacy' as never,
+      runStatus: 'completed',
+      runOutputFiles: [],
+      // runExecutionId intentionally omitted — fallback finalize path
+    });
+    expect(inserted.filter((i) => i.table === 'artifactRuns')).toHaveLength(0);
+  });
+
+  it('proceeds when args.runExecutionId is omitted and the row is NOT terminal (first finalize without executionId still lands)', async () => {
+    // The trust-the-row shortcut only fires when the row is already
+    // terminal. A non-terminal row with omitted executionId still
+    // finalizes normally — otherwise legacy callers that haven't
+    // adopted the executionId argument couldn't make progress.
+    const initial: FakeArtifactRow = {
+      _id: 'art_running',
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      type: 'script_runnable',
+      title: 'running-finalize',
+      revision: 1,
+      runStatus: 'running',
+    };
+    const { ctx, inserted } = createMockCtx([initial]);
+    await applyFinalizeArtifactRun(ctx as never, {
+      artifactId: 'art_running' as never,
+      runStatus: 'completed',
+      runOutputFiles: [],
+      // runExecutionId intentionally omitted
+    });
+    expect(inserted.filter((i) => i.table === 'artifactRuns')).toHaveLength(1);
+  });
+});
diff --git a/services/platform/convex/artifacts/internal_mutations.ts b/services/platform/convex/artifacts/internal_mutations.ts
index 46e55ca9a..a7fe96b1d 100644
--- a/services/platform/convex/artifacts/internal_mutations.ts
+++ b/services/platform/convex/artifacts/internal_mutations.ts
@@ -1,423 +1,207 @@
-import { ConvexError, v } from 'convex/values';
+/**
+ * Thin Convex internalMutation surface for artifact writes.
+ *
+ * The actual handler bodies, arg validators, and return validators live in
+ * the `handlers/` subdirectory, grouped by concern:
+ *
+ *   - `handlers/shared.ts`        — helpers, size guards, validateFiles,
+ *                                   clearStreamingFlags, trimRevisionHistory
+ *   - `handlers/content_edits.ts` — createArtifact + file-level CRUD
+ *                                   (artifact_file_create / artifact_file_update / artifact_file_delete
+ *                                   / artifact_file_rename)
+ *   - `handlers/streaming.ts`     — beginEditStream / abortStream /
+ *                                   updateRewriteStreamingContent /
+ *                                   discardActiveStreamsForThread /
+ *                                   cleanupStaleStreams
+ *   - `handlers/run_state.ts`     — setArtifactRunConfig / initArtifactRun /
+ *                                   appendArtifactRunOutput /
+ *                                   patchArtifactRunProgress /
+ *                                   finalizeArtifactRun (+ the pure
+ *                                   `applyFinalizeArtifactRun` helper)
+ *
+ * This file's job is purely to (1) declare the Convex API surface by
+ * registering each handler with `internalMutation(...)` and (2) re-export
+ * a few cross-module helpers (`MAX_ARTIFACT_BYTES`, `assertAggregateSize`,
+ * `applyFinalizeArtifactRun`) that other modules import directly.
+ */
 
 import { internalMutation } from '../_generated/server';
-import { applyPatches } from '../agent_tools/artifacts/apply_patches';
 import {
-  artifactPatchValidator,
-  artifactTypeValidator,
-  liveStreamModeValidator,
-} from './schema';
-
-const STALE_STREAM_THRESHOLD_MS = 60_000;
-/**
- * Minimum interval between `liveStreamStartedAt` heartbeat refreshes inside
- * `updateStreamingContent`. The cron janitor (`cleanupStaleStreams`) reaps
- * any row whose heartbeat is older than `STALE_STREAM_THRESHOLD_MS`, so
- * refreshing the heartbeat well inside that window is sufficient. Skipping
- * the redundant patch on every chunk also keeps the doc-level `useQuery`
- * subscriptions (artifact-bar, MessageArtifactPills) from re-running on
- * every flush — content-stream flushes happen every ~100-250 ms, but the
- * subscribed queries only need to invalidate when their projected metadata
- * (title, revision, liveStreamMode) actually changed. Must stay <<
- * STALE_STREAM_THRESHOLD_MS.
- */
-const HEARTBEAT_THROTTLE_MS = 5_000;
+  createArtifactArgs,
+  createArtifactHandler,
+  createArtifactReturns,
+  createFileInArtifactArgs,
+  createFileInArtifactHandler,
+  createFileInArtifactReturns,
+  deleteFileFromArtifactArgs,
+  deleteFileFromArtifactHandler,
+  deleteFileFromArtifactReturns,
+  renameFileInArtifactArgs,
+  renameFileInArtifactHandler,
+  renameFileInArtifactReturns,
+  updateFileInArtifactArgs,
+  updateFileInArtifactHandler,
+  updateFileInArtifactReturns,
+} from './handlers/content_edits';
+import {
+  addArtifactPackagesArgs,
+  addArtifactPackagesHandler,
+  addArtifactPackagesReturns,
+  appendArtifactRunOutputArgs,
+  appendArtifactRunOutputHandler,
+  appendArtifactRunOutputReturns,
+  deriveOutputManifestFromHistoryArgs,
+  deriveOutputManifestFromHistoryHandler,
+  deriveOutputManifestFromHistoryReturns,
+  finalizeArtifactRunArgs,
+  finalizeArtifactRunHandler,
+  finalizeArtifactRunReturns,
+  initArtifactRunArgs,
+  initArtifactRunHandler,
+  initArtifactRunReturns,
+  patchArtifactRunProgressArgs,
+  patchArtifactRunProgressHandler,
+  patchArtifactRunProgressReturns,
+  setArtifactRunConfigArgs,
+  setArtifactRunConfigHandler,
+  setArtifactRunConfigReturns,
+} from './handlers/run_state';
+import {
+  abortStreamArgs,
+  abortStreamHandler,
+  abortStreamReturns,
+  beginEditStreamArgs,
+  beginEditStreamHandler,
+  beginEditStreamReturns,
+  cleanupStaleStreamsArgs,
+  cleanupStaleStreamsHandler,
+  cleanupStaleStreamsReturns,
+  discardActiveStreamsForThreadArgs,
+  discardActiveStreamsForThreadHandler,
+  discardActiveStreamsForThreadReturns,
+  updateRewriteStreamingContentArgs,
+  updateRewriteStreamingContentHandler,
+  updateRewriteStreamingContentReturns,
+} from './handlers/streaming';
 
-/**
- * Hard cap on a stored artifact's content (settled or streaming). Convex's
- * per-document limit is 1 MiB; we cap below that so a single mutation that
- * also writes a revision row (which stores the same content) stays under
- * the limit, and so an LLM rewrite that runs away yields a clean
- * `too_large` error instead of a generic 500.
- */
-export const MAX_ARTIFACT_BYTES = 800_000;
+// Re-export cross-module helpers so existing callers keep resolving.
+export {
+  MAX_ARTIFACT_BYTES,
+  assertAggregateSize,
+  assertContentSize,
+} from './handlers/shared';
+export { applyFinalizeArtifactRun } from './handlers/run_state';
 
-export function assertContentSize(content: string): void {
-  const size = new TextEncoder().encode(content).byteLength;
-  if (size > MAX_ARTIFACT_BYTES) {
-    throw new ConvexError({
-      code: 'too_large',
-      message: `Artifact content is ${size} bytes; max ${MAX_ARTIFACT_BYTES}.`,
-    });
-  }
-}
+// =============================================================================
+// Content edits
+// =============================================================================
 
-/**
- * Insert a new artifact (revision 1) and its initial revision row. Used by
- * the `artifact_create` tool both at the streaming-placeholder moment and
- * at the final settle. When `liveStreamMode` is provided, the row is
- * marked as actively-streaming.
- */
 export const createArtifact = internalMutation({
-  args: {
-    organizationId: v.string(),
-    threadId: v.string(),
-    type: artifactTypeValidator,
-    title: v.string(),
-    language: v.optional(v.string()),
-    content: v.string(),
-    createdByMessageId: v.string(),
-    liveStreamMode: v.optional(liveStreamModeValidator),
-    // Set by the artifact_create tool so the canvas can filter
-    // `tool-input-delta` rows in the agent SDK's streamDeltas down to this
-    // artifact's stream during the create flow.
-    toolCallId: v.optional(v.string()),
-  },
-  returns: v.object({ artifactId: v.id('artifacts'), revision: v.number() }),
-  handler: async (ctx, args) => {
-    assertContentSize(args.content);
-    const now = Date.now();
-    const isStreaming = args.liveStreamMode !== undefined;
-    const artifactId = await ctx.db.insert('artifacts', {
-      organizationId: args.organizationId,
-      threadId: args.threadId,
-      type: args.type,
-      title: args.title,
-      language: args.language,
-      content: isStreaming ? '' : args.content,
-      revision: 1,
-      createdByMessageId: args.createdByMessageId,
-      lastEditedByMessageId: args.createdByMessageId,
-      createdAt: now,
-      updatedAt: now,
-      liveStreamMode: args.liveStreamMode,
-      liveStreamStartedAt: isStreaming ? now : undefined,
-      streamingContent: isStreaming ? args.content : undefined,
-      toolCallId: args.toolCallId,
-    });
-    if (!isStreaming) {
-      await ctx.db.insert('artifactRevisions', {
-        artifactId,
-        revision: 1,
-        content: args.content,
-        editedByMessageId: args.createdByMessageId,
-        editKind: 'create',
-        createdAt: now,
-      });
-    }
-    return { artifactId, revision: 1 };
-  },
+  args: createArtifactArgs,
+  returns: createArtifactReturns,
+  handler: createArtifactHandler,
 });
 
-/**
- * Settle the streaming-placeholder row inserted by `createArtifact`:
- * write the canonical title/language/content, drop streamingContent,
- * write the initial revision row, and clear streaming flags.
- */
-export const finalizeStreamedCreate = internalMutation({
-  args: {
-    artifactId: v.id('artifacts'),
-    title: v.string(),
-    language: v.optional(v.string()),
-    content: v.string(),
-    editedByMessageId: v.string(),
-  },
-  returns: v.null(),
-  handler: async (ctx, args) => {
-    assertContentSize(args.content);
-    const artifact = await ctx.db.get(args.artifactId);
-    if (!artifact) {
-      throw new ConvexError({
-        code: 'not_found',
-        message: `artifact ${args.artifactId} not found during finalize.`,
-      });
-    }
-    if (artifact.liveStreamMode !== 'create') {
-      // Defensive: the placeholder row was tampered with (e.g. a userEdit
-      // landed on a streaming-create row, or another tool-call clobbered
-      // the flags). Hard-fail so the agent can recover, instead of writing
-      // a revision row that desynchronises with the artifact's content.
-      throw new ConvexError({
-        code: 'lifecycle',
-        message: `artifact ${args.artifactId} is not in create-streaming state.`,
-      });
-    }
-    const now = Date.now();
-    await ctx.db.patch(args.artifactId, {
-      title: args.title,
-      language: args.language,
-      content: args.content,
-      streamingContent: undefined,
-      streamingPatches: undefined,
-      liveStreamMode: undefined,
-      liveStreamStartedAt: undefined,
-      toolCallId: undefined,
-      updatedAt: now,
-    });
-    await ctx.db.insert('artifactRevisions', {
-      artifactId: args.artifactId,
-      revision: artifact.revision,
-      content: args.content,
-      editedByMessageId: args.editedByMessageId,
-      editKind: 'create',
-      createdAt: now,
-    });
-    return null;
-  },
+export const deleteFileFromArtifact = internalMutation({
+  args: deleteFileFromArtifactArgs,
+  returns: deleteFileFromArtifactReturns,
+  handler: deleteFileFromArtifactHandler,
 });
 
-export const applyToolPatches = internalMutation({
-  args: {
-    artifactId: v.id('artifacts'),
-    patches: v.array(artifactPatchValidator),
-    editedByMessageId: v.string(),
-    // OCC guard — the revision the caller read when planning these patches.
-    // Mismatch means another writer landed between the read and this call,
-    // so the patch's `search` snippets may now match the wrong region.
-    expectedRevision: v.number(),
-  },
-  returns: v.union(
-    v.object({
-      success: v.literal(true),
-      revision: v.number(),
-      content: v.string(),
-    }),
-    v.object({
-      success: v.literal(false),
-      error: v.string(),
-      failedIndex: v.number(),
-      stale: v.optional(v.boolean()),
-      currentRevision: v.optional(v.number()),
-    }),
-  ),
-  handler: async (ctx, args) => {
-    const artifact = await ctx.db.get(args.artifactId);
-    if (!artifact) {
-      return {
-        success: false as const,
-        error: `artifact ${args.artifactId} not found`,
-        failedIndex: 0,
-      };
-    }
-    if (artifact.revision !== args.expectedRevision) {
-      return {
-        success: false as const,
-        error: `artifact has been modified since you last read it (revision ${artifact.revision}, you sent ${args.expectedRevision}). Re-read and retry.`,
-        failedIndex: 0,
-        stale: true,
-        currentRevision: artifact.revision,
-      };
-    }
-    const result = applyPatches(artifact.content, args.patches);
-    if (!result.ok) {
-      return {
-        success: false as const,
-        error: result.error,
-        failedIndex: result.failedIndex,
-      };
-    }
-    assertContentSize(result.content);
-    const nextRevision = artifact.revision + 1;
-    const now = Date.now();
-    await ctx.db.patch(args.artifactId, {
-      content: result.content,
-      revision: nextRevision,
-      lastEditedByMessageId: args.editedByMessageId,
-      streamingContent: undefined,
-      streamingPatches: undefined,
-      liveStreamMode: undefined,
-      liveStreamStartedAt: undefined,
-      toolCallId: undefined,
-      updatedAt: now,
-    });
-    await ctx.db.insert('artifactRevisions', {
-      artifactId: args.artifactId,
-      revision: nextRevision,
-      content: result.content,
-      editedByMessageId: args.editedByMessageId,
-      editKind: 'patch',
-      patches: [...args.patches],
-      createdAt: now,
-    });
-    return {
-      success: true as const,
-      revision: nextRevision,
-      content: result.content,
-    };
-  },
+export const renameFileInArtifact = internalMutation({
+  args: renameFileInArtifactArgs,
+  returns: renameFileInArtifactReturns,
+  handler: renameFileInArtifactHandler,
 });
 
-export const rewriteArtifact = internalMutation({
-  args: {
-    artifactId: v.id('artifacts'),
-    content: v.string(),
-    editedByMessageId: v.string(),
-    expectedRevision: v.number(),
-  },
-  returns: v.union(
-    v.object({ success: v.literal(true), revision: v.number() }),
-    v.object({
-      success: v.literal(false),
-      stale: v.literal(true),
-      currentRevision: v.number(),
-      error: v.string(),
-    }),
-  ),
-  handler: async (ctx, args) => {
-    assertContentSize(args.content);
-    const artifact = await ctx.db.get(args.artifactId);
-    if (!artifact) {
-      throw new Error(`artifact ${args.artifactId} not found`);
-    }
-    if (artifact.revision !== args.expectedRevision) {
-      return {
-        success: false as const,
-        stale: true as const,
-        currentRevision: artifact.revision,
-        error: `artifact has been modified since you last read it (revision ${artifact.revision}, you sent ${args.expectedRevision}). Re-read and retry.`,
-      };
-    }
-    const nextRevision = artifact.revision + 1;
-    const now = Date.now();
-    await ctx.db.patch(args.artifactId, {
-      content: args.content,
-      revision: nextRevision,
-      lastEditedByMessageId: args.editedByMessageId,
-      streamingContent: undefined,
-      streamingPatches: undefined,
-      liveStreamMode: undefined,
-      liveStreamStartedAt: undefined,
-      toolCallId: undefined,
-      updatedAt: now,
-    });
-    await ctx.db.insert('artifactRevisions', {
-      artifactId: args.artifactId,
-      revision: nextRevision,
-      content: args.content,
-      editedByMessageId: args.editedByMessageId,
-      editKind: 'rewrite',
-      createdAt: now,
-    });
-    return { success: true as const, revision: nextRevision };
-  },
+export const createFileInArtifact = internalMutation({
+  args: createFileInArtifactArgs,
+  returns: createFileInArtifactReturns,
+  handler: createFileInArtifactHandler,
 });
 
-/**
- * Mark an existing artifact as actively streaming. Used by `artifact_edit`
- * once the tool input has parsed enough JSON to identify the target.
- */
-export const beginEditStream = internalMutation({
-  args: {
-    artifactId: v.id('artifacts'),
-    liveStreamMode: liveStreamModeValidator,
-    // Set by the artifact_edit tool so the canvas can filter
-    // `tool-input-delta` rows down to this edit's stream. Stored on the row
-    // so subscribers can pick up the right toolCallId without a separate
-    // round-trip; cleared at settle alongside the other streaming flags.
-    toolCallId: v.optional(v.string()),
-  },
-  returns: v.null(),
-  handler: async (ctx, args) => {
-    await ctx.db.patch(args.artifactId, {
-      liveStreamMode: args.liveStreamMode,
-      liveStreamStartedAt: Date.now(),
-      streamingContent: args.liveStreamMode === 'rewrite' ? '' : undefined,
-      streamingPatches: args.liveStreamMode === 'patch' ? [] : undefined,
-      toolCallId: args.toolCallId,
-    });
-    return null;
-  },
+export const updateFileInArtifact = internalMutation({
+  args: updateFileInArtifactArgs,
+  returns: updateFileInArtifactReturns,
+  handler: updateFileInArtifactHandler,
 });
 
-/**
- * Throttled-by-the-caller update of the partial content as the LLM streams
- * its tool-call argument. Writes to the shadow `streamingContent` field so
- * a mid-stream crash cannot corrupt the previously-settled `content`. The
- * title and language fields are also patched here as they grow during
- * streaming — titles are short enough that throttling them isn't worth it.
- *
- * For `mode: 'patch'` streams, `streamingPatches` is populated with the
- * partial list of `search` snippets so the Canvas pane can highlight which
- * regions are about to change.
- */
-export const updateStreamingContent = internalMutation({
-  args: {
-    artifactId: v.id('artifacts'),
-    streamingContent: v.optional(v.string()),
-    title: v.optional(v.string()),
-    language: v.optional(v.string()),
-    streamingPatches: v.optional(v.array(artifactPatchValidator)),
-  },
-  returns: v.null(),
-  handler: async (ctx, args) => {
-    if (args.streamingContent !== undefined) {
-      assertContentSize(args.streamingContent);
-    }
-    const patch: Record<string, unknown> = {};
-    if (args.streamingContent !== undefined) {
-      patch.streamingContent = args.streamingContent;
-    }
-    if (args.title !== undefined) patch.title = args.title;
-    if (args.language !== undefined) patch.language = args.language;
-    if (args.streamingPatches !== undefined) {
-      patch.streamingPatches = args.streamingPatches;
-    }
-    if (Object.keys(patch).length === 0) return null;
-    // Refresh the liveness timestamp at most every HEARTBEAT_THROTTLE_MS.
-    // `liveStreamStartedAt` is the watchdog input for `cleanupStaleStreams`;
-    // refreshing inside the threshold window is enough to keep the row alive
-    // and avoids invalidating doc-level Convex subscriptions on every chunk.
-    const existing = await ctx.db.get(args.artifactId);
-    const now = Date.now();
-    const lastBeat = existing?.liveStreamStartedAt ?? 0;
-    if (now - lastBeat >= HEARTBEAT_THROTTLE_MS) {
-      patch.liveStreamStartedAt = now;
-    }
-    await ctx.db.patch(args.artifactId, patch);
-    return null;
-  },
+// =============================================================================
+// Streaming lifecycle
+// =============================================================================
+
+export const beginEditStream = internalMutation({
+  args: beginEditStreamArgs,
+  returns: beginEditStreamReturns,
+  handler: beginEditStreamHandler,
 });
 
-/**
- * Defensive cleanup: clears all streaming flags without touching `content`.
- * Used by tools in their finally-block when execute fails before any of
- * the canonical settle mutations ran.
- */
 export const abortStream = internalMutation({
-  args: { artifactId: v.id('artifacts') },
-  returns: v.null(),
-  handler: async (ctx, { artifactId }) => {
-    await ctx.db.patch(artifactId, {
-      streamingContent: undefined,
-      streamingPatches: undefined,
-      liveStreamMode: undefined,
-      liveStreamStartedAt: undefined,
-      toolCallId: undefined,
-    });
-    return null;
-  },
+  args: abortStreamArgs,
+  returns: abortStreamReturns,
+  handler: abortStreamHandler,
+});
+
+export const updateRewriteStreamingContent = internalMutation({
+  args: updateRewriteStreamingContentArgs,
+  returns: updateRewriteStreamingContentReturns,
+  handler: updateRewriteStreamingContentHandler,
+});
+
+export const discardActiveStreamsForThread = internalMutation({
+  args: discardActiveStreamsForThreadArgs,
+  returns: discardActiveStreamsForThreadReturns,
+  handler: discardActiveStreamsForThreadHandler,
 });
 
-/**
- * Janitor — clears stream flags on rows where the writer has been silent
- * past the threshold. Covers crashed agent runs that never reached a
- * tool's finally-block. Idempotent and safe to run on a cron.
- */
 export const cleanupStaleStreams = internalMutation({
-  args: {},
-  returns: v.object({ cleared: v.number() }),
-  handler: async (ctx) => {
-    const cutoff = Date.now() - STALE_STREAM_THRESHOLD_MS;
-    let cleared = 0;
-    // The `by_liveStreamMode` index is sparse: rows with `liveStreamMode`
-    // undefined are not in it. So this iterator only touches active streams.
-    for await (const row of ctx.db
-      .query('artifacts')
-      .withIndex('by_liveStreamMode')) {
-      if (
-        row.liveStreamStartedAt !== undefined &&
-        row.liveStreamStartedAt < cutoff
-      ) {
-        await ctx.db.patch(row._id, {
-          streamingContent: undefined,
-          streamingPatches: undefined,
-          liveStreamMode: undefined,
-          liveStreamStartedAt: undefined,
-          toolCallId: undefined,
-        });
-        cleared += 1;
-      }
-    }
-    return { cleared };
-  },
+  args: cleanupStaleStreamsArgs,
+  returns: cleanupStaleStreamsReturns,
+  handler: cleanupStaleStreamsHandler,
+});
+
+// =============================================================================
+// Runnable-artifact run state
+// =============================================================================
+
+export const setArtifactRunConfig = internalMutation({
+  args: setArtifactRunConfigArgs,
+  returns: setArtifactRunConfigReturns,
+  handler: setArtifactRunConfigHandler,
+});
+
+export const addArtifactPackages = internalMutation({
+  args: addArtifactPackagesArgs,
+  returns: addArtifactPackagesReturns,
+  handler: addArtifactPackagesHandler,
+});
+
+export const initArtifactRun = internalMutation({
+  args: initArtifactRunArgs,
+  returns: initArtifactRunReturns,
+  handler: initArtifactRunHandler,
+});
+
+export const appendArtifactRunOutput = internalMutation({
+  args: appendArtifactRunOutputArgs,
+  returns: appendArtifactRunOutputReturns,
+  handler: appendArtifactRunOutputHandler,
+});
+
+export const patchArtifactRunProgress = internalMutation({
+  args: patchArtifactRunProgressArgs,
+  returns: patchArtifactRunProgressReturns,
+  handler: patchArtifactRunProgressHandler,
+});
+
+export const finalizeArtifactRun = internalMutation({
+  args: finalizeArtifactRunArgs,
+  returns: finalizeArtifactRunReturns,
+  handler: finalizeArtifactRunHandler,
+});
+
+export const deriveOutputManifestFromHistory = internalMutation({
+  args: deriveOutputManifestFromHistoryArgs,
+  returns: deriveOutputManifestFromHistoryReturns,
+  handler: deriveOutputManifestFromHistoryHandler,
 });
diff --git a/services/platform/convex/artifacts/internal_queries.test.ts b/services/platform/convex/artifacts/internal_queries.test.ts
new file mode 100644
index 000000000..7b67d2a84
--- /dev/null
+++ b/services/platform/convex/artifacts/internal_queries.test.ts
@@ -0,0 +1,770 @@
+/**
+ * Unit tests for the artifact-side internal queries.
+ *
+ * Currently covers `findArtifactByCreatedMessage`, which backs the
+ * `artifact_create` same-message guard: when an assistant reply has
+ * already produced an artifact, the second `artifact_create` call gets a
+ * soft `already_created_in_message` conflict instead of spawning a
+ * duplicate project. Empty-string `createdByMessageId` must short-circuit
+ * to null so multi-step / sub-agent edge cases don't cross-match every
+ * empty-string row in the thread.
+ */
+
+import { describe, expect, it, vi } from 'vitest';
+
+vi.mock('../_generated/server', async (importOriginal) => {
+  const mod = await importOriginal<Record<string, unknown>>();
+  return {
+    ...mod,
+    internalQuery: (config: Record<string, unknown>) => config,
+  };
+});
+
+import {
+  findArtifactByCreatedMessage,
+  getLatestRunOutputs,
+} from './internal_queries';
+
+interface FakeArtifactRow {
+  _id: string;
+  organizationId: string;
+  threadId: string;
+  createdByMessageId?: string;
+}
+
+interface QueryHandler<TArgs, TReturn> {
+  handler: (ctx: unknown, args: TArgs) => Promise<TReturn> | TReturn;
+}
+
+function createMockCtx(rows: FakeArtifactRow[]) {
+  function makeBuilder() {
+    const eqs: Record<string, unknown> = {};
+    const matches = (): FakeArtifactRow[] =>
+      rows.filter((r) => {
+        if (
+          eqs.organizationId !== undefined &&
+          r.organizationId !== eqs.organizationId
+        ) {
+          return false;
+        }
+        if (eqs.threadId !== undefined && r.threadId !== eqs.threadId) {
+          return false;
+        }
+        if (
+          eqs.createdByMessageId !== undefined &&
+          r.createdByMessageId !== eqs.createdByMessageId
+        ) {
+          return false;
+        }
+        return true;
+      });
+    const builder: Record<string, unknown> = {};
+    builder.withIndex = vi.fn((_name: string, cb: (q: unknown) => unknown) => {
+      const q = {
+        eq: (field: string, value: unknown) => {
+          eqs[field] = value;
+          return q;
+        },
+      };
+      cb(q);
+      return builder;
+    });
+    builder.first = vi.fn(async () => {
+      const list = matches();
+      return list.length > 0 ? list[0] : null;
+    });
+    return builder;
+  }
+  return {
+    ctx: { db: { query: vi.fn(() => makeBuilder()) } },
+  };
+}
+
+type Args = {
+  organizationId: string;
+  threadId: string;
+  createdByMessageId: string;
+};
+
+const find = findArtifactByCreatedMessage as unknown as QueryHandler<
+  Args,
+  FakeArtifactRow | null
+>;
+
+describe('findArtifactByCreatedMessage', () => {
+  it('returns the existing artifact row when one matches the message id', async () => {
+    const { ctx } = createMockCtx([
+      {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        threadId: 'thr_a',
+        createdByMessageId: 'msg_1',
+      },
+    ]);
+
+    const result = await find.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      createdByMessageId: 'msg_1',
+    });
+
+    expect(result).not.toBeNull();
+    expect(result?._id).toBe('art_1');
+  });
+
+  it('returns null when no artifact was created in this message', async () => {
+    const { ctx } = createMockCtx([
+      {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        threadId: 'thr_a',
+        createdByMessageId: 'msg_OTHER',
+      },
+    ]);
+
+    const result = await find.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      createdByMessageId: 'msg_1',
+    });
+
+    expect(result).toBeNull();
+  });
+
+  it('returns null without touching the db when createdByMessageId is empty', async () => {
+    // Empty-string `createdByMessageId` is the multi-step / sub-agent
+    // fallback — guarding against it prevents a stray empty-string row in
+    // the thread from cross-matching every new tool call.
+    const { ctx } = createMockCtx([
+      {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        threadId: 'thr_a',
+        createdByMessageId: '',
+      },
+    ]);
+
+    const result = await find.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      createdByMessageId: '',
+    });
+
+    expect(result).toBeNull();
+    expect(ctx.db.query).not.toHaveBeenCalled();
+  });
+
+  it('scopes the lookup to (organizationId, threadId, createdByMessageId)', async () => {
+    const { ctx } = createMockCtx([
+      {
+        _id: 'art_other_org',
+        organizationId: 'org_OTHER',
+        threadId: 'thr_a',
+        createdByMessageId: 'msg_1',
+      },
+      {
+        _id: 'art_other_thread',
+        organizationId: 'org_a',
+        threadId: 'thr_OTHER',
+        createdByMessageId: 'msg_1',
+      },
+    ]);
+
+    const result = await find.handler(ctx, {
+      organizationId: 'org_a',
+      threadId: 'thr_a',
+      createdByMessageId: 'msg_1',
+    });
+
+    // Both candidate rows live outside the current (org, thread) scope.
+    expect(result).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// getLatestRunOutputs — pre-stage source resolution
+//
+// The pre-stage path that feeds /workspace/output/ in a follow-up
+// `artifact_run` must NOT be defeated by intermediate runs that happen
+// to be `status: 'completed'` but produced no files (e.g. a qa.py that
+// exits 0 without writing anything). The walk-back has to find the
+// most recent run that actually produced files, regardless of status.
+// ---------------------------------------------------------------------------
+
+interface FakeArtifactRow_ {
+  _id: string;
+  organizationId: string;
+  type: string;
+  runOutputFiles?: Array<{
+    name: string;
+    storageId?: string;
+    size: number;
+    contentType?: string;
+  }>;
+}
+
+interface FakeRunRow {
+  _id: string;
+  _creationTime: number;
+  artifactId: string;
+  status: 'completed' | 'failed' | 'cancelled';
+}
+
+interface FakeRunFile {
+  _id: string;
+  _creationTime: number;
+  runId: string;
+  artifactId: string;
+  name: string;
+  storageId: string;
+  size: number;
+  contentType?: string;
+}
+
+interface FakeArtifactOutput {
+  _id: string;
+  artifactId: string;
+  name: string;
+  storageId: string;
+  size: number;
+  contentType?: string;
+  sha256?: string;
+  producedByRunId: string;
+  updatedAt: number;
+}
+
+function createPreStageCtx(opts: {
+  artifact: FakeArtifactRow_;
+  runs: FakeRunRow[];
+  runFiles: FakeRunFile[];
+  artifactOutputs?: FakeArtifactOutput[];
+}) {
+  return {
+    ctx: {
+      db: {
+        get: vi.fn(async (id: string) => {
+          if (id === opts.artifact._id) return opts.artifact;
+          // `from_run` pin path looks up the run row by id; return it
+          // so the pin branch can find its artifactId and walk runFiles.
+          const run = opts.runs.find((r) => r._id === id);
+          return run ?? null;
+        }),
+        normalizeId: vi.fn((_table: string, id: string) => id),
+        query: vi.fn((table: string) => {
+          const eqs: Record<string, unknown> = {};
+          let order: 'asc' | 'desc' = 'asc';
+          const builder: Record<string | symbol, unknown> = {};
+          builder.withIndex = vi.fn(
+            (_name: string, cb: (q: unknown) => unknown) => {
+              const q = {
+                eq: (field: string, value: unknown) => {
+                  eqs[field] = value;
+                  return q;
+                },
+              };
+              cb(q);
+              return builder;
+            },
+          );
+          builder.order = vi.fn((dir: 'asc' | 'desc') => {
+            order = dir;
+            return builder;
+          });
+          // Async iterable
+          builder[Symbol.asyncIterator] = async function* () {
+            if (table === 'artifactRuns') {
+              const rows = opts.runs
+                .filter((r) => r.artifactId === eqs.artifactId)
+                .sort((a, b) =>
+                  order === 'desc'
+                    ? b._creationTime - a._creationTime
+                    : a._creationTime - b._creationTime,
+                );
+              for (const r of rows) yield r;
+              return;
+            }
+            if (table === 'artifactRunFiles') {
+              // Two access patterns:
+              //  - by_run (used by the explicit `from_run` pin path)
+              //  - by_artifact (used by the cumulative walk-back); ordered
+              //    desc by _creationTime so first-occurrence-per-name wins.
+              let rows = opts.runFiles;
+              if (eqs.runId !== undefined) {
+                rows = rows.filter((f) => f.runId === eqs.runId);
+              }
+              if (eqs.artifactId !== undefined) {
+                rows = rows.filter((f) => f.artifactId === eqs.artifactId);
+              }
+              rows = [...rows].sort((a, b) =>
+                order === 'desc'
+                  ? b._creationTime - a._creationTime
+                  : a._creationTime - b._creationTime,
+              );
+              for (const f of rows) yield f;
+              return;
+            }
+            if (table === 'artifactOutputs') {
+              const rows = (opts.artifactOutputs ?? []).filter(
+                (o) => o.artifactId === eqs.artifactId,
+              );
+              for (const o of rows) yield o;
+              return;
+            }
+          };
+          return builder;
+        }),
+      },
+    },
+  };
+}
+
+const getLatest = getLatestRunOutputs as unknown as QueryHandler<
+  {
+    artifactId: string;
+    expectedOrganizationId?: string;
+    fromRun?: string;
+  },
+  {
+    files: Array<{ name: string; storageId: string; size: number }>;
+    source: string;
+  }
+>;
+
+describe('getLatestRunOutputs', () => {
+  it('returns files from a failed-but-with-files run when the latest completed run produced nothing', async () => {
+    // The exact scenario the user reported:
+    //   - Run 1 (older): main.js + qa.py multi-step. main.js wrote a
+    //     pptx, qa.py crashed → overall status='failed', PPTX in
+    //     artifactRunFiles.
+    //   - Run 2 (newer): qa.py-only. Exits 0 with no /workspace/output
+    //     writes → status='completed', empty artifactRunFiles.
+    // The next pre-stage must pick up Run 1's pptx, not Run 2's empty
+    // file set.
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [
+        {
+          _id: 'run_old_failed',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'failed',
+        },
+        {
+          _id: 'run_new_completed',
+          _creationTime: 2_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      runFiles: [
+        {
+          _id: 'rf_1',
+          _creationTime: 1_100,
+          runId: 'run_old_failed',
+          artifactId: 'art_1',
+          name: 'test.pptx',
+          storageId: 'st_pptx',
+          size: 250_000,
+        },
+      ],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('artifact_run_files');
+    expect(result.files).toHaveLength(1);
+    expect(result.files[0]?.name).toBe('test.pptx');
+    expect(result.files[0]?.storageId).toBe('st_pptx');
+  });
+
+  it('walks back through cancelled / failed runs alike, first run with files wins', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [
+        {
+          _id: 'run_oldest_with_file',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'failed',
+        },
+        {
+          _id: 'run_middle_cancelled_empty',
+          _creationTime: 2_000,
+          artifactId: 'art_1',
+          status: 'cancelled',
+        },
+        {
+          _id: 'run_newest_completed_empty',
+          _creationTime: 3_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      runFiles: [
+        {
+          _id: 'rf_1',
+          _creationTime: 1_100,
+          runId: 'run_oldest_with_file',
+          artifactId: 'art_1',
+          name: 'first.txt',
+          storageId: 'st_first',
+          size: 100,
+        },
+      ],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('artifact_run_files');
+    expect(result.files[0]?.name).toBe('first.txt');
+  });
+
+  it('falls back to legacy artifacts.runOutputFiles when no run produced files', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+        runOutputFiles: [
+          {
+            name: 'legacy.txt',
+            storageId: 'st_legacy',
+            size: 50,
+          },
+        ],
+      },
+      runs: [
+        {
+          _id: 'run_empty',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      runFiles: [],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('legacy_artifact_field');
+    expect(result.files[0]?.name).toBe('legacy.txt');
+  });
+
+  it('returns none when both walk-back and legacy field are empty', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [],
+      runFiles: [],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('none');
+    expect(result.files).toHaveLength(0);
+  });
+
+  it('respects expectedOrganizationId IDOR check', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [],
+      runFiles: [],
+    });
+
+    const result = await getLatest.handler(ctx, {
+      artifactId: 'art_1',
+      expectedOrganizationId: 'org_OTHER',
+    });
+
+    expect(result.source).toBe('none');
+    expect(result.files).toHaveLength(0);
+  });
+
+  // ---------------------------------------------------------------------
+  // Cumulative-state invariant (crispy-curry plan Defect 1).
+  //
+  // The old walk-back returned a single run's files. If Run 1 produced
+  // foo.pptx and Run 2 produced only bar.txt (no foo.pptx), the next
+  // pre-stage saw Run 2 first and returned [bar.txt] — losing foo.pptx
+  // from /workspace/output/ even though it still existed in _storage.
+  //
+  // The new walk-back reduces newest-name-wins across runs, so Run 3 sees
+  // BOTH foo.pptx and bar.txt. This is the regression for the user's
+  // exact reported failure mode.
+  // ---------------------------------------------------------------------
+
+  it('accumulates files across runs even when newer runs produced different filenames (no-shadow invariant)', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [
+        {
+          _id: 'run_1',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+        {
+          _id: 'run_2',
+          _creationTime: 2_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      runFiles: [
+        {
+          _id: 'rf_old',
+          _creationTime: 1_100,
+          runId: 'run_1',
+          artifactId: 'art_1',
+          name: 'foo.pptx',
+          storageId: 'st_foo',
+          size: 250_000,
+        },
+        {
+          _id: 'rf_new',
+          _creationTime: 2_100,
+          runId: 'run_2',
+          artifactId: 'art_1',
+          name: 'bar.txt',
+          storageId: 'st_bar',
+          size: 50,
+        },
+      ],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('artifact_run_files');
+    // Both files should be visible — newer-different-filename must not
+    // shadow earlier output.
+    expect(result.files.map((f) => f.name).sort()).toEqual([
+      'bar.txt',
+      'foo.pptx',
+    ]);
+    // Walk-back path signals lazy-derive is needed so the next read
+    // hits the manifest table directly.
+    expect(
+      (result as unknown as { needsManifestDerive: boolean })
+        .needsManifestDerive,
+    ).toBe(true);
+  });
+
+  it('takes newest-by-creation-time when the same filename appears across runs', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [
+        {
+          _id: 'run_1',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+        {
+          _id: 'run_2',
+          _creationTime: 2_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      runFiles: [
+        {
+          _id: 'rf_old',
+          _creationTime: 1_100,
+          runId: 'run_1',
+          artifactId: 'art_1',
+          name: 'report.txt',
+          storageId: 'st_old',
+          size: 10,
+        },
+        {
+          _id: 'rf_new',
+          _creationTime: 2_100,
+          runId: 'run_2',
+          artifactId: 'art_1',
+          name: 'report.txt',
+          storageId: 'st_new',
+          size: 20,
+        },
+      ],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('artifact_run_files');
+    expect(result.files).toHaveLength(1);
+    expect(result.files[0]?.name).toBe('report.txt');
+    expect(result.files[0]?.storageId).toBe('st_new');
+  });
+
+  // ---------------------------------------------------------------------
+  // Manifest precedence (crispy-curry plan §1).
+  //
+  // Once the artifact has any rows in `artifactOutputs`, the cumulative
+  // manifest is the source of truth — the walk-back fallback is
+  // bypassed. `needsManifestDerive` should be false because no
+  // lazy-derive is needed.
+  // ---------------------------------------------------------------------
+
+  it('reads from artifactOutputs manifest when present, skipping the run-files walk-back', async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [
+        {
+          _id: 'run_stale',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      // The walk-back would have surfaced this file. The manifest takes
+      // precedence; we should NEVER see `walked_only.txt` in the result.
+      runFiles: [
+        {
+          _id: 'rf_walked',
+          _creationTime: 1_100,
+          runId: 'run_stale',
+          artifactId: 'art_1',
+          name: 'walked_only.txt',
+          storageId: 'st_walked',
+          size: 10,
+        },
+      ],
+      artifactOutputs: [
+        {
+          _id: 'ao_1',
+          artifactId: 'art_1',
+          name: 'manifest_a.txt',
+          storageId: 'st_a',
+          size: 100,
+          sha256: 'deadbeef',
+          producedByRunId: 'run_x',
+          updatedAt: 5_000,
+        },
+        {
+          _id: 'ao_2',
+          artifactId: 'art_1',
+          name: 'manifest_b.txt',
+          storageId: 'st_b',
+          size: 200,
+          producedByRunId: 'run_y',
+          updatedAt: 6_000,
+        },
+      ],
+    });
+
+    const result = await getLatest.handler(ctx, { artifactId: 'art_1' });
+
+    expect(result.source).toBe('artifact_outputs');
+    expect(result.files.map((f) => f.name).sort()).toEqual([
+      'manifest_a.txt',
+      'manifest_b.txt',
+    ]);
+    // Manifest path → no derive needed.
+    expect(
+      (result as unknown as { needsManifestDerive: boolean })
+        .needsManifestDerive,
+    ).toBe(false);
+    // sha256 from the manifest is preserved through the query.
+    const a = result.files.find((f) => f.name === 'manifest_a.txt');
+    expect((a as unknown as { sha256?: string } | undefined)?.sha256).toBe(
+      'deadbeef',
+    );
+  });
+
+  // ---------------------------------------------------------------------
+  // `from_run` pin still scopes to a single run's files (crispy-curry plan §1).
+  // The pin is a positive lever — "give me the state run X produced" —
+  // so it deliberately bypasses the cumulative manifest.
+  // ---------------------------------------------------------------------
+
+  it("from_run pin returns only that one run's files, ignoring the cumulative manifest", async () => {
+    const { ctx } = createPreStageCtx({
+      artifact: {
+        _id: 'art_1',
+        organizationId: 'org_a',
+        type: 'script_runnable',
+      },
+      runs: [
+        {
+          _id: 'run_pinned',
+          _creationTime: 1_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+        {
+          _id: 'run_other',
+          _creationTime: 2_000,
+          artifactId: 'art_1',
+          status: 'completed',
+        },
+      ],
+      runFiles: [
+        {
+          _id: 'rf_pinned',
+          _creationTime: 1_100,
+          runId: 'run_pinned',
+          artifactId: 'art_1',
+          name: 'pinned.txt',
+          storageId: 'st_pinned',
+          size: 10,
+        },
+        {
+          _id: 'rf_other',
+          _creationTime: 2_100,
+          runId: 'run_other',
+          artifactId: 'art_1',
+          name: 'other.txt',
+          storageId: 'st_other',
+          size: 20,
+        },
+      ],
+      artifactOutputs: [
+        {
+          _id: 'ao_1',
+          artifactId: 'art_1',
+          name: 'manifest.txt',
+          storageId: 'st_manifest',
+          size: 100,
+          producedByRunId: 'run_other',
+          updatedAt: 5_000,
+        },
+      ],
+    });
+
+    const result = await getLatest.handler(ctx, {
+      artifactId: 'art_1',
+      fromRun: 'run_pinned',
+    });
+
+    expect(result.source).toBe('artifact_run_files');
+    expect(result.files).toHaveLength(1);
+    expect(result.files[0]?.name).toBe('pinned.txt');
+  });
+});
diff --git a/services/platform/convex/artifacts/internal_queries.ts b/services/platform/convex/artifacts/internal_queries.ts
index 3b2c659d7..9198dfabd 100644
--- a/services/platform/convex/artifacts/internal_queries.ts
+++ b/services/platform/convex/artifacts/internal_queries.ts
@@ -1,6 +1,7 @@
 import { v } from 'convex/values';
 
 import { internalQuery } from '../_generated/server';
+import { loadArtifactWithFiles, resolveArtifactFiles } from './resolve_files';
 
 export const getById = internalQuery({
   args: {
@@ -12,7 +13,7 @@ export const getById = internalQuery({
     ctx,
     { artifactId, expectedOrganizationId, expectedThreadId },
   ) => {
-    const artifact = await ctx.db.get(artifactId);
+    const artifact = await loadArtifactWithFiles(ctx, artifactId);
     if (!artifact) return null;
     if (
       expectedOrganizationId !== undefined &&
@@ -48,3 +49,511 @@ export const listByThread = internalQuery({
     return rows;
   },
 });
+
+/**
+ * Metadata-only projection of artifacts in a thread. Returned shape carries
+ * the fields the `artifact_list` agent tool exposes to the LLM:
+ *   { _id, type, title, revision, entryFile, fileCount, totalBytes,
+ *     language?, updatedAt }
+ *
+ * Why a separate query: the heavier `listByThread` returns full rows with
+ * embedded `files[]` content for `build_artifacts_context` (which actually
+ * needs the bytes). The agent-tool path doesn't — it just summarizes —
+ * but the original implementation pulled the full rows and aggregated
+ * `content.length` on the action side, allocating MB of strings per call
+ * for no user-visible benefit. This query projects server-side via
+ * `resolveArtifactFiles`, keeping the wire payload bounded.
+ */
+export const listByThreadMetadata = internalQuery({
+  args: {
+    organizationId: v.string(),
+    threadId: v.string(),
+  },
+  returns: v.array(
+    v.object({
+      _id: v.id('artifacts'),
+      type: v.string(),
+      title: v.string(),
+      revision: v.number(),
+      entryFile: v.string(),
+      fileCount: v.number(),
+      totalBytes: v.number(),
+      language: v.optional(v.string()),
+      updatedAt: v.number(),
+    }),
+  ),
+  handler: async (ctx, { organizationId, threadId }) => {
+    const out: Array<{
+      _id: import('../_generated/dataModel').Id<'artifacts'>;
+      type: string;
+      title: string;
+      revision: number;
+      entryFile: string;
+      fileCount: number;
+      totalBytes: number;
+      language?: string;
+      updatedAt: number;
+    }> = [];
+    for await (const row of ctx.db
+      .query('artifacts')
+      .withIndex('by_organizationId_and_thread', (q) =>
+        q.eq('organizationId', organizationId).eq('threadId', threadId),
+      )
+      .order('asc')) {
+      const resolved = resolveArtifactFiles(row);
+      let totalBytes = 0;
+      for (const f of resolved.files) totalBytes += f.content.length;
+      const entry: {
+        _id: import('../_generated/dataModel').Id<'artifacts'>;
+        type: string;
+        title: string;
+        revision: number;
+        entryFile: string;
+        fileCount: number;
+        totalBytes: number;
+        language?: string;
+        updatedAt: number;
+      } = {
+        _id: row._id,
+        type: row.type,
+        title: row.title,
+        revision: row.revision,
+        entryFile: resolved.entryFile,
+        fileCount: resolved.files.length,
+        totalBytes,
+        updatedAt: row.updatedAt,
+      };
+      if (row.language !== undefined) entry.language = row.language;
+      out.push(entry);
+    }
+    return out;
+  },
+});
+
+/**
+ * Returns the artifact's CUMULATIVE output manifest for pre-staging into the
+ * next sandbox run's `/workspace/output/`. Each `(artifactId, name)` survives
+ * across runs — empty runs don't wipe earlier files, and a later run that
+ * produces a different filename doesn't shadow the earlier one.
+ *
+ * Source precedence (highest first):
+ *   1. `artifactOutputs` table — cumulative manifest, maintained by
+ *      `applyFinalizeArtifactRun` upserts. O(1) per artifact.
+ *   2. Newest-name-wins reduction across `artifactRunFiles` — for artifacts
+ *      that predate the manifest. Walks all runs newest-first, builds a
+ *      `Map<name, file>` taking the first occurrence per name. The caller
+ *      (action) is expected to follow up with `deriveOutputManifestFromHistory`
+ *      so subsequent reads land in source 1.
+ *   3. Legacy `artifacts.runOutputFiles` field — pre-`artifactRunFiles` rows
+ *      (kept for backward compat per [feedback_deprecate_dont_delete_schema_fields]).
+ *
+ * Pre-stage source selection:
+ *   - omitted `fromRun` (or `"latest"`): cumulative manifest as described above.
+ *   - explicit runId string: pin to that exact run's files via
+ *     `artifactRunFiles` (status-agnostic). Bypasses the cumulative model
+ *     because the LLM is explicitly asking for "the state run X produced"
+ *     rather than the artifact's accumulated workspace.
+ */
+export const getLatestRunOutputs = internalQuery({
+  args: {
+    artifactId: v.id('artifacts'),
+    expectedOrganizationId: v.optional(v.string()),
+    fromRun: v.optional(v.string()),
+  },
+  returns: v.object({
+    files: v.array(
+      v.object({
+        name: v.string(),
+        storageId: v.id('_storage'),
+        size: v.number(),
+        contentType: v.optional(v.string()),
+        sha256: v.optional(v.string()),
+      }),
+    ),
+    source: v.union(
+      v.literal('artifact_outputs'),
+      v.literal('artifact_run_files'),
+      v.literal('legacy_artifact_field'),
+      v.literal('none'),
+    ),
+    /**
+     * True when the cumulative manifest table is empty for this artifact
+     * but a fallback source (`artifact_run_files` or `legacy_artifact_field`)
+     * supplied the data. The caller should follow up with
+     * `deriveOutputManifestFromHistory` so the next read is O(1).
+     */
+    needsManifestDerive: v.boolean(),
+  }),
+  handler: async (ctx, { artifactId, expectedOrganizationId, fromRun }) => {
+    type PriorOutputFile = {
+      name: string;
+      storageId: import('../_generated/dataModel').Id<'_storage'>;
+      size: number;
+      contentType?: string;
+      sha256?: string;
+    };
+    const artifact = await ctx.db.get(artifactId);
+    if (!artifact) {
+      return { files: [], source: 'none' as const, needsManifestDerive: false };
+    }
+    if (
+      expectedOrganizationId !== undefined &&
+      artifact.organizationId !== expectedOrganizationId
+    ) {
+      return { files: [], source: 'none' as const, needsManifestDerive: false };
+    }
+
+    // 1. Explicit `from_run` pin — caller named a specific runId. Returns
+    //    that run's `artifactRunFiles` exactly (status-agnostic, no
+    //    cumulative reduce). Pin is a positive lever ("I want the state
+    //    run X produced"), so we deliberately bypass the manifest path.
+    if (fromRun !== undefined && fromRun !== 'latest') {
+      let pinnedRun: Awaited<ReturnType<typeof ctx.db.get<'artifactRuns'>>> =
+        null;
+      try {
+        const pinnedRunId = ctx.db.normalizeId('artifactRuns', fromRun);
+        if (pinnedRunId !== null) {
+          pinnedRun = await ctx.db.get(pinnedRunId);
+        }
+      } catch (err) {
+        console.warn(
+          '[getLatestRunOutputs] malformed fromRun id, falling back:',
+          err,
+        );
+      }
+      if (pinnedRun !== null && pinnedRun.artifactId === artifactId) {
+        const pinnedFiles: PriorOutputFile[] = [];
+        for await (const f of ctx.db
+          .query('artifactRunFiles')
+          .withIndex('by_run', (q) => q.eq('runId', pinnedRun._id))) {
+          pinnedFiles.push({
+            name: f.name,
+            storageId: f.storageId,
+            size: f.size,
+            ...(f.contentType !== undefined && { contentType: f.contentType }),
+            ...(f.sha256 !== undefined && { sha256: f.sha256 }),
+          });
+        }
+        return {
+          files: pinnedFiles,
+          source: 'artifact_run_files' as const,
+          needsManifestDerive: false,
+        };
+      }
+    }
+
+    // 2. Cumulative manifest (preferred). One index scan, no walk-back.
+    const manifestFiles: PriorOutputFile[] = [];
+    for await (const row of ctx.db
+      .query('artifactOutputs')
+      .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))) {
+      manifestFiles.push({
+        name: row.name,
+        storageId: row.storageId,
+        size: row.size,
+        ...(row.contentType !== undefined && { contentType: row.contentType }),
+        ...(row.sha256 !== undefined && { sha256: row.sha256 }),
+      });
+    }
+    if (manifestFiles.length > 0) {
+      return {
+        files: manifestFiles,
+        source: 'artifact_outputs' as const,
+        needsManifestDerive: false,
+      };
+    }
+
+    // 3. Pre-manifest fallback: walk `artifactRunFiles` newest-first and
+    //    build a cumulative `Map<name, file>` (first occurrence wins).
+    //    This already fixes the "newest-shadows-older" architectural
+    //    defect even before the artifact's manifest gets derived. The
+    //    caller is expected to follow up with the derive mutation so the
+    //    next read lands in branch 2 above.
+    //
+    //    Status-agnostic by design — `artifactRunFiles` is append-only and
+    //    only carries files that survived harvest + storage upload, so the
+    //    row's presence is the "this file was really produced" signal.
+    const byName = new Map<string, Omit<PriorOutputFile, 'name'>>();
+    for await (const row of ctx.db
+      .query('artifactRunFiles')
+      .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))
+      .order('desc')) {
+      if (byName.has(row.name)) continue;
+      byName.set(row.name, {
+        storageId: row.storageId,
+        size: row.size,
+        ...(row.contentType !== undefined && { contentType: row.contentType }),
+        ...(row.sha256 !== undefined && { sha256: row.sha256 }),
+      });
+    }
+    if (byName.size > 0) {
+      const files: PriorOutputFile[] = Array.from(byName, ([name, info]) => ({
+        name,
+        ...info,
+      }));
+      return {
+        files,
+        source: 'artifact_run_files' as const,
+        needsManifestDerive: true,
+      };
+    }
+
+    // 4. Final fallback: legacy artifacts.runOutputFiles (pre-table data).
+    const files: PriorOutputFile[] = [];
+    for (const f of artifact.runOutputFiles ?? []) {
+      if (f.storageId === undefined) continue;
+      const entry: PriorOutputFile = {
+        name: f.name,
+        storageId: f.storageId,
+        size: f.size,
+      };
+      if (f.contentType !== undefined) entry.contentType = f.contentType;
+      files.push(entry);
+    }
+    return {
+      files,
+      source:
+        files.length > 0
+          ? ('legacy_artifact_field' as const)
+          : ('none' as const),
+      // Legacy field can't be derived into manifest from a query — the
+      // action's lazy-derive path explicitly only walks artifactRunFiles
+      // (the legacy field has no producedByRunId reference). So this
+      // flag stays false here; the next harvest will populate the
+      // manifest naturally via applyFinalizeArtifactRun.
+      needsManifestDerive: false,
+    };
+  },
+});
+
+/**
+ * Returns the `artifactRuns` row created by `applyFinalizeArtifactRun` for
+ * a given sandbox `executionId`, or null if the run never finalized (rare
+ * — only infra crashes that bypass the finalize path). Used by
+ * `artifact_run` to surface the persistent run id to the LLM so a later
+ * call can pin pre-staging via `inputs: { from_run: "<runId>" }`.
+ */
+/**
+ * Validates that a `runId` (LLM-supplied as `artifact_run({inputs.from_run})`)
+ * actually belongs to the given `artifactId`. Returns `'ok'` if valid, or a
+ * structured reason for the tool layer to surface to the LLM. Without this
+ * validation the spawner action silently falls back to "latest succeeded"
+ * when the runId is malformed or points at a different artifact's run,
+ * masking the misuse and producing a run pinned to outputs the LLM did not
+ * intend.
+ */
+export const validateRunIdForArtifact = internalQuery({
+  args: {
+    artifactId: v.id('artifacts'),
+    runId: v.string(),
+  },
+  returns: v.union(
+    v.literal('ok'),
+    v.literal('malformed_run_id'),
+    v.literal('run_not_found'),
+    v.literal('run_belongs_to_other_artifact'),
+  ),
+  handler: async (ctx, { artifactId, runId }) => {
+    if (runId === 'latest' || runId.length === 0) {
+      // 'latest' is the sentinel for "no explicit pin"; both paths fall
+      // through to the cumulative-manifest branch in `getLatestRunOutputs`
+      // so they're equivalent — accept here so the tool doesn't have to
+      // pre-strip them.
+      return 'ok' as const;
+    }
+    const normalized = ctx.db.normalizeId('artifactRuns', runId);
+    if (normalized === null) return 'malformed_run_id' as const;
+    const row = await ctx.db.get(normalized);
+    if (row === null) return 'run_not_found' as const;
+    if (row.artifactId !== artifactId) {
+      return 'run_belongs_to_other_artifact' as const;
+    }
+    return 'ok' as const;
+  },
+});
+
+export const getRunByExecutionId = internalQuery({
+  args: { executionId: v.id('sandboxExecutions') },
+  returns: v.union(
+    v.null(),
+    v.object({
+      _id: v.id('artifactRuns'),
+      artifactId: v.id('artifacts'),
+      status: v.string(),
+    }),
+  ),
+  handler: async (ctx, { executionId }) => {
+    const row = await ctx.db
+      .query('artifactRuns')
+      .withIndex('by_executionId', (q) => q.eq('executionId', executionId))
+      .first();
+    if (row === null) return null;
+    return {
+      _id: row._id,
+      artifactId: row.artifactId,
+      status: row.status,
+    };
+  },
+});
+
+/**
+ * Returns the first artifact in this thread whose `createdByMessageId` matches
+ * the supplied id, or null. Backs the `artifact_create` same-message guard:
+ * the tool short-circuits to a soft-conflict response so the model uses
+ * `artifact_file_create` / `artifact_file_update` instead of spawning a duplicate project on the same reply.
+ *
+ * Caller must pass a non-empty `createdByMessageId` — empty-string artifacts
+ * from multi-step / sub-agent edge cases would otherwise cross-match.
+ */
+/**
+ * List all files in an artifact (metadata only — path + size). Backs the
+ * `artifact_file_list` agent tool. Reads canonical `artifactFiles` rows; falls back
+ * to the artifact-row `files[]` / synthesized-from-`content` projection
+ * via `resolveArtifactFiles` for rows that predate the multi-file refactor.
+ */
+export const listFilesByArtifact = internalQuery({
+  args: {
+    artifactId: v.id('artifacts'),
+    expectedOrganizationId: v.optional(v.string()),
+    expectedThreadId: v.optional(v.string()),
+  },
+  handler: async (
+    ctx,
+    { artifactId, expectedOrganizationId, expectedThreadId },
+  ) => {
+    const artifact = await ctx.db.get(artifactId);
+    if (!artifact) return null;
+    if (
+      expectedOrganizationId !== undefined &&
+      artifact.organizationId !== expectedOrganizationId
+    ) {
+      return null;
+    }
+    if (
+      expectedThreadId !== undefined &&
+      artifact.threadId !== expectedThreadId
+    ) {
+      return null;
+    }
+    const rows = [];
+    for await (const row of ctx.db
+      .query('artifactFiles')
+      .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))) {
+      rows.push(row);
+    }
+    if (rows.length > 0) {
+      const resolved = resolveArtifactFiles(artifact);
+      return {
+        artifactId,
+        revision: artifact.revision,
+        type: artifact.type,
+        title: artifact.title,
+        language: artifact.language,
+        entryFile: resolved.entryFile,
+        files: rows.map((r) => ({
+          path: r.path,
+          size: new TextEncoder().encode(r.content).byteLength,
+        })),
+      };
+    }
+    // Fallback: row predates artifactFiles backfill — derive from doc.
+    const resolved = resolveArtifactFiles(artifact);
+    return {
+      artifactId,
+      revision: artifact.revision,
+      type: artifact.type,
+      title: artifact.title,
+      language: artifact.language,
+      entryFile: resolved.entryFile,
+      files: resolved.files.map((f) => ({
+        path: f.path,
+        size: new TextEncoder().encode(f.content).byteLength,
+      })),
+    };
+  },
+});
+
+/**
+ * Read file contents by exact path(s). Backs the `artifact_file_read` agent tool.
+ * Returns each requested path's full content; unknown paths are reported
+ * in `missing` so the tool can surface a structured `file_missing` error.
+ */
+export const getFilesByPaths = internalQuery({
+  args: {
+    artifactId: v.id('artifacts'),
+    paths: v.array(v.string()),
+    expectedOrganizationId: v.optional(v.string()),
+    expectedThreadId: v.optional(v.string()),
+  },
+  handler: async (
+    ctx,
+    { artifactId, paths, expectedOrganizationId, expectedThreadId },
+  ) => {
+    const artifact = await ctx.db.get(artifactId);
+    if (!artifact) return null;
+    if (
+      expectedOrganizationId !== undefined &&
+      artifact.organizationId !== expectedOrganizationId
+    ) {
+      return null;
+    }
+    if (
+      expectedThreadId !== undefined &&
+      artifact.threadId !== expectedThreadId
+    ) {
+      return null;
+    }
+    const resolved = resolveArtifactFiles(artifact);
+    // Prefer artifactFiles rows when present; fall back to resolved files.
+    const tableRows: { path: string; content: string }[] = [];
+    for await (const row of ctx.db
+      .query('artifactFiles')
+      .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))) {
+      tableRows.push({ path: row.path, content: row.content });
+    }
+    const source = tableRows.length > 0 ? tableRows : resolved.files;
+    const byPath = new Map<string, string>();
+    for (const f of source) byPath.set(f.path, f.content);
+    const found: { path: string; content: string }[] = [];
+    const missing: string[] = [];
+    for (const p of paths) {
+      const content = byPath.get(p);
+      if (content === undefined) {
+        missing.push(p);
+      } else {
+        found.push({ path: p, content });
+      }
+    }
+    return {
+      artifactId,
+      revision: artifact.revision,
+      type: artifact.type,
+      title: artifact.title,
+      language: artifact.language,
+      entryFile: resolved.entryFile,
+      availablePaths: Array.from(byPath.keys()),
+      files: found,
+      missing,
+    };
+  },
+});
+
+export const findArtifactByCreatedMessage = internalQuery({
+  args: {
+    organizationId: v.string(),
+    threadId: v.string(),
+    createdByMessageId: v.string(),
+  },
+  handler: async (ctx, { organizationId, threadId, createdByMessageId }) => {
+    if (createdByMessageId === '') return null;
+    return await ctx.db
+      .query('artifacts')
+      .withIndex('by_organizationId_thread_createdByMessageId', (q) =>
+        q
+          .eq('organizationId', organizationId)
+          .eq('threadId', threadId)
+          .eq('createdByMessageId', createdByMessageId),
+      )
+      .first();
+  },
+});
diff --git a/services/platform/convex/artifacts/mutations.ts b/services/platform/convex/artifacts/mutations.ts
index d0d9dce4f..8a6c5e507 100644
--- a/services/platform/convex/artifacts/mutations.ts
+++ b/services/platform/convex/artifacts/mutations.ts
@@ -2,16 +2,27 @@ import { v } from 'convex/values';
 import { ConvexError } from 'convex/values';
 
 import { mutation } from '../_generated/server';
+import { validatePath } from '../agent_tools/artifacts/shared';
 import { getAuthUserIdentity } from '../lib/rls';
 import { assertThreadAccess } from '../lib/rls/auth/can_access_thread';
-import { assertContentSize } from './internal_mutations';
+import { authorizeRls } from '../lib/rls/helpers/access_control';
+import { getUserOrganizations } from '../lib/rls/organization/get_user_organizations';
+import { assertAggregateSize } from './internal_mutations';
+import { mirrorLegacyContent, resolveArtifactFiles } from './resolve_files';
 
+/**
+ * User-driven edit from the Canvas pane. Path-aware: writes to a specific
+ * file in the project. Refuses to overwrite the file currently being
+ * streamed-to by the LLM, but allows concurrent edits to OTHER files.
+ */
 export const userEdit = mutation({
   args: {
     artifactId: v.id('artifacts'),
+    /** File path within the artifact. Defaults to the row's `entryFile`. */
+    path: v.optional(v.string()),
     content: v.string(),
   },
-  returns: v.object({ revision: v.number() }),
+  returns: v.object({ revision: v.number(), path: v.string() }),
   handler: async (ctx, args) => {
     const authUser = await getAuthUserIdentity(ctx);
     if (!authUser) {
@@ -34,24 +45,71 @@ export const userEdit = mutation({
         message: 'Not authorized to access this thread.',
       });
     }
+    // Role gate: the access-control matrix (access_control.ts) makes
+    // `member` READ-ONLY for `artifacts`. `userEdit` is a plain `mutation`,
+    // not `mutationWithRLS`, so without this explicit check a member
+    // could edit artifacts via the public mutation (audit follow-up F13).
+    // Explicit-fail when membership is absent (e.g. revoked org access mid-
+    // session): previously this leaned on `authorizeRls(undefined, …)`
+    // implicitly coercing to the most-restrictive `member` row, which is
+    // correct today but couples correctness to the role matrix never
+    // changing the undefined behaviour. Surface the forbidden state
+    // directly so a future matrix change can't quietly open a hole.
+    const memberships = await getUserOrganizations(ctx, authUser);
+    const membership = memberships.find(
+      (m) => m.organizationId === artifact.organizationId,
+    );
+    if (!membership) {
+      throw new ConvexError({
+        code: 'forbidden',
+        message: "You are not a member of this artifact's organization.",
+      });
+    }
+    if (!authorizeRls(membership.role, 'artifacts', 'write')) {
+      throw new ConvexError({
+        code: 'forbidden',
+        message: 'Your role does not permit editing artifacts.',
+      });
+    }
+
+    const resolved = resolveArtifactFiles(artifact);
+    const targetPath =
+      args.path !== undefined ? validatePath(args.path) : resolved.entryFile;
 
-    if (artifact.liveStreamMode !== undefined) {
+    // Refuse iff the LLM is streaming to THIS specific file. Edits to other
+    // files in the same project are allowed concurrently (per R2-07).
+    if (
+      artifact.liveStreamMode !== undefined &&
+      artifact.streamingPath === targetPath
+    ) {
       throw new ConvexError({
         code: 'streaming',
-        message: 'Cannot edit while the agent is streaming this artifact.',
+        message: `Cannot edit "${targetPath}" while the agent is streaming to it.`,
       });
     }
 
-    assertContentSize(args.content);
-
-    if (args.content === artifact.content) {
-      return { revision: artifact.revision };
+    // Find existing or treat as new file.
+    const existing = resolved.files.find((f) => f.path === targetPath);
+    if (existing && existing.content === args.content) {
+      return { revision: artifact.revision, path: targetPath };
     }
 
+    const nextFiles = existing
+      ? resolved.files.map((f) =>
+          f.path === targetPath
+            ? { path: targetPath, content: args.content }
+            : f,
+        )
+      : [...resolved.files, { path: targetPath, content: args.content }];
+
+    assertAggregateSize(nextFiles);
+
     const nextRevision = artifact.revision + 1;
     const now = Date.now();
     await ctx.db.patch(args.artifactId, {
-      content: args.content,
+      files: nextFiles,
+      entryFile: resolved.entryFile,
+      content: mirrorLegacyContent(nextFiles, resolved.entryFile),
       revision: nextRevision,
       lastEditedByMessageId: undefined,
       updatedAt: now,
@@ -59,11 +117,14 @@ export const userEdit = mutation({
     await ctx.db.insert('artifactRevisions', {
       artifactId: args.artifactId,
       revision: nextRevision,
-      content: args.content,
+      content: mirrorLegacyContent(nextFiles, resolved.entryFile),
+      files: nextFiles,
+      entryFile: resolved.entryFile,
+      filePath: targetPath,
       editedByMessageId: undefined,
       editKind: 'user',
       createdAt: now,
     });
-    return { revision: nextRevision };
+    return { revision: nextRevision, path: targetPath };
   },
 });
diff --git a/services/platform/convex/artifacts/queries.test.ts b/services/platform/convex/artifacts/queries.test.ts
new file mode 100644
index 000000000..4e959cb32
--- /dev/null
+++ b/services/platform/convex/artifacts/queries.test.ts
@@ -0,0 +1,239 @@
+/**
+ * Unit tests for `selectRunsPerFile` — the pure projection helper that
+ * powers the canvas `RunResultPanel`. The Convex wrapper around it
+ * (`listRunsPerFile`) handles auth + row fetching only; this helper owns
+ * all the logic worth verifying: latest-per-path collapsing, entry-first
+ * ordering, deleted-file filtering, and the legacy single-file fallback.
+ */
+
+import { describe, expect, it } from 'vitest';
+
+import { selectRunsPerFile } from './queries';
+
+interface FakeArtifact {
+  _id: string;
+  files?: Array<{ path: string; content: string }>;
+  entryFile?: string;
+  revision: number;
+  runStatus?: string;
+  runExecutionId?: string;
+  runProgress?: unknown;
+  runErrorCode?: string;
+  runErrorMessage?: string;
+  runStdoutPreview?: string;
+  runStderrPreview?: string;
+  runOutputFiles?: unknown[];
+  runRevision?: number;
+  runExitCode?: number;
+}
+
+interface FakeExecution {
+  _id: string;
+  _creationTime: number;
+  artifactId: string;
+  path?: string;
+  status: string;
+  errorCode?: string;
+  errorMessage?: string;
+  stdoutPreview?: string;
+  stderrPreview?: string;
+  outputFiles?: unknown[];
+  exitCode?: number;
+}
+
+// `selectRunsPerFile` is typed against `Doc<'artifacts'>` /
+// `Doc<'sandboxExecutions'>`; from a unit-test point of view those are
+// structurally compatible with our fakes (we only touch the fields the
+// helper reads). The casts below keep the test bodies readable.
+type SelectFn = (
+  artifact: FakeArtifact,
+  rowsNewestFirst: FakeExecution[],
+  entryFile: string,
+  declaredFiles: ReadonlyArray<string>,
+) => Array<{
+  executionId: unknown;
+  path: string;
+  runStatus?: string;
+  runRevision?: number;
+}>;
+
+const select = selectRunsPerFile as unknown as SelectFn;
+
+const baseArtifact: FakeArtifact = {
+  _id: 'art_1',
+  files: [
+    { path: 'main.py', content: '' },
+    { path: 'helper.py', content: '' },
+    { path: 'verify.py', content: '' },
+  ],
+  entryFile: 'main.py',
+  revision: 3,
+  runExecutionId: 'exec_main_latest',
+  runRevision: 3,
+};
+
+describe('selectRunsPerFile', () => {
+  it('orders the result with entry file first, then declared file order', () => {
+    const executions: FakeExecution[] = [
+      {
+        _id: 'exec_main_latest',
+        _creationTime: 300,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'completed',
+      },
+      {
+        _id: 'exec_verify',
+        _creationTime: 200,
+        artifactId: 'art_1',
+        path: 'verify.py',
+        status: 'completed',
+      },
+      {
+        _id: 'exec_helper',
+        _creationTime: 100,
+        artifactId: 'art_1',
+        path: 'helper.py',
+        status: 'completed',
+      },
+    ];
+    const result = select(baseArtifact, executions, 'main.py', [
+      'main.py',
+      'helper.py',
+      'verify.py',
+    ]);
+    expect(result.map((r) => r.path)).toEqual([
+      'main.py',
+      'helper.py',
+      'verify.py',
+    ]);
+  });
+
+  it('keeps only the newest execution per path when there are repeats', () => {
+    const executions: FakeExecution[] = [
+      {
+        _id: 'exec_main_new',
+        _creationTime: 500,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'completed',
+      },
+      {
+        _id: 'exec_main_mid',
+        _creationTime: 300,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'failed',
+      },
+      {
+        _id: 'exec_main_old',
+        _creationTime: 100,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'completed',
+      },
+    ];
+    const result = select(baseArtifact, executions, 'main.py', ['main.py']);
+    expect(result).toHaveLength(1);
+    expect(result[0].executionId).toBe('exec_main_new');
+  });
+
+  it('skips runs whose path is no longer declared (file deleted via canvas)', () => {
+    const executions: FakeExecution[] = [
+      {
+        _id: 'exec_orphan',
+        _creationTime: 200,
+        artifactId: 'art_1',
+        path: 'deleted.py',
+        status: 'completed',
+      },
+      {
+        _id: 'exec_main',
+        _creationTime: 100,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'completed',
+      },
+    ];
+    const result = select(baseArtifact, executions, 'main.py', ['main.py']);
+    expect(result.map((r) => r.path)).toEqual(['main.py']);
+  });
+
+  it('mirrors live runProgress / runRevision only onto the row matching artifact.runExecutionId', () => {
+    const executions: FakeExecution[] = [
+      {
+        _id: 'exec_main_latest',
+        _creationTime: 500,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'running',
+      },
+      {
+        _id: 'exec_helper_old',
+        _creationTime: 100,
+        artifactId: 'art_1',
+        path: 'helper.py',
+        status: 'completed',
+      },
+    ];
+    const result = select(baseArtifact, executions, 'main.py', [
+      'main.py',
+      'helper.py',
+    ]);
+    const main = result.find((r) => r.path === 'main.py');
+    const helper = result.find((r) => r.path === 'helper.py');
+    // The current latest (matches artifact.runExecutionId) inherits the
+    // live freshness flag; the older execution row does NOT — that's the
+    // signal the canvas uses to gate stale output chrome.
+    expect(main?.runRevision).toBe(3);
+    expect(helper?.runRevision).toBeUndefined();
+  });
+
+  it('falls back to the artifact row when no executions exist but artifact carries runStatus (legacy)', () => {
+    const legacyArtifact: FakeArtifact = {
+      _id: 'art_legacy',
+      files: [{ path: 'main.py', content: '' }],
+      entryFile: 'main.py',
+      revision: 5,
+      runStatus: 'completed',
+      runRevision: 5,
+      runStdoutPreview: 'legacy stdout',
+    };
+    const result = select(legacyArtifact, [], 'main.py', ['main.py']);
+    expect(result).toHaveLength(1);
+    expect(result[0].path).toBe('main.py');
+    expect(result[0].runStatus).toBe('completed');
+  });
+
+  it('returns an empty array when nothing has ever run', () => {
+    const freshArtifact: FakeArtifact = {
+      _id: 'art_fresh',
+      files: [{ path: 'main.py', content: '' }],
+      entryFile: 'main.py',
+      revision: 1,
+    };
+    const result = select(freshArtifact, [], 'main.py', ['main.py']);
+    expect(result).toEqual([]);
+  });
+
+  it('skips executions with no `path` (legacy pre-multi-file rows)', () => {
+    const executions: FakeExecution[] = [
+      {
+        _id: 'exec_unpathed',
+        _creationTime: 500,
+        artifactId: 'art_1',
+        status: 'completed',
+      },
+      {
+        _id: 'exec_main',
+        _creationTime: 100,
+        artifactId: 'art_1',
+        path: 'main.py',
+        status: 'completed',
+      },
+    ];
+    const result = select(baseArtifact, executions, 'main.py', ['main.py']);
+    expect(result).toHaveLength(1);
+    expect(result[0].executionId).toBe('exec_main');
+  });
+});
diff --git a/services/platform/convex/artifacts/queries.ts b/services/platform/convex/artifacts/queries.ts
index 5a97b4ca6..d8bebb252 100644
--- a/services/platform/convex/artifacts/queries.ts
+++ b/services/platform/convex/artifacts/queries.ts
@@ -6,6 +6,7 @@ import type { Doc } from '../_generated/dataModel';
 import { query } from '../_generated/server';
 import { getAuthUserIdentity } from '../lib/rls';
 import { canAccessThread } from '../lib/rls/auth/can_access_thread';
+import { loadArtifactWithFiles, resolveArtifactFiles } from './resolve_files';
 
 const MAX_LIST_BY_THREAD = 50;
 
@@ -26,6 +27,12 @@ export interface ArtifactListItem {
   language?: string;
   revision: number;
   liveStreamMode?: Doc<'artifacts'>['liveStreamMode'];
+  /** Number of files in the project. Derived from `files` or 1 for legacy. */
+  fileCount: number;
+  /** Entry-file path. Synthesized for legacy rows via {@link resolveArtifactFiles}. */
+  entryFile: string;
+  /** Aggregate byte length of file contents (entry file's content for legacy rows). */
+  totalBytes: number;
   createdByMessageId: string;
   lastEditedByMessageId?: string;
   createdAt: number;
@@ -33,6 +40,11 @@ export interface ArtifactListItem {
 }
 
 function projectListItem(row: Doc<'artifacts'>): ArtifactListItem {
+  const resolved = resolveArtifactFiles(row);
+  const totalBytes = resolved.files.reduce(
+    (acc, f) => acc + f.content.length,
+    0,
+  );
   return {
     _id: row._id,
     _creationTime: row._creationTime,
@@ -41,6 +53,9 @@ function projectListItem(row: Doc<'artifacts'>): ArtifactListItem {
     language: row.language,
     revision: row.revision,
     liveStreamMode: row.liveStreamMode,
+    fileCount: resolved.files.length,
+    entryFile: resolved.entryFile,
+    totalBytes,
     createdByMessageId: row.createdByMessageId,
     lastEditedByMessageId: row.lastEditedByMessageId,
     createdAt: row.createdAt,
@@ -53,7 +68,7 @@ export const getById = query({
   handler: async (ctx, { artifactId }): Promise<Doc<'artifacts'> | null> => {
     const authUser = await getAuthUserIdentity(ctx);
     if (!authUser) return null;
-    const artifact = await ctx.db.get(artifactId);
+    const artifact = await loadArtifactWithFiles(ctx, artifactId);
     if (!artifact) return null;
     const metadata = await canAccessThread(
       ctx,
@@ -151,9 +166,150 @@ export const syncArtifactStream = query({
   },
 });
 
-export const listRevisions = query({
+/**
+ * Shared shape of one per-file run projection — produced by both the
+ * normal `projectExecutionRow` and the legacy `projectArtifactRowFallback`,
+ * so callers (the `listRunsPerFile` query, its pure helper, the canvas
+ * `RunResultPanel`) can treat both branches uniformly.
+ */
+export interface ArtifactRunFileProjection {
+  executionId: Doc<'sandboxExecutions'>['_id'] | null;
+  path: string;
+  runStatus: Doc<'sandboxExecutions'>['status'] | undefined;
+  runProgress: Doc<'artifacts'>['runProgress'] | undefined;
+  runErrorCode: Doc<'sandboxExecutions'>['errorCode'] | undefined;
+  runErrorMessage: Doc<'sandboxExecutions'>['errorMessage'] | undefined;
+  runStdoutPreview: Doc<'sandboxExecutions'>['stdoutPreview'] | undefined;
+  runStderrPreview: Doc<'sandboxExecutions'>['stderrPreview'] | undefined;
+  runOutputFiles: Doc<'sandboxExecutions'>['outputFiles'] | undefined;
+  runRevision: number | undefined;
+  runExitCode: number | undefined;
+}
+
+/**
+ * Project a `sandboxExecutions` row into the legacy `artifact.run*` shape
+ * the canvas renderer consumes. `runProgress` is mirrored from the artifact
+ * row ONLY when the execution is the currently-active one (the artifact
+ * row's `runExecutionId` matches), so a finished run keeps its final
+ * status without picking up a later run's progress chrome.
+ */
+function projectExecutionRow(
+  artifact: Doc<'artifacts'>,
+  row: Doc<'sandboxExecutions'>,
+  path: string,
+): ArtifactRunFileProjection {
+  const isCurrentLatest =
+    artifact.runExecutionId !== undefined &&
+    artifact.runExecutionId === row._id;
+  return {
+    executionId: row._id,
+    path,
+    runStatus: row.status,
+    runProgress: isCurrentLatest ? artifact.runProgress : undefined,
+    runErrorCode: row.errorCode,
+    runErrorMessage: row.errorMessage,
+    runStdoutPreview: row.stdoutPreview,
+    runStderrPreview: row.stderrPreview,
+    runOutputFiles: row.outputFiles,
+    runRevision: isCurrentLatest ? artifact.runRevision : undefined,
+    runExitCode: row.exitCode,
+  };
+}
+
+/**
+ * Legacy fallback projection for single-file artifacts whose runs predate
+ * the `sandboxExecutions.path` column — we read the run state off the
+ * artifact row directly. Only reachable when the caller is asking about
+ * the entry file (other paths can't be ambiguously inferred from the row).
+ */
+function projectArtifactRowFallback(
+  artifact: Doc<'artifacts'>,
+  path: string,
+): ArtifactRunFileProjection {
+  return {
+    executionId: artifact.runExecutionId ?? null,
+    path,
+    runStatus: artifact.runStatus,
+    runProgress: artifact.runProgress,
+    runErrorCode: artifact.runErrorCode,
+    runErrorMessage: artifact.runErrorMessage,
+    runStdoutPreview: artifact.runStdoutPreview,
+    runStderrPreview: artifact.runStderrPreview,
+    runOutputFiles: artifact.runOutputFiles ?? [],
+    runRevision: artifact.runRevision,
+    runExitCode: artifact.runExitCode,
+  };
+}
+
+/**
+ * Pure helper extracted from `listRunsPerFile` for unit testability —
+ * applies the latest-per-path collapse, ordering (entry file first,
+ * declared order after), and projection. The Convex wrapper handles auth,
+ * row fetching, and the index walk.
+ *
+ * `executionsNewestFirst` must already be sorted newest-first; rows are
+ * traversed in that order and the first occurrence of each `path` wins.
+ * Rows with a `path` not present in `declaredFiles` are dropped (the user
+ * deleted that file from the project).
+ */
+export function selectRunsPerFile(
+  artifact: Doc<'artifacts'>,
+  executionsNewestFirst: Doc<'sandboxExecutions'>[],
+  entryFile: string,
+  declaredFiles: ReadonlyArray<string>,
+): ArtifactRunFileProjection[] {
+  const filePaths = new Set(declaredFiles);
+  const latestByPath = new Map<string, Doc<'sandboxExecutions'>>();
+  for (const row of executionsNewestFirst) {
+    const rowPath = row.path;
+    if (rowPath === undefined) continue;
+    if (!filePaths.has(rowPath)) continue;
+    if (latestByPath.has(rowPath)) continue;
+    latestByPath.set(rowPath, row);
+  }
+
+  // Legacy fallback: no per-file rows at all but the artifact row carries
+  // run state (pre-`path` column data) — synthesize a single entry-file
+  // projection so the user still sees their last run.
+  if (
+    latestByPath.size === 0 &&
+    artifact.runStatus !== undefined &&
+    filePaths.has(entryFile)
+  ) {
+    return [projectArtifactRowFallback(artifact, entryFile)];
+  }
+
+  // Stable order: entry file first, then declared file order.
+  const ordered: string[] = [];
+  if (filePaths.has(entryFile)) ordered.push(entryFile);
+  for (const path of declaredFiles) {
+    if (path !== entryFile) ordered.push(path);
+  }
+  return ordered
+    .map((path) => ({ path, row: latestByPath.get(path) }))
+    .filter(
+      (pair): pair is { path: string; row: Doc<'sandboxExecutions'> } =>
+        pair.row !== undefined,
+    )
+    .map(({ path, row }) => projectExecutionRow(artifact, row, path));
+}
+
+/**
+ * Per-file run projections for every file in `artifact.files[]` that has a
+ * recorded execution row. Backs the canvas `RunResultPanel`, which displays
+ * the entry file's run as a primary fixture and other files' runs as
+ * collapsible secondaries — independent of the sidebar's active file.
+ *
+ * Ordering: entry file first if present, then the remaining files in
+ * `files[]` declaration order. Files without any recorded execution row
+ * are omitted (the panel stays quiet for files that have never run).
+ *
+ * For legacy single-file artifacts whose runs predate `sandboxExecutions.path`,
+ * we synthesize a single entry-file row from the artifact's `run*` fields.
+ */
+export const listRunsPerFile = query({
   args: { artifactId: v.id('artifacts') },
-  handler: async (ctx, { artifactId }): Promise<Doc<'artifactRevisions'>[]> => {
+  handler: async (ctx, { artifactId }) => {
     const authUser = await getAuthUserIdentity(ctx);
     if (!authUser) return [];
     const artifact = await ctx.db.get(artifactId);
@@ -167,13 +323,44 @@ export const listRevisions = query({
     if (!metadata || metadata.organizationId !== artifact.organizationId) {
       return [];
     }
-    const rows: Doc<'artifactRevisions'>[] = [];
+
+    const resolved = resolveArtifactFiles(artifact);
+    const executions: Doc<'sandboxExecutions'>[] = [];
     for await (const row of ctx.db
-      .query('artifactRevisions')
-      .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))
-      .order('asc')) {
-      rows.push(row);
+      .query('sandboxExecutions')
+      .withIndex('by_artifactId', (q) => q.eq('artifactId', artifactId))
+      .order('desc')) {
+      executions.push(row);
     }
-    return rows;
+    const projections = selectRunsPerFile(
+      artifact,
+      executions,
+      resolved.entryFile,
+      resolved.files.map((f) => f.path),
+    );
+
+    // `sandboxExecutions.outputFiles` is the audit projection and intentionally
+    // omits `storageId` (see [sandbox/wire.ts] — "audit row, no denormalized
+    // storageId"). The canvas's <FileChip> needs `storageId` to render a
+    // download link, so look it up per file via the `fileMetadata` row. Keeps
+    // `selectRunsPerFile` pure (no ctx) so its unit tests stay synchronous.
+    /* oxlint-disable oxc/no-map-spread -- copy-on-write enrichment; mutating
+       the query-row projection in place would leak into the next reactive
+       subscription delivery */
+    return await Promise.all(
+      projections.map(async (p) => {
+        if (!p.runOutputFiles || p.runOutputFiles.length === 0) return p;
+        const enriched = await Promise.all(
+          p.runOutputFiles.map(async (f) => {
+            if (f.storageId !== undefined) return f;
+            const meta = await ctx.db.get(f.fileMetadataId);
+            if (meta === null) return f;
+            return { ...f, storageId: meta.storageId };
+          }),
+        );
+        return { ...p, runOutputFiles: enriched };
+      }),
+    );
+    /* oxlint-enable oxc/no-map-spread */
   },
 });
diff --git a/services/platform/convex/artifacts/resolve_files.ts b/services/platform/convex/artifacts/resolve_files.ts
new file mode 100644
index 000000000..d961da40f
--- /dev/null
+++ b/services/platform/convex/artifacts/resolve_files.ts
@@ -0,0 +1,104 @@
+import type { Doc, Id } from '../_generated/dataModel';
+import type { MutationCtx, QueryCtx } from '../_generated/server';
+import {
+  defaultEntryFileFor,
+  isValidArtifactType,
+} from '../agent_tools/artifacts/shared';
+
+export interface ResolvedArtifactFiles {
+  files: readonly { readonly path: string; readonly content: string }[];
+  entryFile: string;
+  /** True iff the row was missing `files`/`entryFile` and we synthesized them from legacy `content`. */
+  synthesized: boolean;
+}
+
+/**
+ * Single source of truth for reading an artifact's project shape, regardless
+ * of whether the row has migrated to the multi-file schema yet.
+ *
+ * - If the row has `files` and `entryFile` populated, return them as-is.
+ * - Otherwise, synthesize a single-file project from the legacy `content`
+ *   column using the type's default entry-file name.
+ *
+ * Every read path in Convex queries / mutations / UI / preview server MUST
+ * route through this helper. Direct reads of `artifact.content` outside the
+ * dual-write mirroring in mutations are a Phase A bug.
+ */
+export function resolveArtifactFiles(
+  artifact: Pick<
+    Doc<'artifacts'>,
+    'type' | 'language' | 'content' | 'files' | 'entryFile'
+  >,
+): ResolvedArtifactFiles {
+  if (
+    artifact.files !== undefined &&
+    artifact.files.length > 0 &&
+    artifact.entryFile !== undefined
+  ) {
+    return {
+      files: artifact.files,
+      entryFile: artifact.entryFile,
+      synthesized: false,
+    };
+  }
+  // Legacy single-file row OR a row mid-migration. Synthesize.
+  const type = isValidArtifactType(artifact.type) ? artifact.type : 'code';
+  const entryFile = defaultEntryFileFor(type, artifact.language);
+  return {
+    files: [{ path: entryFile, content: artifact.content ?? '' }],
+    entryFile,
+    synthesized: true,
+  };
+}
+
+/**
+ * Mirror entry-file content back to the legacy `content` column for the
+ * Phase A migration window — keeps rollback to pre-Phase-A code safe. Every
+ * settle-path mutation MUST call this and write the returned string to the
+ * row's `content` field alongside the canonical `files`/`entryFile`.
+ */
+export function mirrorLegacyContent(
+  files: readonly { readonly path: string; readonly content: string }[],
+  entryFile: string,
+): string {
+  const entry = files.find((f) => f.path === entryFile);
+  return entry?.content ?? '';
+}
+
+/**
+ * Load an artifact and overlay its `files` field with the canonical
+ * `artifactFiles` table rows (when present). Mutations dual-write both the
+ * embedded `artifacts.files[]` array and the per-file `artifactFiles` rows
+ * via `syncArtifactFiles`; this helper lets read paths consume the table as
+ * the authoritative source while staying compatible with rows that predate
+ * the refactor's backfill (legacy rows have no `artifactFiles` rows — fall
+ * back to whatever was on the doc).
+ */
+export async function loadArtifactWithFiles(
+  ctx: QueryCtx | MutationCtx,
+  artifactId: Id<'artifacts'>,
+): Promise<Doc<'artifacts'> | null> {
+  const doc = await ctx.db.get(artifactId);
+  if (!doc) return null;
+  const rows: { path: string; content: string }[] = [];
+  for await (const row of ctx.db
+    .query('artifactFiles')
+    .withIndex('by_artifact', (q) => q.eq('artifactId', artifactId))) {
+    rows.push({ path: row.path, content: row.content });
+  }
+  if (rows.length === 0) return doc;
+  return { ...doc, files: rows };
+}
+
+/**
+ * Compute total content bytes across all files in the project (used for
+ * `assertAggregateSize`). UTF-8 byte length, not JS string length.
+ */
+export function aggregateFileBytes(
+  files: readonly { readonly content: string }[],
+): number {
+  const encoder = new TextEncoder();
+  let total = 0;
+  for (const f of files) total += encoder.encode(f.content).byteLength;
+  return total;
+}
diff --git a/services/platform/convex/artifacts/schema.ts b/services/platform/convex/artifacts/schema.ts
index 53d9306f6..f27987ca7 100644
--- a/services/platform/convex/artifacts/schema.ts
+++ b/services/platform/convex/artifacts/schema.ts
@@ -1,19 +1,63 @@
 import { defineTable } from 'convex/server';
 import { v } from 'convex/values';
 
+import {
+  sandboxErrorCodeValidator,
+  sandboxOutputFileValidator,
+  sandboxRunProgressValidator,
+  sandboxRunStatusValidator,
+} from '../sandbox/wire';
+
 export const artifactTypeValidator = v.union(
   v.literal('html'),
   v.literal('svg'),
   v.literal('markdown'),
   v.literal('mermaid'),
   v.literal('code'),
+  // Canonical runnable type. The artifact's `files[]` carry the source;
+  // per-file runtime is inferred from extension (`.py` → python3,
+  // `.js`/`.cjs`/`.mjs` → node) so one artifact can mix languages. The
+  // `run*` fields below carry the execution state (status, stdout/stderr
+  // preview, output files, ...). Editing a runnable artifact via
+  // artifact_file_update re-runs on the next artifact_run call.
+  v.literal('script_runnable'),
+  // @deprecated — legacy single-runtime literals. Retained in the
+  // validator so existing rows continue to parse (per
+  // [feedback_deprecate_dont_delete_schema_fields]). New artifact_create
+  // calls only emit `script_runnable`; the run-side pipeline routes the
+  // legacy literals through the same polyglot path with a single-runtime
+  // file set.
+  v.literal('python_runnable'),
+  v.literal('node_runnable'),
 );
 
+// Re-export the canonical sandbox validators under their legacy names so
+// existing imports keep working without churn. New code should import the
+// `sandbox*` names directly from `convex/sandbox/wire`.
+export const artifactRunStatusValidator = sandboxRunStatusValidator;
+export const artifactRunErrorCodeValidator = sandboxErrorCodeValidator;
+export const artifactRunOutputFileValidator = sandboxOutputFileValidator;
+
 export const artifactEditKindValidator = v.union(
   v.literal('create'),
   v.literal('patch'),
   v.literal('rewrite'),
+  // Chunked content delivery introduced with the streaming-create retirement —
+  // each historical `artifact_edit({mode: 'append'})` call concatenated a
+  // slice onto the file's existing content. The tool is retired; the value
+  // is kept here so historical `artifactRevisions` rows continue to parse.
+  v.literal('append'),
   v.literal('user'),
+  // File-level operations introduced with the multi-file refactor.
+  v.literal('file_create'),
+  v.literal('file_delete'),
+  v.literal('file_rename'),
+  // Project-level metadata: entry-point repoint without touching files.
+  // Retained for read-validator compatibility with existing rows; the
+  // The historical `set_entry` surface has been retired (use `artifact_file_rename`
+  // instead — its `from === entryFile` follow-along covers the common
+  // case atomically).
+  v.literal('set_entry'),
   // Snapshot taken when a chat branch was forked: the artifact is cloned
   // from the parent thread at its current state into the new branch's
   // namespace. The `revision` on this row preserves the parent's revision
@@ -26,26 +70,38 @@ export const artifactPatchValidator = v.object({
   replace: v.string(),
 });
 
+/**
+ * A single file inside an artifact's project tree. `path` is a POSIX-style
+ * relative path, NFC-normalized, validated against the path-safety rules
+ * in `agent_tools/artifacts/shared.ts:validatePath`.
+ */
+export const artifactFileValidator = v.object({
+  path: v.string(),
+  content: v.string(),
+});
+
 export const liveStreamModeValidator = v.union(
   v.literal('create'),
   v.literal('rewrite'),
+  // Chunked content delivery — same on-the-wire shape as rewrite (content
+  // streams in via tool input) but the mutation concatenates instead of
+  // replacing at execute time.
+  v.literal('append'),
   v.literal('patch'),
 );
 
 /**
  * Thread-scoped runnable/editable documents the LLM can create and patch
- * via the `artifact_create` / `artifact_edit` tools. Lives outside the
+ * via the `artifact_create` + file-level CRUD tools. Lives outside the
  * message stream so a single artifact can be mutated across many turns
  * without re-emitting its full content.
  *
- * `liveStreamMode` is set while a tool call is actively writing into this
- * row. For `create` and `rewrite` modes, `streamingContent` carries the
- * partial content the LLM has emitted so far — kept off `content` so a
- * crashed write cannot corrupt the previously-settled revision. For
- * `patch` mode, `streamingContent` stays empty (the row's content does
- * not change until execute settles atomically) and the partial patches
- * are mirrored to `streamingPatches` so the UI can render an inline diff
- * preview of the regions about to change.
+ * **In-flight refactor (see plan llm-majestic-hamming.md)**: many fields
+ * on this row are being migrated to dedicated tables (`artifactFiles`,
+ * `artifactRuns`, `artifactRunFiles`). They remain here as `@deprecated`
+ * per [feedback_deprecate_dont_delete_schema_fields] so existing rows
+ * keep parsing — new code reads/writes the new tables, with a fallback
+ * to these fields during the migration window.
  */
 export const artifactsTable = defineTable({
   organizationId: v.string(),
@@ -53,7 +109,26 @@ export const artifactsTable = defineTable({
   type: artifactTypeValidator,
   title: v.string(),
   language: v.optional(v.string()),
-  content: v.string(),
+  /**
+   * @deprecated — legacy single-file content. Phase A of the multi-file
+   * refactor: marked optional; `files[entryFile].content` is the canonical
+   * source. New mutations mirror entry-file content back here for rollback
+   * safety. Phase C will drop this column.
+   */
+  content: v.optional(v.string()),
+  /**
+   * @deprecated — migrating to `artifactFiles` table (one row per file
+   * keyed by `(artifactId, path)`). Reads still fall back here during the
+   * migration window; new writes go to `artifactFiles`. Do NOT remove —
+   * historical rows still carry this array.
+   */
+  files: v.optional(v.array(artifactFileValidator)),
+  /**
+   * Which file in `files[]` is the entry-point — used by `artifact_run`
+   * (executed script), HTML preview (entry document), and renderers for
+   * static types (the file the canvas displays by default).
+   */
+  entryFile: v.optional(v.string()),
   revision: v.number(),
   createdByMessageId: v.string(),
   // Cleared when the user edits the artifact via the Canvas pane — there
@@ -61,31 +136,135 @@ export const artifactsTable = defineTable({
   lastEditedByMessageId: v.optional(v.string()),
   createdAt: v.number(),
   updatedAt: v.number(),
+  /**
+   * @deprecated — transient streaming state. Migrating to the per-file
+   * `artifactFiles.streamingWriteToolCallId` pointer + the agent
+   * component's `streamDeltas` table. Kept on the row so historical data
+   * passes the read validator; new code does not write this.
+   */
   liveStreamMode: v.optional(liveStreamModeValidator),
+  /** @deprecated — see {@link liveStreamMode}. */
   liveStreamStartedAt: v.optional(v.number()),
-  // The AI-SDK toolCallId of the create/edit invocation that produced this
-  // row (or whose latest edit produced it). The Canvas pane uses it to
-  // filter `tool-input-delta` parts in the agent SDK's streamDeltas table
-  // down to this artifact's stream and decode the partial `content` JSON
-  // field client-side — that's how chat-style smooth streaming is
-  // delivered without an extra deltas table on our side. Optional because
-  // pre-existing rows from before this field shipped don't have it; the
-  // canvas falls back to `streamingContent` for those.
+  /**
+   * @deprecated — the canvas now finds the active write toolCallId on the
+   * per-file `artifactFiles.streamingWriteToolCallId` pointer. Kept for
+   * historical rows; new code does not write this.
+   */
   toolCallId: v.optional(v.string()),
+  /**
+   * @deprecated — streamed content now lives in the agent component's
+   * `streamDeltas` table (looked up by toolCallId). Kept for historical
+   * rows that still carry partial bytes here.
+   */
   streamingContent: v.optional(v.string()),
-  // While `liveStreamMode === 'patch'`, the partial patches array parsed
-  // from the LLM's tool input is mirrored here as {search, replace} pairs
-  // (only entries with a complete `search`; `replace` may still be
-  // streaming in). The Canvas pane uses these to render an inline diff
-  // preview over the (still settled) source — patch mode never writes
-  // `streamingContent`, so this is the only mid-stream signal users have.
+  /**
+   * @deprecated — advisory streaming-path hint. Historical rows may still
+   * carry it; the current `artifact_file_create` / `artifact_file_update` flow no longer
+   * relies on this field as a load-bearing signal.
+   */
+  streamingPath: v.optional(v.string()),
+  /**
+   * @deprecated — patch-mode preview rendering is being moved client-side
+   * over streamDeltas. Kept for historical rows.
+   */
   streamingPatches: v.optional(v.array(artifactPatchValidator)),
+
+  // --- Runnable-artifact run state (populated only for runnable types:
+  // `script_runnable` (canonical) or `python_runnable` / `node_runnable`
+  // (legacy). All optional per the
+  // [feedback_deprecate_dont_delete_schema_fields] rule so existing rows
+  // pass the read validator unchanged. The canvas-runnable-code-renderer
+  // subscribes to these fields for live progress + final output display.
+
+  /**
+   * Legacy flat package list — still written by single-runtime callers
+   * and by the polyglot pipeline for legacy `python_runnable` /
+   * `node_runnable` rows. New polyglot writes go to
+   * {@link runPackagesByLang}; readers fall back here when the grouped
+   * field is absent. Retained per
+   * [feedback_deprecate_dont_delete_schema_fields].
+   */
+  runPackages: v.optional(v.array(v.string())),
+  /**
+   * Per-language package buckets for `script_runnable` artifacts. Each
+   * bucket is sent to its native installer (`uv pip install` /
+   * `npm install`) on the next run. Either side is optional — a pure-
+   * Python or pure-Node artifact still uses this field with only one
+   * bucket populated.
+   */
+  runPackagesByLang: v.optional(
+    v.object({
+      python: v.optional(v.array(v.string())),
+      node: v.optional(v.array(v.string())),
+    }),
+  ),
+  runOptions: v.optional(
+    v.object({
+      allowSdist: v.optional(v.boolean()),
+      allowInstallScripts: v.optional(v.boolean()),
+    }),
+  ),
+  runStatus: v.optional(artifactRunStatusValidator),
+  // Structured progress payload patched by the Convex action as the
+  // spawner emits phase events. `kind` is rendered via the
+  // `chat.runnable.progress.*` i18n keys; the optional `package` /
+  // `version` fields fill ICU placeholders for `installingPackage`.
+  // Server never writes user-visible English text here.
+  runProgress: v.optional(sandboxRunProgressValidator),
+  runStartedAt: v.optional(v.number()),
+  runCompletedAt: v.optional(v.number()),
+  runExitCode: v.optional(v.number()),
+  runErrorCode: v.optional(artifactRunErrorCodeValidator),
+  runErrorMessage: v.optional(v.string()),
+  runStdoutPreview: v.optional(v.string()),
+  runStderrPreview: v.optional(v.string()),
+  runStdoutStorageId: v.optional(v.id('_storage')),
+  runStderrStorageId: v.optional(v.id('_storage')),
+  /**
+   * @deprecated — migrating to `artifactRunFiles` table (append-only, one
+   * row per produced file per run). Reads fall back here during migration
+   * window; new writes go to `artifactRunFiles` via an `artifactRuns` row.
+   */
+  runOutputFiles: v.optional(v.array(artifactRunOutputFileValidator)),
+  // Link to the latest per-execution audit row. The sandboxExecutions
+  // table is the source of truth for execution history; the artifact row
+  // holds only the *latest* result for fast canvas reads.
+  runExecutionId: v.optional(v.id('sandboxExecutions')),
+  // The `revision` the source content held when this run started. After a
+  // subsequent edit bumps `revision`, the inequality `runRevision !==
+  // revision` is the canonical "the displayed run is stale" signal — used
+  // by buildRunAttrs (to omit run state from the LLM context) and by the
+  // canvas renderer (to grey out the panel). Avoids the alternative of
+  // clearing every run-state field on edit, which would surprise users by
+  // wiping the prior output the moment they touch the script (round-2
+  // R2-B10).
+  runRevision: v.optional(v.number()),
+  /**
+   * Phase-B migration sentinel: set to `true` by
+   * `migrations/backfill_artifact_files_table.ts` as the LAST write after
+   * all of an artifact's `artifactFiles` + `artifactRuns` + `artifactRunFiles`
+   * rows are inserted. On retry the backfill skips artifacts where this is
+   * truthy. Optional + sparse — non-migrated rows omit it. Once Phase B
+   * is universally applied this field could be dropped, but per the
+   * "deprecate, don't delete" rule it stays optional indefinitely.
+   */
+  _phaseB_complete: v.optional(v.boolean()),
 })
   .index('by_organizationId', ['organizationId'])
   .index('by_organizationId_and_thread', ['organizationId', 'threadId'])
   // Sparse-by-construction: rows where `liveStreamMode` is undefined are
   // excluded from this index, so the cleanup cron only walks live streams.
-  .index('by_liveStreamMode', ['liveStreamMode']);
+  .index('by_liveStreamMode', ['liveStreamMode'])
+  // Backs the `artifact_create` same-message guard: when a tool call lands
+  // in a thread that already produced an artifact within the same assistant
+  // message (`createdByMessageId`), short-circuit to a soft-conflict
+  // response steering the model toward `artifact_file_create` / `artifact_file_update`
+  // instead of spawning a duplicate project.
+  .index('by_organizationId_thread_createdByMessageId', [
+    'organizationId',
+    'threadId',
+    'createdByMessageId',
+  ]);
 
 /**
  * Append-only revision history for `artifacts`. One row per write — including
@@ -96,10 +275,162 @@ export const artifactsTable = defineTable({
 export const artifactRevisionsTable = defineTable({
   artifactId: v.id('artifacts'),
   revision: v.number(),
-  content: v.string(),
+  /**
+   * @deprecated — legacy single-file content snapshot. Phase A: optional.
+   * New revisions write `files` (full snapshot for content edits) instead.
+   * For `editKind === 'set_entry'`, BOTH `files` and `content` are omitted
+   * (pure metadata revision); read-fold logic walks back to find the most
+   * recent revision carrying file state.
+   */
+  content: v.optional(v.string()),
+  /** Full files snapshot at this revision (for content-touching edits). */
+  files: v.optional(v.array(artifactFileValidator)),
+  /** Entry-file pointer at this revision. */
+  entryFile: v.optional(v.string()),
+  /** Which file the patch/rewrite/delete operated on. */
+  filePath: v.optional(v.string()),
+  /** Source path for `editKind === 'file_rename'`. */
+  fromPath: v.optional(v.string()),
   // Omitted when editKind === 'user' (Canvas pane textarea edit).
   editedByMessageId: v.optional(v.string()),
   editKind: artifactEditKindValidator,
   patches: v.optional(v.array(artifactPatchValidator)),
   createdAt: v.number(),
 }).index('by_artifact', ['artifactId', 'revision']);
+
+// =============================================================================
+// Refactor target tables (plan: llm-majestic-hamming.md)
+//
+// Replace the embedded `files[]` / `runOutputFiles[]` / streaming-state
+// fields on `artifactsTable` with dedicated tables. The old fields remain
+// `@deprecated` on the parent row so historical data continues to parse;
+// new write paths target the tables below.
+// =============================================================================
+
+/**
+ * One row per source file in an artifact's project tree.
+ *
+ * Replaces the embedded `artifacts.files[]` array. Keyed by
+ * `(artifactId, path)`. `streamingWriteToolCallId` is the only transient
+ * state — set by `artifact_file_create` / `artifact_file_update` onStart, cleared on commit;
+ * the canvas uses it to find the corresponding `streamDeltas` entries for
+ * live content rendering.
+ */
+export const artifactFilesTable = defineTable({
+  artifactId: v.id('artifacts'),
+  path: v.string(),
+  content: v.string(),
+  /**
+   * AI-SDK toolCallId of the active `artifact_file_create` / `artifact_file_update` (or
+   * equivalent) tool call currently streaming bytes into this file. Cleared
+   * on commit. When set, the canvas reads agent-component `streamDeltas`
+   * filtered by this toolCallId for live content display.
+   */
+  streamingWriteToolCallId: v.optional(v.string()),
+  createdAt: v.number(),
+  updatedAt: v.number(),
+})
+  .index('by_artifact_path', ['artifactId', 'path'])
+  .index('by_artifact', ['artifactId']);
+
+/**
+ * One row per artifact execution attempt. Append-only — failed and
+ * cancelled runs leave their row in place so the user (and the LLM via
+ * `artifact_list_runs`) can see history. The next-run pre-stage resolves
+ * an `inputsFromRun` reference (defaulting to "latest succeeded") to
+ * decide which run's outputs to seed into `/workspace/output/`.
+ */
+export const artifactRunsTable = defineTable({
+  artifactId: v.id('artifacts'),
+  status: artifactRunStatusValidator,
+  exitCode: v.optional(v.number()),
+  errorCode: v.optional(artifactRunErrorCodeValidator),
+  errorMessage: v.optional(v.string()),
+  startedAt: v.number(),
+  endedAt: v.optional(v.number()),
+  /** Artifact `revision` at the moment this run started. */
+  revision: v.number(),
+  /** Audit row in `sandboxExecutions` table. */
+  executionId: v.optional(v.id('sandboxExecutions')),
+  /**
+   * The prior run whose `/workspace/output/` files were pre-staged into
+   * this run's container. `undefined` means "latest succeeded was used"
+   * (the default) or "nothing was pre-staged".
+   */
+  inputsFromRun: v.optional(v.id('artifactRuns')),
+})
+  .index('by_artifact', ['artifactId'])
+  .index('by_artifact_status', ['artifactId', 'status'])
+  // Backs `getRunByExecutionId` — `artifact_run` tool uses it to surface
+  // the persistent runId to the LLM after `executeCode` returns.
+  .index('by_executionId', ['executionId']);
+
+/**
+ * One row per file produced by a run (harvested from `/workspace/output/`
+ * at run end). Append-only — never overwritten. A failed run that
+ * produced partial files still gets rows here (per [D5]); the parent
+ * `artifactRuns.status` distinguishes the source.
+ */
+export const artifactRunFilesTable = defineTable({
+  runId: v.id('artifactRuns'),
+  /** Denormalized from `artifactRuns.artifactId` for direct queries. */
+  artifactId: v.id('artifacts'),
+  name: v.string(),
+  storageId: v.id('_storage'),
+  size: v.number(),
+  contentType: v.optional(v.string()),
+  /**
+   * SHA-256 (hex) of the harvested bytes, mirrored from
+   * `fileMetadata.sha256`. Required for the pinned-run pre-stage path
+   * (`getLatestRunOutputs` branch 1) to return attestation hashes
+   * symmetric with the cumulative `artifactOutputs` manifest. Optional
+   * because rows written before sha256 was plumbed all the way through
+   * the harvest pipeline don't carry it; attestation falls back to
+   * "presence only" in that case.
+   */
+  sha256: v.optional(v.string()),
+  createdAt: v.number(),
+})
+  .index('by_run', ['runId'])
+  .index('by_artifact', ['artifactId']);
+
+/**
+ * Cumulative output manifest per artifact. Authoritative source of truth for
+ * "files that should currently exist in /workspace/output/ for this artifact".
+ *
+ * Keyed by `(artifactId, name)`. Every successful harvest upserts each
+ * produced file here (newer wins for same name; new names accumulate).
+ * Empty harvests don't touch the manifest. This replaces the prior
+ * "latest run's files" walk-back model — multi-run histories with
+ * different filenames no longer lose older files.
+ *
+ * The `artifactRunFiles` table remains the per-run audit (append-only,
+ * never overwritten); this table is the workspace-state-of-truth used
+ * by pre-stage. `sha256` is computed at harvest time and used both for
+ * dedupe and for the spawner pre-stage attestation.
+ */
+export const artifactOutputsTable = defineTable({
+  artifactId: v.id('artifacts'),
+  /** POSIX-relative name inside `/workspace/output/`. Path-safety enforced by sandbox. */
+  name: v.string(),
+  storageId: v.id('_storage'),
+  size: v.number(),
+  contentType: v.optional(v.string()),
+  /**
+   * sha256 hex of the file bytes. Populated by every new harvest (computed
+   * spawner-side); used for both pre-stage attestation and dedupe.
+   *
+   * Optional because the lazy-derive migration backfills from legacy
+   * `artifactRunFiles` rows that predate sha256 capture — those entries
+   * land with `sha256` undefined and the attestation path treats them as
+   * "presence only" rather than "byte-exact". Once an artifact has been
+   * exercised by a fresh run, all of its entries carry sha256.
+   */
+  sha256: v.optional(v.string()),
+  /** The run that most recently produced this name. */
+  producedByRunId: v.id('artifactRuns'),
+  updatedAt: v.number(),
+})
+  .index('by_artifact', ['artifactId'])
+  .index('by_artifact_name', ['artifactId', 'name'])
+  .index('by_storageId', ['storageId']);
diff --git a/services/platform/convex/artifacts/snapshot_for_branch.ts b/services/platform/convex/artifacts/snapshot_for_branch.ts
index 70c156342..b37c8ad84 100644
--- a/services/platform/convex/artifacts/snapshot_for_branch.ts
+++ b/services/platform/convex/artifacts/snapshot_for_branch.ts
@@ -1,43 +1,71 @@
 import type { Doc } from '../_generated/dataModel';
 import type { MutationCtx } from '../_generated/server';
+import { mirrorLegacyContent, resolveArtifactFiles } from './resolve_files';
 
 /**
  * Snapshot a single artifact from a parent thread into a freshly-forked
  * branch thread. Called by `createBranchThread` while copying messages.
  *
- * The caller decides which revision to snapshot (the latest in-scope one,
- * walked from `artifactRevisions` so the branch sees the artifact as it
- * stood at the fork point — not the parent's current state, which may
- * include post-fork edits the branch shouldn't inherit).
+ * The caller decides which revision to snapshot via `snapshotRevision` AND
+ * supplies the file state captured at that revision (`revisionFiles` +
+ * `revisionEntryFile`, falling back to `revisionContent` for legacy
+ * content-only rows). Using the source row's CURRENT files would mix in
+ * out-of-scope edits made on the parent after the fork point — exactly the
+ * bug the `create_branch_thread_artifacts` "later edits out of scope" test
+ * pins down. When no revision-level snapshot is supplied we fall back to
+ * the source row's current state (used by callers that branch from a
+ * single-revision artifact, where current === in-scope).
  *
  * Behaviour:
  *   - Inserts a new `artifacts` row scoped to `targetThreadId`.
  *   - Preserves `snapshotRevision` as the row's `revision` so the user
- *     sees continuous version labels (e.g. "v26" in both branches);
- *     branching is a workspace fork, not a fresh start.
- *   - Always uses settled `snapshotContent` — never `streamingContent`.
- *   - Maps `createdByMessageId` to the branch's copy of that message;
- *     `lastEditedByMessageId` is mapped if the editor message was in the
- *     copied range, otherwise dropped to `undefined`.
- *   - Inserts one `artifactRevisions` row with `editKind: 'branch'` so the
- *     branch's revision history begins with an explicit fork marker.
- *
- * Plain helper (not a Convex `internalMutation`) so the caller's mutation
- * transaction wraps both the message copy and the artifact snapshots —
- * either everything succeeds or nothing is written.
+ *     sees continuous version labels.
+ *   - Copies the full `files[]` map and `entryFile`. Also mirrors entry
+ *     content to legacy `content` for rollback safety during Phase A.
+ *   - Inserts one `artifactRevisions` row with `editKind: 'branch'`.
  */
 export async function snapshotArtifactForBranch(
   ctx: MutationCtx,
   args: {
     source: Doc<'artifacts'>;
-    snapshotContent: string;
     snapshotRevision: number;
     targetThreadId: string;
     mappedCreatedByMessageId: string;
     mappedLastEditedByMessageId?: string;
+    /** Files snapshot captured at `snapshotRevision` (Phase A+ rows). */
+    revisionFiles?: ReadonlyArray<{ path: string; content: string }>;
+    /** Entry-file pointer at `snapshotRevision`. */
+    revisionEntryFile?: string;
+    /** Legacy single-file content at `snapshotRevision` (Phase A rows). */
+    revisionContent?: string;
   },
 ): Promise<{ artifactId: Doc<'artifacts'>['_id'] }> {
   const { source } = args;
+  const sourceResolved = resolveArtifactFiles(source);
+  let files: Array<{ path: string; content: string }>;
+  let entryFile: string;
+  if (args.revisionFiles !== undefined && args.revisionFiles.length > 0) {
+    files = args.revisionFiles.map((f) => ({
+      path: f.path,
+      content: f.content,
+    }));
+    entryFile = args.revisionEntryFile ?? sourceResolved.entryFile;
+  } else if (args.revisionContent !== undefined) {
+    // Legacy `content`-only revision: synthesize a single-file artifact at
+    // the entry path captured at that revision (or the current entry as a
+    // last resort — only the entry pointer can drift, files cannot, since
+    // legacy rows only had one file).
+    entryFile = args.revisionEntryFile ?? sourceResolved.entryFile;
+    files = [{ path: entryFile, content: args.revisionContent }];
+  } else {
+    // No revision-level snapshot supplied — current state is in-scope.
+    files = sourceResolved.files.map((f) => ({
+      path: f.path,
+      content: f.content,
+    }));
+    entryFile = sourceResolved.entryFile;
+  }
+  const legacyContent = mirrorLegacyContent(files, entryFile);
   const now = Date.now();
   const artifactId = await ctx.db.insert('artifacts', {
     organizationId: source.organizationId,
@@ -45,7 +73,9 @@ export async function snapshotArtifactForBranch(
     type: source.type,
     title: source.title,
     language: source.language,
-    content: args.snapshotContent,
+    files,
+    entryFile,
+    content: legacyContent,
     revision: args.snapshotRevision,
     createdByMessageId: args.mappedCreatedByMessageId,
     lastEditedByMessageId: args.mappedLastEditedByMessageId,
@@ -56,7 +86,9 @@ export async function snapshotArtifactForBranch(
   await ctx.db.insert('artifactRevisions', {
     artifactId,
     revision: args.snapshotRevision,
-    content: args.snapshotContent,
+    content: legacyContent,
+    files,
+    entryFile,
     editedByMessageId:
       args.mappedLastEditedByMessageId ?? args.mappedCreatedByMessageId,
     editKind: 'branch',
diff --git a/services/platform/convex/crons.ts b/services/platform/convex/crons.ts
index 620ade8e9..28685e973 100644
--- a/services/platform/convex/crons.ts
+++ b/services/platform/convex/crons.ts
@@ -91,6 +91,19 @@ crons.cron(
   {},
 );
 
+// Sandbox watchdog — same shape as the transcription / video-link sweeps.
+// Convex hard-kills actions at the 30-min timeout without running the
+// action's finally; that leaves sandboxExecutions stuck at `status='running'`
+// and the slot they hold permanently shrinks the org's concurrent cap.
+// Heartbeat from `executeCode` keeps `heartbeatAt` fresh while the action
+// is alive; this cron flips rows older than 2× max-timeout to `failed`.
+crons.cron(
+  'recover stuck sandbox executions (every 5 min)',
+  '*/5 * * * *',
+  internal.sandbox.internal_mutations.recoverStuckSandboxes,
+  {},
+);
+
 // GDPR erasure watchdog (round-2 V5 P0-14) - the same shape as the
 // transcription watchdog above. Convex actions hard-stop at 30 min;
 // `gdprErasureRequests` rows whose subject has too many rows / RAG
diff --git a/services/platform/convex/file_metadata/schema.ts b/services/platform/convex/file_metadata/schema.ts
index 08f6c4e7f..7cf09b2f6 100644
--- a/services/platform/convex/file_metadata/schema.ts
+++ b/services/platform/convex/file_metadata/schema.ts
@@ -71,6 +71,14 @@ export const fileMetadataTable = defineTable({
   // short-circuits to the cached transcript when a prior row in the same
   // org has completed transcription of the same content.
   contentHash: v.optional(v.string()),
+  // SHA-256 (hex) of the raw bytes for sandbox-harvested output files.
+  // Set by `insertOutputFiles` from the spawner's harvest payload; used for
+  // pre-stage attestation when the same file is later re-injected into
+  // another run's `/workspace/output/`. Distinct from `contentHash` (audio
+  // transcript dedup) — different write source, different purpose. Optional
+  // because non-sandbox uploads (chat attachments, document imports) don't
+  // compute it.
+  sha256: v.optional(v.string()),
   uploadedBy: v.optional(v.string()),
   /**
    * For chat-uploaded files, the chat thread the file was attached to.
diff --git a/services/platform/convex/http.ts b/services/platform/convex/http.ts
index a1d6cf0e7..0203163a9 100644
--- a/services/platform/convex/http.ts
+++ b/services/platform/convex/http.ts
@@ -52,6 +52,10 @@ import {
   patchProduct,
   deleteProduct,
 } from './products/rest_api';
+import {
+  outputUploadUrlAction,
+  recordUploadedAction,
+} from './sandbox/sandbox_http';
 import {
   ssoDiscoverHandler,
   ssoAuthorizeHandler,
@@ -704,5 +708,28 @@ http.route({
   handler: apiGatewayOptions,
 });
 
+// ---------------------------------------------------------------------------
+// Sandbox callback endpoints (sandbox-wobbly-origami plan §2).
+//
+// The spawner POSTs here from inside docker compose to (a) request more
+// presigned upload URLs (EP1) and (b) report each successful upload's
+// storageId (EP2). Both are HMAC-authenticated using the same SANDBOX_TOKEN
+// the spawner uses for inbound `/v1/execute` — we reuse the secret rather
+// than mint a new one.
+//
+// Routed through Caddy `handle /api/sandbox/*` → convex:3211.
+// ---------------------------------------------------------------------------
+http.route({
+  path: '/api/sandbox/output_upload_url',
+  method: 'POST',
+  handler: outputUploadUrlAction,
+});
+
+http.route({
+  path: '/api/sandbox/record_uploaded',
+  method: 'POST',
+  handler: recordUploadedAction,
+});
+
 const _routes = http.getRoutes();
 export default http;
diff --git a/services/platform/convex/lib/agent_response/generate_response.ts b/services/platform/convex/lib/agent_response/generate_response.ts
index 043969cdc..ca377cd47 100644
--- a/services/platform/convex/lib/agent_response/generate_response.ts
+++ b/services/platform/convex/lib/agent_response/generate_response.ts
@@ -1075,7 +1075,7 @@ export async function generateAgentResponse(
               excludeToolMessages: true,
               searchOtherThreads: false,
             },
-            saveStreamDeltas: { throttleMs: 100, chunking: /[\p{P}\s]/u },
+            saveStreamDeltas: { throttleMs: 250, chunking: /[\p{P}\s]/u },
           },
         );
 
diff --git a/services/platform/convex/lib/agent_response/stream_throttle.test.ts b/services/platform/convex/lib/agent_response/stream_throttle.test.ts
index 5991c8f6b..02e9f5185 100644
--- a/services/platform/convex/lib/agent_response/stream_throttle.test.ts
+++ b/services/platform/convex/lib/agent_response/stream_throttle.test.ts
@@ -2,12 +2,24 @@ import { readFile } from 'node:fs/promises';
 /**
  * Verify the stream delta throttle configuration.
  *
- * The saveStreamDeltas.throttleMs value directly impacts perceived TTFT:
- * - 200ms (old): up to 200ms delay after LLM produces first token
- * - 100ms (new): halves the worst-case delay for first token persistence
+ * The saveStreamDeltas.throttleMs value trades off two concerns:
+ * - First-token latency: the SDK flushes the first delta immediately
+ *   (initial #latestWrite=0 makes the throttle check pass on the first
+ *    addParts call), so this knob does NOT affect TTFT.
+ * - Stream row volume + main-thread cost: each Convex push triggers a
+ *   full UIMessage rebuild from cursor=0 in the agent SDK's
+ *   `useStreamingUIMessages` hook. With huge tool inputs the per-push
+ *   cost becomes O(N²) over the delta count. A larger throttle reduces
+ *   N proportionally.
+ *
+ * Tale settled on 250ms (the SDK default) after a 2-round review found
+ * that 100ms produced enough rows for `useStreamingUIMessages` to stall
+ * the main thread on long artifact_create calls, while 500ms showed
+ * visible chunkiness because Tale has no inter-push smoothing layer
+ * (`useStreamBuffer` smooths within a buffer, not between Convex pushes).
  *
  * This test reads the source file to verify the configuration value,
- * ensuring it stays at the optimized level and isn't accidentally reverted.
+ * ensuring it stays at the chosen level and isn't accidentally reverted.
  */
 import { resolve } from 'node:path';
 
@@ -19,7 +31,7 @@ const GENERATE_RESPONSE_PATH = resolve(
 );
 
 describe('saveStreamDeltas throttle configuration', () => {
-  it('uses throttleMs of 100 for faster first-token delivery', async () => {
+  it('uses throttleMs of 250 to balance row volume and stream smoothness', async () => {
     const source = await readFile(GENERATE_RESPONSE_PATH, 'utf-8');
 
     // Match the saveStreamDeltas config line
@@ -29,10 +41,10 @@ describe('saveStreamDeltas throttle configuration', () => {
     expect(match).not.toBeNull();
 
     const throttleMs = Number(match?.[1]);
-    expect(throttleMs).toBe(100);
+    expect(throttleMs).toBe(250);
   });
 
-  it('does not exceed 150ms throttle to maintain TTFT target', async () => {
+  it('stays within the [100, 400] band — outside this range either TTFT regresses or streaming feels chunky', async () => {
     const source = await readFile(GENERATE_RESPONSE_PATH, 'utf-8');
 
     const match = source.match(
@@ -41,6 +53,7 @@ describe('saveStreamDeltas throttle configuration', () => {
     expect(match).not.toBeNull();
 
     const throttleMs = Number(match?.[1]);
-    expect(throttleMs).toBeLessThanOrEqual(150);
+    expect(throttleMs).toBeGreaterThanOrEqual(100);
+    expect(throttleMs).toBeLessThanOrEqual(400);
   });
 });
diff --git a/services/platform/convex/lib/context_management/build_artifacts_context.ts b/services/platform/convex/lib/context_management/build_artifacts_context.ts
index 58516f9e0..cbab6ba50 100644
--- a/services/platform/convex/lib/context_management/build_artifacts_context.ts
+++ b/services/platform/convex/lib/context_management/build_artifacts_context.ts
@@ -1,29 +1,26 @@
 import { internal } from '../../_generated/api';
 import type { ActionCtx } from '../../_generated/server';
+import { isRunnableArtifactType } from '../../agent_tools/artifacts/shared';
+import { resolveArtifactFiles } from '../../artifacts/resolve_files';
 
 /**
- * Hard upper bound on the total characters injected as artifact context.
- * When the thread holds more than fits, the *oldest* artifacts collapse
- * into omitted stubs so the most recent state stays visible — the model
- * needs the latest revisions to patch correctly.
+ * Hard upper bound on total bytes of file content injected as artifact
+ * context across the whole block. The metadata header (artifact id/type/
+ * title/revision/entryFile/fileCount per row) is always emitted; only file
+ * bodies are subject to truncation.
  */
-const MAX_TOTAL_BYTES = 80_000;
+const MAX_TOTAL_BODY_BYTES = 80_000;
 
-/**
- * Per-artifact body cap. Artifacts longer than this are truncated with
- * a sentinel; the model can still see the head of the document and call
- * `artifact_edit` against snippets it remembers from a prior turn.
- */
-const MAX_PER_ARTIFACT_BYTES = 30_000;
+/** Per-file body cap before truncation sentinel. */
+const MAX_PER_FILE_BYTES = 30_000;
 
 /**
  * Build the LLM-facing artifacts block for the current thread.
  *
- * The block is XML-shaped (not collapsible HTML) so the model can parse
- * IDs/types/revisions reliably. Returns `undefined` when the thread has
- * no artifacts so the caller can skip injecting an empty section, and
- * also when the underlying query fails — artifact context is enrichment,
- * not load-bearing, so a transient failure should not abort the turn.
+ * Each artifact becomes a `<artifact>` element listing its files as nested
+ * `<file>` blocks. Multi-file projects emit one `<file>` per path; legacy
+ * single-file artifacts (with only `content` on the row) emit one
+ * synthesized `<file path="defaultEntry">` via `resolveArtifactFiles`.
  */
 export async function buildArtifactsContext(
   ctx: ActionCtx,
@@ -47,27 +44,39 @@ export async function buildArtifactsContext(
 
   if (artifacts.length === 0) return undefined;
 
-  // Walk newest first so the latest artifacts always claim budget; emit
-  // omitted stubs for the *oldest* once full. We reverse the resulting
-  // blocks at the end so the prompt stays in chronological order.
+  // Walk newest first so the latest artifacts claim file-body budget first.
+  // Metadata is always emitted (it's cheap and important for the LLM to know
+  // what exists). We reverse blocks at the end to keep chronological order.
   const ordered = artifacts.toReversed();
-  let totalBytes = 0;
+  let totalBodyBytes = 0;
   const blocks: string[] = [];
   for (const artifact of ordered) {
-    const body = sanitizeArtifactBody(truncateArtifactBody(artifact.content));
-    const bytes = body.length;
-    if (totalBytes + bytes > MAX_TOTAL_BYTES) {
-      blocks.push(
-        `<artifact id="${artifact._id}" type="${artifact.type}" title=${JSON.stringify(artifact.title)} revision="${artifact.revision}" omitted="true" />`,
-      );
-      continue;
-    }
-    totalBytes += bytes;
+    const resolved = resolveArtifactFiles(artifact);
     const langAttr = artifact.language
       ? ` language=${JSON.stringify(artifact.language)}`
       : '';
+    const runAttr = buildRunAttrs(artifact);
+    const headerAttrs = `id="${artifact._id}" type="${artifact.type}"${langAttr}${runAttr} title=${JSON.stringify(
+      artifact.title,
+    )} revision="${artifact.revision}" entryFile=${JSON.stringify(resolved.entryFile)} fileCount="${resolved.files.length}"`;
+
+    const fileBlocks: string[] = [];
+    for (const file of resolved.files) {
+      const truncated = truncateFileBody(file.content);
+      if (totalBodyBytes + truncated.length > MAX_TOTAL_BODY_BYTES) {
+        fileBlocks.push(
+          `<file path=${JSON.stringify(file.path)} size="${file.content.length}" omitted="true" />`,
+        );
+        continue;
+      }
+      totalBodyBytes += truncated.length;
+      const body = sanitizeFileBody(truncated);
+      fileBlocks.push(
+        `<file path=${JSON.stringify(file.path)} size="${file.content.length}">\n${body}\n</file>`,
+      );
+    }
     blocks.push(
-      `<artifact id="${artifact._id}" type="${artifact.type}"${langAttr} title=${JSON.stringify(artifact.title)} revision="${artifact.revision}">\n${body}\n</artifact>`,
+      `<artifact ${headerAttrs}>\n${fileBlocks.join('\n')}\n</artifact>`,
     );
   }
   blocks.reverse();
@@ -75,28 +84,55 @@ export async function buildArtifactsContext(
   return [
     blocks.join('\n\n'),
     '',
-    'You may modify any of these via the `artifact_edit` tool — prefer `mode: "patch"` for small changes. Do NOT re-emit an artifact via `artifact_create`; that creates a duplicate. Snippets in <artifact> bodies appear verbatim and can be used as `search` blocks for patches.',
+    'You may modify any of these via the file-level CRUD tools: `artifact_file_create` (add a new file), `artifact_file_update` (overwrite an existing file in full), `artifact_file_delete` (remove a file — refused on entryFile and on the last file), `artifact_file_rename` (rename a file; auto-repoints entryFile if matched). Use `artifact_file_list` to enumerate paths and `artifact_file_read` to fetch content. For runnable artifacts, declare new dependencies via `artifact_packages_add` before `artifact_run`. Pass the artifact\'s `revision="N"` back as `expectedRevision` so a concurrent edit by another turn is detected (the call will return `code: "stale"` instead of overwriting). If you see `runStale="true"` on a runnable artifact, the source was edited after the last run — call `artifact_run` again to refresh outputs. To create a NEW artifact use `artifact_create`; calling create with an existing title returns the existing artifactId and does NOT overwrite.',
   ].join('\n');
 }
 
-function truncateArtifactBody(content: string): string {
-  if (content.length <= MAX_PER_ARTIFACT_BYTES) return content;
+function truncateFileBody(content: string): string {
+  if (content.length <= MAX_PER_FILE_BYTES) return content;
   return (
-    content.slice(0, MAX_PER_ARTIFACT_BYTES) +
-    `\n\n[...truncated; ${content.length - MAX_PER_ARTIFACT_BYTES} more characters elided. Re-read the artifact via search snippets you remember from earlier turns.]`
+    content.slice(0, MAX_PER_FILE_BYTES) +
+    `\n\n[...truncated; ${content.length - MAX_PER_FILE_BYTES} more characters elided. Call artifact_file_read({artifactId, path}) to fetch the rest.]`
   );
 }
 
-/**
- * Defuse delimiter-injection: a user/agent-authored artifact body could
- * contain `</artifact>` or `</details>` and prematurely close the wrapper
- * (the outer `<details>` block is added by `formatArtifactsContext`). The
- * model would then read whatever follows as if it were a top-level
- * instruction. Replacing the closing-tag form with a backslash-escaped
- * variant keeps the bytes the model sees readable but breaks the parse.
- */
-function sanitizeArtifactBody(body: string): string {
+interface ArtifactRowForContext {
+  type: string;
+  revision: number;
+  runStatus?: string;
+  runErrorCode?: string;
+  runOutputFiles?: { name: string }[];
+  runRevision?: number;
+}
+
+function buildRunAttrs(artifact: ArtifactRowForContext): string {
+  if (!isRunnableArtifactType(artifact.type)) {
+    return '';
+  }
+  if (
+    artifact.runRevision !== undefined &&
+    artifact.runRevision !== artifact.revision
+  ) {
+    return ' runStale="true"';
+  }
+  const parts: string[] = [];
+  if (artifact.runStatus) parts.push(`runStatus="${artifact.runStatus}"`);
+  if (artifact.runErrorCode) {
+    parts.push(`runErrorCode="${artifact.runErrorCode}"`);
+  }
+  if (artifact.runOutputFiles && artifact.runOutputFiles.length > 0) {
+    const names = artifact.runOutputFiles
+      .map((f) => f.name)
+      .join(',')
+      .slice(0, 200);
+    parts.push(`runOutputFiles=${JSON.stringify(names)}`);
+  }
+  return parts.length ? ' ' + parts.join(' ') : '';
+}
+
+function sanitizeFileBody(body: string): string {
   return body
+    .replace(/<\/file>/gi, '<\\/file>')
     .replace(/<\/artifact>/gi, '<\\/artifact>')
     .replace(/<\/details>/gi, '<\\/details>');
 }
diff --git a/services/platform/convex/lib/create_agent_config.test.ts b/services/platform/convex/lib/create_agent_config.test.ts
index b68932589..0f9131e46 100644
--- a/services/platform/convex/lib/create_agent_config.test.ts
+++ b/services/platform/convex/lib/create_agent_config.test.ts
@@ -17,7 +17,7 @@ function makeFakeModel() {
 
 describe('createAgentConfig', () => {
   describe('callSettings.maxOutputTokens default', () => {
-    it('defaults callSettings.maxOutputTokens to 8192 when maxTokens is not provided', () => {
+    it('defaults callSettings.maxOutputTokens to 32768 when maxTokens is not provided', () => {
       const config = createAgentConfig({
         name: 'test-agent',
         languageModel: makeFakeModel(),
@@ -27,7 +27,7 @@ describe('createAgentConfig', () => {
       const callSettings = config.callSettings as
         | Record<string, number>
         | undefined;
-      expect(callSettings?.maxOutputTokens).toBe(8192);
+      expect(callSettings?.maxOutputTokens).toBe(32768);
     });
 
     it('uses caller-provided maxTokens when explicitly set', () => {
@@ -106,7 +106,7 @@ describe('createAgentConfig', () => {
       expect(callSettings?.maxOutputTokens).toBe(1024);
     });
 
-    it('falls back to 8192 default when neither is provided', () => {
+    it('falls back to 32768 default when neither is provided', () => {
       const config = createAgentConfig({
         name: 'test-agent',
         languageModel: makeFakeModel(),
@@ -116,7 +116,7 @@ describe('createAgentConfig', () => {
       const callSettings = config.callSettings as
         | Record<string, number>
         | undefined;
-      expect(callSettings?.maxOutputTokens).toBe(8192);
+      expect(callSettings?.maxOutputTokens).toBe(32768);
     });
   });
 
diff --git a/services/platform/convex/lib/create_agent_config.ts b/services/platform/convex/lib/create_agent_config.ts
index bc3fc2751..bd881ae45 100644
--- a/services/platform/convex/lib/create_agent_config.ts
+++ b/services/platform/convex/lib/create_agent_config.ts
@@ -92,10 +92,12 @@ export function createAgentConfig(opts: {
   });
 
   // Call settings: cap output tokens via priority caller > model config >
-  // 8192 default. The default keeps OpenRouter from truncating responses
-  // with its much lower built-in cap. Temperature and frequencyPenalty are
-  // intentionally NOT set — reasoning models (e.g. DeepSeek V3.2) treat
-  // them as `0` and return empty content.
+  // 32768 default. The default keeps OpenRouter from truncating responses
+  // with its much lower built-in cap, and leaves enough headroom for tool
+  // calls whose arguments include large `content` strings (e.g. file-write
+  // tools); 8192 was too tight and got truncated mid-string on ~22KB writes.
+  // Temperature and frequencyPenalty are intentionally NOT set — reasoning
+  // models (e.g. DeepSeek V3.2) treat them as `0` and return empty content.
   //
   // `0` from caller / model config is treated as "omit" — sending
   // `max_tokens: 0` to OpenAI/OpenRouter generates zero tokens, not
@@ -109,7 +111,7 @@ export function createAgentConfig(opts: {
         ? opts.modelMaxOutputTokens
         : opts.maxTokens === 0 || opts.modelMaxOutputTokens === 0
           ? undefined
-          : 8192;
+          : 32768;
   const callSettings: Record<string, number> =
     resolvedMax === undefined ? {} : { maxOutputTokens: resolvedMax };
 
diff --git a/services/platform/convex/lib/helpers/public_storage_url.ts b/services/platform/convex/lib/helpers/public_storage_url.ts
index 45b25cd52..87ba5eef2 100644
--- a/services/platform/convex/lib/helpers/public_storage_url.ts
+++ b/services/platform/convex/lib/helpers/public_storage_url.ts
@@ -109,3 +109,37 @@ export function isStorageUrl(url: string): boolean {
     return url.includes(STORAGE_PATH);
   }
 }
+
+/**
+ * Rewrite an internal Convex URL so a sandbox spawner container can reach it
+ * through the Caddy proxy on the internal Docker network.
+ *
+ * Sister function of {@link toPublicUrl}. They differ in audience:
+ *  - `toPublicUrl()` builds the **browser-facing** URL (SITE_URL public host).
+ *  - `toSandboxStorageUrl()` builds the **sandbox-bound** URL using
+ *    `SANDBOX_STORAGE_INTERNAL_BASE_URL` (defaults to the internal proxy
+ *    alias e.g. `http://proxy` in docker compose). Spawner containers can
+ *    fetch / POST through this without going out to the public hostname.
+ *
+ * Falls back to `toPublicUrl()` when `SANDBOX_STORAGE_INTERNAL_BASE_URL`
+ * isn't set, so local `bun dev` (where the env var may be undefined) keeps
+ * working — the sandbox is still reachable via the public URL form.
+ *
+ * Idempotent: if the URL already starts with the configured prefix it is
+ * returned unchanged so callers never need to worry about double-rewriting.
+ */
+export function toSandboxStorageUrl(internalUrl: string): string {
+  const base = process.env.SANDBOX_STORAGE_INTERNAL_BASE_URL;
+  if (!base) {
+    // Fallback for `bun dev` and any deploy that hasn't set the env yet.
+    // The public URL is still reachable from the spawner (it just round-
+    // trips through Caddy's public listener instead of the internal one).
+    return toPublicUrl(internalUrl);
+  }
+  const prefix = base.replace(/\/$/, '');
+  if (internalUrl.startsWith(prefix)) return internalUrl;
+  const originMatch = internalUrl.match(/^https?:\/\/[^/]+/);
+  if (!originMatch) return internalUrl;
+  const path = internalUrl.slice(originMatch[0].length);
+  return `${prefix}${path}`;
+}
diff --git a/services/platform/convex/lib/rate_limiter/index.ts b/services/platform/convex/lib/rate_limiter/index.ts
index 95520b306..7b80b451e 100644
--- a/services/platform/convex/lib/rate_limiter/index.ts
+++ b/services/platform/convex/lib/rate_limiter/index.ts
@@ -301,6 +301,17 @@ export const rateLimiter = new RateLimiter(components.rateLimiter, {
     period: HOUR,
     capacity: 1,
   },
+  // Per-org lazy cleanup of sandboxExecutions audit rows. Gates the
+  // opportunistic delete-old-rows sweep in reserveSlotAndInsert so a
+  // busy org performs at most one sweep per hour. Audit retention is
+  // 90 days; older terminal rows are reclaimed here instead of via a
+  // crons.ts entry (see feedback_lazy_cleanup_over_cron).
+  'cleanup:sandbox': {
+    kind: 'token bucket',
+    rate: 1,
+    period: HOUR,
+    capacity: 1,
+  },
 
   // ============================================
   // TIER 7: Governance (Fixed Window)
diff --git a/services/platform/convex/lib/rls/helpers/access_control.ts b/services/platform/convex/lib/rls/helpers/access_control.ts
index dc115ab57..cd3c8f8ed 100644
--- a/services/platform/convex/lib/rls/helpers/access_control.ts
+++ b/services/platform/convex/lib/rls/helpers/access_control.ts
@@ -25,7 +25,24 @@ type PlatformTable =
   | 'workflowProcessingRecords'
   | 'promptTemplates'
   | 'promptCategories'
-  | 'auditLogs';
+  | 'auditLogs'
+  // Sandbox / artifact tables — added round-2 R2-B8. Previously the
+  // `rls_rules.ts` entries for these tables gated on bare org membership
+  // and bypassed `authorizeRls`, which meant a `member` (read-only) user
+  // could still write to artifacts and trigger billable sandbox runs.
+  | 'artifacts'
+  | 'artifactRevisions'
+  | 'auditLogChainGenesis'
+  | 'sandboxExecutions'
+  // Multi-file artifact tables — added audit follow-up F14. Writes go
+  // exclusively through internalMutation (handlers/*.ts); reads need
+  // an explicit READ_ONLY role-matrix entry so the new rls_rules.ts
+  // rules can defense-in-depth via `authorizeRls()` (otherwise the
+  // deny-by-default permissions would silently 0-result the canvas).
+  | 'artifactFiles'
+  | 'artifactRuns'
+  | 'artifactRunFiles'
+  | 'artifactOutputs';
 
 type PlatformAction = 'read' | 'write';
 
@@ -65,6 +82,18 @@ const platformPermissions: Record<
     promptTemplates: ALL,
     promptCategories: ALL,
     auditLogs: ALL,
+    artifacts: ALL,
+    artifactRevisions: ALL,
+    // Genesis row is an internal sentinel — no client-facing reads/writes.
+    auditLogChainGenesis: NONE,
+    // Audit table; user-facing access is read-only across all roles.
+    sandboxExecutions: READ_ONLY,
+    // Multi-file artifact tables: writes are internal-only (handlers/*.ts);
+    // reads through RLS-wrapped queries get READ_ONLY across all org roles.
+    artifactFiles: READ_ONLY,
+    artifactRuns: READ_ONLY,
+    artifactRunFiles: READ_ONLY,
+    artifactOutputs: READ_ONLY,
   },
   developer: {
     agentBindings: ALL,
@@ -87,6 +116,14 @@ const platformPermissions: Record<
     promptTemplates: ALL,
     promptCategories: ALL,
     auditLogs: ALL,
+    artifacts: ALL,
+    artifactRevisions: ALL,
+    auditLogChainGenesis: NONE,
+    sandboxExecutions: READ_ONLY,
+    artifactFiles: READ_ONLY,
+    artifactRuns: READ_ONLY,
+    artifactRunFiles: READ_ONLY,
+    artifactOutputs: READ_ONLY,
   },
   editor: {
     agentBindings: ALL,
@@ -109,6 +146,14 @@ const platformPermissions: Record<
     promptTemplates: ALL,
     promptCategories: ALL,
     auditLogs: ALL,
+    artifacts: ALL,
+    artifactRevisions: ALL,
+    auditLogChainGenesis: NONE,
+    sandboxExecutions: READ_ONLY,
+    artifactFiles: READ_ONLY,
+    artifactRuns: READ_ONLY,
+    artifactRunFiles: READ_ONLY,
+    artifactOutputs: READ_ONLY,
   },
   member: {
     agentBindings: READ_ONLY,
@@ -131,6 +176,18 @@ const platformPermissions: Record<
     promptTemplates: ALL,
     promptCategories: ALL,
     auditLogs: READ_ONLY,
+    // Members can READ artifacts (so the chat surface keeps working in
+    // shared threads) but NOT write — artifact_create / file_* /
+    // artifact_run all trigger billable sandbox executions. Aligns with
+    // the `documents` table's own member-as-read-only contract.
+    artifacts: READ_ONLY,
+    artifactRevisions: READ_ONLY,
+    auditLogChainGenesis: NONE,
+    sandboxExecutions: READ_ONLY,
+    artifactFiles: READ_ONLY,
+    artifactRuns: READ_ONLY,
+    artifactRunFiles: READ_ONLY,
+    artifactOutputs: READ_ONLY,
   },
   disabled: {
     agentBindings: NONE,
@@ -153,6 +210,14 @@ const platformPermissions: Record<
     promptTemplates: NONE,
     promptCategories: NONE,
     auditLogs: NONE,
+    artifacts: NONE,
+    artifactRevisions: NONE,
+    auditLogChainGenesis: NONE,
+    sandboxExecutions: NONE,
+    artifactFiles: NONE,
+    artifactRuns: NONE,
+    artifactRunFiles: NONE,
+    artifactOutputs: NONE,
   },
 };
 
diff --git a/services/platform/convex/lib/rls/helpers/rls_rules.ts b/services/platform/convex/lib/rls/helpers/rls_rules.ts
index bcfe85dca..5bb01fe1b 100644
--- a/services/platform/convex/lib/rls/helpers/rls_rules.ts
+++ b/services/platform/convex/lib/rls/helpers/rls_rules.ts
@@ -597,22 +597,15 @@ export async function rlsRules(
     },
 
     // Audit Log Chain Genesis - internal per-org serialization sentinel for
-    // the audit hash chain (see audit_logs/schema.ts). Carries no user data;
-    // any org member who can produce an audit-logged write must be able to
-    // upsert and patch this row, so gate purely on org membership.
+    // the audit hash chain (see audit_logs/schema.ts). Carries no user data.
+    // Writes happen exclusively through internalMutation (createAuditLog),
+    // which bypasses RLS, so the user-facing gate is deny-all. Surfacing
+    // this sentinel to clients would leak per-org write-rate metadata
+    // (round-2 R2-B8).
     auditLogChainGenesis: {
-      read: async (_, row) => {
-        if (!user) return false;
-        return userOrgIds.has(row.organizationId);
-      },
-      insert: async ({ user: ruleUser }, row) => {
-        if (!ruleUser) return false;
-        return userOrgIds.has(row.organizationId);
-      },
-      modify: async (_, row) => {
-        if (!user) return false;
-        return userOrgIds.has(row.organizationId);
-      },
+      read: async () => false,
+      insert: async () => false,
+      modify: async () => false,
     },
 
     // Audit Logs - organization-scoped, allow inserts for org members
@@ -646,6 +639,158 @@ export async function rlsRules(
       },
     },
 
+    // Artifacts - organization-scoped + role-gated (round-2 R2-B8). A
+    // `member` (read-only role per access_control) can SEE shared
+    // artifacts but cannot create / edit / re-run them — those paths
+    // trigger billable sandbox executions, matching the contract the
+    // sibling `documents` table already enforces.
+    artifacts: {
+      read: async (_, artifact) => {
+        if (!user) return false;
+        if (!userOrgIds.has(artifact.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === artifact.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifacts', 'read');
+      },
+      modify: async (_, artifact) => {
+        if (!user) return false;
+        if (!userOrgIds.has(artifact.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === artifact.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifacts', 'write');
+      },
+      insert: async ({ user: ruleUser }, artifact) => {
+        if (!ruleUser) return false;
+        if (!userOrgIds.has(artifact.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === artifact.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifacts', 'write');
+      },
+    },
+
+    // Artifact Revisions - linked to artifacts via artifactId; the
+    // revision row itself doesn't carry organizationId, so we resolve
+    // membership through the parent artifact. Append-only in practice
+    // (writes go through internalMutation which bypasses RLS); the
+    // role-gated modify/insert are defense-in-depth (round-2 R2-B8).
+    artifactRevisions: {
+      read: async (_, revision) => {
+        if (!user) return false;
+        const parent = await ctx.db.get(revision.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactRevisions', 'read');
+      },
+      modify: async (_, revision) => {
+        if (!user) return false;
+        const parent = await ctx.db.get(revision.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactRevisions', 'write');
+      },
+      insert: async ({ user: ruleUser }, revision) => {
+        if (!ruleUser) return false;
+        const parent = await ctx.db.get(revision.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactRevisions', 'write');
+      },
+    },
+
+    // Sandbox Executions - audit table. Reads go through the role
+    // matrix (members can READ their org's history); writes happen
+    // exclusively through internalMutation (reserveSlotAndInsert /
+    // finalize / recoverStuckSandboxes) which bypasses RLS, so the
+    // user-facing modify/insert remain deny-all.
+    sandboxExecutions: {
+      read: async (_, exec) => {
+        if (!user) return false;
+        if (!userOrgIds.has(exec.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === exec.organizationId,
+        );
+        return authorizeRls(membership?.role, 'sandboxExecutions', 'read');
+      },
+      modify: async () => false,
+      insert: async () => false,
+    },
+
+    // Multi-file artifact tables (audit follow-up F14). Writes are
+    // internal-mutation only (handlers/content_edits.ts,
+    // handlers/run_state.ts, output_mutations.ts) which bypasses RLS;
+    // user-facing modify/insert remain deny-all. Reads resolve org
+    // membership through the parent `artifactId` row, mirroring the
+    // `artifactRevisions` pattern above (the child rows don't carry
+    // `organizationId` themselves).
+    artifactFiles: {
+      read: async (_, file) => {
+        if (!user) return false;
+        const parent = await ctx.db.get(file.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactFiles', 'read');
+      },
+      modify: async () => false,
+      insert: async () => false,
+    },
+    artifactRuns: {
+      read: async (_, run) => {
+        if (!user) return false;
+        const parent = await ctx.db.get(run.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactRuns', 'read');
+      },
+      modify: async () => false,
+      insert: async () => false,
+    },
+    artifactRunFiles: {
+      read: async (_, runFile) => {
+        if (!user) return false;
+        const parent = await ctx.db.get(runFile.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactRunFiles', 'read');
+      },
+      modify: async () => false,
+      insert: async () => false,
+    },
+    artifactOutputs: {
+      read: async (_, output) => {
+        if (!user) return false;
+        const parent = await ctx.db.get(output.artifactId);
+        if (!parent) return false;
+        if (!userOrgIds.has(parent.organizationId)) return false;
+        const membership = userOrganizations.find(
+          (m) => m.organizationId === parent.organizationId,
+        );
+        return authorizeRls(membership?.role, 'artifactOutputs', 'read');
+      },
+      modify: async () => false,
+      insert: async () => false,
+    },
+
     // Workflow Step Audit Logs - organization-scoped, allow inserts for org members
     wfStepAuditLogs: {
       read: async (_, log) => {
diff --git a/services/platform/convex/migrations.ts b/services/platform/convex/migrations.ts
index ab0ad56ba..794b8f9e4 100644
--- a/services/platform/convex/migrations.ts
+++ b/services/platform/convex/migrations.ts
@@ -15,6 +15,18 @@ export const runAll = internalAction({
     await ctx.runMutation(
       internal.migrations.backfill_ledger_granularity.apply,
     );
+    // Multi-file artifact refactor — Phase A. Synthesizes `files`/`entryFile`
+    // for legacy single-`content` artifact rows. Idempotent (skip-if-set).
+    await ctx.runMutation(internal.migrations.backfill_artifact_files.apply);
+    // Multi-file artifact refactor — Phase B. Backfills the dedicated
+    // `artifactFiles` / `artifactRuns` / `artifactRunFiles` tables from
+    // the legacy embedded fields. Depends on Phase A (reads the
+    // synthesized `files[]`). Sentinel-gated idempotent — partially-done
+    // artifacts roll back atomically per batch and retry skips completed
+    // ones at O(1).
+    await ctx.runMutation(
+      internal.migrations.backfill_artifact_files_table.apply,
+    );
     // Idempotent: orgs that already carry an applied-bounds snapshot are
     // skipped inside `seedInitialBoundsInternal`, so re-running on every
     // deploy is safe. Without this seed, retention_cleanup silently no-ops
diff --git a/services/platform/convex/migrations/backfill_artifact_files.ts b/services/platform/convex/migrations/backfill_artifact_files.ts
new file mode 100644
index 000000000..c3cf0be78
--- /dev/null
+++ b/services/platform/convex/migrations/backfill_artifact_files.ts
@@ -0,0 +1,74 @@
+/**
+ * Migration: Backfill files/entryFile on legacy single-content artifacts.
+ *
+ * Phase A of the multi-file refactor: rows created before this deploy have
+ * `content: string` but no `files` / `entryFile`. Synthesize them in place:
+ *
+ *   files: [{ path: defaultEntryFileFor(type, language), content }]
+ *   entryFile: defaultEntryFileFor(type, language)
+ *
+ * Idempotent — skips rows already carrying `files`.
+ *
+ * Live-streaming rows are NOT skipped. Backfill writes synthesized values
+ * with the current `content`; subsequent settle under new code will overwrite
+ * with the canonical post-edit state.
+ */
+
+import { internalMutation } from '../_generated/server';
+import { defaultEntryFileFor } from '../agent_tools/artifacts/shared';
+
+const BATCH_SIZE = 50;
+
+export const apply = internalMutation({
+  args: {},
+  handler: async (ctx) => {
+    let totalUpdated = 0;
+    let totalSkipped = 0;
+    let cursor: string | null = null;
+    let isDone = false;
+
+    while (!isDone) {
+      let updated = 0;
+      let skipped = 0;
+
+      const result = await ctx.db
+        .query('artifacts')
+        .paginate({ cursor, numItems: BATCH_SIZE });
+
+      for (const row of result.page) {
+        if (row.files !== undefined && row.entryFile !== undefined) {
+          skipped++;
+          continue;
+        }
+        const entryFile = defaultEntryFileFor(row.type, row.language);
+        const content = row.content ?? '';
+        const files = [{ path: entryFile, content }];
+        try {
+          await ctx.db.patch(row._id, {
+            files,
+            entryFile,
+            // Leave `content` in place for rollback safety (Phase A).
+          });
+          updated++;
+        } catch (err) {
+          console.error(
+            `[backfill_artifact_files] Error processing artifact ${String(row._id)}:`,
+            err,
+          );
+          skipped++;
+        }
+      }
+
+      console.log(
+        `[backfill_artifact_files] Batch: updated=${updated}, skipped=${skipped}, done=${result.isDone}`,
+      );
+
+      totalUpdated += updated;
+      totalSkipped += skipped;
+      cursor = result.continueCursor;
+      isDone = result.isDone;
+    }
+
+    return { updated: totalUpdated, skipped: totalSkipped };
+  },
+});
diff --git a/services/platform/convex/migrations/backfill_artifact_files_table.test.ts b/services/platform/convex/migrations/backfill_artifact_files_table.test.ts
new file mode 100644
index 000000000..678f74772
--- /dev/null
+++ b/services/platform/convex/migrations/backfill_artifact_files_table.test.ts
@@ -0,0 +1,289 @@
+// Regression gate for the Phase B backfill orphan-row fix (P0-4 from the
+// crispy-curry review). Mocks the convex generated layer like
+// `sandbox/internal_mutations.test.ts` so the mutation body is unit-testable
+// without a running backend.
+
+import { describe, it, expect, vi } from 'vitest';
+
+vi.mock('../_generated/server', async (importOriginal) => {
+  const mod = await importOriginal<Record<string, unknown>>();
+  return {
+    ...mod,
+    internalMutation: (config: Record<string, unknown>) => config,
+  };
+});
+
+import { apply } from './backfill_artifact_files_table';
+
+interface MutHandler<TArgs, TReturn> {
+  handler: (ctx: unknown, args: TArgs) => Promise<TReturn> | TReturn;
+}
+
+interface ArtifactRow {
+  _id: string;
+  files?: Array<{ path: string; content: string }>;
+  runStatus?: string;
+  runOutputFiles?: Array<{
+    name: string;
+    storageId?: string;
+    size: number;
+    contentType?: string;
+  }>;
+  revision: number;
+  _phaseB_complete?: boolean;
+}
+
+function makeCtx(artifacts: ArtifactRow[]) {
+  const inserted: Array<{ table: string; payload: Record<string, unknown> }> =
+    [];
+  const patched: Array<{ id: string; patch: Record<string, unknown> }> = [];
+  // Per-table row stores so re-runs can observe prior inserts.
+  const artifactFiles: Record<string, unknown>[] = [];
+  const artifactRuns: Array<{ _id: string; artifactId: string }> = [];
+  const artifactRunFiles: Array<{
+    _id: string;
+    runId: string;
+    artifactId: string;
+    name: string;
+  }> = [];
+
+  function makeBuilder(table: string) {
+    let whereArtifactId: string | undefined;
+    let whereRunId: string | undefined;
+    let wherePath: string | undefined;
+    let whereName: string | undefined;
+    const builder: Record<string, unknown> = {};
+    builder.withIndex = vi.fn((_name: string, cb: (q: unknown) => unknown) => {
+      const q = {
+        eq: (field: string, value: unknown) => {
+          if (field === 'artifactId') whereArtifactId = value as string;
+          if (field === 'runId') whereRunId = value as string;
+          if (field === 'path') wherePath = value as string;
+          if (field === 'name') whereName = value as string;
+          return q;
+        },
+      };
+      cb(q);
+      return builder;
+    });
+    builder.filter = vi.fn((cb: (q: unknown) => unknown) => {
+      const q = {
+        eq: (_field: unknown, value: unknown) => {
+          whereName = value as string;
+          return q;
+        },
+        field: (name: string) => name,
+      };
+      cb(q);
+      return builder;
+    });
+    builder.first = vi.fn(async () => {
+      if (table === 'artifactFiles') {
+        return (
+          artifactFiles.find(
+            (r) => r.artifactId === whereArtifactId && r.path === wherePath,
+          ) ?? null
+        );
+      }
+      if (table === 'artifactRuns') {
+        return (
+          artifactRuns.find((r) => r.artifactId === whereArtifactId) ?? null
+        );
+      }
+      if (table === 'artifactRunFiles') {
+        return (
+          artifactRunFiles.find(
+            (r) => r.runId === whereRunId && r.name === whereName,
+          ) ?? null
+        );
+      }
+      return null;
+    });
+    return builder;
+  }
+
+  let nextId = 1;
+  const ctx = {
+    db: {
+      query: vi.fn((table: string) => {
+        if (table === 'artifacts') {
+          return {
+            paginate: async () => ({
+              page: artifacts,
+              continueCursor: null,
+              isDone: true,
+            }),
+          };
+        }
+        return makeBuilder(table);
+      }),
+      insert: vi.fn(async (table: string, payload: Record<string, unknown>) => {
+        const id = `${table}_${nextId++}`;
+        inserted.push({ table, payload });
+        if (table === 'artifactFiles') {
+          artifactFiles.push({ ...payload, _id: id });
+        } else if (table === 'artifactRuns') {
+          artifactRuns.push({
+            ...(payload as Record<string, never>),
+            _id: id,
+            artifactId: payload.artifactId as string,
+          });
+        } else if (table === 'artifactRunFiles') {
+          artifactRunFiles.push({
+            _id: id,
+            runId: payload.runId as string,
+            artifactId: payload.artifactId as string,
+            name: payload.name as string,
+          });
+        }
+        return id;
+      }),
+      patch: vi.fn(async (id: string, patch: Record<string, unknown>) => {
+        patched.push({ id, patch });
+        const target = artifacts.find((a) => a._id === id);
+        if (target !== undefined) Object.assign(target, patch);
+      }),
+    },
+  };
+  return { ctx, inserted, patched, artifacts, artifactRunFiles };
+}
+
+describe('backfill_artifact_files_table.apply', () => {
+  const mut = apply as unknown as MutHandler<
+    Record<string, never>,
+    {
+      artifacts: number;
+      filesCreated: number;
+      runsCreated: number;
+      runFilesCreated: number;
+      skipped: number;
+    }
+  >;
+
+  it('writes files+runs+runFiles AND then patches the sentinel as last write', async () => {
+    const artifacts: ArtifactRow[] = [
+      {
+        _id: 'a_1',
+        files: [{ path: 'main.py', content: 'print("hi")' }],
+        runStatus: 'completed',
+        runOutputFiles: [
+          {
+            name: 'out.png',
+            storageId: 'kg_1',
+            size: 100,
+            contentType: 'image/png',
+          },
+        ],
+        revision: 1,
+      },
+    ];
+    const { ctx, inserted, patched } = makeCtx(artifacts);
+    const out = await mut.handler(ctx, {});
+
+    expect(out.filesCreated).toBe(1);
+    expect(out.runsCreated).toBe(1);
+    expect(out.runFilesCreated).toBe(1);
+
+    // Sentinel patch happens AFTER all inserts.
+    const sentinelIndex = patched.findIndex(
+      (p) => p.id === 'a_1' && p.patch._phaseB_complete === true,
+    );
+    expect(sentinelIndex).toBeGreaterThan(-1);
+    expect(inserted.length).toBe(3); // one each of files, runs, runFiles
+  });
+
+  it('skips artifacts whose sentinel is already true (O(1) on retry)', async () => {
+    const artifacts: ArtifactRow[] = [
+      {
+        _id: 'a_1',
+        _phaseB_complete: true,
+        files: [{ path: 'main.py', content: 'print("hi")' }],
+        runStatus: 'completed',
+        runOutputFiles: [{ name: 'out.png', storageId: 'kg_1', size: 100 }],
+        revision: 1,
+      },
+    ];
+    const { ctx, inserted, patched } = makeCtx(artifacts);
+    const out = await mut.handler(ctx, {});
+
+    expect(out.skipped).toBe(1);
+    expect(out.filesCreated).toBe(0);
+    expect(out.runsCreated).toBe(0);
+    expect(out.runFilesCreated).toBe(0);
+    expect(inserted).toHaveLength(0);
+    expect(patched).toHaveLength(0);
+  });
+
+  it('on partial-prior orphan: re-uses existing artifactRuns row and fills missing artifactRunFiles', async () => {
+    // Simulate a pre-sentinel partial attempt: artifactRuns row exists for
+    // a_1 (orphaned because the inner artifactRunFiles loop failed mid-way),
+    // but only 1 of 2 runFiles landed. Sentinel is absent. Expected:
+    // re-use the existing run, insert the missing runFile, patch sentinel.
+    const artifacts: ArtifactRow[] = [
+      {
+        _id: 'a_1',
+        files: [], // already migrated, by_artifact_path check will skip
+        runStatus: 'completed',
+        runOutputFiles: [
+          { name: 'out1.png', storageId: 'kg_1', size: 100 },
+          { name: 'out2.png', storageId: 'kg_2', size: 200 },
+        ],
+        revision: 1,
+      },
+    ];
+    const { ctx, inserted, patched, artifactRunFiles } = makeCtx(artifacts);
+    // Seed the orphan state: one artifactRuns row + one of its runFiles.
+    await ctx.db.insert('artifactRuns', {
+      artifactId: 'a_1',
+      status: 'completed',
+      startedAt: 0,
+      revision: 1,
+    });
+    await ctx.db.insert('artifactRunFiles', {
+      runId: 'artifactRuns_1',
+      artifactId: 'a_1',
+      name: 'out1.png',
+      storageId: 'kg_1',
+      size: 100,
+      createdAt: 0,
+    });
+    const insertedBeforeRun = inserted.length;
+    const out = await mut.handler(ctx, {});
+
+    // No new artifactRuns row (existing was reused), one new runFile.
+    expect(out.runsCreated).toBe(0);
+    expect(out.runFilesCreated).toBe(1);
+    expect(artifactRunFiles.map((r) => r.name).sort()).toEqual([
+      'out1.png',
+      'out2.png',
+    ]);
+    // Sentinel did land.
+    expect(
+      patched.some((p) => p.id === 'a_1' && p.patch._phaseB_complete === true),
+    ).toBe(true);
+    // We only added the one missing runFile, no other extras.
+    expect(inserted.length - insertedBeforeRun).toBe(1);
+  });
+
+  it('skips run synthesis for in-flight (non-terminal) status', async () => {
+    const artifacts: ArtifactRow[] = [
+      {
+        _id: 'a_1',
+        files: [{ path: 'main.py', content: 'x' }],
+        runStatus: 'running',
+        runOutputFiles: [{ name: 'wip.txt', storageId: 'kg_1', size: 1 }],
+        revision: 1,
+      },
+    ];
+    const { ctx, patched } = makeCtx(artifacts);
+    const out = await mut.handler(ctx, {});
+    expect(out.filesCreated).toBe(1);
+    expect(out.runsCreated).toBe(0);
+    expect(out.runFilesCreated).toBe(0);
+    // Sentinel still patches (artifact is "done" for migration purposes;
+    // in-flight rows have no durable run state to capture).
+    expect(
+      patched.some((p) => p.id === 'a_1' && p.patch._phaseB_complete === true),
+    ).toBe(true);
+  });
+});
diff --git a/services/platform/convex/migrations/backfill_artifact_files_table.ts b/services/platform/convex/migrations/backfill_artifact_files_table.ts
new file mode 100644
index 000000000..8db8edbad
--- /dev/null
+++ b/services/platform/convex/migrations/backfill_artifact_files_table.ts
@@ -0,0 +1,193 @@
+/**
+ * Migration: Backfill `artifactFiles` / `artifactRuns` / `artifactRunFiles`
+ * dedicated tables from the legacy embedded `artifacts.files[]` and
+ * `artifacts.runOutputFiles[]` fields.
+ *
+ * Part of the refactor described in plan llm-majestic-hamming.md. The
+ * legacy fields stay on `artifactsTable` as `@deprecated` (per
+ * [feedback_deprecate_dont_delete_schema_fields]) — this script only
+ * POPULATES the new tables; nothing is deleted from `artifacts`.
+ *
+ * **Atomicity contract**:
+ * Each batch is a single Convex mutation (transactional). The per-artifact
+ * write block (files + run + runFiles) runs without per-step try/catch so
+ * any throw propagates and rolls the whole batch back — partial state is
+ * impossible. The `_phaseB_complete` sentinel is patched as the LAST write
+ * for each artifact; on retry, artifacts with the sentinel are skipped at
+ * O(1), so an aborted batch only re-does the unfinished tail.
+ *
+ *   files     → `artifactFiles` (one row per (artifactId, path))
+ *   run state → `artifactRuns` + `artifactRunFiles` IF status is terminal
+ *               (completed / failed / cancelled). In-flight statuses
+ *               aren't synthesized — they weren't durable history anyway.
+ *
+ * Live-streaming rows: backfilled with the current `files[]` snapshot;
+ * subsequent settle under new code will upsert via the regular write path.
+ *
+ * Auto-invoked from `migrations.runAll` after Phase A (which synthesizes
+ * `files[]` / `entryFile` for legacy single-`content` rows). Manual
+ * invocation also supported:
+ *   `npx convex run migrations/backfill_artifact_files_table:apply`
+ */
+
+import { internalMutation } from '../_generated/server';
+
+const BATCH_SIZE = 50;
+
+export const apply = internalMutation({
+  args: {},
+  handler: async (ctx) => {
+    let totalArtifacts = 0;
+    let totalFilesCreated = 0;
+    let totalRunsCreated = 0;
+    let totalRunFilesCreated = 0;
+    let totalSkipped = 0;
+    let cursor: string | null = null;
+    let isDone = false;
+
+    while (!isDone) {
+      const result = await ctx.db
+        .query('artifacts')
+        .paginate({ cursor, numItems: BATCH_SIZE });
+
+      for (const row of result.page) {
+        totalArtifacts += 1;
+
+        // Sentinel-based idempotency: skip O(1) if a prior batch already
+        // finished this artifact. The sentinel is patched as the LAST write
+        // for each artifact below, so its presence means every row
+        // (artifactFiles + artifactRuns + artifactRunFiles) is in place.
+        if (row._phaseB_complete === true) {
+          totalSkipped += 1;
+          continue;
+        }
+
+        const now = Date.now();
+
+        // 1. Backfill artifactFiles from legacy artifacts.files[]. Each
+        //    insert is gated by a by_artifact_path index check so we don't
+        //    duplicate rows from a partial prior attempt that crashed
+        //    before the sentinel landed. (Convex would roll the whole
+        //    batch back, but a previous backfill version skipped the
+        //    sentinel and the deployment may already carry residue.)
+        const legacyFiles = row.files ?? [];
+        for (const f of legacyFiles) {
+          const existing = await ctx.db
+            .query('artifactFiles')
+            .withIndex('by_artifact_path', (q) =>
+              q.eq('artifactId', row._id).eq('path', f.path),
+            )
+            .first();
+          if (existing !== null) {
+            totalSkipped += 1;
+            continue;
+          }
+          await ctx.db.insert('artifactFiles', {
+            artifactId: row._id,
+            path: f.path,
+            content: f.content,
+            createdAt: now,
+            updatedAt: now,
+          });
+          totalFilesCreated += 1;
+        }
+
+        // 2. Backfill artifactRuns + artifactRunFiles from terminal
+        //    run state. In-flight statuses (queued/installing/running)
+        //    aren't synthesized — they have no durable meaning post-refactor.
+        const runStatus = row.runStatus;
+        const isTerminal =
+          runStatus === 'completed' ||
+          runStatus === 'failed' ||
+          runStatus === 'cancelled';
+        if (isTerminal) {
+          // Reused-sentinel safety: a pre-sentinel partial attempt may
+          // have left an artifactRuns row without all its artifactRunFiles
+          // (the orphan class the sentinel design closes). On retry, if
+          // an artifactRuns row already exists we treat it as authoritative
+          // for the run header but still re-attempt any artifactRunFiles
+          // not present in the by_run index.
+          const existingRun = await ctx.db
+            .query('artifactRuns')
+            .withIndex('by_artifact', (q) => q.eq('artifactId', row._id))
+            .first();
+          let runId = existingRun?._id;
+          if (existingRun === null) {
+            const startedAt = row.runStartedAt ?? now;
+            runId = await ctx.db.insert('artifactRuns', {
+              artifactId: row._id,
+              status: runStatus,
+              ...(row.runExitCode !== undefined && {
+                exitCode: row.runExitCode,
+              }),
+              ...(row.runErrorCode !== undefined && {
+                errorCode: row.runErrorCode,
+              }),
+              ...(row.runErrorMessage !== undefined && {
+                errorMessage: row.runErrorMessage,
+              }),
+              startedAt,
+              ...(row.runCompletedAt !== undefined && {
+                endedAt: row.runCompletedAt,
+              }),
+              revision: row.runRevision ?? row.revision,
+              ...(row.runExecutionId !== undefined && {
+                executionId: row.runExecutionId,
+              }),
+            });
+            totalRunsCreated += 1;
+          }
+
+          if (runId !== undefined) {
+            const finalRunId = runId;
+            for (const out of row.runOutputFiles ?? []) {
+              if (out.storageId === undefined) continue;
+              const existingFile = await ctx.db
+                .query('artifactRunFiles')
+                .withIndex('by_run', (q) => q.eq('runId', finalRunId))
+                .filter((q) => q.eq(q.field('name'), out.name))
+                .first();
+              if (existingFile !== null) {
+                totalSkipped += 1;
+                continue;
+              }
+              await ctx.db.insert('artifactRunFiles', {
+                runId,
+                artifactId: row._id,
+                name: out.name,
+                storageId: out.storageId,
+                size: out.size,
+                ...(out.contentType !== undefined && {
+                  contentType: out.contentType,
+                }),
+                createdAt: now,
+              });
+              totalRunFilesCreated += 1;
+            }
+          }
+        }
+
+        // 3. LAST write: mark this artifact done. If anything above threw
+        //    the batch rolls back and this never lands — retry will re-do
+        //    the artifact from scratch (per-row idempotency guards above
+        //    keep that safe).
+        await ctx.db.patch(row._id, { _phaseB_complete: true });
+      }
+
+      console.log(
+        `[backfill_artifact_files_table] Batch: artifacts=${result.page.length}, filesCreated=${totalFilesCreated}, runsCreated=${totalRunsCreated}, runFilesCreated=${totalRunFilesCreated}, done=${result.isDone}`,
+      );
+
+      cursor = result.continueCursor;
+      isDone = result.isDone;
+    }
+
+    return {
+      artifacts: totalArtifacts,
+      filesCreated: totalFilesCreated,
+      runsCreated: totalRunsCreated,
+      runFilesCreated: totalRunFilesCreated,
+      skipped: totalSkipped,
+    };
+  },
+});
diff --git a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts
new file mode 100644
index 000000000..b865ef954
--- /dev/null
+++ b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts
@@ -0,0 +1,614 @@
+'use node';
+
+// HTTP client for the sandbox spawner.
+//
+// HMAC-signs each request body with SANDBOX_TOKEN (mirrors services/sandbox/
+// src/auth.ts). Spawner rejects unsigned or wrong-signed requests with 401.
+
+import { createHash, createHmac } from 'node:crypto';
+
+import {
+  sandboxErrorCodeLiterals,
+  sandboxPhaseEventLiterals,
+  sandboxStepStatusLiterals,
+  type SandboxErrorCode,
+  type SandboxLanguage,
+  type SandboxPhaseEvent,
+  type SandboxStepResult,
+} from '../../../sandbox/wire';
+
+const SIGNATURE_HEADER = 'x-tale-sandbox-signature';
+const TIMESTAMP_HEADER = 'x-tale-sandbox-timestamp';
+
+interface SandboxFileBody {
+  path: string;
+  content: string;
+}
+
+interface SpawnerExecuteBody {
+  executionId: string;
+  organizationId: string;
+  language: SandboxLanguage;
+  /**
+   * Files staged at /workspace/code/<path>. Required for both single-script
+   * and multi-script modes. Mirrors `services/sandbox/src/types.ts:ExecuteRequest.files`.
+   * The cross-service wire-shape stays in sync via this duplicated
+   * declaration — any drift surfaces as a typecheck mismatch in the
+   * platform `executeCode` action which constructs this body.
+   */
+  files: SandboxFileBody[];
+  /**
+   * Single-script mode: relative path inside `files[]` to exec. Mutually
+   * exclusive with `steps`; the spawner rejects payloads where both (or
+   * neither) are present.
+   */
+  entryPath?: string;
+  /**
+   * Multi-script mode body field. Paths in `files[]` that the spawner-
+   * generated wrapper invokes sequentially in the same container. See
+   * `services/sandbox/src/types.ts:ExecuteRequest.steps` for the full
+   * contract.
+   */
+  steps?: string[];
+  /**
+   * Legacy single-bucket package list. Used for single-language requests
+   * (`language: 'python' | 'node'`). Polyglot requests should use
+   * {@link packagesByLang} instead so the spawner knows which install
+   * tool to run for each bucket.
+   */
+  packages?: string[];
+  /**
+   * Per-runtime package buckets. Sent when `language === 'polyglot'` to
+   * route installs to `uv pip install` (python) and / or `npm install`
+   * (node) independently. Either bucket may be omitted; an empty or
+   * absent bucket means "skip that install".
+   */
+  packagesByLang?: {
+    python?: string[];
+    node?: string[];
+  };
+  timeoutMs?: number;
+  /**
+   * Prior-run output downloads. Each entry carries a name (filename to
+   * write inside /workspace/output/) and a URL the spawner GETs to pull
+   * the bytes. URLs are pre-rewritten through `toSandboxStorageUrl()` so
+   * they target the internal Caddy alias (`http://proxy/...`) and never
+   * have to round-trip through the public hostname. Replaces the legacy
+   * inline-base64 `priorOutputFiles[]` field — see plan §1.
+   */
+  priorOutputDownloads?: Array<{ name: string; url: string }>;
+  /**
+   * Pre-allocated upload slots the spawner POSTs harvested output files
+   * to. Length = N (defaults to 2; see plan §3). When the spawner needs
+   * more slots than were pre-allocated it lazily requests additional
+   * URLs via {@link outputUrlEndpoint}.
+   */
+  outputUploadSlots: Array<{ url: string }>;
+  /**
+   * HMAC-signed callback the spawner POSTs to when it needs more upload
+   * slots than the pre-allocated pool. Server-side per-run quota counter
+   * gates how many can be granted; see plan §3.
+   */
+  outputUrlEndpoint: string;
+  /**
+   * HMAC-signed callback the spawner POSTs to AFTER each output upload
+   * succeeds. The platform records `{fileName, storageId, size,
+   * contentType}` against the audit row's `uploadedStorageIds` set so a
+   * spawner crash mid-harvest doesn't orphan blobs. See plan §3.
+   */
+  reportUploadedEndpoint: string;
+}
+
+interface SpawnerExecuteResponse {
+  status: 'completed' | 'failed' | 'cancelled';
+  exitCode: number | null;
+  errorCode?: SandboxErrorCode;
+  errorMessage?: string;
+  stdoutBase64: string;
+  stderrBase64: string;
+  durationMs: number;
+  truncated: { stdout: boolean; stderr: boolean; files: number };
+  outputFiles: {
+    name: string;
+    /**
+     * Convex `_storage` id. Replaces the legacy `contentBase64` field —
+     * the spawner now POSTs bytes directly to a pre-signed upload URL and
+     * returns the storageId Convex allocated. See plan §3.
+     */
+    storageId: string;
+    size: number;
+    contentType: string;
+    /**
+     * sha256 (hex) of the harvested bytes — populated by the spawner
+     * during `harvestOutputDir` (crispy-curry plan §1). Used to seed the
+     * cumulative `artifactOutputs` manifest entry for the next pre-stage
+     * attestation. Required (parity-guarded by `HarvestOutputFile` in
+     * `services/platform/convex/sandbox/wire.ts`); the SSE parser rejects
+     * payloads missing it so a wire-drift surfaces as a hard failure
+     * rather than a silently-undefined sha256 downstream.
+     */
+    sha256: string;
+  }[];
+  /** Per-step results populated only for multi-step requests. */
+  steps?: SandboxStepResult[];
+  /**
+   * Optional upload telemetry. Older spawner images (built before the
+   * presigned-URL plan landed) will omit this; new ones populate it with
+   * attempted / succeeded counts plus per-failure detail. Treat as a
+   * diagnostic — not a correctness signal.
+   */
+  uploadStats?: {
+    attempted: number;
+    succeeded: number;
+    failures: Array<{
+      slotIndex: number;
+      fileName: string;
+      httpStatus: number;
+      errorSnippet: string;
+    }>;
+  };
+  /**
+   * Optional per-phase timing breakdown (ms). Helpful for tracking where
+   * the round-trip budget goes; surface to audit so we can compare TTL
+   * pressure vs the 1h `generateUploadUrl` window.
+   */
+  timing?: {
+    stageMs: number;
+    executeMs: number;
+    harvestMs: number;
+    uploadMs: number;
+  };
+  /**
+   * Pre-stage attestation (crispy-curry plan §3). For every entry in
+   * `priorOutputDownloads` the spawner reports back whether it landed on
+   * `/workspace/output/` (`staged[]`) or was skipped (`skipped[]` with a
+   * structured reason). The action diffs `staged[]` against the manifest
+   * it sent and aborts the run with `PRE_STAGE_FAILED` if any expected
+   * file is missing — BEFORE the spawner's outputFiles are promoted to
+   * fileMetadata. Omitted when the request had no `priorOutputDownloads`.
+   */
+  priorStage?: {
+    staged: Array<{ name: string; bytes: number; sha256: string }>;
+    skipped: Array<{
+      name: string;
+      reason:
+        | 'unsafe_path'
+        | 'fetch_failed'
+        | 'http_error'
+        | 'url_expired'
+        | 'write_failed';
+      detail: string;
+    }>;
+  };
+}
+
+const SANDBOX_ERROR_CODE_SET: ReadonlySet<string> = new Set(
+  sandboxErrorCodeLiterals,
+);
+const SANDBOX_PHASE_SET: ReadonlySet<string> = new Set(
+  sandboxPhaseEventLiterals,
+);
+const SANDBOX_STEP_STATUS_SET: ReadonlySet<string> = new Set(
+  sandboxStepStatusLiterals,
+);
+
+// Signature contract (mirrors services/sandbox/src/auth.ts):
+//   signedString = `${METHOD}\n${path}\n${timestamp}\n${sha256Hex(body)}`
+//   signature    = HMAC-SHA256(token, signedString)
+// Bundling method+path+ts into the signed string stops a captured
+// /v1/execute signature from being replayed against /v1/cancel/:id and
+// caps the replay window to the spawner's 60s clock-skew tolerance.
+function signRequest(
+  method: string,
+  path: string,
+  timestamp: string,
+  body: string,
+  token: string,
+): string {
+  const bodyHash = createHash('sha256').update(body).digest('hex');
+  const signedString = `${method.toUpperCase()}\n${path}\n${timestamp}\n${bodyHash}`;
+  return createHmac('sha256', token).update(signedString).digest('hex');
+}
+
+function getSpawnerUrl(): string {
+  // Mirrors RAG_URL / CRAWLER_URL convention: default to host loopback
+  // so `bun dev`'s local convex-local-backend (running on the host) can
+  // reach the spawner via the published port. Docker compose sets
+  // SANDBOX_URL=http://sandbox:8003 on the tale-convex container so the
+  // dockerized convex resolves through Docker DNS instead.
+  return process.env.SANDBOX_URL ?? 'http://localhost:8003';
+}
+
+function getSpawnerToken(): string | null {
+  // Opt-in HMAC: when SANDBOX_TOKEN is unset (or empty-string) the
+  // spawner skips signature verification and this client sends unsigned
+  // requests. `tale deploy` auto-mints one via ensure-env for production
+  // deploys. Both sides treat empty-string as unset.
+  const token = process.env.SANDBOX_TOKEN;
+  return token && token.length > 0 ? token : null;
+}
+
+interface SpawnerExecuteCallbacks {
+  /** Fired as soon as the runtime entrypoint emits a PHASE marker. */
+  onPhase?: (phase: SandboxPhaseEvent) => Promise<void> | void;
+  /**
+   * Live stdout tail. Fires per spawner-side line (PHASE markers stripped).
+   * The trailing newline is preserved. Used by the action to append to the
+   * canvas's `runStdoutPreview` so users see output stream during the run
+   * instead of only at terminal time. The action coalesces several
+   * invocations into a single mutation per ~250 ms (or threshold bytes).
+   */
+  onStdout?: (text: string) => void;
+  /** Live stderr tail. Fires per spawner-side chunk (not line-buffered). */
+  onStderr?: (text: string) => void;
+}
+
+/**
+ * POST /v1/execute as SSE. The spawner emits zero or more `event: phase`
+ * lines followed by exactly one `event: result` line. We invoke `onPhase`
+ * per phase event and return the parsed result. The function is still
+ * async-await — the streaming is internal.
+ *
+ * Throws on transport / 5xx / 401; returns the spawner's own
+ * success-shape `{status, errorCode, ...}` otherwise so the caller can
+ * decide failure semantics.
+ */
+// Spawner overhead budget above the user-code timeout: container pull/start,
+// pip/npm install streaming, harvest + bytes-out. Keeps the fetch ceiling
+// above the spawner-side wall clock so a healthy long run isn't aborted by
+// the client. Anything beyond this is genuinely stuck (the SSE stream has
+// stalled past any plausible processing), so abort and let the caller route
+// through `failExecution` → `SPAWNER_UNAVAILABLE` rather than wait for the
+// 30-min Convex action ceiling.
+const SPAWNER_FETCH_OVERHEAD_MS = 60_000;
+const SPAWNER_DEFAULT_TIMEOUT_MS = 30_000;
+
+export async function spawnerExecute(
+  body: SpawnerExecuteBody,
+  signal: AbortSignal,
+  callbacks: SpawnerExecuteCallbacks = {},
+): Promise<SpawnerExecuteResponse> {
+  const baseUrl = getSpawnerUrl();
+  const url = `${baseUrl}/v1/execute`;
+  const path = new URL(url).pathname;
+  const token = getSpawnerToken();
+  const bodyJson = JSON.stringify(body);
+  const timestamp = String(Date.now());
+
+  const headers: Record<string, string> = {
+    'content-type': 'application/json',
+    accept: 'text/event-stream',
+  };
+  if (token !== null) {
+    headers[SIGNATURE_HEADER] = signRequest(
+      'POST',
+      path,
+      timestamp,
+      bodyJson,
+      token,
+    );
+    headers[TIMESTAMP_HEADER] = timestamp;
+  }
+
+  // Independent client-side timeout. Without this a stalled SSE stream
+  // (network or spawner hang) would block the Convex action until its 30-min
+  // hard limit, wasting the slot. Combine with the caller's abort signal so
+  // user-stop still aborts immediately.
+  const fetchTimeoutMs =
+    (body.timeoutMs ?? SPAWNER_DEFAULT_TIMEOUT_MS) + SPAWNER_FETCH_OVERHEAD_MS;
+  const fetchAbort = AbortSignal.any([
+    signal,
+    AbortSignal.timeout(fetchTimeoutMs),
+  ]);
+
+  let res: Response;
+  try {
+    res = await fetch(url, {
+      method: 'POST',
+      headers,
+      body: bodyJson,
+      signal: fetchAbort,
+    });
+  } catch (err) {
+    throw new Error(
+      `sandbox spawner unreachable at ${url}: ${err instanceof Error ? err.message : String(err)}`,
+      { cause: err },
+    );
+  }
+
+  if (res.status === 401) {
+    throw new Error(
+      'sandbox spawner rejected request (401) — SANDBOX_TOKEN mismatch between Convex and spawner',
+    );
+  }
+  if (res.status === 429) {
+    throw new Error('sandbox spawner busy (429) — concurrency cap reached');
+  }
+  if (res.status === 413) {
+    throw new Error(
+      'sandbox spawner refused payload (413) — request body exceeds spawner cap',
+    );
+  }
+  if (!res.ok) {
+    const text = await res.text().catch((err) => {
+      console.warn(`[spawnerExecute] failed to read error body:`, err);
+      return '';
+    });
+    throw new Error(`sandbox spawner ${res.status}: ${text || res.statusText}`);
+  }
+  if (!res.body) {
+    throw new Error('sandbox spawner returned no body');
+  }
+
+  // SSE parser: events are separated by `\n\n`; each event has `event:` and
+  // `data:` lines. Handles CRLF line endings (any future proxy) as well as
+  // LF. Accumulates text and processes complete events as they arrive,
+  // dispatching phase callbacks and capturing the final result.
+  const reader = res.body.getReader();
+  const decoder = new TextDecoder('utf-8');
+  let buf = '';
+  let finalResult: SpawnerExecuteResponse | null = null;
+  let errorEvent: string | null = null;
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    buf += decoder.decode(value, { stream: true }).replace(/\r\n/g, '\n');
+    let boundary: number;
+    while ((boundary = buf.indexOf('\n\n')) !== -1) {
+      const eventText = buf.slice(0, boundary);
+      buf = buf.slice(boundary + 2);
+      const parsed = parseSseEvent(eventText);
+      if (!parsed) continue;
+      if (parsed.event === 'phase') {
+        const rawPhase = parsed.data.phase;
+        if (
+          typeof rawPhase === 'string' &&
+          SANDBOX_PHASE_SET.has(rawPhase) &&
+          callbacks.onPhase
+        ) {
+          try {
+            // SANDBOX_PHASE_SET.has(rawPhase) guard above narrows the
+            // string into the literal union the callback expects, but
+            // the lint rule still flags the assertion; suppress for the
+            // wire-shape boundary.
+            // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+            await callbacks.onPhase(rawPhase as SandboxPhaseEvent);
+          } catch (err) {
+            // Log but don't abort the underlying execution — the artifact
+            // patch is a UX nice-to-have; the audit + final result still
+            // proceed to completion.
+            console.warn(`[spawnerExecute] onPhase callback failed:`, err);
+          }
+        }
+      } else if (parsed.event === 'stdout') {
+        const text = parsed.data.text;
+        if (typeof text === 'string' && text.length > 0 && callbacks.onStdout) {
+          try {
+            callbacks.onStdout(text);
+          } catch (err) {
+            // Same posture as `onPhase`: log but don't abort the run — live
+            // tail is a UX-enhancement, not a correctness contract. The
+            // final `result` event still carries the canonical base64'd
+            // stdout/stderr buffer.
+            console.warn(`[spawnerExecute] onStdout callback failed:`, err);
+          }
+        }
+      } else if (parsed.event === 'stderr') {
+        const text = parsed.data.text;
+        if (typeof text === 'string' && text.length > 0 && callbacks.onStderr) {
+          try {
+            callbacks.onStderr(text);
+          } catch (err) {
+            console.warn(`[spawnerExecute] onStderr callback failed:`, err);
+          }
+        }
+      } else if (parsed.event === 'result') {
+        const validated = validateExecuteResponse(parsed.data);
+        if (validated) {
+          finalResult = validated;
+        } else {
+          throw new Error('sandbox spawner result event has malformed payload');
+        }
+      } else if (parsed.event === 'error') {
+        const rawMessage = parsed.data.message;
+        errorEvent =
+          typeof rawMessage === 'string' && rawMessage.length > 0
+            ? rawMessage
+            : 'sandbox spawner error';
+      }
+    }
+  }
+
+  if (errorEvent !== null) {
+    throw new Error(`sandbox spawner SSE error: ${errorEvent}`);
+  }
+  if (finalResult === null) {
+    throw new Error('sandbox spawner stream ended without a result event');
+  }
+  return finalResult;
+}
+
+function parseSseEvent(
+  block: string,
+): { event: string; data: Record<string, unknown> } | null {
+  let event = 'message';
+  const dataLines: string[] = [];
+  for (const raw of block.split('\n')) {
+    if (raw.startsWith('event:')) {
+      event = raw.slice(6).trim();
+    } else if (raw.startsWith('data:')) {
+      dataLines.push(raw.slice(5).trimStart());
+    }
+  }
+  if (dataLines.length === 0) return null;
+  try {
+    const parsed: unknown = JSON.parse(dataLines.join('\n'));
+    if (
+      parsed === null ||
+      typeof parsed !== 'object' ||
+      Array.isArray(parsed)
+    ) {
+      return null;
+    }
+    // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- wire JSON; the object guard above rules out null/array, so indexing string keys is sound
+    return { event, data: parsed as Record<string, unknown> };
+  } catch (err) {
+    console.warn(`[spawnerExecute] SSE event parse failed:`, err);
+    return null;
+  }
+}
+
+/**
+ * Narrow the result event payload to `SpawnerExecuteResponse`. Returns
+ * null on shape mismatch — caller throws so the action fails through the
+ * normal failExecution path rather than producing partial state.
+ */
+function validateExecuteResponse(
+  raw: Record<string, unknown>,
+): SpawnerExecuteResponse | null {
+  if (
+    raw.status !== 'completed' &&
+    raw.status !== 'failed' &&
+    raw.status !== 'cancelled'
+  ) {
+    return null;
+  }
+  if (
+    raw.errorCode !== undefined &&
+    (typeof raw.errorCode !== 'string' ||
+      !SANDBOX_ERROR_CODE_SET.has(raw.errorCode))
+  ) {
+    return null;
+  }
+  if (
+    typeof raw.stdoutBase64 !== 'string' ||
+    typeof raw.stderrBase64 !== 'string'
+  ) {
+    return null;
+  }
+  if (typeof raw.durationMs !== 'number') return null;
+  if (!Array.isArray(raw.outputFiles)) return null;
+  // Each outputFile must now carry a Convex storageId (the spawner POSTed
+  // the bytes to a pre-signed upload URL during harvest). The legacy
+  // `contentBase64` shape was retired by the sandbox-wobbly-origami plan.
+  for (const f of raw.outputFiles) {
+    if (f === null || typeof f !== 'object' || Array.isArray(f)) return null;
+    // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked via guards above; standard wire-shape narrowing pattern
+    const e = f as Record<string, unknown>;
+    if (typeof e.name !== 'string') return null;
+    if (typeof e.storageId !== 'string' || e.storageId.length === 0) {
+      return null;
+    }
+    if (typeof e.size !== 'number') return null;
+    if (typeof e.contentType !== 'string') return null;
+    // sha256 required (parity-guarded by `HarvestOutputFile` in wire.ts).
+    // Reject malformed payloads here so the downstream insert can write
+    // the hash without ambiguity.
+    if (typeof e.sha256 !== 'string' || e.sha256.length === 0) return null;
+  }
+  // steps is optional, but if present must be a typed array of step
+  // results — refuse the payload otherwise so a wire-drift surfaces as
+  // a hard failure rather than a silently-typecast garbage object.
+  if (raw.steps !== undefined) {
+    if (!Array.isArray(raw.steps)) return null;
+    for (const s of raw.steps) {
+      if (s === null || typeof s !== 'object' || Array.isArray(s)) return null;
+      // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked via guards above; standard wire-shape narrowing pattern used elsewhere in this file (see `parseSseEvent`).
+      const e = s as Record<string, unknown>;
+      if (typeof e.path !== 'string') return null;
+      if (
+        typeof e.status !== 'string' ||
+        !SANDBOX_STEP_STATUS_SET.has(e.status)
+      ) {
+        return null;
+      }
+      if (e.exitCode !== null && typeof e.exitCode !== 'number') return null;
+      if (typeof e.durationMs !== 'number') return null;
+    }
+  }
+  // uploadStats / timing are optional diagnostic fields. If present they
+  // must be well-formed objects so a wire-drift surfaces as a hard fail
+  // rather than a silently-typecast garbage object.
+  if (raw.uploadStats !== undefined) {
+    if (
+      raw.uploadStats === null ||
+      typeof raw.uploadStats !== 'object' ||
+      Array.isArray(raw.uploadStats)
+    ) {
+      return null;
+    }
+    // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked above
+    const us = raw.uploadStats as Record<string, unknown>;
+    if (typeof us.attempted !== 'number') return null;
+    if (typeof us.succeeded !== 'number') return null;
+    if (!Array.isArray(us.failures)) return null;
+    for (const f of us.failures) {
+      if (f === null || typeof f !== 'object' || Array.isArray(f)) return null;
+      // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked above
+      const fe = f as Record<string, unknown>;
+      if (typeof fe.slotIndex !== 'number') return null;
+      if (typeof fe.fileName !== 'string') return null;
+      if (typeof fe.httpStatus !== 'number') return null;
+      if (typeof fe.errorSnippet !== 'string') return null;
+    }
+  }
+  if (raw.timing !== undefined) {
+    if (
+      raw.timing === null ||
+      typeof raw.timing !== 'object' ||
+      Array.isArray(raw.timing)
+    ) {
+      return null;
+    }
+    // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked above
+    const t = raw.timing as Record<string, unknown>;
+    if (typeof t.stageMs !== 'number') return null;
+    if (typeof t.executeMs !== 'number') return null;
+    if (typeof t.harvestMs !== 'number') return null;
+    if (typeof t.uploadMs !== 'number') return null;
+  }
+  // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked above; remaining nullable fields default at caller
+  return raw as unknown as SpawnerExecuteResponse;
+}
+
+export async function spawnerCancel(executionId: string): Promise<void> {
+  const url = `${getSpawnerUrl()}/v1/cancel/${encodeURIComponent(executionId)}`;
+  const path = new URL(url).pathname;
+  const token = getSpawnerToken();
+  const body = '';
+  const timestamp = String(Date.now());
+  const headers: Record<string, string> = {
+    'content-type': 'application/json',
+  };
+  if (token !== null) {
+    headers[SIGNATURE_HEADER] = signRequest(
+      'POST',
+      path,
+      timestamp,
+      body,
+      token,
+    );
+    headers[TIMESTAMP_HEADER] = timestamp;
+  }
+  try {
+    // 5s timeout: cancel is best-effort and the watchdog reaps stuck rows
+    // anyway. Without this, an unreachable spawner blocks user-Stop per row
+    // until Node's socket default (~minutes) — visible to users as the
+    // canvas spinner refusing to clear.
+    await fetch(url, {
+      method: 'POST',
+      headers,
+      body,
+      signal: AbortSignal.timeout(5_000),
+    });
+  } catch (err) {
+    // Cancellation is best-effort; the watchdog cron will reap stuck rows
+    // if the spawner is unreachable. Log so a stuck cancel path isn't
+    // silently swallowed.
+    console.warn(
+      `[spawnerCancel] best-effort cancel failed for ${executionId}:`,
+      err,
+    );
+  }
+}
diff --git a/services/platform/convex/node_only/sandbox/internal_actions.ts b/services/platform/convex/node_only/sandbox/internal_actions.ts
new file mode 100644
index 000000000..c35787480
--- /dev/null
+++ b/services/platform/convex/node_only/sandbox/internal_actions.ts
@@ -0,0 +1,1303 @@
+'use node';
+
+// `executeCode` — the action the `artifact_run` agent tool calls.
+//
+// Owns the spawner round-trip + storage transactionality:
+//   1. reserveSlotAndInsert mutation (atomic quota + audit row insert).
+//   2. setRunning('installing') mutation + start a 60s heartbeat loop.
+//   3. POST /v1/execute on the spawner with AbortSignal wired through.
+//   4. Upload every output blob; if all succeed, single batched
+//      `insertOutputFiles` mutation. On any storage failure, delete the
+//      blobs we already wrote so we don't orphan `_storage`.
+//   5. Upload stdout/stderr to `_storage` when over the preview cap.
+//   6. finalize mutation with the structured result.
+//
+// Every failure path goes through the same `failExecution` helper which
+// finalizes the audit row, finalizes the artifact row if one was tied to
+// this run, and rolls back any uploaded storage blobs. This makes the
+// "canvas spinner stuck forever" failure mode (R1 finding) structurally
+// impossible — there is one terminate-and-clean code path, not six.
+//
+// Error rule:
+//   - Infrastructure failures (spawner unreachable, action timeout, quota
+//     mutation throw) → finalize + THROW so the agent SDK surfaces them.
+//   - User-code failures (exit ≠ 0, sandbox timeout, OOM, install failure)
+//     → finalize + RETURN structured result so the LLM can read and react.
+
+import { ConvexError, v } from 'convex/values';
+
+import { internal } from '../../_generated/api';
+import type { Id } from '../../_generated/dataModel';
+import { internalAction, type ActionCtx } from '../../_generated/server';
+import { toSandboxStorageUrl } from '../../lib/helpers/public_storage_url';
+import {
+  SANDBOX_CODE_PREVIEW_MAX,
+  SANDBOX_DEFAULT_TIMEOUT_MS,
+  SANDBOX_MAX_OUTPUT_FILES_PER_RUN,
+  SANDBOX_MAX_TIMEOUT_MS,
+  SANDBOX_OUTPUT_UPLOAD_SLOTS_PREALLOC,
+  SANDBOX_STDERR_PREVIEW_MAX,
+  SANDBOX_STDOUT_PREVIEW_MAX,
+} from '../../sandbox/schema';
+import {
+  sandboxErrorCodeValidator,
+  sandboxLanguageValidator,
+  sandboxStepResultValidator,
+  type SandboxErrorCode,
+  type SandboxRunProgressKind,
+  type SandboxStepResult,
+} from '../../sandbox/wire';
+import { spawnerCancel, spawnerExecute } from './helpers/spawner_client';
+
+const HEARTBEAT_INTERVAL_MS = 60_000;
+
+// Explicit handler return type. Required to break a self-referential type
+// cycle: without it, the inferred type of `executeCode` depends on its own
+// handler's return type (which reaches `internal.sandbox.*` through
+// `_generated/api.d.ts`). The cycle collapses every Convex consumer in the
+// codebase to `any` — see PR #1727 CI breakage.
+type ExecuteCodeResult = {
+  executionId: Id<'sandboxExecutions'>;
+  success: boolean;
+  status: 'completed' | 'failed' | 'cancelled';
+  exitCode: number | null;
+  errorCode?: SandboxErrorCode;
+  errorMessage?: string;
+  stdoutPreview: string;
+  stderrPreview: string;
+  durationMs: number;
+  truncated: { stdout: boolean; stderr: boolean; files: number };
+  files: Array<{
+    name: string;
+    fileMetadataId: Id<'fileMetadata'>;
+    storageId: Id<'_storage'>;
+    size: number;
+    contentType: string;
+  }>;
+  steps?: SandboxStepResult[];
+  /**
+   * Pre-stage attestation summary surfaced from the spawner. Populated on
+   * every artifact-bound run that had prior-output downloads; omitted
+   * otherwise. The agent tool re-shapes this for the LLM-visible result
+   * so the model can see exactly which prior files made it into
+   * `/workspace/output/` and which were skipped (with structured reason).
+   */
+  preStage?: {
+    staged: string[];
+    skipped: Array<{ name: string; reason: string; detail: string }>;
+  };
+};
+
+interface FailContext {
+  ctx: ActionCtx;
+  executionId: Id<'sandboxExecutions'>;
+  artifactId?: Id<'artifacts'>;
+  uploadedStorageIds: Set<string>;
+  startedAt: number;
+}
+
+/**
+ * One-stop failure handler. Finalizes the audit row, finalizes the artifact
+ * row (so the canvas spinner stops), and cascade-deletes any `_storage`
+ * blobs we already wrote. Always returns the structured result the caller
+ * can `return` directly.
+ */
+/**
+ * Roll back `_storage` blobs we already wrote in the action's in-memory
+ * set. Used by `failExecution` AND by the success path when
+ * `insertOutputFiles` reports `skippedTerminal` (race with user-cancel).
+ * Clears the set after deletion so the finally block doesn't double-free.
+ */
+async function rollbackUploadedBlobs(
+  ctx: { storage: { delete: (id: Id<'_storage'>) => Promise<void> } },
+  ids: Set<string>,
+  context: string,
+): Promise<void> {
+  for (const sid of ids) {
+    try {
+      // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- delete needs Id<'_storage'>
+      await ctx.storage.delete(sid as unknown as Id<'_storage'>);
+    } catch (err) {
+      console.warn(`[${context}] storage.delete(${sid}) failed:`, err);
+    }
+  }
+  ids.clear();
+}
+
+async function failExecution(
+  fc: FailContext,
+  status: 'failed' | 'cancelled',
+  errorCode: SandboxErrorCode,
+  errorMessage: string,
+  extra?: {
+    stdoutPreview?: string;
+    stderrPreview?: string;
+    exitCode?: number | null;
+  },
+): Promise<ExecuteCodeResult> {
+  const durationMs = Date.now() - fc.startedAt;
+  await rollbackUploadedBlobs(
+    fc.ctx,
+    fc.uploadedStorageIds,
+    'sandbox.failExecution',
+  );
+
+  try {
+    await fc.ctx.runMutation(internal.sandbox.internal_mutations.finalize, {
+      executionId: fc.executionId,
+      status,
+      errorCode,
+      errorMessage,
+      ...(extra?.stdoutPreview !== undefined && {
+        stdoutPreview: extra.stdoutPreview,
+      }),
+      ...(extra?.stderrPreview !== undefined && {
+        stderrPreview: extra.stderrPreview,
+      }),
+      ...(extra?.exitCode !== undefined &&
+        extra.exitCode !== null && { exitCode: extra.exitCode }),
+      outputFiles: [],
+      durationMs,
+      actualSeconds: durationMs / 1000,
+    });
+  } catch (err) {
+    console.warn(`[sandbox.failExecution] audit finalize failed:`, err);
+  }
+
+  if (fc.artifactId) {
+    try {
+      await fc.ctx.runMutation(
+        internal.artifacts.internal_mutations.finalizeArtifactRun,
+        {
+          artifactId: fc.artifactId,
+          runStatus: status,
+          runErrorCode: errorCode,
+          runErrorMessage: errorMessage,
+          ...(extra?.exitCode !== undefined &&
+            extra.exitCode !== null && { runExitCode: extra.exitCode }),
+          ...(extra?.stdoutPreview !== undefined && {
+            runStdoutPreview: extra.stdoutPreview,
+          }),
+          ...(extra?.stderrPreview !== undefined && {
+            runStderrPreview: extra.stderrPreview,
+          }),
+          runOutputFiles: [],
+          runExecutionId: fc.executionId,
+        },
+      );
+    } catch (err) {
+      console.warn(`[sandbox.failExecution] artifact finalize failed:`, err);
+    }
+  }
+
+  return {
+    executionId: fc.executionId,
+    success: false,
+    status,
+    exitCode: extra?.exitCode ?? null,
+    errorCode,
+    errorMessage,
+    stdoutPreview: extra?.stdoutPreview ?? '',
+    stderrPreview: extra?.stderrPreview ?? '',
+    durationMs,
+    truncated: { stdout: false, stderr: false, files: 0 },
+    files: [],
+  };
+}
+
+function buildInstallProgress(packages: string[] | undefined): {
+  kind: SandboxRunProgressKind;
+  package?: string;
+  version?: string;
+} {
+  if (!packages || packages.length === 0) {
+    return { kind: 'installing' };
+  }
+  // `python-pptx==1.0.2` → { package: 'python-pptx', version: '1.0.2' }.
+  // Anything that doesn't match the canonical pip/npm spec falls back to
+  // the no-version variant; the UI message map handles both via ICU.
+  const first = packages[0];
+  if (first === undefined) return { kind: 'installing' };
+  const match = first.match(/^([^@=<>!~]+)(?:[@=]=?([^@=<>!~ ]+))?/);
+  if (match && match[1]) {
+    return {
+      kind: 'installingPackage',
+      package: match[1].trim(),
+      ...(match[2] !== undefined && { version: match[2].trim() }),
+    };
+  }
+  return { kind: 'installing' };
+}
+
+export const executeCode = internalAction({
+  args: {
+    organizationId: v.string(),
+    uploadedBy: v.string(),
+    threadId: v.optional(v.string()),
+    messageId: v.optional(v.string()),
+    toolCallId: v.optional(v.string()),
+    agentSlug: v.optional(v.string()),
+
+    language: sandboxLanguageValidator,
+    /**
+     * Files to stage under /workspace/code/<path>. Required for both
+     * modes — single-script needs the entry file, multi-script needs every
+     * step's file. Forwarded verbatim to the spawner; the spawner
+     * re-validates path safety.
+     */
+    files: v.array(v.object({ path: v.string(), content: v.string() })),
+    /**
+     * Single-script mode: relative path inside `files[]` to exec. The
+     * runtime entrypoint receives this and exec()s `/workspace/code/<entryPath>`
+     * directly — no synthetic mirror. Mutually exclusive with `steps`;
+     * the mutex is enforced below before the reservation mutation, and
+     * re-enforced at the spawner boundary.
+     */
+    entryPath: v.optional(v.string()),
+    /**
+     * Multi-script mode: paths inside `files[]` to execute sequentially
+     * in the same container. See artifact_run_tool / spawner ExecuteRequest
+     * for the full contract. Mutually exclusive with `entryPath`.
+     */
+    steps: v.optional(v.array(v.string())),
+    /**
+     * Legacy single-bucket package list. For `language: 'python' | 'node'`
+     * requests, this routes to whichever installer matches. Mutually
+     * compatible with {@link packagesByLang} — when both are set, the
+     * action sends both fields verbatim and the spawner picks the right
+     * one per language.
+     */
+    packages: v.optional(v.array(v.string())),
+    /**
+     * Per-language package buckets. Required for `language: 'polyglot'`
+     * (the spawner installs both buckets in one container). For single-
+     * language requests, the bucket matching `language` is used and the
+     * other is ignored.
+     */
+    packagesByLang: v.optional(
+      v.object({
+        python: v.optional(v.array(v.string())),
+        node: v.optional(v.array(v.string())),
+      }),
+    ),
+    timeoutMs: v.optional(v.number()),
+    // NOTE: `allowSdist` / `allowInstallScripts` are intentionally NOT
+    // accepted as action args. The spawner-side install guards (`pip
+    // --only-binary=:all:` and `npm --ignore-scripts`) are hardcoded
+    // server-side here so a prompt-injected LLM cannot disable them
+    // (round-2 R2-B4). To grant a per-org carve-out, add an
+    // `orgs.sandboxPolicy` table and gate the override there instead of
+    // surfacing the knob to the LLM.
+    purpose: v.string(),
+    // When set, the action wires PHASE events from the spawner SSE to
+    // patchArtifactRunProgress and finalizeArtifactRun — canvas shows
+    // live progress instead of a frozen spinner.
+    artifactId: v.optional(v.id('artifacts')),
+    /**
+     * Pre-stage source override. Default behaviour ("latest succeeded
+     * run") applies when omitted or when `fromRun === 'latest'`. Pass a
+     * specific `artifactRuns` row id to pin pre-staging to that run.
+     */
+    inputs: v.optional(
+      v.object({
+        fromRun: v.string(),
+      }),
+    ),
+  },
+  returns: v.object({
+    executionId: v.id('sandboxExecutions'),
+    success: v.boolean(),
+    status: v.union(
+      v.literal('completed'),
+      v.literal('failed'),
+      v.literal('cancelled'),
+    ),
+    exitCode: v.union(v.number(), v.null()),
+    errorCode: v.optional(sandboxErrorCodeValidator),
+    errorMessage: v.optional(v.string()),
+    stdoutPreview: v.string(),
+    stderrPreview: v.string(),
+    durationMs: v.number(),
+    truncated: v.object({
+      stdout: v.boolean(),
+      stderr: v.boolean(),
+      files: v.number(),
+    }),
+    files: v.array(
+      v.object({
+        name: v.string(),
+        fileMetadataId: v.id('fileMetadata'),
+        storageId: v.id('_storage'),
+        size: v.number(),
+        contentType: v.string(),
+        sha256: v.optional(v.string()),
+      }),
+    ),
+    steps: v.optional(v.array(sandboxStepResultValidator)),
+    // Pre-stage attestation surfaced from the spawner — present whenever
+    // the request had `priorOutputDownloads`. `staged[]` is the list of
+    // names that actually landed in /workspace/output/ before user code
+    // ran; `skipped[]` carries any expected files the spawner couldn't
+    // stage, with a structured reason. When skipped[] is non-empty, the
+    // action takes the PRE_STAGE_FAILED path; this field still lets the
+    // LLM-facing tool show what worked vs what didn't.
+    preStage: v.optional(
+      v.object({
+        staged: v.array(v.string()),
+        skipped: v.array(
+          v.object({
+            name: v.string(),
+            reason: v.string(),
+            detail: v.string(),
+          }),
+        ),
+      }),
+    ),
+  }),
+  handler: async (ctx, args): Promise<ExecuteCodeResult> => {
+    // Exactly one of `entryPath` or `steps` must be set. The spawner
+    // enforces this at the wire boundary, but we re-check here so a
+    // misuse from another caller (e.g. a future free-form executor)
+    // fails fast with a useful diagnostic instead of confusing 400s
+    // from the spawner.
+    const entryProvided = args.entryPath !== undefined;
+    const stepsProvided = args.steps !== undefined && args.steps.length > 0;
+    if (entryProvided === stepsProvided) {
+      throw new ConvexError({
+        code: 'INPUT_REJECTED',
+        message:
+          'executeCode requires exactly one of `entryPath` (single-script) or `steps[]` (multi-script).',
+      });
+    }
+    if (args.files.length === 0) {
+      throw new ConvexError({
+        code: 'INPUT_REJECTED',
+        message: 'executeCode requires `files[]` carrying the script contents.',
+      });
+    }
+
+    const timeoutMs = Math.min(
+      Math.max(args.timeoutMs ?? SANDBOX_DEFAULT_TIMEOUT_MS, 1_000),
+      SANDBOX_MAX_TIMEOUT_MS,
+    );
+    const estimatedSeconds = Math.ceil(timeoutMs / 1000);
+
+    // ---- codePreview / codeStorageId split ----
+    // Single-script mode: persist the entry file's content as the executed
+    // source. Multi-step mode: the spawner generates the executed wrapper
+    // itself, so persist a stable synthesized preview keyed off the step
+    // list — the audit row still shows what was requested without
+    // falsely advertising any of the user's individual scripts as "the
+    // executed code".
+    const sourceForPreview = entryProvided
+      ? (args.files.find((f) => f.path === args.entryPath)?.content ?? '')
+      : `[multi-step] ${args.steps?.join(' → ') ?? ''}`;
+    const codeBytes = Buffer.byteLength(sourceForPreview, 'utf8');
+    let codePreview = sourceForPreview;
+    let codeStorageId: Id<'_storage'> | undefined;
+    if (codeBytes > SANDBOX_CODE_PREVIEW_MAX) {
+      const blob = new Blob([sourceForPreview], { type: 'text/plain' });
+      codeStorageId = await ctx.storage.store(blob);
+      codePreview = sourceForPreview.slice(0, SANDBOX_CODE_PREVIEW_MAX);
+    }
+
+    // ---- atomic reservation (concurrent cap + daily CPU budget + insert) ----
+    // If reservation throws (QUOTA_EXCEEDED, daily budget, etc.) the blob we
+    // just stored is orphaned — it never lands on an audit row to be owned.
+    // The wider `failExecution`-driven rollback set isn't yet constructed at
+    // this point, so we delete here in the catch (audit finding R2-B7 #1).
+    let executionId: Id<'sandboxExecutions'>;
+    try {
+      executionId = await ctx.runMutation(
+        internal.sandbox.internal_mutations.reserveSlotAndInsert,
+        {
+          organizationId: args.organizationId,
+          uploadedBy: args.uploadedBy,
+          ...(args.threadId !== undefined && { threadId: args.threadId }),
+          ...(args.messageId !== undefined && { messageId: args.messageId }),
+          ...(args.toolCallId !== undefined && {
+            toolCallId: args.toolCallId,
+          }),
+          ...(args.agentSlug !== undefined && { agentSlug: args.agentSlug }),
+          ...(args.artifactId !== undefined && { artifactId: args.artifactId }),
+          // Audit-row attribution: single-script → the executed file;
+          // multi-step → the first step (still a meaningful pointer into
+          // the artifact tree for forensic grep).
+          path: args.entryPath ?? args.steps?.[0] ?? '<unknown>',
+          language: args.language,
+          purpose: args.purpose,
+          codePreview,
+          ...(codeStorageId !== undefined && { codeStorageId }),
+          // Audit-row attribution: flatten polyglot buckets back into a
+          // single list so historical grep ("which runs installed
+          // markitdown?") still works regardless of which language route
+          // they took. Order: legacy `packages` first, then python bucket,
+          // then node bucket — preserves the "first spec wins" semantics
+          // that `buildInstallProgress` relies on for the install banner.
+          packages: [
+            ...(args.packages ?? []),
+            ...(args.packagesByLang?.python ?? []),
+            ...(args.packagesByLang?.node ?? []),
+          ],
+          // installOptions is intentionally NOT forwarded: install-safety
+          // is hardcoded server-side (round-2 R2-B4). The schema field
+          // remains optional for backward compatibility with old rows.
+          estimatedSeconds,
+        },
+      );
+    } catch (err) {
+      // Reservation failed — the codeStorageId blob is now orphaned. Delete
+      // it before propagating so a quota-bounce-loop doesn't accrete
+      // unowned `_storage` rows (audit finding R2-B7 #1).
+      if (codeStorageId !== undefined) {
+        try {
+          await ctx.storage.delete(codeStorageId);
+        } catch (deleteErr) {
+          console.warn(
+            '[sandbox.executeCode] codeStorageId rollback after reservation failure failed:',
+            deleteErr,
+          );
+        }
+      }
+      // Quota errors are user-facing — surface as ConvexError. The tool's
+      // wrapper translates this into structured agent-visible output.
+      if (
+        err instanceof ConvexError &&
+        typeof err.data === 'object' &&
+        err.data !== null &&
+        // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose
+        (err.data as { code?: string }).code === 'QUOTA_EXCEEDED'
+      ) {
+        const dataMessage =
+          err.data && typeof err.data === 'object' && 'message' in err.data
+            ? // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose; we just type-narrowed the message key
+              String((err.data as { message?: string }).message)
+            : 'Sandbox quota exceeded';
+        throw new ConvexError({
+          code: 'QUOTA_EXCEEDED',
+          message: dataMessage,
+        });
+      }
+      throw err;
+    }
+
+    const startedAt = Date.now();
+    const uploadedStorageIds = new Set<string>();
+    const fc: FailContext = {
+      ctx,
+      executionId,
+      ...(args.artifactId !== undefined && { artifactId: args.artifactId }),
+      uploadedStorageIds,
+      startedAt,
+    };
+
+    // ---- flip status to installing, start heartbeat ----
+    // The spawner emits a real `installing` phase event later, but flipping
+    // to `installing` here means the watchdog can also reap rows that get
+    // stuck before the spawner ever responds (the `queued` sweep handles
+    // throws between this point and reserveSlotAndInsert, but `installing`
+    // also signals the canvas to show a progress spinner immediately).
+    try {
+      await ctx.runMutation(internal.sandbox.internal_mutations.setRunning, {
+        executionId,
+        status: 'installing',
+      });
+    } catch (err) {
+      return failExecution(
+        fc,
+        'failed',
+        'SPAWNER_UNAVAILABLE',
+        `failed to flip audit row to installing: ${err instanceof Error ? err.message : String(err)}`,
+      );
+    }
+
+    // Fire heartbeat from a separate function so we can also call it inline
+    // around long blocking work (storage uploads of multi-MB output files
+    // can otherwise hog the event loop long enough that the interval timer's
+    // fires get coalesced and `heartbeatAt` ages past the watchdog cutoff,
+    // causing the watchdog to wrongly mark this live run as stuck —
+    // audit finding R2-B6 #3).
+    const tickHeartbeat = async (): Promise<void> => {
+      try {
+        await ctx.runMutation(internal.sandbox.internal_mutations.heartbeat, {
+          executionId,
+        });
+      } catch (err) {
+        // Don't swallow silently — a stalled heartbeat path is exactly the
+        // failure mode the watchdog mis-classifies as "stuck execution"
+        // (R2-B6 #2). Logging it makes the regression visible in production
+        // before users notice the wrong-side ghost result.
+        console.warn('[sandbox.executeCode] heartbeat mutation failed:', err);
+      }
+    };
+    const heartbeat = setInterval(() => {
+      void tickHeartbeat();
+    }, HEARTBEAT_INTERVAL_MS);
+
+    const abort = new AbortController();
+
+    // ---- live stdout/stderr tail coalescer ----
+    // The spawner emits `event: stdout` / `event: stderr` per-line (stdout)
+    // and per-chunk (stderr). We buffer them and flush via one mutation per
+    // ~250 ms or once the buffer exceeds the threshold, whichever first —
+    // so a chatty `pip install` doesn't fire one Convex mutation per line.
+    // Drift between the live tail and the canonical preview written at
+    // `finalizeArtifactRun` is bounded by the same 16-KB cap on each side.
+    const OUTPUT_FLUSH_DEBOUNCE_MS = 250;
+    const OUTPUT_FLUSH_THRESHOLD_BYTES = 2048;
+    let pendingStdout = '';
+    let pendingStderr = '';
+    let outputFlushTimer: ReturnType<typeof setTimeout> | null = null;
+    let outputFlushInFlight = false;
+    let outputBufferingStopped = false;
+    const flushOutputBuffer = async (): Promise<void> => {
+      if (outputFlushInFlight) return;
+      if (!pendingStdout && !pendingStderr) return;
+      if (!args.artifactId) {
+        pendingStdout = '';
+        pendingStderr = '';
+        return;
+      }
+      const stdoutDelta = pendingStdout;
+      const stderrDelta = pendingStderr;
+      pendingStdout = '';
+      pendingStderr = '';
+      outputFlushInFlight = true;
+      try {
+        await ctx.runMutation(
+          internal.artifacts.internal_mutations.appendArtifactRunOutput,
+          {
+            artifactId: args.artifactId,
+            executionId,
+            ...(stdoutDelta && { stdoutDelta }),
+            ...(stderrDelta && { stderrDelta }),
+          },
+        );
+      } catch (err) {
+        // Tail is UX-only; never block the run on a failed append.
+        console.warn(
+          '[sandbox.executeCode] appendArtifactRunOutput failed:',
+          err,
+        );
+      } finally {
+        outputFlushInFlight = false;
+        if (
+          !outputBufferingStopped &&
+          (pendingStdout || pendingStderr) &&
+          !outputFlushTimer
+        ) {
+          outputFlushTimer = setTimeout(() => {
+            outputFlushTimer = null;
+            void flushOutputBuffer();
+          }, OUTPUT_FLUSH_DEBOUNCE_MS);
+        }
+      }
+    };
+    const scheduleOutputFlush = (): void => {
+      if (outputBufferingStopped) return;
+      if (outputFlushTimer || outputFlushInFlight) return;
+      outputFlushTimer = setTimeout(() => {
+        outputFlushTimer = null;
+        void flushOutputBuffer();
+      }, OUTPUT_FLUSH_DEBOUNCE_MS);
+    };
+    const maybeFlushIfLarge = (): void => {
+      if (
+        pendingStdout.length + pendingStderr.length >=
+        OUTPUT_FLUSH_THRESHOLD_BYTES
+      ) {
+        if (outputFlushTimer) {
+          clearTimeout(outputFlushTimer);
+          outputFlushTimer = null;
+        }
+        void flushOutputBuffer();
+      }
+    };
+    const onStdoutTail = args.artifactId
+      ? (text: string) => {
+          if (outputBufferingStopped) return;
+          pendingStdout += text;
+          maybeFlushIfLarge();
+          scheduleOutputFlush();
+        }
+      : undefined;
+    const onStderrTail = args.artifactId
+      ? (text: string) => {
+          if (outputBufferingStopped) return;
+          pendingStderr += text;
+          maybeFlushIfLarge();
+          scheduleOutputFlush();
+        }
+      : undefined;
+
+    // ---- pre-stage prior run outputs ----
+    // Sandbox-wobbly-origami plan §1: instead of base64-inlining prior outputs
+    // into the spawner request body, we hand the spawner a list of
+    // download URLs (rewritten through `toSandboxStorageUrl()` so they
+    // resolve against the internal Caddy alias) and let it fetch each in
+    // parallel. Avoids the 10 MiB cap on prior outputs and the JSON-over-
+    // base64 wire encoding entirely.
+    let priorOutputDownloads: Array<{ name: string; url: string }> = [];
+    let priorOutputSkippedNote: string | undefined;
+    // Captured here so the post-spawner attestation step (see §3 of the
+    // crispy-curry plan) can diff `priorStage.staged[]` against what we
+    // actually asked for. `sha256` is undefined for entries derived from
+    // legacy `artifactRunFiles` rows; the attestation treats those as
+    // "presence only" rather than "byte-exact".
+    const priorOutputExpected: Array<{ name: string; sha256?: string }> = [];
+    if (args.artifactId !== undefined) {
+      try {
+        const latest = await ctx.runQuery(
+          internal.artifacts.internal_queries.getLatestRunOutputs,
+          {
+            artifactId: args.artifactId,
+            expectedOrganizationId: args.organizationId,
+            ...(args.inputs?.fromRun !== undefined && {
+              fromRun: args.inputs.fromRun,
+            }),
+          },
+        );
+        const candidates = latest.files;
+        const totalBytes = candidates.reduce((sum, f) => sum + f.size, 0);
+        console.info(
+          `[sandbox.preStage] artifact=${args.artifactId} source=${latest.source} candidates=${candidates.length} totalBytes=${totalBytes} fromRun=${args.inputs?.fromRun ?? 'default-latest'}`,
+        );
+        // Best-effort lazy migration: if the query had to fall back to the
+        // walk-back path, run the derive mutation so the next pre-stage
+        // hits the manifest in O(1). Never blocks the current run on
+        // failure — the walk-back already supplied the data we need.
+        if (latest.needsManifestDerive) {
+          try {
+            const r = await ctx.runMutation(
+              internal.artifacts.internal_mutations
+                .deriveOutputManifestFromHistory,
+              { artifactId: args.artifactId },
+            );
+            console.info(
+              `[sandbox.preStage] manifest-derived artifact=${args.artifactId} inserted=${r.inserted} alreadyPresent=${r.alreadyPresent}`,
+            );
+          } catch (deriveErr) {
+            console.warn(
+              `[sandbox.preStage] manifest derive failed (non-fatal):`,
+              deriveErr,
+            );
+          }
+        }
+        const skipped: string[] = [];
+        for (const file of candidates) {
+          // Build a sandbox-bound download URL. `getUrl()` returns the
+          // public form; rewrite it through `toSandboxStorageUrl()` so the
+          // spawner's fetch goes through the internal Caddy alias rather
+          // than the publicly-resolvable hostname.
+          let rawUrl: string | null;
+          try {
+            rawUrl = await ctx.storage.getUrl(file.storageId);
+          } catch (urlErr) {
+            console.warn(
+              `[sandbox.preStage] getUrl(${file.storageId}) failed for ${file.name}:`,
+              urlErr,
+            );
+            skipped.push(file.name);
+            continue;
+          }
+          if (rawUrl === null) {
+            skipped.push(file.name);
+            continue;
+          }
+          priorOutputDownloads.push({
+            name: file.name,
+            url: toSandboxStorageUrl(rawUrl),
+          });
+          priorOutputExpected.push({
+            name: file.name,
+            ...(file.sha256 !== undefined && { sha256: file.sha256 }),
+          });
+        }
+        if (skipped.length > 0) {
+          priorOutputSkippedNote = `[tale-sandbox] prior-output blobs missing in storage, skipped: ${skipped.join(', ')}\n`;
+          console.warn(
+            `[sandbox.preStage] SKIP-MISSING artifact=${args.artifactId} skipped=${JSON.stringify(skipped)}`,
+          );
+        }
+        if (priorOutputDownloads.length > 0) {
+          console.info(
+            `[sandbox.preStage] STAGED artifact=${args.artifactId} files=${JSON.stringify(priorOutputDownloads.map((f) => f.name))}`,
+          );
+        }
+      } catch (err) {
+        // Pre-staging is best-effort — never block the run on a load
+        // failure. Surface a one-liner so users notice the regression in
+        // CI but the script still gets its chance.
+        console.warn(
+          '[sandbox.executeCode] prior-output pre-stage failed:',
+          err,
+        );
+        priorOutputDownloads = [];
+        priorOutputSkippedNote = `[tale-sandbox] prior-output pre-stage failed: ${err instanceof Error ? err.message : String(err)}\n`;
+      }
+    }
+    if (priorOutputSkippedNote !== undefined && onStderrTail !== undefined) {
+      // Route the note through the live-tail channel so it lands in the
+      // canvas stderr panel alongside the script's own output.
+      onStderrTail(priorOutputSkippedNote);
+    }
+
+    // ---- pre-allocate upload slots + persist quota counter ----
+    // Plan §3: hand the spawner N pre-signed upload URLs up front (median
+    // run = 1 file, p90 = 2; pre-alloc 2 to cover both without round-trip).
+    // The remaining quota lives server-side so the spawner can lazily ask
+    // for more via EP1 without us pre-vending all 16 URLs every run.
+    const preAllocSlots: Array<{ url: string }> = [];
+    try {
+      for (let i = 0; i < SANDBOX_OUTPUT_UPLOAD_SLOTS_PREALLOC; i += 1) {
+        const raw = await ctx.storage.generateUploadUrl();
+        preAllocSlots.push({ url: toSandboxStorageUrl(raw) });
+      }
+    } catch (err) {
+      return failExecution(
+        fc,
+        'failed',
+        'SPAWNER_UNAVAILABLE',
+        `failed to pre-allocate output upload slots: ${err instanceof Error ? err.message : String(err)}`,
+      );
+    }
+    const remainingQuota =
+      SANDBOX_MAX_OUTPUT_FILES_PER_RUN - preAllocSlots.length;
+    try {
+      await ctx.runMutation(
+        internal.sandbox.internal_mutations.applyInitOutputSlots,
+        {
+          executionId,
+          slots: preAllocSlots.map((s) => s.url),
+          quotaRemaining: remainingQuota,
+        },
+      );
+    } catch (err) {
+      console.warn(`[sandbox.executeCode] applyInitOutputSlots failed:`, err);
+      // Non-fatal: the run can still proceed using the pre-allocated
+      // slots; only the lazy EP1 path needs the quota counter.
+    }
+
+    // Resolve the sandbox-facing callback endpoints. The spawner uses
+    // these to (a) request additional upload URLs via EP1 and (b) report
+    // each successful storageId via EP2.
+    //
+    // Two ports are involved: storage upload/download is on convex:3210
+    // (the admin/storage API, what `generateUploadUrl()` returns), while
+    // user-defined httpActions live on convex:3211 (the HTTP API). Caddy
+    // routes `/api/storage/*` → 3210 and `/api/*` → 3211. When we bypass
+    // Caddy by talking directly to convex (`SANDBOX_STORAGE_INTERNAL_BASE_URL=
+    // http://convex:3210`), the storage URLs work on the configured base
+    // but the sandbox callbacks need an explicit port swap to 3211 — or
+    // the operator overrides via SANDBOX_HTTP_API_BASE_URL.
+    const storageBase = (
+      process.env.SANDBOX_STORAGE_INTERNAL_BASE_URL ??
+      process.env.SITE_URL ??
+      'http://127.0.0.1:3210'
+    ).replace(/\/$/, '');
+    const httpApiBase = (
+      process.env.SANDBOX_HTTP_API_BASE_URL ??
+      storageBase.replace(/:3210(\/|$)/, ':3211$1')
+    ).replace(/\/$/, '');
+    const outputUrlEndpoint = `${httpApiBase}/api/sandbox/output_upload_url`;
+    const reportUploadedEndpoint = `${httpApiBase}/api/sandbox/record_uploaded`;
+
+    try {
+      const spawnerResult = await spawnerExecute(
+        {
+          executionId: String(executionId),
+          organizationId: args.organizationId,
+          language: args.language,
+          // The mutual-exclusion gate at the top of the handler guarantees
+          // exactly one of `entryPath` / `steps` lands in the body. We
+          // forward both possibilities; the spawner's own validator
+          // enforces the wire contract a second time.
+          files: args.files,
+          ...(args.entryPath !== undefined && { entryPath: args.entryPath }),
+          ...(args.steps !== undefined &&
+            args.steps.length > 0 && { steps: args.steps }),
+          ...(args.packages !== undefined && { packages: args.packages }),
+          ...(args.packagesByLang !== undefined && {
+            packagesByLang: args.packagesByLang,
+          }),
+          ...(priorOutputDownloads.length > 0 && { priorOutputDownloads }),
+          outputUploadSlots: preAllocSlots,
+          outputUrlEndpoint,
+          reportUploadedEndpoint,
+          timeoutMs,
+        },
+        abort.signal,
+        {
+          ...(onStdoutTail && { onStdout: onStdoutTail }),
+          ...(onStderrTail && { onStderr: onStderrTail }),
+          onPhase: args.artifactId
+            ? async (phase) => {
+                // Structured progress — UI renders the localized text via
+                // the `chat.runnable.progress.*` i18n keys. We never write
+                // English literals into the artifact row anymore.
+                const runProgress =
+                  phase === 'installing'
+                    ? buildInstallProgress([
+                        ...(args.packages ?? []),
+                        ...(args.packagesByLang?.python ?? []),
+                        ...(args.packagesByLang?.node ?? []),
+                      ])
+                    : phase === 'running'
+                      ? { kind: 'running' as const }
+                      : phase === 'preparing'
+                        ? { kind: 'preparing' as const }
+                        : undefined;
+                const runStatus =
+                  phase === 'installing'
+                    ? 'installing'
+                    : phase === 'running'
+                      ? 'running'
+                      : phase === 'preparing'
+                        ? 'installing'
+                        : undefined;
+                if (!runStatus) return;
+                await ctx.runMutation(
+                  internal.artifacts.internal_mutations
+                    .patchArtifactRunProgress,
+                  {
+                    // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- narrowed by args.artifactId guard
+                    artifactId: args.artifactId as NonNullable<
+                      typeof args.artifactId
+                    >,
+                    runStatus,
+                    ...(runProgress && { runProgress }),
+                    runExecutionId: executionId,
+                  },
+                );
+              }
+            : undefined,
+        },
+      );
+
+      // Stop accepting more live-tail deltas. Any in-flight or pending
+      // flush completes; subsequent SSE-callback invocations no-op. The
+      // canonical preview is about to be written by `finalize` /
+      // `finalizeArtifactRun`, so further appends would only race that
+      // write to no benefit.
+      outputBufferingStopped = true;
+      if (outputFlushTimer) {
+        clearTimeout(outputFlushTimer);
+        outputFlushTimer = null;
+      }
+
+      // ---- pre-stage attestation (crispy-curry plan §3) ----
+      // The spawner ships back `priorStage.staged[]` listing every file
+      // it actually wrote to /workspace/output/ before user code ran.
+      // Diff against what we asked it to inject; any expected file that
+      // didn't land → fail the run BEFORE we promote the spawner's output
+      // blobs to fileMetadata, so the LLM can never see `success:true`
+      // alongside a missing prior file. The skipped[] reasons (url_expired,
+      // http_error, write_failed, etc.) are surfaced in the structured
+      // errorMessage so the agent can decide whether to retry, pin
+      // `inputs.from_run` to an older snapshot, or surface the issue.
+      //
+      // We add the spawner's outputFiles to uploadedStorageIds first so
+      // failExecution cleans them — the bytes already landed in storage
+      // via EP2 even though user code ran against a corrupted workspace.
+      if (
+        spawnerResult.priorStage !== undefined &&
+        spawnerResult.priorStage.skipped.length > 0
+      ) {
+        for (const f of spawnerResult.outputFiles) {
+          uploadedStorageIds.add(f.storageId);
+        }
+        const stagedNames = new Set(
+          spawnerResult.priorStage.staged.map((s) => s.name),
+        );
+        const expectedMissing = priorOutputExpected.filter(
+          (e) => !stagedNames.has(e.name),
+        );
+        const missingNames = expectedMissing.map((e) => e.name);
+        console.warn(
+          `[sandbox.preStage] PRE_STAGE_FAILED artifact=${args.artifactId ?? '(none)'} missing=${JSON.stringify(missingNames)} skipped=${JSON.stringify(spawnerResult.priorStage.skipped)}`,
+        );
+        return failExecution(
+          fc,
+          'failed',
+          'PRE_STAGE_FAILED',
+          JSON.stringify({
+            missing: missingNames,
+            skipped: spawnerResult.priorStage.skipped,
+            message:
+              'pre-stage attestation: spawner did not stage every expected prior-output file before user code ran',
+          }),
+          {
+            stdoutPreview: Buffer.from(spawnerResult.stdoutBase64, 'base64')
+              .toString('utf8')
+              .slice(0, SANDBOX_STDOUT_PREVIEW_MAX),
+            stderrPreview: Buffer.from(spawnerResult.stderrBase64, 'base64')
+              .toString('utf8')
+              .slice(0, SANDBOX_STDERR_PREVIEW_MAX),
+            ...(spawnerResult.exitCode !== null && {
+              exitCode: spawnerResult.exitCode,
+            }),
+          },
+        );
+      }
+
+      // ---- upload-pipeline completeness gate (crispy-curry plan §4) ----
+      // `uploadStats.failures` non-empty means at least one harvested file
+      // either failed its upload POST or its EP2 record-back. The audit
+      // row's `uploadedStorageIds[]` already cleaned the partials; treat
+      // this as a fatal run so the LLM doesn't trust a workspace state
+      // that doesn't match what's in the manifest after finalize.
+      if (
+        spawnerResult.uploadStats !== undefined &&
+        spawnerResult.uploadStats.failures.length > 0 &&
+        // Only escalate to UPLOAD_INCOMPLETE when the spawner didn't
+        // already classify this as a specific upload-pipeline error. The
+        // spawner's classifyFailure path may have already emitted
+        // UPLOAD_FAILED / UPLOAD_QUOTA_EXCEEDED / UPLOAD_REPORT_FAILED;
+        // preserve those rather than relabeling.
+        spawnerResult.errorCode === undefined
+      ) {
+        for (const f of spawnerResult.outputFiles) {
+          uploadedStorageIds.add(f.storageId);
+        }
+        const failed = spawnerResult.uploadStats.failures.map((f) => ({
+          fileName: f.fileName,
+          httpStatus: f.httpStatus,
+          errorSnippet: f.errorSnippet,
+        }));
+        console.warn(
+          `[sandbox.upload] UPLOAD_INCOMPLETE artifact=${args.artifactId ?? '(none)'} failures=${JSON.stringify(failed)}`,
+        );
+        return failExecution(
+          fc,
+          'failed',
+          'UPLOAD_INCOMPLETE',
+          JSON.stringify({
+            failures: failed,
+            message:
+              'output-upload completeness: at least one harvested file failed its upload POST or EP2 record-back',
+          }),
+          {
+            stdoutPreview: Buffer.from(spawnerResult.stdoutBase64, 'base64')
+              .toString('utf8')
+              .slice(0, SANDBOX_STDOUT_PREVIEW_MAX),
+            stderrPreview: Buffer.from(spawnerResult.stderrBase64, 'base64')
+              .toString('utf8')
+              .slice(0, SANDBOX_STDERR_PREVIEW_MAX),
+            ...(spawnerResult.exitCode !== null && {
+              exitCode: spawnerResult.exitCode,
+            }),
+          },
+        );
+      }
+
+      // ---- register file metadata (presigned upload pipeline) ----
+      // Sandbox-wobbly-origami: the spawner POSTed each output blob to a
+      // presigned URL itself, so by the time we reach here the bytes are
+      // already in `_storage` and we have the allocated storageId on each
+      // outputFiles entry. We just need to insert the sibling fileMetadata
+      // rows. Track every storageId we accept so `failExecution` can roll
+      // them back if a subsequent mutation throws.
+      const stagedForInsert: Array<{
+        name: string;
+        storageId: Id<'_storage'>;
+        size: number;
+        contentType: string;
+        sha256: string;
+      }> = [];
+      for (const f of spawnerResult.outputFiles) {
+        // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- spawner-side validator already enforced the storageId is a non-empty string; cast to the branded id for the mutation arg
+        const storageId = f.storageId as unknown as Id<'_storage'>;
+        uploadedStorageIds.add(String(storageId));
+        stagedForInsert.push({
+          name: f.name,
+          storageId,
+          size: f.size,
+          contentType: f.contentType,
+          sha256: f.sha256,
+        });
+      }
+
+      const insertResult = await ctx.runMutation(
+        internal.sandbox.output_mutations.insertOutputFiles,
+        {
+          executionId,
+          organizationId: args.organizationId,
+          ...(args.threadId !== undefined && { threadId: args.threadId }),
+          uploadedBy: args.uploadedBy,
+          files: stagedForInsert,
+        },
+      );
+
+      // If the audit row was terminalized between the spawner's SSE result
+      // and this mutation (e.g., user clicked Stop near completion), the
+      // mutation refuses to insert fileMetadata rows. Roll back the blobs
+      // we already wrote — without this they orphan since neither the
+      // audit row nor the artifactRunFiles will reference them (audit
+      // follow-up F6 — cancel-race blob leak).
+      if (insertResult.skippedTerminal) {
+        console.warn(
+          `[sandbox.executeCode] insertOutputFiles skipped — audit row already terminal; rolling back ${uploadedStorageIds.size} blob(s)`,
+        );
+        await rollbackUploadedBlobs(
+          ctx,
+          uploadedStorageIds,
+          'sandbox.executeCode.cancel-race',
+        );
+        const cancelDurationMs = Date.now() - startedAt;
+        return {
+          executionId,
+          success: false,
+          status: 'cancelled',
+          exitCode: spawnerResult.exitCode,
+          errorCode: 'CANCELLED',
+          errorMessage:
+            'Run was cancelled while harvesting outputs; uploaded blobs rolled back.',
+          stdoutPreview: '',
+          stderrPreview: '',
+          durationMs: cancelDurationMs,
+          truncated: { stdout: false, stderr: false, files: 0 },
+          files: [],
+        };
+      }
+      const insertedFiles = insertResult.insertedFiles;
+
+      // ---- stdout/stderr previews + overflow storage ----
+      const stdoutText = Buffer.from(
+        spawnerResult.stdoutBase64,
+        'base64',
+      ).toString('utf8');
+      const stderrText = Buffer.from(
+        spawnerResult.stderrBase64,
+        'base64',
+      ).toString('utf8');
+      const stdoutPreview = stdoutText.slice(0, SANDBOX_STDOUT_PREVIEW_MAX);
+      const stderrPreview = stderrText.slice(0, SANDBOX_STDERR_PREVIEW_MAX);
+      let stdoutStorageId: Id<'_storage'> | undefined;
+      let stderrStorageId: Id<'_storage'> | undefined;
+      if (stdoutText.length > SANDBOX_STDOUT_PREVIEW_MAX) {
+        await tickHeartbeat();
+        const blob = new Blob([stdoutText], { type: 'text/plain' });
+        stdoutStorageId = await ctx.storage.store(blob);
+        uploadedStorageIds.add(String(stdoutStorageId));
+      }
+      if (stderrText.length > SANDBOX_STDERR_PREVIEW_MAX) {
+        await tickHeartbeat();
+        const blob = new Blob([stderrText], { type: 'text/plain' });
+        stderrStorageId = await ctx.storage.store(blob);
+        uploadedStorageIds.add(String(stderrStorageId));
+      }
+
+      const durationMs = spawnerResult.durationMs;
+      const actualSeconds = durationMs / 1000;
+
+      await ctx.runMutation(internal.sandbox.internal_mutations.finalize, {
+        executionId,
+        status: spawnerResult.status,
+        ...(spawnerResult.exitCode !== null && {
+          exitCode: spawnerResult.exitCode,
+        }),
+        ...(spawnerResult.errorCode !== undefined && {
+          errorCode: spawnerResult.errorCode,
+        }),
+        ...(spawnerResult.errorMessage !== undefined && {
+          errorMessage: spawnerResult.errorMessage,
+        }),
+        stdoutPreview,
+        stderrPreview,
+        ...(stdoutStorageId !== undefined && { stdoutStorageId }),
+        ...(stderrStorageId !== undefined && { stderrStorageId }),
+        outputFiles: insertedFiles.map((f) => ({
+          name: f.name,
+          fileMetadataId: f.fileMetadataId,
+          size: f.size,
+          contentType: f.contentType,
+          sha256: f.sha256,
+        })),
+        truncated: spawnerResult.truncated,
+        durationMs,
+        actualSeconds,
+        ...(spawnerResult.steps !== undefined && {
+          steps: spawnerResult.steps,
+        }),
+        ...(spawnerResult.uploadStats !== undefined && {
+          uploadStats: spawnerResult.uploadStats,
+        }),
+        ...(spawnerResult.timing !== undefined && {
+          timing: spawnerResult.timing,
+        }),
+      });
+
+      // When this run is tied to a runnable artifact, finalize the artifact
+      // row so the canvas-runnable-code-renderer sees the completed state
+      // + output file chips. The audit row above already holds the
+      // per-execution forensics; the artifact row holds the *latest* state
+      // for fast canvas reads.
+      if (args.artifactId) {
+        await ctx.runMutation(
+          internal.artifacts.internal_mutations.finalizeArtifactRun,
+          {
+            artifactId: args.artifactId,
+            runStatus: spawnerResult.status,
+            ...(spawnerResult.exitCode !== null && {
+              runExitCode: spawnerResult.exitCode,
+            }),
+            ...(spawnerResult.errorCode !== undefined && {
+              runErrorCode: spawnerResult.errorCode,
+            }),
+            ...(spawnerResult.errorMessage !== undefined && {
+              runErrorMessage: spawnerResult.errorMessage,
+            }),
+            runStdoutPreview: stdoutPreview,
+            runStderrPreview: stderrPreview,
+            ...(stdoutStorageId !== undefined && {
+              runStdoutStorageId: stdoutStorageId,
+            }),
+            ...(stderrStorageId !== undefined && {
+              runStderrStorageId: stderrStorageId,
+            }),
+            runOutputFiles: insertedFiles.map((f) => ({
+              name: f.name,
+              fileMetadataId: f.fileMetadataId,
+              storageId: f.storageId,
+              size: f.size,
+              contentType: f.contentType,
+              sha256: f.sha256,
+            })),
+            runExecutionId: executionId,
+          },
+        );
+      }
+
+      // Successful path — the storage IDs are now owned by mutations; drop
+      // them from the rollback set so the finally block doesn't double-free.
+      uploadedStorageIds.clear();
+
+      return {
+        executionId,
+        success: spawnerResult.status === 'completed',
+        status: spawnerResult.status,
+        exitCode: spawnerResult.exitCode,
+        ...(spawnerResult.errorCode !== undefined && {
+          errorCode: spawnerResult.errorCode,
+        }),
+        ...(spawnerResult.errorMessage !== undefined && {
+          errorMessage: spawnerResult.errorMessage,
+        }),
+        stdoutPreview,
+        stderrPreview,
+        durationMs,
+        truncated: spawnerResult.truncated,
+        files: insertedFiles,
+        ...(spawnerResult.steps !== undefined && {
+          steps: spawnerResult.steps,
+        }),
+        ...(spawnerResult.priorStage !== undefined && {
+          preStage: {
+            staged: spawnerResult.priorStage.staged.map((s) => s.name),
+            skipped: spawnerResult.priorStage.skipped.map((s) => ({
+              name: s.name,
+              reason: s.reason,
+              detail: s.detail,
+            })),
+          },
+        }),
+      };
+    } catch (err) {
+      // Infra failure: best-effort spawner cancel (idempotent if container
+      // already gone) and route through failExecution so the audit + artifact
+      // rows both terminate AND any uploaded blobs are reclaimed.
+      const message = err instanceof Error ? err.message : String(err);
+      try {
+        await spawnerCancel(String(executionId));
+      } catch (cancelErr) {
+        console.warn(
+          `[sandbox.executeCode] best-effort spawnerCancel failed:`,
+          cancelErr,
+        );
+      }
+      await failExecution(fc, 'failed', 'SPAWNER_UNAVAILABLE', message);
+      throw new Error(`Sandbox spawner failed: ${message}`, { cause: err });
+    } finally {
+      clearInterval(heartbeat);
+      // Stop accepting/scheduling live-tail flushes — finalize has already
+      // written (or is about to write) the canonical preview, and a pending
+      // setTimeout here would keep the action alive past its useful work.
+      outputBufferingStopped = true;
+      if (outputFlushTimer) {
+        clearTimeout(outputFlushTimer);
+        outputFlushTimer = null;
+      }
+      // Abort any in-flight fetch from spawnerExecute so the spawner-side
+      // request can tear down promptly when the action exits (success,
+      // structured failure, OR thrown infra error).
+      abort.abort('action-exit');
+    }
+  },
+});
+
+/**
+ * User-Stop cascade — kills every in-flight sandbox execution on a thread.
+ *
+ * Without this, clicking the chat's "Stop" button aborts the SDK stream but
+ * leaves the spawner happily executing whatever the LLM started: container
+ * burns CPU for up to `SANDBOX_MAX_TIMEOUT_MS`, quota keeps draining, canvas
+ * spinner persists, and the eventually-arriving result silently overwrites
+ * what the user wanted to cancel.
+ *
+ * Wiring: `convex/threads/cancel_generation.ts` schedules this via
+ * `ctx.scheduler.runAfter(0, ...)` after abortStream'ing the SDK streams.
+ * Scheduler (not direct runAction) because the calling mutation can't await
+ * an action — and shouldn't, since the user is owed an immediate
+ * Stop-acknowledged response.
+ *
+ * For each non-terminal execution:
+ *  1. POST /v1/cancel/:id to the spawner — SIGKILLs the container and
+ *     (per the same-PR change in server.ts/spawn.ts) writes a final SSE
+ *     `event: result` with status:'cancelled' to the still-listening
+ *     `executeCode` action, which then routes through its normal finalize.
+ *  2. Also call `cancelExecutionRecord` directly — closes the window where
+ *     the spawner-side cancel fails (network blip, container already gone)
+ *     and the audit/artifact rows would otherwise stay non-terminal until
+ *     the 15-min watchdog reap. The mutation is terminal-state-guarded so
+ *     racing with `executeCode`'s own finalize is safe.
+ */
+export const cancelExecutionsForThread = internalAction({
+  // `threadId` carried as `v.string()` because the upstream `threads` table
+  // is provided by `@convex-dev/agent`; the platform schema stores its id
+  // as a string on every reference (see `sandboxExecutions.threadId`).
+  args: { threadId: v.string() },
+  returns: v.number(),
+  handler: async (ctx: ActionCtx, args) => {
+    const rows = await ctx.runQuery(
+      internal.sandbox.internal_mutations.listNonTerminalByThread,
+      { threadId: args.threadId },
+    );
+    let cancelled = 0;
+    for (const row of rows) {
+      try {
+        await spawnerCancel(String(row._id));
+      } catch (err) {
+        // Best-effort — if the spawner is unreachable or the container is
+        // already gone, we still mark the row cancelled below so the canvas
+        // clears. The 404-on-unknown-id case is the most common and harmless.
+        console.warn(
+          `[sandbox.cancelExecutionsForThread] spawnerCancel(${row._id}) failed (continuing):`,
+          err,
+        );
+      }
+      try {
+        await ctx.runMutation(
+          internal.sandbox.internal_mutations.cancelExecutionRecord,
+          { executionId: row._id, reason: 'Execution cancelled by user' },
+        );
+        cancelled += 1;
+      } catch (err) {
+        console.warn(
+          `[sandbox.cancelExecutionsForThread] cancelExecutionRecord(${row._id}) failed:`,
+          err,
+        );
+      }
+    }
+    return cancelled;
+  },
+});
diff --git a/services/platform/convex/sandbox/internal_mutations.test.ts b/services/platform/convex/sandbox/internal_mutations.test.ts
new file mode 100644
index 000000000..913dc5df9
--- /dev/null
+++ b/services/platform/convex/sandbox/internal_mutations.test.ts
@@ -0,0 +1,409 @@
+// R1.22 #3 — atomic quota mutation regression gate. Mocks the convex
+// generated layer (same pattern as file_metadata/internal_mutations.test.ts)
+// so the mutation body is unit-testable without a running backend.
+
+import { ConvexError } from 'convex/values';
+import { describe, it, expect, vi } from 'vitest';
+
+vi.mock('../_generated/server', async (importOriginal) => {
+  const mod = await importOriginal<Record<string, unknown>>();
+  return {
+    ...mod,
+    // The mutation factory just hands the config straight through so we
+    // can call `.handler(ctx, args)` from tests.
+    internalMutation: (config: Record<string, unknown>) => config,
+  };
+});
+
+import {
+  reserveSlotAndInsert,
+  recoverStuckSandboxes,
+  finalize,
+} from './internal_mutations';
+import { insertOutputFiles } from './output_mutations';
+import { SANDBOX_MAX_CONCURRENT_PER_ORG } from './schema';
+
+interface MutHandler<TArgs, TReturn> {
+  handler: (ctx: unknown, args: TArgs) => Promise<TReturn> | TReturn;
+}
+
+function asyncIter<T>(rows: T[]): AsyncIterable<T> {
+  return {
+    async *[Symbol.asyncIterator]() {
+      for (const r of rows) yield r;
+    },
+  };
+}
+
+interface FakeRow {
+  estimatedSeconds: number;
+  _creationTime: number;
+  status: string;
+  actualSeconds?: number;
+  _id: string;
+  heartbeatAt: number;
+}
+
+interface MockCtxOptions {
+  runningRows?: FakeRow[];
+  queuedRows?: FakeRow[];
+  installingRows?: FakeRow[];
+  completedTodayRows?: FakeRow[];
+}
+
+function createMockCtx(opts: MockCtxOptions = {}) {
+  const runningRows = opts.runningRows ?? [];
+  const queuedRows = opts.queuedRows ?? [];
+  const installingRows = opts.installingRows ?? [];
+  const completedRows = opts.completedTodayRows ?? [];
+  const insertedRows: Record<string, unknown>[] = [];
+
+  // The fluent `.withIndex` chain — store the eq() args so the handler
+  // returning the right async iterator can be selected.
+  function makeBuilder() {
+    const calls: Array<Record<string, unknown>> = [];
+    const builder: Record<string | symbol, unknown> = {};
+    builder.withIndex = vi.fn((_name: string, cb: (q: unknown) => unknown) => {
+      const q = {
+        eq: (field: string, value: unknown) => {
+          calls.push({ field, value });
+          return q;
+        },
+      };
+      cb(q);
+      return builder;
+    });
+    builder.order = vi.fn(() => builder);
+    const resolveRows = (): FakeRow[] => {
+      const status = calls.find((c) => c.field === 'status')?.value;
+      if (status === 'running') return runningRows;
+      if (status === 'queued') return queuedRows;
+      if (status === 'installing') return installingRows;
+      // No status filter → completedToday daily-budget scan
+      return [
+        ...completedRows,
+        ...runningRows,
+        ...queuedRows,
+        ...installingRows,
+      ];
+    };
+    // Watchdog uses `.take(N)` to bound the per-status scan. Tests deal in
+    // tens of rows so we just return everything (cap=200 production value).
+    builder.take = vi.fn(async (_n: number) => resolveRows());
+    // The mutation iterates the builder directly with `for await` for the
+    // reserveSlotAndInsert quota scan path.
+    builder[Symbol.asyncIterator] = function () {
+      return asyncIter(resolveRows())[Symbol.asyncIterator]();
+    };
+    return builder;
+  }
+
+  return {
+    ctx: {
+      db: {
+        query: vi.fn(() => makeBuilder()),
+        insert: vi.fn(
+          async (_table: string, payload: Record<string, unknown>) => {
+            insertedRows.push(payload);
+            return `exec_${insertedRows.length}`;
+          },
+        ),
+        get: vi.fn(),
+        patch: vi.fn(),
+      },
+    },
+    insertedRows,
+  };
+}
+
+describe('reserveSlotAndInsert', () => {
+  const baseArgs = {
+    organizationId: 'org_alpha',
+    uploadedBy: 'user_1',
+    language: 'python' as const,
+    codePreview: 'print("hi")',
+    packages: [],
+    estimatedSeconds: 30,
+  };
+
+  it('inserts a row when no in-flight and budget has room', async () => {
+    const { ctx, insertedRows } = createMockCtx();
+    const mut = reserveSlotAndInsert as unknown as MutHandler<
+      typeof baseArgs,
+      string
+    >;
+    const id = await mut.handler(ctx, baseArgs);
+    expect(id).toBe('exec_1');
+    expect(insertedRows[0]).toMatchObject({
+      organizationId: 'org_alpha',
+      status: 'queued',
+      estimatedSeconds: 30,
+    });
+    // lifecycleStatus is no longer persisted — confirm it isn't smuggled
+    // back in by a future regression.
+    expect(insertedRows[0]).not.toHaveProperty('lifecycleStatus');
+  });
+
+  it(`rejects when running count is already at the cap (${SANDBOX_MAX_CONCURRENT_PER_ORG})`, async () => {
+    const running: FakeRow[] = Array.from(
+      { length: SANDBOX_MAX_CONCURRENT_PER_ORG },
+      (_v, i) => ({
+        _id: `r${i}`,
+        _creationTime: Date.now() - 1000,
+        status: 'running',
+        estimatedSeconds: 30,
+        heartbeatAt: Date.now(),
+      }),
+    );
+    const { ctx } = createMockCtx({ runningRows: running });
+    const mut = reserveSlotAndInsert as unknown as MutHandler<
+      typeof baseArgs,
+      string
+    >;
+    await expect(mut.handler(ctx, baseArgs)).rejects.toBeInstanceOf(
+      ConvexError,
+    );
+  });
+
+  it('rejects when queued rows alone fill the cap (leaked-slot defence)', async () => {
+    const queued: FakeRow[] = Array.from(
+      { length: SANDBOX_MAX_CONCURRENT_PER_ORG },
+      (_v, i) => ({
+        _id: `q${i}`,
+        _creationTime: Date.now() - 500,
+        status: 'queued',
+        estimatedSeconds: 30,
+        heartbeatAt: Date.now(),
+      }),
+    );
+    const { ctx } = createMockCtx({ queuedRows: queued });
+    const mut = reserveSlotAndInsert as unknown as MutHandler<
+      typeof baseArgs,
+      string
+    >;
+    await expect(mut.handler(ctx, baseArgs)).rejects.toBeInstanceOf(
+      ConvexError,
+    );
+  });
+
+  it('rejects when daily CPU budget pre-debit overflows', async () => {
+    // 4 prior runs of 500s each = 2000s; cap is 1800s → next call should reject.
+    const completed: FakeRow[] = Array.from({ length: 4 }, (_v, i) => ({
+      _id: `c${i}`,
+      _creationTime: Date.now() - 60_000,
+      status: 'completed',
+      estimatedSeconds: 500,
+      actualSeconds: 500,
+      heartbeatAt: Date.now(),
+    }));
+    const { ctx } = createMockCtx({ completedTodayRows: completed });
+    const mut = reserveSlotAndInsert as unknown as MutHandler<
+      typeof baseArgs,
+      string
+    >;
+    await expect(
+      mut.handler(ctx, { ...baseArgs, estimatedSeconds: 30 }),
+    ).rejects.toThrow(/budget/i);
+  });
+});
+
+describe('recoverStuckSandboxes', () => {
+  // Cutoff = max_timeout (300s) + 10 min upload tail = 900s = 15 min. Tests
+  // use 20 min to comfortably clear the threshold.
+  const STALE_HEARTBEAT_AGE_MS = 20 * 60_000;
+
+  it('flips running rows whose heartbeat is older than the watchdog cutoff', async () => {
+    const stale: FakeRow = {
+      _id: 'stuck1',
+      _creationTime: Date.now() - 3_600_000,
+      status: 'running',
+      estimatedSeconds: 120,
+      heartbeatAt: Date.now() - STALE_HEARTBEAT_AGE_MS,
+    };
+    const fresh: FakeRow = {
+      _id: 'live1',
+      _creationTime: Date.now() - 60_000,
+      status: 'running',
+      estimatedSeconds: 60,
+      heartbeatAt: Date.now() - 5_000,
+    };
+    const { ctx } = createMockCtx({ runningRows: [stale, fresh] });
+    const mut = recoverStuckSandboxes as unknown as MutHandler<
+      Record<string, unknown>,
+      number
+    >;
+    const count = await mut.handler(ctx, {});
+    expect(count).toBe(1);
+    expect(ctx.db.patch).toHaveBeenCalledWith(
+      'stuck1',
+      expect.objectContaining({
+        status: 'failed',
+        errorCode: 'SPAWNER_UNAVAILABLE',
+      }),
+    );
+    expect(ctx.db.patch).not.toHaveBeenCalledWith('live1', expect.anything());
+  });
+
+  it('also flips queued rows whose heartbeat is older than the watchdog cutoff', async () => {
+    // Captures the "throw between reserveSlotAndInsert and setRunning" leak.
+    const stale: FakeRow = {
+      _id: 'queuedStuck',
+      _creationTime: Date.now() - 3_600_000,
+      status: 'queued',
+      estimatedSeconds: 60,
+      heartbeatAt: Date.now() - STALE_HEARTBEAT_AGE_MS,
+    };
+    const { ctx } = createMockCtx({ queuedRows: [stale] });
+    const mut = recoverStuckSandboxes as unknown as MutHandler<
+      Record<string, unknown>,
+      number
+    >;
+    const count = await mut.handler(ctx, {});
+    expect(count).toBe(1);
+    expect(ctx.db.patch).toHaveBeenCalledWith(
+      'queuedStuck',
+      expect.objectContaining({
+        status: 'failed',
+        errorCode: 'SPAWNER_UNAVAILABLE',
+      }),
+    );
+  });
+});
+
+describe('finalize', () => {
+  const baseArgs = {
+    executionId: 'exec_1' as never,
+    status: 'completed' as const,
+    outputFiles: [],
+    durationMs: 1000,
+    actualSeconds: 1,
+  };
+
+  it('refuses to overwrite a terminal row (watchdog-vs-action race)', async () => {
+    const mut = finalize as unknown as MutHandler<typeof baseArgs, null>;
+    const ctx = {
+      db: {
+        get: vi.fn(async () => ({
+          _id: 'exec_1',
+          status: 'failed',
+          errorCode: 'SPAWNER_UNAVAILABLE',
+        })),
+        patch: vi.fn(),
+      },
+    };
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const result = await mut.handler(ctx, baseArgs);
+    expect(result).toBeNull();
+    expect(ctx.db.patch).not.toHaveBeenCalled();
+    expect(warnSpy).toHaveBeenCalled();
+    warnSpy.mockRestore();
+  });
+
+  it('patches when the row is still in-flight', async () => {
+    const mut = finalize as unknown as MutHandler<typeof baseArgs, null>;
+    const ctx = {
+      db: {
+        get: vi.fn(async () => ({ _id: 'exec_1', status: 'running' })),
+        patch: vi.fn(),
+      },
+    };
+    await mut.handler(ctx, baseArgs);
+    expect(ctx.db.patch).toHaveBeenCalledWith(
+      'exec_1',
+      expect.objectContaining({ status: 'completed' }),
+    );
+  });
+});
+
+describe('insertOutputFiles', () => {
+  // P0 fix regression gate (commit A): sha256 must round-trip from the
+  // spawner's harvest payload through `insertOutputFiles` onto the
+  // `fileMetadata` row AND into the returned `insertedFiles` shape, so the
+  // action's downstream `runOutputFiles` mapping no longer needs the
+  // manual filename re-join that used to drop sha256 silently.
+  const baseArgs = {
+    executionId: 'exec_1' as never,
+    organizationId: 'org_alpha',
+    threadId: 'thr_a',
+    uploadedBy: 'user_1',
+    files: [
+      {
+        name: 'chart.png',
+        storageId: 'kg_blob_1' as never,
+        size: 1024,
+        contentType: 'image/png',
+        sha256: 'a'.repeat(64),
+      },
+      {
+        name: 'data.csv',
+        storageId: 'kg_blob_2' as never,
+        size: 2048,
+        contentType: 'text/csv',
+        sha256: 'b'.repeat(64),
+      },
+    ],
+  };
+
+  function makeCtx(rowStatus: string) {
+    const inserted: Array<{
+      table: string;
+      payload: Record<string, unknown>;
+    }> = [];
+    return {
+      ctx: {
+        db: {
+          get: vi.fn(async () => ({ _id: 'exec_1', status: rowStatus })),
+          insert: vi.fn(
+            async (table: string, payload: Record<string, unknown>) => {
+              inserted.push({ table, payload });
+              return `fm_${inserted.length}` as never;
+            },
+          ),
+        },
+      },
+      inserted,
+    };
+  }
+
+  it('persists sha256 onto each fileMetadata row and returns it', async () => {
+    const mut = insertOutputFiles as unknown as MutHandler<
+      typeof baseArgs,
+      {
+        skippedTerminal: boolean;
+        insertedFiles: Array<{ name: string; sha256: string }>;
+      }
+    >;
+    const { ctx, inserted } = makeCtx('running');
+    const result = await mut.handler(ctx, baseArgs);
+    expect(result.skippedTerminal).toBe(false);
+    expect(result.insertedFiles).toHaveLength(2);
+    expect(result.insertedFiles[0].sha256).toBe('a'.repeat(64));
+    expect(result.insertedFiles[1].sha256).toBe('b'.repeat(64));
+    // Both fileMetadata inserts carry sha256 (regression gate: prior bug
+    // dropped it on the floor here).
+    expect(inserted).toHaveLength(2);
+    expect(inserted[0].payload).toMatchObject({
+      fileName: 'chart.png',
+      sha256: 'a'.repeat(64),
+    });
+    expect(inserted[1].payload).toMatchObject({
+      fileName: 'data.csv',
+      sha256: 'b'.repeat(64),
+    });
+  });
+
+  it('returns skippedTerminal:true when the audit row is already terminal', async () => {
+    const mut = insertOutputFiles as unknown as MutHandler<
+      typeof baseArgs,
+      { skippedTerminal: boolean; insertedFiles: unknown[] }
+    >;
+    const { ctx, inserted } = makeCtx('cancelled');
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const result = await mut.handler(ctx, baseArgs);
+    expect(result.skippedTerminal).toBe(true);
+    expect(result.insertedFiles).toEqual([]);
+    expect(inserted).toHaveLength(0);
+    expect(warnSpy).toHaveBeenCalled();
+    warnSpy.mockRestore();
+  });
+});
diff --git a/services/platform/convex/sandbox/internal_mutations.ts b/services/platform/convex/sandbox/internal_mutations.ts
new file mode 100644
index 000000000..e36ebb19b
--- /dev/null
+++ b/services/platform/convex/sandbox/internal_mutations.ts
@@ -0,0 +1,693 @@
+import { ConvexError, v } from 'convex/values';
+
+import type { Id } from '../_generated/dataModel';
+import {
+  internalMutation,
+  internalQuery,
+  type MutationCtx,
+} from '../_generated/server';
+import { applyFinalizeArtifactRun } from '../artifacts/internal_mutations';
+import { rateLimiter } from '../lib/rate_limiter';
+import {
+  SANDBOX_DAILY_CPU_BUDGET_SECONDS,
+  SANDBOX_MAX_CONCURRENT_PER_ORG,
+  SANDBOX_WATCHDOG_CUTOFF_MS,
+} from './schema';
+import {
+  sandboxErrorCodeValidator,
+  sandboxLanguageValidator,
+  sandboxOutputFileValidator,
+  sandboxStepResultValidator,
+  sandboxTerminalStatuses,
+  sandboxTruncatedValidator,
+} from './wire';
+
+const ONE_DAY_MS = 24 * 60 * 60 * 1000;
+const AUDIT_RETENTION_MS = 90 * ONE_DAY_MS;
+const AUDIT_GC_PER_SWEEP = 100;
+
+/**
+ * Opportunistic per-org GC for sandboxExecutions audit rows. Rate-limited
+ * to at most once per hour per org so a busy org doesn't pay the scan
+ * cost on every insert. Caps the per-sweep delete count to keep the
+ * mutation runtime bounded — leftover rows are reclaimed by the next
+ * sweep an hour later.
+ */
+async function maybeRunSandboxAuditCleanup(
+  ctx: MutationCtx,
+  organizationId: string,
+): Promise<void> {
+  // Best-effort gate. If the rate limiter component is unreachable (e.g.
+  // the unit-test ctx mock that doesn't ship `runMutation`), skip the
+  // sweep rather than crash the parent reservation — cleanup is
+  // opportunistic and a missed window costs nothing.
+  let result: { ok: boolean };
+  try {
+    result = await rateLimiter.limit(ctx, 'cleanup:sandbox', {
+      key: organizationId,
+      throws: false,
+    });
+  } catch (err) {
+    console.warn('[sandbox.cleanup] rate-limiter gate failed:', err);
+    return;
+  }
+  if (!result.ok) return;
+  const cutoff = Date.now() - AUDIT_RETENTION_MS;
+  let deleted = 0;
+  for await (const row of ctx.db
+    .query('sandboxExecutions')
+    .withIndex('by_organizationId', (q) =>
+      q.eq('organizationId', organizationId),
+    )
+    .order('asc')) {
+    if (row._creationTime >= cutoff) break;
+    if (!sandboxTerminalStatuses.has(row.status)) continue;
+    // Cascade-delete the storage blobs owned by this audit row before
+    // dropping it. Without this, every GC cycle orphaned three `_storage`
+    // rows per audit row (code/stdout/stderr) and never released the
+    // bytes — audit finding R2-B7 #2.
+    //
+    // outputFiles[*].storageId is intentionally NOT deleted here: that
+    // ownership lives on the sibling `fileMetadata` rows; their own
+    // lifecycle (referenced by chat messages) governs blob lifetime.
+    await deleteSandboxRowStorage(ctx, row);
+    await ctx.db.delete(row._id);
+    deleted += 1;
+    if (deleted >= AUDIT_GC_PER_SWEEP) break;
+  }
+}
+
+/**
+ * Best-effort `_storage` cleanup for an audit row about to be deleted (90-day
+ * retention sweep) or reaped (watchdog). Each delete is independently
+ * try/catch'd so a single missing blob doesn't abort the parent mutation.
+ *
+ * Output-file blobs are deliberately excluded — their ownership lives on
+ * `fileMetadata` rows whose own lifecycle handles cleanup.
+ */
+async function deleteSandboxRowStorage(
+  ctx: MutationCtx,
+  row: {
+    codeStorageId?: Id<'_storage'>;
+    stdoutStorageId?: Id<'_storage'>;
+    stderrStorageId?: Id<'_storage'>;
+  },
+): Promise<void> {
+  for (const id of [
+    row.codeStorageId,
+    row.stdoutStorageId,
+    row.stderrStorageId,
+  ]) {
+    if (id === undefined) continue;
+    try {
+      await ctx.storage.delete(id);
+    } catch (err) {
+      console.warn(`[sandbox.cleanup] storage.delete ${id} failed:`, err);
+    }
+  }
+}
+
+/**
+ * Sweep the orphan blobs reported via EP2 (`applyRecordUploaded`) when the
+ * watchdog reaps a stuck row, OR when `failExecution` rolls back a failed
+ * run. Mirrors the existing `uploadedStorageIds` rollback in the action's
+ * fail path — see plan §3.
+ */
+async function deleteReportedUploadedBlobs(
+  ctx: MutationCtx,
+  uploaded: ReadonlyArray<Id<'_storage'>> | undefined,
+): Promise<void> {
+  if (!uploaded || uploaded.length === 0) return;
+  for (const id of uploaded) {
+    try {
+      await ctx.storage.delete(id);
+    } catch (err) {
+      console.warn(
+        `[sandbox.cleanup] uploadedStorageIds delete ${id} failed:`,
+        err,
+      );
+    }
+  }
+}
+
+/**
+ * Atomic concurrency-cap + daily-CPU-budget reservation.
+ *
+ * Convex mutations are serializable with OCC: the by_organizationId_and_status
+ * index range read here is recorded in the read set, so two parallel
+ * reservations that both see "3/4 in flight" cannot both insert — one
+ * retries. This closes the TOCTOU race R1.8/R1.10 flagged.
+ *
+ * Daily CPU budget = sum(actualSeconds of completed-today) + sum(estimatedSeconds
+ * of currently-running) + this call's estimate. Pre-debit so 4 concurrent
+ * 300s calls cannot collectively overshoot (post-debit would allow a 20-min
+ * burst per wave).
+ */
+export const reserveSlotAndInsert = internalMutation({
+  args: {
+    organizationId: v.string(),
+    uploadedBy: v.string(),
+    threadId: v.optional(v.string()),
+    messageId: v.optional(v.string()),
+    toolCallId: v.optional(v.string()),
+    agentSlug: v.optional(v.string()),
+    artifactId: v.optional(v.id('artifacts')),
+    /** For artifact-bound runs: which file in the project was executed. */
+    path: v.optional(v.string()),
+    language: sandboxLanguageValidator,
+    purpose: v.optional(v.string()),
+    codePreview: v.string(),
+    codeStorageId: v.optional(v.id('_storage')),
+    packages: v.array(v.string()),
+    installOptions: v.optional(
+      v.object({
+        allowSdist: v.optional(v.boolean()),
+        allowInstallScripts: v.optional(v.boolean()),
+      }),
+    ),
+    estimatedSeconds: v.number(),
+  },
+  returns: v.id('sandboxExecutions'),
+  handler: async (ctx, args) => {
+    const now = Date.now();
+
+    // Concurrent cap. Short-circuit at the cap; never materialise the full set.
+    // Both `queued` and `running` rows count: the cap is "in-flight", not
+    // "actively executing". This must agree with the watchdog (below) which
+    // also sweeps both states — otherwise a leaked queued row would shrink
+    // the effective cap until the next watchdog run.
+    let inFlight = 0;
+    let runningSecondsProjected = 0;
+    for (const status of ['running', 'queued', 'installing'] as const) {
+      for await (const row of ctx.db
+        .query('sandboxExecutions')
+        .withIndex('by_organizationId_and_status', (q) =>
+          q.eq('organizationId', args.organizationId).eq('status', status),
+        )) {
+        inFlight += 1;
+        runningSecondsProjected += row.estimatedSeconds;
+        if (inFlight >= SANDBOX_MAX_CONCURRENT_PER_ORG) {
+          throw new ConvexError({
+            code: 'QUOTA_EXCEEDED',
+            message: `At most ${SANDBOX_MAX_CONCURRENT_PER_ORG} sandboxes can run concurrently for this organization.`,
+          });
+        }
+      }
+    }
+
+    // Daily CPU-second budget. Today = last 24h sliding window keyed by
+    // `_creationTime`. Reusing `by_organizationId` index (per `videoLinkJobs`
+    // convention) keeps the scan bounded for typical orgs (≤dozens/day).
+    const dayCutoff = now - ONE_DAY_MS;
+    let completedToday = 0;
+    for await (const row of ctx.db
+      .query('sandboxExecutions')
+      .withIndex('by_organizationId', (q) =>
+        q.eq('organizationId', args.organizationId),
+      )
+      .order('desc')) {
+      if (row._creationTime < dayCutoff) break;
+      // Cancelled rows count too: the spawner still spent CPU bringing the
+      // container up before the cancel landed, and treating cancels as
+      // "free" would let an abusive caller burst spawn/abort the same
+      // execution to bypass the budget. If we ever want to refund early
+      // cancels (e.g. cancelled in the queued state with no work done),
+      // do it explicitly on the cancel path, not implicitly here.
+      if (
+        row.status === 'completed' ||
+        row.status === 'failed' ||
+        row.status === 'cancelled'
+      ) {
+        completedToday += row.actualSeconds ?? row.estimatedSeconds;
+      }
+    }
+    if (
+      completedToday + runningSecondsProjected + args.estimatedSeconds >
+      SANDBOX_DAILY_CPU_BUDGET_SECONDS
+    ) {
+      throw new ConvexError({
+        code: 'QUOTA_EXCEEDED',
+        message: `Daily CPU-second budget exceeded (${SANDBOX_DAILY_CPU_BUDGET_SECONDS}s/org). Try again tomorrow or split the work.`,
+      });
+    }
+
+    const executionId = await ctx.db.insert('sandboxExecutions', {
+      organizationId: args.organizationId,
+      uploadedBy: args.uploadedBy,
+      ...(args.threadId !== undefined && { threadId: args.threadId }),
+      ...(args.messageId !== undefined && { messageId: args.messageId }),
+      ...(args.toolCallId !== undefined && { toolCallId: args.toolCallId }),
+      ...(args.agentSlug !== undefined && { agentSlug: args.agentSlug }),
+      ...(args.artifactId !== undefined && { artifactId: args.artifactId }),
+      ...(args.path !== undefined && { path: args.path }),
+      // Normalize the audit field: always store an object with explicit
+      // booleans (default false) so a future read-side default-divergence
+      // can't quietly invert the meaning. The legacy conditional-spread
+      // stored either `undefined` or a partial object, depending on the
+      // caller's args shape.
+      installOptions: {
+        allowSdist: args.installOptions?.allowSdist ?? false,
+        allowInstallScripts: args.installOptions?.allowInstallScripts ?? false,
+      },
+      language: args.language,
+      ...(args.purpose !== undefined && { purpose: args.purpose }),
+      codePreview: args.codePreview,
+      ...(args.codeStorageId !== undefined && {
+        codeStorageId: args.codeStorageId,
+      }),
+      packages: args.packages,
+      status: 'queued',
+      statusChangedAt: now,
+      heartbeatAt: now,
+      estimatedSeconds: args.estimatedSeconds,
+      outputFiles: [],
+      startedAt: now,
+    });
+    // Opportunistic per-org GC of audit rows older than 90 days. Gated by
+    // a 1/hour rate limiter so we don't scan on every insert. Done AFTER
+    // the insert (vs. before) so a quota-rejected insert doesn't waste
+    // the GC window.
+    await maybeRunSandboxAuditCleanup(ctx, args.organizationId);
+    return executionId;
+  },
+});
+
+export const setRunning = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    // Only `installing` is flipped here. The spawner emits a separate
+    // `running` SSE event later, but we don't patch the audit row for it —
+    // the lifecycle is queued → installing → terminal. The literal `running`
+    // existed in earlier drafts but no caller emits it; keep the validator
+    // tight so a future regression can't silently introduce it.
+    status: v.optional(v.literal('installing')),
+  },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return null;
+    // Monotonic: queued → installing. Don't roll back; terminal states are
+    // also rejected (no resurrection).
+    const next = args.status ?? 'installing';
+    if (row.status !== 'queued') return null;
+    const now = Date.now();
+    await ctx.db.patch(args.executionId, {
+      status: next,
+      statusChangedAt: now,
+      heartbeatAt: now,
+    });
+    return null;
+  },
+});
+
+export const heartbeat = internalMutation({
+  args: { executionId: v.id('sandboxExecutions') },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return null;
+    if (row.status !== 'running' && row.status !== 'installing') return null;
+    await ctx.db.patch(args.executionId, { heartbeatAt: Date.now() });
+    return null;
+  },
+});
+
+/**
+ * Settles an audit row into a terminal state. Idempotent w.r.t. duplicate
+ * Convex retries AND races with the watchdog: if the row is already in a
+ * terminal state we leave it alone (no-op + warn). The watchdog reaping a
+ * stuck row claims authority; a late-arriving result from the action must
+ * not clobber the `SPAWNER_UNAVAILABLE` audit data the watchdog wrote.
+ */
+export const finalize = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    status: v.union(
+      v.literal('completed'),
+      v.literal('failed'),
+      v.literal('cancelled'),
+    ),
+    exitCode: v.optional(v.number()),
+    errorCode: v.optional(sandboxErrorCodeValidator),
+    errorMessage: v.optional(v.string()),
+    stdoutPreview: v.optional(v.string()),
+    stderrPreview: v.optional(v.string()),
+    stdoutStorageId: v.optional(v.id('_storage')),
+    stderrStorageId: v.optional(v.id('_storage')),
+    outputFiles: v.array(sandboxOutputFileValidator),
+    truncated: v.optional(sandboxTruncatedValidator),
+    durationMs: v.number(),
+    actualSeconds: v.number(),
+    /**
+     * Per-step results when the underlying run was multi-step. Single-step
+     * runs leave this undefined; the column is sparse and only patched
+     * when present.
+     */
+    steps: v.optional(v.array(sandboxStepResultValidator)),
+    /**
+     * Presigned-URL upload telemetry from the spawner (sandbox-wobbly-
+     * origami plan §5). Optional + sparse — older spawner builds don't
+     * emit these fields; new builds populate them with per-file outcome
+     * + per-phase timing.
+     */
+    uploadStats: v.optional(
+      v.object({
+        attempted: v.number(),
+        succeeded: v.number(),
+        failures: v.array(
+          v.object({
+            slotIndex: v.number(),
+            fileName: v.string(),
+            httpStatus: v.number(),
+            errorSnippet: v.string(),
+          }),
+        ),
+      }),
+    ),
+    timing: v.optional(
+      v.object({
+        stageMs: v.number(),
+        executeMs: v.number(),
+        harvestMs: v.number(),
+        uploadMs: v.number(),
+      }),
+    ),
+  },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return null;
+    if (sandboxTerminalStatuses.has(row.status)) {
+      // Late-arriving result vs. watchdog reap. Authority belongs to
+      // whoever wrote first — preserve their data, drop ours.
+      console.warn(
+        `[sandbox.finalize] no-op: row ${row._id} already terminal as ${row.status}; dropping incoming ${args.status}`,
+      );
+      return null;
+    }
+    const now = Date.now();
+    await ctx.db.patch(args.executionId, {
+      status: args.status,
+      statusChangedAt: now,
+      completedAt: now,
+      durationMs: args.durationMs,
+      actualSeconds: args.actualSeconds,
+      ...(args.exitCode !== undefined && { exitCode: args.exitCode }),
+      ...(args.errorCode !== undefined && { errorCode: args.errorCode }),
+      ...(args.errorMessage !== undefined && {
+        errorMessage: args.errorMessage,
+      }),
+      ...(args.stdoutPreview !== undefined && {
+        stdoutPreview: args.stdoutPreview,
+      }),
+      ...(args.stderrPreview !== undefined && {
+        stderrPreview: args.stderrPreview,
+      }),
+      ...(args.stdoutStorageId !== undefined && {
+        stdoutStorageId: args.stdoutStorageId,
+      }),
+      ...(args.stderrStorageId !== undefined && {
+        stderrStorageId: args.stderrStorageId,
+      }),
+      outputFiles: args.outputFiles,
+      ...(args.truncated !== undefined && { truncated: args.truncated }),
+      ...(args.steps !== undefined && { steps: args.steps }),
+      ...(args.uploadStats !== undefined && { uploadStats: args.uploadStats }),
+      ...(args.timing !== undefined && { timing: args.timing }),
+    });
+    return null;
+  },
+});
+
+/**
+ * Watchdog cron — flips long-stuck rows to failed/SPAWNER_UNAVAILABLE.
+ *
+ * Convex 30-min hard-kill skips action `try/finally`, so without this the
+ * audit row stays in a non-terminal state forever and the slot it holds
+ * permanently shrinks the org's concurrent cap. Heartbeat from the action
+ * keeps `heartbeatAt` fresh; we declare a row stuck when it's been
+ * 2×max_timeout without an update.
+ *
+ * Sweeps `queued`, `installing`, AND `running` — a throw between
+ * `reserveSlotAndInsert` and `setRunning` leaves the row in `queued`
+ * indefinitely and would leak a quota slot otherwise.
+ */
+// Per-status cap on rows reaped in a single mutation. Convex mutations
+// have a doc-read/-write budget — an unbounded full-table scan can hit
+// it and abort mid-sweep, leaving the trailing rows stuck (audit finding
+// R2-B6 #1). Cron re-runs every 5 min so leftover rows get picked up.
+const WATCHDOG_REAP_PER_STATUS = 200;
+
+export const recoverStuckSandboxes = internalMutation({
+  args: {},
+  returns: v.number(),
+  handler: async (ctx) => {
+    const cutoff = Date.now() - SANDBOX_WATCHDOG_CUTOFF_MS;
+    let recovered = 0;
+    for (const status of ['running', 'installing', 'queued'] as const) {
+      const candidates = await ctx.db
+        .query('sandboxExecutions')
+        .withIndex('by_status', (q) => q.eq('status', status))
+        .take(WATCHDOG_REAP_PER_STATUS);
+      for (const row of candidates) {
+        if (row.heartbeatAt >= cutoff) continue;
+        const now = Date.now();
+        await ctx.db.patch(row._id, {
+          status: 'failed',
+          statusChangedAt: now,
+          completedAt: now,
+          errorCode: 'SPAWNER_UNAVAILABLE',
+          errorMessage: `Watchdog reaped a stuck ${status} row`,
+          actualSeconds: row.estimatedSeconds,
+        });
+        // Best-effort storage cleanup so a watchdog reap doesn't leave
+        // code/stdout/stderr blobs orphaned for the full 90-day audit
+        // retention window (audit finding R2-B7 #2 follow-up).
+        await deleteSandboxRowStorage(ctx, row);
+        // Sandbox-wobbly-origami: also reclaim any output blobs the
+        // spawner reported via EP2 (`applyRecordUploaded`) before
+        // crashing. They never made it into a `fileMetadata` row, so
+        // their ownership is purely on this audit row's
+        // `uploadedStorageIds` set.
+        await deleteReportedUploadedBlobs(ctx, row.uploadedStorageIds);
+        // Cascade to the artifact row if this execution was bound to one,
+        // so the canvas spinner terminates as soon as the watchdog runs
+        // (otherwise the runnable card spins until the audit row TTLs out).
+        if (row.artifactId) {
+          await applyFinalizeArtifactRun(ctx, {
+            artifactId: row.artifactId,
+            runStatus: 'failed',
+            runErrorCode: 'SPAWNER_UNAVAILABLE',
+            runErrorMessage: `Watchdog reaped a stuck ${status} sandbox execution`,
+            runOutputFiles: [],
+            runExecutionId: row._id,
+          });
+        }
+        recovered += 1;
+      }
+    }
+    return recovered;
+  },
+});
+
+/**
+ * Locates every non-terminal `sandboxExecutions` row tied to a thread.
+ * Used by the user-Stop cascade: when `cancel_generation` fires, the new
+ * `cancelExecutionsForThread` action calls this to find what to kill, then
+ * issues `spawnerCancel` + `cancelExecutionRecord` for each. Returns a
+ * trimmed projection (id + artifactId) because the caller doesn't need
+ * the full doc — keeps the query cheap.
+ */
+export const listNonTerminalByThread = internalQuery({
+  // `threadId` is stored as `v.string()` on `sandboxExecutions` (the
+  // upstream `threads` table is provided by `@convex-dev/agent`, so the
+  // platform schema never sees its branded `Id<'threads'>` directly);
+  // accept the same `v.string()` here to match.
+  args: { threadId: v.string() },
+  returns: v.array(
+    v.object({
+      _id: v.id('sandboxExecutions'),
+      artifactId: v.optional(v.id('artifacts')),
+    }),
+  ),
+  handler: async (ctx, args) => {
+    const rows = await ctx.db
+      .query('sandboxExecutions')
+      .withIndex('by_threadId', (q) => q.eq('threadId', args.threadId))
+      .collect();
+    const out: Array<{
+      _id: Id<'sandboxExecutions'>;
+      artifactId?: Id<'artifacts'>;
+    }> = [];
+    for (const row of rows) {
+      if (sandboxTerminalStatuses.has(row.status)) continue;
+      const entry: {
+        _id: Id<'sandboxExecutions'>;
+        artifactId?: Id<'artifacts'>;
+      } = { _id: row._id };
+      if (row.artifactId !== undefined) entry.artifactId = row.artifactId;
+      out.push(entry);
+    }
+    return out;
+  },
+});
+
+/**
+ * Initialize the presigned-URL upload slots + quota counter on the audit
+ * row, called by the action right after `reserveSlotAndInsert` and
+ * before dispatching the request to the spawner. Idempotent: writing the
+ * same slots twice is harmless, but mid-flight slot rotation isn't
+ * supported (the spawner already holds the URLs in memory).
+ *
+ * `quotaRemaining` is the number of additional URLs EP1 can still grant
+ * after subtracting the pre-allocated slots: e.g. with
+ * SANDBOX_MAX_OUTPUT_FILES_PER_RUN=16 and 2 slots pre-allocated, we
+ * persist quotaRemaining=14.
+ */
+export const applyInitOutputSlots = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    slots: v.array(v.string()),
+    quotaRemaining: v.number(),
+  },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return null;
+    if (sandboxTerminalStatuses.has(row.status)) return null;
+    await ctx.db.patch(args.executionId, {
+      outputUploadSlots: args.slots,
+      outputUrlQuotaRemaining: args.quotaRemaining,
+    });
+    return null;
+  },
+});
+
+/**
+ * Server-side per-run quota counter. Spawner POSTs to EP1
+ * (`/api/sandbox/output_upload_url`) when its local slot pool runs dry;
+ * the httpAction calls this mutation to atomically decrement and reports
+ * how many URLs were granted. Returns `granted: 0` if the row is already
+ * terminal or the quota is exhausted — caller responds with 412 in that
+ * case so the spawner stops asking.
+ */
+export const applyConsumeUrlQuota = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    count: v.number(),
+  },
+  returns: v.object({
+    granted: v.number(),
+    remaining: v.number(),
+  }),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return { granted: 0, remaining: 0 };
+    if (sandboxTerminalStatuses.has(row.status)) {
+      // Row is already terminal — refuse further uploads.
+      return { granted: 0, remaining: row.outputUrlQuotaRemaining ?? 0 };
+    }
+    const remaining = row.outputUrlQuotaRemaining ?? 0;
+    const granted = Math.max(0, Math.min(args.count, remaining));
+    if (granted === 0) {
+      return { granted: 0, remaining };
+    }
+    const nextRemaining = remaining - granted;
+    await ctx.db.patch(args.executionId, {
+      outputUrlQuotaRemaining: nextRemaining,
+    });
+    return { granted, remaining: nextRemaining };
+  },
+});
+
+/**
+ * Append a storage id to the audit row's `uploadedStorageIds` rollback
+ * set. Spawner POSTs to EP2 (`/api/sandbox/record_uploaded`) after each
+ * successful per-file upload; the httpAction calls this. Terminal-state
+ * rows are refused (the run is over, no point recording new uploads).
+ *
+ * Note: we DON'T also write an `outputFiles` entry here — those are
+ * written transactionally by `output_mutations.insertOutputFiles` when
+ * the spawner result event lands. EP2 only feeds the rollback set so
+ * a spawner crash between successful EP2 and the final SSE result
+ * doesn't orphan the blob.
+ */
+export const applyRecordUploaded = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    fileName: v.string(),
+    storageId: v.id('_storage'),
+    size: v.number(),
+    contentType: v.string(),
+  },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return null;
+    if (sandboxTerminalStatuses.has(row.status)) {
+      // Run is already terminal — caller is too late. Don't append to
+      // the rollback set; the final state may have already been
+      // computed and persisting more ids could trigger a stale
+      // `failExecution` to delete a blob we now expect to keep.
+      console.warn(
+        `[sandbox.applyRecordUploaded] late EP2 for terminal row ${row._id} (status=${row.status}); ignoring ${args.fileName}`,
+      );
+      return null;
+    }
+    const existing = row.uploadedStorageIds ?? [];
+    // Idempotency: dedupe in case the spawner retried EP2 after a
+    // network blip. The set is small (cap = MAX_OUTPUT_FILES_PER_RUN)
+    // so the linear scan is fine.
+    if (existing.some((id) => id === args.storageId)) return null;
+    await ctx.db.patch(args.executionId, {
+      uploadedStorageIds: [...existing, args.storageId],
+      heartbeatAt: Date.now(),
+    });
+    return null;
+  },
+});
+
+/**
+ * Terminal-state transition driven by user-Stop. Distinct from `finalize`
+ * because there's no spawner result to merge — we just mark the row
+ * `cancelled` with the canonical error code, and cascade to the artifact
+ * so the canvas spinner clears in the same Convex tick. Idempotent: a
+ * row already in a terminal state is left alone (watchdog/spawner result
+ * may have raced ahead).
+ */
+export const cancelExecutionRecord = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    reason: v.optional(v.string()),
+  },
+  returns: v.null(),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (!row) return null;
+    if (sandboxTerminalStatuses.has(row.status)) return null;
+    const now = Date.now();
+    const message = args.reason ?? 'Execution cancelled by user';
+    await ctx.db.patch(args.executionId, {
+      status: 'cancelled',
+      statusChangedAt: now,
+      completedAt: now,
+      errorCode: 'CANCELLED',
+      errorMessage: message,
+      actualSeconds: Math.max(
+        (now - row.startedAt) / 1000,
+        row.estimatedSeconds,
+      ),
+    });
+    if (row.artifactId) {
+      await applyFinalizeArtifactRun(ctx, {
+        artifactId: row.artifactId,
+        runStatus: 'cancelled',
+        runErrorCode: 'CANCELLED',
+        runErrorMessage: message,
+        runOutputFiles: [],
+        runExecutionId: row._id,
+      });
+    }
+    return null;
+  },
+});
diff --git a/services/platform/convex/sandbox/output_mutations.ts b/services/platform/convex/sandbox/output_mutations.ts
new file mode 100644
index 000000000..aa350d06f
--- /dev/null
+++ b/services/platform/convex/sandbox/output_mutations.ts
@@ -0,0 +1,100 @@
+// Internal mutations the sandbox Node action uses to commit storage uploads
+// transactionally. Kept in the non-`use node` module because mutations don't
+// run in the Node runtime.
+
+import { v } from 'convex/values';
+
+import type { Id } from '../_generated/dataModel';
+import { internalMutation } from '../_generated/server';
+import { sandboxTerminalStatuses } from './wire';
+
+const outputFileValidator = v.object({
+  name: v.string(),
+  storageId: v.id('_storage'),
+  size: v.number(),
+  contentType: v.string(),
+  // SHA-256 (hex) computed by the spawner during harvest. Required at this
+  // hop — spawner always emits it for new uploads (parity-guarded by
+  // `HarvestOutputFile` in wire.ts). Persisted onto the `fileMetadata` row
+  // so downstream readers (artifactOutputs, attestation) don't have to
+  // re-fetch from the spawner result.
+  sha256: v.string(),
+});
+
+/**
+ * After the action has uploaded every output blob to `_storage`, this
+ * mutation atomically inserts the `fileMetadata` rows that point at them.
+ * All-or-nothing: if any insert fails the mutation aborts and the caller
+ * deletes the orphaned `_storage` blobs.
+ *
+ * Terminal-state guard mirrors `finalize`'s posture (audit follow-up F6):
+ * if the audit row reached a terminal state between the spawner's SSE
+ * `result` event and this mutation (e.g. the user clicked Stop right
+ * before the harvest landed), we return `{skippedTerminal: true}` so the
+ * caller skips the `uploadedStorageIds.clear()` step and the
+ * `failExecution`-style rollback can delete the orphan blobs.
+ */
+export const insertOutputFiles = internalMutation({
+  args: {
+    executionId: v.id('sandboxExecutions'),
+    organizationId: v.string(),
+    threadId: v.optional(v.string()),
+    uploadedBy: v.string(),
+    files: v.array(outputFileValidator),
+  },
+  returns: v.object({
+    skippedTerminal: v.boolean(),
+    insertedFiles: v.array(
+      v.object({
+        name: v.string(),
+        fileMetadataId: v.id('fileMetadata'),
+        storageId: v.id('_storage'),
+        size: v.number(),
+        contentType: v.string(),
+        sha256: v.string(),
+      }),
+    ),
+  }),
+  handler: async (ctx, args) => {
+    const row = await ctx.db.get(args.executionId);
+    if (row !== null && sandboxTerminalStatuses.has(row.status)) {
+      console.warn(
+        `[sandbox.insertOutputFiles] no-op: row ${row._id} already terminal as ${row.status}; caller must roll back ${args.files.length} blob(s)`,
+      );
+      return { skippedTerminal: true, insertedFiles: [] };
+    }
+    const now = Date.now();
+    const insertedFiles: {
+      name: string;
+      fileMetadataId: Id<'fileMetadata'>;
+      storageId: Id<'_storage'>;
+      size: number;
+      contentType: string;
+      sha256: string;
+    }[] = [];
+    for (const f of args.files) {
+      const fileMetadataId = await ctx.db.insert('fileMetadata', {
+        organizationId: args.organizationId,
+        storageId: f.storageId,
+        ...(args.threadId !== undefined && { threadId: args.threadId }),
+        uploadedBy: args.uploadedBy,
+        fileName: f.name,
+        contentType: f.contentType,
+        size: f.size,
+        sha256: f.sha256,
+        source: 'agent',
+        lifecycleStatus: 'active',
+        statusChangedAt: now,
+      });
+      insertedFiles.push({
+        name: f.name,
+        fileMetadataId,
+        storageId: f.storageId,
+        size: f.size,
+        contentType: f.contentType,
+        sha256: f.sha256,
+      });
+    }
+    return { skippedTerminal: false, insertedFiles };
+  },
+});
diff --git a/services/platform/convex/sandbox/sandbox_http.ts b/services/platform/convex/sandbox/sandbox_http.ts
new file mode 100644
index 000000000..a18d9641d
--- /dev/null
+++ b/services/platform/convex/sandbox/sandbox_http.ts
@@ -0,0 +1,339 @@
+// HTTP callback endpoints the sandbox spawner uses to negotiate
+// presigned upload URLs and report each successful storage write.
+//
+// Routes (registered in `convex/http.ts`, proxied through Caddy
+// `handle /api/sandbox/*` → convex:3211):
+//
+//   EP1: POST /api/sandbox/output_upload_url
+//     Body:   {executionId: string, count: number}
+//     200:    {urls: string[], remainingQuota: number}
+//     412:    {code: "QUOTA_EXCEEDED"}            — per-run quota exhausted
+//     401:    {error: "unauthorized"}             — HMAC verify failed
+//     400:    {error: "bad_request", ...}
+//
+//   EP2: POST /api/sandbox/record_uploaded
+//     Body:   {executionId, fileName, storageId, size, contentType}
+//     200:    {ok: true}
+//     401/400 as above.
+//
+// HMAC contract (mirrors services/sandbox/src/auth.ts):
+//   signedString = `${METHOD}\n${path}\n${timestamp}\n${sha256Hex(body)}`
+//   signature    = HMAC-SHA256(SANDBOX_TOKEN, signedString)
+// Both sides share the same SANDBOX_TOKEN so we don't introduce a new
+// secret-management surface (see plan §2).
+
+// Web Crypto API (V8 runtime, no `'use node'` directive needed). The
+// spawner-side mirror in services/sandbox/src/sandbox-callback.ts uses
+// node:crypto, but the produced hex digests are byte-identical so the
+// two sides interoperate. Using Web Crypto here keeps the httpAction in
+// the fast V8 isolate path instead of paying Node-runtime cold-start
+// overhead per upload-slot RPC.
+import { internal } from '../_generated/api';
+import { httpAction } from '../_generated/server';
+import { toSandboxStorageUrl } from '../lib/helpers/public_storage_url';
+import { toId } from '../lib/type_cast_helpers';
+
+const SIGNATURE_HEADER = 'x-tale-sandbox-signature';
+const TIMESTAMP_HEADER = 'x-tale-sandbox-timestamp';
+// Matches the spawner-side window in services/sandbox/src/auth.ts:29.
+// Keeping the two sides symmetric simplifies the threat model (replay
+// surface is the same in either direction) and 30s is enough for any
+// realistic Convex action latency + Caddy hop.
+const TIMESTAMP_TOLERANCE_MS = 30_000;
+
+// Nonce cache mirrors services/sandbox/src/auth.ts:36-52 — bounds the
+// replay window even within the skew tolerance. Module-level state lives
+// for the lifetime of the V8 isolate; on isolate recycle the cache
+// resets, but the spawner-side cache is authoritative for the
+// Convex→spawner direction anyway. This is defense-in-depth on the
+// spawner→Convex direction (EP1 quota drain / EP2 storageId planting).
+const NONCE_TTL_MS = TIMESTAMP_TOLERANCE_MS + 5_000;
+const NONCE_SWEEP_INTERVAL = 100;
+const seenSignatures = new Map<string, number>();
+let verifyCallsSinceSweep = 0;
+
+function maybeSweepNonces(now: number): void {
+  verifyCallsSinceSweep += 1;
+  if (verifyCallsSinceSweep < NONCE_SWEEP_INTERVAL) return;
+  verifyCallsSinceSweep = 0;
+  for (const [sig, expiresAt] of seenSignatures) {
+    if (expiresAt <= now) seenSignatures.delete(sig);
+  }
+}
+
+function jsonResponse(body: unknown, status: number): Response {
+  return new Response(JSON.stringify(body), {
+    status,
+    headers: { 'content-type': 'application/json' },
+  });
+}
+
+function toHex(bytes: ArrayBuffer): string {
+  const arr = new Uint8Array(bytes);
+  let out = '';
+  for (let i = 0; i < arr.length; i += 1) {
+    out += arr[i].toString(16).padStart(2, '0');
+  }
+  return out;
+}
+
+async function sha256Hex(input: string): Promise<string> {
+  const buf = new TextEncoder().encode(input);
+  const digest = await crypto.subtle.digest('SHA-256', buf);
+  return toHex(digest);
+}
+
+async function hmacSha256Hex(token: string, payload: string): Promise<string> {
+  const enc = new TextEncoder();
+  const key = await crypto.subtle.importKey(
+    'raw',
+    enc.encode(token),
+    { name: 'HMAC', hash: 'SHA-256' },
+    false,
+    ['sign'],
+  );
+  const sig = await crypto.subtle.sign('HMAC', key, enc.encode(payload));
+  return toHex(sig);
+}
+
+/**
+ * Constant-time hex-string equality. Mirrors `crypto.timingSafeEqual`
+ * (Node) but works in V8 runtime where that API isn't exposed. Both
+ * strings must already be lower-case hex of the same length; the
+ * length pre-check is non-secret (the signature header length is
+ * attacker-controlled anyway, so leaking it via short-circuit is fine).
+ */
+function timingSafeHexEqual(a: string, b: string): boolean {
+  if (a.length !== b.length) return false;
+  let acc = 0;
+  for (let i = 0; i < a.length; i += 1) {
+    acc |= a.charCodeAt(i) ^ b.charCodeAt(i);
+  }
+  return acc === 0;
+}
+
+async function verifyHmac(
+  method: string,
+  path: string,
+  body: string,
+  signatureHeader: string | null,
+  timestampHeader: string | null,
+  token: string,
+  nowMs: number = Date.now(),
+): Promise<{ ok: true } | { ok: false; reason: string }> {
+  if (!signatureHeader) return { ok: false, reason: 'missing_signature' };
+  if (!timestampHeader) return { ok: false, reason: 'missing_timestamp' };
+  const ts = Number(timestampHeader);
+  if (!Number.isFinite(ts) || ts <= 0) {
+    return { ok: false, reason: 'bad_timestamp' };
+  }
+  if (Math.abs(nowMs - ts) > TIMESTAMP_TOLERANCE_MS) {
+    return { ok: false, reason: 'timestamp_skew' };
+  }
+  const bodyHash = await sha256Hex(body);
+  const signedString = `${method.toUpperCase()}\n${path}\n${timestampHeader}\n${bodyHash}`;
+  const expected = await hmacSha256Hex(token, signedString);
+  if (!timingSafeHexEqual(expected, signatureHeader)) {
+    return { ok: false, reason: 'bad_signature' };
+  }
+
+  // Signature is structurally valid AND within the skew window. Check
+  // the nonce cache to block replay-within-window.
+  maybeSweepNonces(nowMs);
+  const cached = seenSignatures.get(signatureHeader);
+  if (cached !== undefined && cached > nowMs) {
+    return { ok: false, reason: 'replay' };
+  }
+  seenSignatures.set(signatureHeader, nowMs + NONCE_TTL_MS);
+  return { ok: true };
+}
+
+function getSandboxToken(): string | null {
+  const token = process.env.SANDBOX_TOKEN;
+  return token && token.length > 0 ? token : null;
+}
+
+async function readBody(req: Request): Promise<string> {
+  return req.text();
+}
+
+function parsePathFromUrl(rawUrl: string): string {
+  try {
+    return new URL(rawUrl).pathname;
+  } catch {
+    // Fallback for malformed Request.url — shouldn't happen but defend
+    // against it so we don't 500 in the auth path.
+    return rawUrl;
+  }
+}
+
+/**
+ * EP1: presigned-URL upload-slot vendor.
+ *
+ * Spawner asks for `count` additional upload URLs. We consume `granted` of
+ * those from the per-run quota counter (atomic mutation), then call
+ * `ctx.storage.generateUploadUrl()` `granted` times, rewriting each URL
+ * through `toSandboxStorageUrl()` so the spawner can POST through the
+ * internal Caddy alias. Returns 412 + QUOTA_EXCEEDED when the run has hit
+ * its per-run output-file cap (`SANDBOX_MAX_OUTPUT_FILES_PER_RUN`).
+ */
+export const outputUploadUrlAction = httpAction(async (ctx, req) => {
+  const path = parsePathFromUrl(req.url);
+  const body = await readBody(req);
+
+  const token = getSandboxToken();
+  if (token !== null) {
+    const verifyResult = await verifyHmac(
+      req.method,
+      path,
+      body,
+      req.headers.get(SIGNATURE_HEADER),
+      req.headers.get(TIMESTAMP_HEADER),
+      token,
+    );
+    if (!verifyResult.ok) {
+      // Log the discriminator server-side; surface only "unauthorized"
+      // so an attacker can't probe the failure mode.
+      console.warn(`[sandbox_http.EP1] unauthorized (${verifyResult.reason})`);
+      return jsonResponse({ error: 'unauthorized' }, 401);
+    }
+  }
+
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(body);
+  } catch (err) {
+    return jsonResponse({ error: 'bad_request', message: String(err) }, 400);
+  }
+  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'body must be an object' },
+      400,
+    );
+  }
+  // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked above
+  const b = parsed as Record<string, unknown>;
+  if (typeof b.executionId !== 'string' || b.executionId.length === 0) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'executionId required' },
+      400,
+    );
+  }
+  if (
+    typeof b.count !== 'number' ||
+    !Number.isFinite(b.count) ||
+    b.count <= 0 ||
+    b.count > 16
+  ) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'count must be 1..16' },
+      400,
+    );
+  }
+
+  const executionId = toId<'sandboxExecutions'>(b.executionId);
+  const { granted, remaining } = await ctx.runMutation(
+    internal.sandbox.internal_mutations.applyConsumeUrlQuota,
+    { executionId, count: b.count },
+  );
+  if (granted === 0) {
+    return jsonResponse(
+      { code: 'QUOTA_EXCEEDED', remainingQuota: remaining },
+      412,
+    );
+  }
+  const urls: string[] = [];
+  for (let i = 0; i < granted; i += 1) {
+    const raw = await ctx.storage.generateUploadUrl();
+    urls.push(toSandboxStorageUrl(raw));
+  }
+  return jsonResponse({ urls, remainingQuota: remaining }, 200);
+});
+
+/**
+ * EP2: incremental storageId report-back.
+ *
+ * The spawner POSTs here after each successful presigned-URL upload so the
+ * audit row's `uploadedStorageIds` rollback set tracks the live blob set
+ * before the SSE result event finalizes the run. Without this, a spawner
+ * crash mid-harvest would orphan the already-uploaded blobs (see plan §3).
+ */
+export const recordUploadedAction = httpAction(async (ctx, req) => {
+  const path = parsePathFromUrl(req.url);
+  const body = await readBody(req);
+
+  const token = getSandboxToken();
+  if (token !== null) {
+    const verifyResult = await verifyHmac(
+      req.method,
+      path,
+      body,
+      req.headers.get(SIGNATURE_HEADER),
+      req.headers.get(TIMESTAMP_HEADER),
+      token,
+    );
+    if (!verifyResult.ok) {
+      console.warn(`[sandbox_http.EP2] unauthorized (${verifyResult.reason})`);
+      return jsonResponse({ error: 'unauthorized' }, 401);
+    }
+  }
+
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(body);
+  } catch (err) {
+    return jsonResponse({ error: 'bad_request', message: String(err) }, 400);
+  }
+  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'body must be an object' },
+      400,
+    );
+  }
+  // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- shape-checked above
+  const b = parsed as Record<string, unknown>;
+  if (typeof b.executionId !== 'string' || b.executionId.length === 0) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'executionId required' },
+      400,
+    );
+  }
+  if (typeof b.fileName !== 'string' || b.fileName.length === 0) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'fileName required' },
+      400,
+    );
+  }
+  if (typeof b.storageId !== 'string' || b.storageId.length === 0) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'storageId required' },
+      400,
+    );
+  }
+  if (typeof b.size !== 'number' || !Number.isFinite(b.size) || b.size < 0) {
+    return jsonResponse(
+      { error: 'bad_request', message: 'size required' },
+      400,
+    );
+  }
+  if (typeof b.contentType !== 'string') {
+    return jsonResponse(
+      { error: 'bad_request', message: 'contentType required' },
+      400,
+    );
+  }
+
+  const executionId = toId<'sandboxExecutions'>(b.executionId);
+  const storageId = toId<'_storage'>(b.storageId);
+  await ctx.runMutation(
+    internal.sandbox.internal_mutations.applyRecordUploaded,
+    {
+      executionId,
+      fileName: b.fileName,
+      storageId,
+      size: b.size,
+      contentType: b.contentType,
+    },
+  );
+  return jsonResponse({ ok: true }, 200);
+});
diff --git a/services/platform/convex/sandbox/schema.ts b/services/platform/convex/sandbox/schema.ts
new file mode 100644
index 000000000..a30aa0af7
--- /dev/null
+++ b/services/platform/convex/sandbox/schema.ts
@@ -0,0 +1,231 @@
+import { defineTable } from 'convex/server';
+import { v } from 'convex/values';
+
+import {
+  sandboxErrorCodeValidator,
+  sandboxLanguageValidator,
+  sandboxOutputFileValidator,
+  sandboxRunStatusValidator,
+  sandboxStepResultValidator,
+  sandboxTruncatedValidator,
+} from './wire';
+
+/**
+ * Audit row for one `artifact_run` invocation (one tool call → one row,
+ * append-only).
+ *
+ * Lifecycle (validator union = `sandboxRunStatusValidator`):
+ *   queued     — inserted atomically inside reserveSlotAndInsert (concurrent
+ *                cap + daily CPU budget both checked in the same mutation).
+ *   installing — pip / npm install is fetching dependencies; this is a real
+ *                phase the spawner emits an SSE event for. The audit row
+ *                stays in `installing` for the entire spawner round-trip;
+ *                the artifact row mirrors a finer `installing → running`
+ *                progression for the canvas UI, but the audit row only
+ *                tracks the coarse `installing → terminal` transition.
+ *   completed  — exitCode === 0 and the file harvest succeeded.
+ *   failed     — any non-success outcome; `errorCode` carries the cause.
+ *   cancelled  — client aborted via /v1/cancel or LLM-side abort signal.
+ *
+ * The schema validator still accepts `running` as a historical literal so
+ * legacy rows from earlier deploys read cleanly; new writes never use it.
+ *
+ * The watchdog (see `internal_mutations.ts:recoverStuckSandboxes`) sweeps
+ * `queued`, `installing`, AND any legacy `running` rows past
+ * `SANDBOX_WATCHDOG_CUTOFF_MS` so a throw between `reserveSlotAndInsert`
+ * and any subsequent patch cannot leak a quota slot forever. When the
+ * watchdog reaps a row that's bound to a runnable artifact (artifactId
+ * non-null), it cascades the failure to the artifact row so the canvas
+ * spinner terminates immediately.
+ *
+ * Indexes:
+ *   by_organizationId_and_status — quota counting (reserveSlot scan)
+ *   by_organizationId            — daily CPU-budget sum + per-org history
+ *                                  + opportunistic 90-day GC sweep
+ *   by_status                    — watchdog sweep across all orgs
+ *   by_artifactId                — watchdog cascade lookup
+ *
+ * This is an audit table; user-facing soft-delete / trash UI is intentionally
+ * NOT wired up. Retention is 90 days; cleanup runs opportunistically
+ * inside `reserveSlotAndInsert` via the `cleanup:sandbox` rate limiter
+ * (1/hour/org), not via a `crons.ts` entry.
+ */
+export const sandboxExecutionsTable = defineTable({
+  organizationId: v.string(),
+  threadId: v.optional(v.string()),
+  messageId: v.optional(v.string()),
+  toolCallId: v.optional(v.string()),
+  uploadedBy: v.string(),
+  agentSlug: v.optional(v.string()),
+  // Back-link to the runnable artifact this execution belongs to. Optional
+  // because not every sandbox execution is artifact-bound (future free-form
+  // sandbox callers would leave this unset). Watchdog uses this to cascade
+  // failure to the artifact row when it reaps a stuck execution — otherwise
+  // the canvas spinner stays spinning until the audit row is GC'd.
+  artifactId: v.optional(v.id('artifacts')),
+  // For artifact-bound runs: which file path the LLM asked the sandbox to
+  // execute (`main.js`, `verify.py`, …). Lets the canvas render the
+  // latest-run-per-file panel so a verify run no longer clobbers the
+  // generator's output chip. Optional for back-compat with rows written
+  // before the column existed.
+  path: v.optional(v.string()),
+
+  language: sandboxLanguageValidator,
+  purpose: v.optional(v.string()),
+
+  // Preview kept inline so the chat-pane card can render without an extra
+  // round-trip; full code persists in `_storage` when over ~8 KB.
+  codePreview: v.string(),
+  codeStorageId: v.optional(v.id('_storage')),
+  packages: v.array(v.string()),
+  // @deprecated post R2-B4: install options are no longer caller-controlled;
+  // the action hardcodes `{allowSdist: false, allowInstallScripts: false}`
+  // before invoking the spawner. Field retained for read-validation on legacy
+  // rows; new writes never set it to anything else.
+  installOptions: v.optional(
+    v.object({
+      allowSdist: v.optional(v.boolean()),
+      allowInstallScripts: v.optional(v.boolean()),
+    }),
+  ),
+
+  status: sandboxRunStatusValidator,
+  // Every status patch must update this. Watchdog reads
+  // `now - heartbeatAt` (not statusChangedAt) so a long-running but
+  // healthy job isn't reaped.
+  statusChangedAt: v.number(),
+  heartbeatAt: v.number(),
+
+  // For daily CPU-second budget enforcement we pre-debit with this
+  // estimate at reservation time; finalize replaces it with actualSeconds.
+  estimatedSeconds: v.number(),
+  actualSeconds: v.optional(v.number()),
+
+  exitCode: v.optional(v.number()),
+  durationMs: v.optional(v.number()),
+
+  stdoutPreview: v.optional(v.string()), // ≤16 KB
+  stderrPreview: v.optional(v.string()),
+  stdoutStorageId: v.optional(v.id('_storage')),
+  stderrStorageId: v.optional(v.id('_storage')),
+
+  outputFiles: v.array(sandboxOutputFileValidator),
+  // Spawner reports per-call caps were hit; the tool result mirrors these
+  // so the LLM can react ("re-run with smaller scope").
+  truncated: v.optional(sandboxTruncatedValidator),
+
+  // Populated only for multi-step runs (`artifact_run({steps:[...]})`),
+  // one entry per requested step in submission order. Single-step runs
+  // leave this undefined — the existing `path` / `exitCode` columns
+  // already carry the outcome. Optional per the
+  // [feedback_deprecate_dont_delete_schema_fields] rule so existing rows
+  // read cleanly through the validator after schema deploy.
+  steps: v.optional(v.array(sandboxStepResultValidator)),
+
+  // -----------------------------------------------------------------
+  // Presigned-URL upload telemetry (sandbox-wobbly-origami plan §5).
+  // All optional + sparse — old audit rows read cleanly through the
+  // validator. New writes from the rewritten `internal_actions.ts`
+  // populate these fields.
+  // -----------------------------------------------------------------
+  /**
+   * Pre-allocated upload-slot URLs handed to the spawner at request time.
+   * Plain strings (URLs already contain the 1h Convex upload token), kept
+   * for forensic grep when investigating partial-upload failures.
+   */
+  outputUploadSlots: v.optional(v.array(v.string())),
+  /**
+   * Server-side per-run quota counter for incremental URL allocation.
+   * Initialized to `MAX_OUTPUT_FILES_PER_RUN - <pre-alloc N>`; decremented
+   * by `applyConsumeUrlQuota`. Reaches 0 → EP1 returns 412 and the spawner
+   * stops trying to harvest more files.
+   */
+  outputUrlQuotaRemaining: v.optional(v.number()),
+  /**
+   * Storage ids reported back by the spawner via EP2 after a successful
+   * upload. Used as the rollback set in `failExecution` — anything in this
+   * list gets `ctx.storage.delete()` if the run fails. Watchdog also reads
+   * this on stuck-row reap.
+   */
+  uploadedStorageIds: v.optional(v.array(v.id('_storage'))),
+  /**
+   * Spawner-side upload outcomes (per-file). Populated by the harvest
+   * pipeline; surfaced through the audit row so a partial-upload run is
+   * forensically debuggable without trawling SSE event logs.
+   */
+  uploadStats: v.optional(
+    v.object({
+      attempted: v.number(),
+      succeeded: v.number(),
+      failures: v.array(
+        v.object({
+          slotIndex: v.number(),
+          fileName: v.string(),
+          httpStatus: v.number(),
+          errorSnippet: v.string(),
+        }),
+      ),
+    }),
+  ),
+  /**
+   * Per-phase timing breakdown (ms) — `stageMs` covers prior-output
+   * download + file write; `executeMs` the inner docker run; `harvestMs`
+   * the post-run directory walk; `uploadMs` the bytes-out pipeline. Used
+   * to track TTL pressure against the 1h `generateUploadUrl` window.
+   */
+  timing: v.optional(
+    v.object({
+      stageMs: v.number(),
+      executeMs: v.number(),
+      harvestMs: v.number(),
+      uploadMs: v.number(),
+    }),
+  ),
+
+  startedAt: v.number(),
+  completedAt: v.optional(v.number()),
+
+  errorCode: v.optional(sandboxErrorCodeValidator),
+  errorMessage: v.optional(v.string()),
+})
+  .index('by_organizationId_and_status', ['organizationId', 'status'])
+  .index('by_organizationId', ['organizationId'])
+  .index('by_status', ['status'])
+  .index('by_artifactId', ['artifactId'])
+  // For the user-Stop cascade in `cancel_generation.ts` — locates every
+  // non-terminal execution on the cancelled thread so the action can call
+  // `spawnerCancel` on each before the SDK abort would leave them running
+  // until their own SANDBOX_MAX_TIMEOUT_MS. `threadId` is already on the
+  // row; this just lets the query be O(k) instead of org-wide scan.
+  .index('by_threadId', ['threadId']);
+
+export const SANDBOX_MAX_CONCURRENT_PER_ORG = 4;
+export const SANDBOX_DAILY_CPU_BUDGET_SECONDS = 1800;
+export const SANDBOX_MAX_TIMEOUT_MS = 300_000;
+export const SANDBOX_DEFAULT_TIMEOUT_MS = 30_000;
+// Watchdog cutoff = execution wall-clock max + 10 minute tail for storage
+// uploads and finalize mutations. The previous `2 × max_timeout` formula
+// only covered execution time; multi-MB output blob uploads after the
+// spawner returned could push heartbeats past the cutoff and trigger a
+// false-positive watchdog reap (audit finding R2-B6 #3).
+export const SANDBOX_WATCHDOG_CUTOFF_MS = SANDBOX_MAX_TIMEOUT_MS + 600_000;
+
+export const SANDBOX_CODE_PREVIEW_MAX = 8 * 1024;
+export const SANDBOX_STDOUT_PREVIEW_MAX = 16 * 1024;
+export const SANDBOX_STDERR_PREVIEW_MAX = 16 * 1024;
+
+/**
+ * Maximum number of output files a single sandbox execution can publish to
+ * `_storage` via the presigned-upload pipeline. Combined cap across the
+ * pre-allocated slots AND any lazy EP1 requests. Migrated from
+ * `services/sandbox/src/config.ts` to keep the policy single-source on the
+ * Convex side (the spawner is stateless w.r.t. quotas — see plan §3).
+ */
+export const SANDBOX_MAX_OUTPUT_FILES_PER_RUN = 16;
+/**
+ * Number of upload slots pre-allocated at request dispatch time. Set so
+ * the median run (1 file) and p90 run (2 files) avoid the EP1 round-trip
+ * entirely; only the long-tail "many small outputs" path pays the lazy
+ * cost. See plan decision table § "Upload slot count".
+ */
+export const SANDBOX_OUTPUT_UPLOAD_SLOTS_PREALLOC = 2;
diff --git a/services/platform/convex/sandbox/wire.ts b/services/platform/convex/sandbox/wire.ts
new file mode 100644
index 000000000..4499b86a4
--- /dev/null
+++ b/services/platform/convex/sandbox/wire.ts
@@ -0,0 +1,366 @@
+import { v } from 'convex/values';
+
+// Type-only import of the spawner's harvest output-file shape so the
+// compile-time parity guard at the bottom of this file catches any drift
+// between the bytes the spawner emits and the shape Convex consumes.
+import type { OutputFile as SpawnerOutputFile } from '../../../sandbox/src/types';
+// Type-only imports from the spawner's wire module — purely structural,
+// nothing of this lands in the convex runtime bundle. We use these in the
+// compile-time parity assertions at the bottom of the file so a literal
+// drift on EITHER side fails CI typecheck. Audit finding R2-B3 caught
+// that the docstring claimed this guard existed when it didn't.
+import type {
+  sandboxErrorCodeLiterals as SpawnerErrorCodes,
+  sandboxLanguageLiterals as SpawnerLanguages,
+  sandboxPhaseEventLiterals as SpawnerPhases,
+  sandboxSseEventLiterals as SpawnerSseEvents,
+  sandboxStepStatusLiterals as SpawnerStepStatuses,
+} from '../../../sandbox/src/wire';
+
+/**
+ * Single source of truth for the sandbox runtime's wire protocol on the
+ * Convex side. Both the audit row (`sandboxExecutions`) and the artifact
+ * runnable run-state (`artifacts.run*` fields) build their validators from
+ * the literal arrays exported here — adding or removing a code never
+ * requires touching multiple schema files. The spawner-side mirror lives
+ * at `services/sandbox/src/wire.ts`; the bidirectional `extends` checks
+ * at the bottom of this file keep them from drifting.
+ *
+ * Pattern mirrors `services/platform/convex/tts/error_codes.ts`.
+ */
+
+export const sandboxRunStatusLiterals = [
+  'queued',
+  // Set while pip / npm install is fetching deps. The audit row stays in
+  // `queued` until the spawner reports a phase event; the artifact row
+  // mirrors `installing` so the canvas can distinguish "waiting for slot"
+  // from "downloading torch". The audit-row lifecycle is
+  // queued → installing → terminal — `running` is never persisted there;
+  // see the comment on `setRunning` in `internal_mutations.ts`. The literal
+  // below is retained for read-validation of legacy rows and for the
+  // artifact-side `runStatus` field (which DOES use `running` to drive the
+  // canvas spinner). Watchdog reaps queued, installing, and running.
+  'installing',
+  'running',
+  'completed',
+  'failed',
+  'cancelled',
+] as const;
+
+export type SandboxRunStatus = (typeof sandboxRunStatusLiterals)[number];
+
+export const sandboxRunStatusValidator = v.union(
+  v.literal('queued'),
+  v.literal('installing'),
+  // 'running' retained for legacy audit rows pre-refactor and for the
+  // artifact `runStatus` field; new audit-row writes emit 'installing' only.
+  v.literal('running'),
+  v.literal('completed'),
+  v.literal('failed'),
+  v.literal('cancelled'),
+);
+
+export const sandboxTerminalStatuses: ReadonlySet<SandboxRunStatus> = new Set([
+  'completed',
+  'failed',
+  'cancelled',
+]);
+
+export const sandboxErrorCodeLiterals = [
+  'TIMEOUT',
+  'OOM',
+  'EGRESS_DENIED',
+  'INSTALL_FAILED',
+  'PACKAGE_NOT_FOUND',
+  'QUOTA_EXCEEDED',
+  'RUNTIME_ERROR',
+  'SPAWNER_UNAVAILABLE',
+  'CANCELLED',
+  // The action validated the input but rejected it (file missing,
+  // not in the requested thread, IDOR check failed). Distinct from
+  // SPAWNER_UNAVAILABLE so the agent's recovery hint is "fix the args",
+  // not "retry the transient infra".
+  'INPUT_REJECTED',
+  // Output-pipeline error codes (sandbox-wobbly-origami plan §5). Split out
+  // of the legacy catch-all `HARVEST_FAILED` so the LLM-side recovery hint
+  // can be specific. See artifact_run_tool.ts for the per-code recovery
+  // table; the spawner-side mirror is in services/sandbox/src/wire.ts.
+  'HARVEST_READ_FAILED',
+  'UPLOAD_FAILED',
+  'UPLOAD_QUOTA_EXCEEDED',
+  'UPLOAD_REPORT_FAILED',
+  // Pre-stage attestation failure: the spawner reported `priorStage.skipped`
+  // entries for files the platform expected to inject into
+  // `/workspace/output/` before user code ran. Abort BEFORE the container
+  // starts so the LLM cannot run against a corrupted workspace. The
+  // `errorMessage` payload carries a JSON `{skipped: [{name, reason}], ...}`
+  // breakdown so the LLM can decide whether to retry with
+  // `inputs.from_run: <runId>` or surface the issue.
+  'PRE_STAGE_FAILED',
+  // Output-pipeline completeness gate: `uploadStats.failures` came back
+  // non-empty (either an upload POST or the EP2 record-uploaded callback
+  // dropped). The bytes that made it to `_storage` are cleaned via the
+  // existing `uploadedStorageIds[]` rollback; the run is failed so the
+  // LLM doesn't trust a partial workspace state. Distinct from the
+  // per-failure codes above because this is the action-side decision
+  // that "any failure → fatal", not a single transport-layer cause.
+  'UPLOAD_INCOMPLETE',
+] as const;
+
+export type SandboxErrorCode = (typeof sandboxErrorCodeLiterals)[number];
+
+export const sandboxErrorCodeValidator = v.union(
+  v.literal('TIMEOUT'),
+  v.literal('OOM'),
+  v.literal('EGRESS_DENIED'),
+  v.literal('INSTALL_FAILED'),
+  v.literal('PACKAGE_NOT_FOUND'),
+  v.literal('QUOTA_EXCEEDED'),
+  v.literal('RUNTIME_ERROR'),
+  v.literal('SPAWNER_UNAVAILABLE'),
+  v.literal('CANCELLED'),
+  v.literal('INPUT_REJECTED'),
+  v.literal('HARVEST_READ_FAILED'),
+  v.literal('UPLOAD_FAILED'),
+  v.literal('UPLOAD_QUOTA_EXCEEDED'),
+  v.literal('UPLOAD_REPORT_FAILED'),
+  v.literal('PRE_STAGE_FAILED'),
+  v.literal('UPLOAD_INCOMPLETE'),
+);
+
+/**
+ * Wire-level phase events emitted by the spawner SSE stream. The Convex
+ * action translates these into `runStatus` and `runPhase` patches on the
+ * artifact row. `preparing` corresponds to docker-pull / workspace setup;
+ * `installing` to dependency install; `running` to user-code execution;
+ * `completed` to terminal (success or failure — the result body carries
+ * the outcome).
+ */
+/**
+ * SSE event-type vocabulary emitted by the spawner's `POST /v1/execute`.
+ * Mirror of `services/sandbox/src/wire.ts:sandboxSseEventLiterals`. The
+ * compile-time `Equal<>` parity check below catches drift in either
+ * direction. Adding a new event type requires updating both wire files
+ * AND the `spawner_client.ts` SSE-parser switch (the parser is the actual
+ * consumer; this constant is the documentation contract).
+ */
+export const sandboxSseEventLiterals = [
+  'phase',
+  'stdout',
+  'stderr',
+  'result',
+  'error',
+] as const;
+
+export type SandboxSseEvent = (typeof sandboxSseEventLiterals)[number];
+
+export const sandboxPhaseEventLiterals = [
+  'preparing',
+  'installing',
+  'running',
+  'completed',
+] as const;
+
+export type SandboxPhaseEvent = (typeof sandboxPhaseEventLiterals)[number];
+
+/**
+ * Structured progress payload persisted on the artifact row alongside the
+ * phase. Replaces the legacy `runProgress` string field — keys come from
+ * a stable enum and locale-specific text is composed in the UI via the
+ * `chat.runnable.progress.*` message keys, so the server never writes
+ * English literals that the UI cannot translate.
+ */
+export const sandboxRunProgressLiterals = [
+  'queued',
+  'preparing',
+  'installingPackage',
+  'installing',
+  'running',
+] as const;
+
+export type SandboxRunProgressKind =
+  (typeof sandboxRunProgressLiterals)[number];
+
+export const sandboxRunProgressValidator = v.object({
+  kind: v.union(
+    v.literal('queued'),
+    v.literal('preparing'),
+    v.literal('installingPackage'),
+    v.literal('installing'),
+    v.literal('running'),
+  ),
+  // Populated only for `installingPackage` — `{ package: 'python-pptx',
+  // version: '1.0.2' }`. Empty / omitted for the other kinds.
+  package: v.optional(v.string()),
+  version: v.optional(v.string()),
+});
+
+/**
+ * Output-file shape used by both `sandboxExecutions.outputFiles` (audit
+ * row, no denormalized storageId) and `artifacts.runOutputFiles` (canvas
+ * fast-path, denormalized storageId). `storageId` is optional so the same
+ * validator covers both call sites; callers that need it must check.
+ */
+export const sandboxOutputFileValidator = v.object({
+  name: v.string(),
+  size: v.number(),
+  contentType: v.string(),
+  fileMetadataId: v.id('fileMetadata'),
+  storageId: v.optional(v.id('_storage')),
+  // Optional so historical rows (and the audit-row projection that doesn't
+  // need it) continue to validate. New harvests always populate sha256 —
+  // it's set by the spawner during `harvestOutputDir` and used for the
+  // cumulative manifest (artifactOutputs) + pre-stage attestation.
+  sha256: v.optional(v.string()),
+});
+
+export interface SandboxOutputFile {
+  name: string;
+  size: number;
+  contentType: string;
+  fileMetadataId: string;
+  storageId?: string;
+  sha256?: string;
+}
+
+/**
+ * Spawner-emitted harvest output-file shape. Always populated by the
+ * spawner's `harvestOutputDir`; `storageId` and `sha256` are required here
+ * because the spawner has just uploaded the bytes and computed the hash.
+ * Convex transforms this into {@link SandboxOutputFile} when persisting to
+ * the audit row (allocates `fileMetadataId`; `storageId` / `sha256` flow
+ * through verbatim).
+ *
+ * The compile-time parity guard at the bottom of this file ensures this
+ * stays byte-identical to `services/sandbox/src/types.ts:OutputFile`. If
+ * spawner adds or removes a field on its `OutputFile`, the typecheck fails
+ * here, forcing a coordinated update before merge.
+ */
+export interface HarvestOutputFile {
+  name: string;
+  storageId: string;
+  size: number;
+  contentType: string;
+  sha256: string;
+}
+
+export const sandboxTruncatedValidator = v.object({
+  stdout: v.boolean(),
+  stderr: v.boolean(),
+  files: v.number(),
+});
+
+export const sandboxLanguageLiterals = ['python', 'node', 'polyglot'] as const;
+export type SandboxLanguage = (typeof sandboxLanguageLiterals)[number];
+
+export const sandboxLanguageValidator = v.union(
+  v.literal('python'),
+  v.literal('node'),
+  // Polyglot mode: per-step interpreter is chosen by file extension
+  // (.py → python3, .js/.cjs/.mjs → node). Packages are split into
+  // python/node buckets via `packagesByLang` on the wire.
+  v.literal('polyglot'),
+);
+
+/**
+ * Per-step outcome populated only for multi-step runs (where
+ * `artifact_run` was invoked with `steps: [{path}]`). One row per
+ * requested step, in the requested order. `status` is:
+ *   `completed` — exit 0
+ *   `failed`    — exit ≠ 0; the wrapper aborts subsequent steps
+ *   `skipped`   — a prior step failed or the wrapper never reached this one
+ *
+ * `exitCode` is `null` for `skipped` (no process was started).
+ */
+export const sandboxStepStatusLiterals = [
+  'completed',
+  'failed',
+  'skipped',
+] as const;
+
+export type SandboxStepStatus = (typeof sandboxStepStatusLiterals)[number];
+
+export const sandboxStepStatusValidator = v.union(
+  v.literal('completed'),
+  v.literal('failed'),
+  v.literal('skipped'),
+);
+
+export const sandboxStepResultValidator = v.object({
+  path: v.string(),
+  status: sandboxStepStatusValidator,
+  exitCode: v.union(v.number(), v.null()),
+  durationMs: v.number(),
+});
+
+export type SandboxStepResult = {
+  path: string;
+  status: SandboxStepStatus;
+  exitCode: number | null;
+  durationMs: number;
+};
+
+// ---------------------------------------------------------------------------
+// Spawner ↔ Convex literal parity (audit finding R2-B3)
+// ---------------------------------------------------------------------------
+// Compile-time double-extension checks: each literal-set on this side
+// must be both a superset AND a subset of the spawner-side set (i.e.
+// equal). Adding a literal on only one side fails CI typecheck with a
+// clear error pointing at the assigning line, before the divergence
+// ever ships. Purely type-level — no runtime cost.
+//
+// `Equal<ConvexSide, SpawnerSide>` returns `true` iff the two unions
+// match. If the spawner has an extra literal, ConvexSide ⊊ SpawnerSide
+// breaks the second clause. If Convex has an extra, the first clause
+// breaks. The error object is a fake type whose key surfaces a
+// readable diagnostic next to the failing literal-array name.
+type Equal<A, B> = [A] extends [B]
+  ? [B] extends [A]
+    ? true
+    : {
+        __wireDrift: 'Spawner has literal(s) missing from Convex side — add them here too';
+      }
+  : {
+      __wireDrift: 'Convex has literal(s) missing from spawner side — add them in services/sandbox/src/wire.ts';
+    };
+
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const _errorCodeParity: Equal<
+  (typeof sandboxErrorCodeLiterals)[number],
+  (typeof SpawnerErrorCodes)[number]
+> = true;
+
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const _phaseEventParity: Equal<
+  (typeof sandboxPhaseEventLiterals)[number],
+  (typeof SpawnerPhases)[number]
+> = true;
+
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const _languageParity: Equal<
+  (typeof sandboxLanguageLiterals)[number],
+  (typeof SpawnerLanguages)[number]
+> = true;
+
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const _stepStatusParity: Equal<
+  (typeof sandboxStepStatusLiterals)[number],
+  (typeof SpawnerStepStatuses)[number]
+> = true;
+
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const _sseEventParity: Equal<
+  (typeof sandboxSseEventLiterals)[number],
+  (typeof SpawnerSseEvents)[number]
+> = true;
+
+// Harvest output-file shape parity. Both sides declare:
+//   { name, storageId, size, contentType, sha256 }
+// — all required, all primitive strings/numbers. If the spawner side adds
+// or removes a field on its `OutputFile`, the Equal<> below fails here
+// with a clear diagnostic, forcing a coordinated update before merge.
+// (The audit-row validator `sandboxOutputFileValidator` keeps storageId/
+// sha256 optional indefinitely so legacy rows pass — see plan §A.)
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const _harvestOutputFileParity: Equal<HarvestOutputFile, SpawnerOutputFile> =
+  true;
diff --git a/services/platform/convex/schema.ts b/services/platform/convex/schema.ts
index 2a9877d92..080c426f7 100644
--- a/services/platform/convex/schema.ts
+++ b/services/platform/convex/schema.ts
@@ -10,7 +10,14 @@ import {
   agentWebhookUserThreadsTable,
 } from './agents/webhooks/schema';
 import { approvalsTable } from './approvals/schema';
-import { artifactRevisionsTable, artifactsTable } from './artifacts/schema';
+import {
+  artifactFilesTable,
+  artifactOutputsTable,
+  artifactRevisionsTable,
+  artifactRunFilesTable,
+  artifactRunsTable,
+  artifactsTable,
+} from './artifacts/schema';
 import { auditLogChainGenesisTable, auditLogsTable } from './audit_logs/schema';
 import {
   brandingBindingsTable,
@@ -54,6 +61,7 @@ import { notificationsTable } from './notifications/schema';
 import { onedriveSyncConfigsTable } from './onedrive/schema';
 import { productsTable } from './products/schema';
 import { promptCategoriesTable, promptTemplatesTable } from './prompts/schema';
+import { sandboxExecutionsTable } from './sandbox/schema';
 import { ssoProvidersTable } from './sso_providers/schema';
 import { messageMetadataTable } from './streaming/schema';
 import { threadTodosTable } from './thread_todos/schema';
@@ -89,7 +97,11 @@ import {
 
 export default defineSchema({
   approvals: approvalsTable,
+  artifactFiles: artifactFilesTable,
+  artifactOutputs: artifactOutputsTable,
   artifactRevisions: artifactRevisionsTable,
+  artifactRunFiles: artifactRunFilesTable,
+  artifactRuns: artifactRunsTable,
   artifacts: artifactsTable,
   auditLogs: auditLogsTable,
   auditLogChainGenesis: auditLogChainGenesisTable,
@@ -152,6 +164,7 @@ export default defineSchema({
   products: productsTable,
   ssoProviders: ssoProvidersTable,
   vendors: vendorsTable,
+  sandboxExecutions: sandboxExecutionsTable,
   videoLinkJobs: videoLinkJobsTable,
   websites: websitesTable,
   wfApiKeys: wfApiKeysTable,
diff --git a/services/platform/convex/threads/cancel_generation.test.ts b/services/platform/convex/threads/cancel_generation.test.ts
index 21f28b195..c9b987477 100644
--- a/services/platform/convex/threads/cancel_generation.test.ts
+++ b/services/platform/convex/threads/cancel_generation.test.ts
@@ -1,4 +1,4 @@
-import { describe, it, expect, vi, beforeEach } from 'vitest';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 import type { MutationCtx } from '../_generated/server';
 
@@ -67,12 +67,10 @@ describe('cancelGeneration — happy path', () => {
 
     await cancelGeneration(ctx as unknown as MutationCtx, 'user_1', 'thread_1');
 
-    // Thread lookup
     expect(ctx.runQuery).toHaveBeenCalledWith('mock-getThread', {
       threadId: 'thread_1',
     });
 
-    // Should list active streams
     expect(mockListStreams).toHaveBeenCalledWith(
       ctx,
       expect.anything(),
@@ -82,7 +80,6 @@ describe('cancelGeneration — happy path', () => {
       }),
     );
 
-    // Should abort both streams
     expect(mockAbortStream).toHaveBeenCalledTimes(2);
     expect(mockAbortStream).toHaveBeenCalledWith(
       ctx,
@@ -102,7 +99,7 @@ describe('cancelGeneration — happy path', () => {
     );
   });
 
-  it('marks assistant message as success with displayedContent (ChatGPT-style)', async () => {
+  it('truncates string content to displayedLength (ChatGPT-style)', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListMessages.mockResolvedValue({
       page: [
@@ -118,29 +115,72 @@ describe('cancelGeneration — happy path', () => {
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      'Full long',
+      9,
     );
 
     expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
       messageId: 'msg_1',
       patch: {
+        status: 'success',
+        message: { role: 'assistant', content: 'Full long' },
+      },
+    });
+  });
+
+  it('truncates array content while preserving non-text parts', async () => {
+    const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
+    const filePart = {
+      type: 'file',
+      data: 'data:image/png;base64,xxx',
+      mediaType: 'image/png',
+    };
+    const reasoningPart = { type: 'reasoning', text: 'thinking' };
+    mockListMessages.mockResolvedValue({
+      page: [
+        {
+          _id: 'msg_1',
+          message: {
+            role: 'assistant',
+            content: [
+              filePart,
+              { type: 'text', text: 'Here is the image you asked for' },
+              reasoningPart,
+            ],
+          },
+          text: 'Here is the image you asked for',
+        },
+      ],
+    });
+
+    await cancelGeneration(
+      ctx as unknown as MutationCtx,
+      'user_1',
+      'thread_1',
+      7,
+    );
+
+    expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
+      messageId: 'msg_1',
+      patch: {
+        status: 'success',
         message: {
           role: 'assistant',
-          content: 'Full long',
+          content: [filePart, { type: 'text', text: 'Here is' }, reasoningPart],
         },
-        status: 'success',
       },
     });
   });
 
-  it('sets status to failed when displayedContent is null (no content shown)', async () => {
+  it('keeps streamed content when displayedLength is null but text was persisted', async () => {
+    // Snapshot raced (refs unregistered, e.g. mid-remount). Don't vaporise
+    // already-streamed deltas — preserve them.
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListMessages.mockResolvedValue({
       page: [
         {
           _id: 'msg_1',
-          message: { role: 'assistant', content: 'Some response' },
-          text: 'Some response',
+          message: { role: 'assistant', content: 'Some streamed reply' },
+          text: 'Some streamed reply',
         },
       ],
     });
@@ -152,14 +192,13 @@ describe('cancelGeneration — happy path', () => {
       null,
     );
 
-    expect(mockListMessages).toHaveBeenCalled();
     expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
       messageId: 'msg_1',
-      patch: { status: 'failed' },
+      patch: { status: 'success' },
     });
   });
 
-  it('sets status to failed when displayedContent is undefined', async () => {
+  it('keeps streamed content when displayedLength is undefined but text was persisted', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListMessages.mockResolvedValue({
       page: [
@@ -178,7 +217,31 @@ describe('cancelGeneration — happy path', () => {
       undefined,
     );
 
-    expect(mockListMessages).toHaveBeenCalled();
+    expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
+      messageId: 'msg_1',
+      patch: { status: 'success' },
+    });
+  });
+
+  it('marks failed when no displayedLength AND no streamed text (true early cancel)', async () => {
+    const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
+    mockListMessages.mockResolvedValue({
+      page: [
+        {
+          _id: 'msg_1',
+          message: { role: 'assistant', content: '' },
+          text: '',
+        },
+      ],
+    });
+
+    await cancelGeneration(
+      ctx as unknown as MutationCtx,
+      'user_1',
+      'thread_1',
+      null,
+    );
+
     expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
       messageId: 'msg_1',
       patch: { status: 'failed' },
@@ -222,10 +285,9 @@ describe('cancelGeneration — happy path', () => {
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      'Latest',
+      6,
     );
 
-    // Should update the FIRST assistant message found (latest in page order)
     expect(ctx.runMutation).toHaveBeenCalledWith(
       'mock-updateMessage',
       expect.objectContaining({ messageId: 'msg_3' }),
@@ -282,7 +344,7 @@ describe('cancelGeneration — edge cases', () => {
     ).rejects.toThrow('Thread not found');
   });
 
-  it('marks as failed with empty string displayedContent (no visible text)', async () => {
+  it('treats displayedLength=0 as no snapshot (preserve streamed text if any)', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListMessages.mockResolvedValue({
       page: [
@@ -294,17 +356,16 @@ describe('cancelGeneration — edge cases', () => {
       ],
     });
 
-    // Empty string has no trim content — treated as no displayed content
     await cancelGeneration(
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      '',
+      0,
     );
 
     expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
       messageId: 'msg_1',
-      patch: { status: 'failed' },
+      patch: { status: 'success' },
     });
   });
 
@@ -324,12 +385,10 @@ describe('cancelGeneration — edge cases', () => {
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      'some content',
+      12,
     );
 
-    // No updateMessage — no assistant message to update
     expect(ctx.runMutation).not.toHaveBeenCalled();
-    // No saveMessage — cancelledAt signal replaces sentinel messages
   });
 
   it('does not create message when no messages exist at all', async () => {
@@ -342,7 +401,7 @@ describe('cancelGeneration — edge cases', () => {
     expect(ctx.runMutation).not.toHaveBeenCalled();
   });
 
-  it('does not create message when no messages exist and displayedContent is null', async () => {
+  it('does not create message when no messages exist and displayedLength is null', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListMessages.mockResolvedValue({ page: [] });
 
@@ -356,37 +415,6 @@ describe('cancelGeneration — edge cases', () => {
     expect(ctx.runMutation).not.toHaveBeenCalled();
   });
 
-  it('finds the first assistant message even without text property', async () => {
-    const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
-    mockListMessages.mockResolvedValue({
-      page: [
-        {
-          _id: 'msg_1',
-          message: { role: 'assistant', content: 'tool call result' },
-          text: undefined,
-        },
-        {
-          _id: 'msg_2',
-          message: { role: 'assistant', content: 'Visible response' },
-          text: 'Visible response',
-        },
-      ],
-    });
-
-    await cancelGeneration(
-      ctx as unknown as MutationCtx,
-      'user_1',
-      'thread_1',
-      null,
-    );
-
-    // Should find first assistant message (msg_1) and update its status
-    expect(ctx.runMutation).toHaveBeenCalledWith(
-      'mock-updateMessage',
-      expect.objectContaining({ messageId: 'msg_1' }),
-    );
-  });
-
   it('aborts a single stream', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListStreams.mockResolvedValue([{ streamId: 'stream_solo' }]);
@@ -404,7 +432,7 @@ describe('cancelGeneration — edge cases', () => {
     );
   });
 
-  it('handles very long displayedContent', async () => {
+  it('handles very long displayedLength', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     const longContent = 'A'.repeat(50000);
     mockListMessages.mockResolvedValue({
@@ -421,30 +449,32 @@ describe('cancelGeneration — edge cases', () => {
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      longContent,
+      50000,
     );
 
     expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
       messageId: 'msg_1',
       patch: {
-        message: { role: 'assistant', content: longContent },
         status: 'success',
+        message: { role: 'assistant', content: longContent },
       },
     });
   });
 
-  it('preserves multi-byte characters in displayedContent without corruption', async () => {
+  it('preserves multi-byte characters at the truncation boundary', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
-    const unicodeContent = 'Hello 🌍 世界! Here is some text with emoji 🎉🚀';
+    // Snapshot length on the client is also UTF-16; the backend's slice
+    // is symmetric so the result is whatever the client saw.
+    const fullText = 'Hello 🌍 世界! Here is some text with emoji 🎉🚀';
     mockListMessages.mockResolvedValue({
       page: [
         {
           _id: 'msg_1',
           message: {
             role: 'assistant',
-            content: unicodeContent + ' and more...',
+            content: fullText + ' and more...',
           },
-          text: unicodeContent + ' and more...',
+          text: fullText + ' and more...',
         },
       ],
     });
@@ -453,14 +483,14 @@ describe('cancelGeneration — edge cases', () => {
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      unicodeContent,
+      fullText.length,
     );
 
     expect(ctx.runMutation).toHaveBeenCalledWith('mock-updateMessage', {
       messageId: 'msg_1',
       patch: {
-        message: { role: 'assistant', content: unicodeContent },
         status: 'success',
+        message: { role: 'assistant', content: fullText },
       },
     });
   });
@@ -503,14 +533,13 @@ describe('cancelGeneration — edge cases', () => {
       ctx as unknown as MutationCtx,
       'user_1',
       'thread_1',
-      'Partial',
+      7,
     );
 
-    // Should NOT update the existing successful message
     expect(ctx.runMutation).not.toHaveBeenCalled();
   });
 
-  it('skips message creation when latest is successful and no displayedContent', async () => {
+  it('skips message creation when latest is successful and no displayedLength', async () => {
     const ctx = createMockCtx({ userId: 'user_1', status: 'active' });
     mockListMessages.mockResolvedValue({
       page: [
diff --git a/services/platform/convex/threads/cancel_generation.ts b/services/platform/convex/threads/cancel_generation.ts
index 4ba356199..1a1eddc0f 100644
--- a/services/platform/convex/threads/cancel_generation.ts
+++ b/services/platform/convex/threads/cancel_generation.ts
@@ -1,7 +1,8 @@
 import { abortStream, listMessages, listStreams } from '@convex-dev/agent';
 
-import { components } from '../_generated/api';
+import { components, internal } from '../_generated/api';
 import type { MutationCtx } from '../_generated/server';
+import { truncateAssistantContent } from './truncate_message_content';
 
 /**
  * Cancel an active AI generation for a thread.
@@ -9,14 +10,21 @@ import type { MutationCtx } from '../_generated/server';
  * 1. Validates thread ownership.
  * 2. Aborts all active (streaming) SDK streams.
  * 3. Sets cancelledAt on threadMetadata so the running action detects it.
- * 4. If displayedContent is provided, marks the latest assistant message as
- *    "success" with that content (ChatGPT-style clean stop).
+ * 4. Updates the latest assistant message:
+ *    - If `displayedLength > 0`: truncate the message content in-place to
+ *      that length, preserving every non-text part (reasoning, tool-call,
+ *      tool-result, file, source). Marks status=success — the user sees
+ *      exactly what the typewriter had revealed.
+ *    - If no displayed length but the message already has streamed text:
+ *      mark status=success without touching content (don't lose deltas).
+ *    - Otherwise (truly empty): mark status=failed → rendered as a clean
+ *      "aborted" bubble by the UI.
  */
 export async function cancelGeneration(
   ctx: MutationCtx,
   userId: string,
   threadId: string,
-  displayedContent?: string | null,
+  displayedLength?: number | null,
 ): Promise<void> {
   const thread = await ctx.runQuery(components.agent.threads.getThread, {
     threadId,
@@ -38,7 +46,7 @@ export async function cancelGeneration(
     });
   }
 
-  // Mark the latest assistant message based on displayed content
+  // Find the latest assistant message and decide how to finalise it.
   const messagesResult = await listMessages(ctx, components.agent, {
     threadId,
     paginationOpts: { numItems: 5, cursor: null },
@@ -50,17 +58,36 @@ export async function cancelGeneration(
   );
 
   if (latestAssistant && latestAssistant.status !== 'success') {
-    if (displayedContent?.trim()) {
-      // ChatGPT-style: preserve displayed content as a successful message
+    const message = latestAssistant.message;
+    const hasDisplayedLength =
+      typeof displayedLength === 'number' && displayedLength > 0;
+
+    if (hasDisplayedLength && message?.role === 'assistant') {
+      // ChatGPT-style: keep exactly what the user saw. Truncate text
+      // content to displayedLength while preserving structured parts.
+      const truncated = truncateAssistantContent(
+        message.content,
+        displayedLength,
+      );
       await ctx.runMutation(components.agent.messages.updateMessage, {
         messageId: latestAssistant._id,
         patch: {
           status: 'success',
-          message: { role: 'assistant', content: displayedContent },
+          message: { ...message, content: truncated },
         },
       });
+    } else if (latestAssistant.text?.trim()) {
+      // No displayed-length signal (snapshot raced / refs unregistered),
+      // but content was already streamed. Preserve what's persisted rather
+      // than discarding it — better to show "more than the user saw" than
+      // to vaporise their reply.
+      await ctx.runMutation(components.agent.messages.updateMessage, {
+        messageId: latestAssistant._id,
+        patch: { status: 'success' },
+      });
     } else {
-      // No content was displayed — mark as failed so frontend shows clean state
+      // Truly empty (cancel fired before any token was streamed).
+      // Mark failed so the UI renders the clean aborted bubble.
       await ctx.runMutation(components.agent.messages.updateMessage, {
         messageId: latestAssistant._id,
         patch: { status: 'failed' },
@@ -83,4 +110,44 @@ export async function cancelGeneration(
       streamId: undefined,
     });
   }
+
+  // Discard any in-flight artifact streams on this thread. Without this,
+  // a stop during `artifact_create` mid-input-delta leaves a `revision:0`
+  // placeholder row in the canvas sidebar with a streaming badge until
+  // `cleanupStaleStreams` cron sweeps it (up to ~6 min). We do this inline
+  // because the mutation just deletes/patches the artifact row — no
+  // external services involved.
+  if (threadMeta?.organizationId) {
+    try {
+      await ctx.runMutation(
+        internal.artifacts.internal_mutations.discardActiveStreamsForThread,
+        { organizationId: threadMeta.organizationId, threadId },
+      );
+    } catch (err) {
+      // Best-effort — never fail the cancel because of cleanup hiccups.
+      // The 60 s + 5 min watchdog still sweeps anything we miss here.
+      console.warn(
+        '[cancelGeneration] discardActiveStreamsForThread failed:',
+        err,
+      );
+    }
+  }
+
+  // Cascade Stop to any running sandbox executions on this thread. Scheduled
+  // (not awaited) because the action calls the spawner over HTTP and we
+  // don't want to block the user's Stop-acknowledged response on a network
+  // round-trip. The mutation that finalizes each execution is terminal-state
+  // guarded so racing with `executeCode`'s own finalize is safe.
+  try {
+    await ctx.scheduler.runAfter(
+      0,
+      internal.node_only.sandbox.internal_actions.cancelExecutionsForThread,
+      { threadId },
+    );
+  } catch (err) {
+    console.warn(
+      '[cancelGeneration] scheduler.runAfter(cancelExecutionsForThread) failed:',
+      err,
+    );
+  }
 }
diff --git a/services/platform/convex/threads/create_branch_thread.ts b/services/platform/convex/threads/create_branch_thread.ts
index 7605bf3b0..da82a28b5 100644
--- a/services/platform/convex/threads/create_branch_thread.ts
+++ b/services/platform/convex/threads/create_branch_thread.ts
@@ -153,13 +153,21 @@ export const createBranchThread = internalMutation({
       // message is in scope (or 'user' edits, which carry no messageId but
       // by revision-order monotonicity must have happened between the
       // surrounding assistant edits). Stop at the first out-of-scope edit.
+      // While walking, keep the most recent in-scope file/content snapshot
+      // so we can branch at the revision the user actually forked at, not
+      // the source row's current state (which may include later edits made
+      // on the parent after the fork point).
       let snapshotRev:
         | {
             revision: number;
-            content: string;
             editedByMessageId?: string;
           }
         | undefined;
+      let snapshotFiles:
+        | ReadonlyArray<{ path: string; content: string }>
+        | undefined;
+      let snapshotEntryFile: string | undefined;
+      let snapshotContent: string | undefined;
       for await (const rev of ctx.db
         .query('artifactRevisions')
         .withIndex('by_artifact', (q) => q.eq('artifactId', source._id))
@@ -170,14 +178,27 @@ export const createBranchThread = internalMutation({
         if (!inScope) break;
         snapshotRev = {
           revision: rev.revision,
-          content: rev.content,
           editedByMessageId: rev.editedByMessageId,
         };
+        // Capture file/content state at this revision. `set_entry` rows
+        // omit `files` AND `content` (only entryFile changes) — for those
+        // we keep the previously-captured file state but update the entry
+        // pointer. `files` and legacy `content` are mutually exclusive in
+        // current writes (post-Phase A); legacy rows have only `content`.
+        if (rev.files !== undefined) {
+          snapshotFiles = rev.files;
+          if (rev.entryFile !== undefined) snapshotEntryFile = rev.entryFile;
+          // Don't carry a stale legacy `content` past a `files` revision.
+          snapshotContent = undefined;
+        } else if (rev.content !== undefined) {
+          snapshotContent = rev.content;
+          if (rev.entryFile !== undefined) snapshotEntryFile = rev.entryFile;
+        } else if (rev.entryFile !== undefined) {
+          // set_entry: only the entry pointer changed.
+          snapshotEntryFile = rev.entryFile;
+        }
       }
 
-      // Fall back to the source row when no revision rows exist (e.g.
-      // legacy data). Should not normally happen.
-      const finalContent = snapshotRev?.content ?? source.content;
       const finalRevision = snapshotRev?.revision ?? source.revision;
       const mappedLastEditedByMessageId = snapshotRev?.editedByMessageId
         ? messageIdMap.get(snapshotRev.editedByMessageId)
@@ -185,11 +206,19 @@ export const createBranchThread = internalMutation({
 
       await snapshotArtifactForBranch(ctx, {
         source,
-        snapshotContent: finalContent,
         snapshotRevision: finalRevision,
         targetThreadId: branchThreadId,
         mappedCreatedByMessageId,
-        mappedLastEditedByMessageId,
+        ...(mappedLastEditedByMessageId !== undefined && {
+          mappedLastEditedByMessageId,
+        }),
+        ...(snapshotFiles !== undefined && { revisionFiles: snapshotFiles }),
+        ...(snapshotEntryFile !== undefined && {
+          revisionEntryFile: snapshotEntryFile,
+        }),
+        ...(snapshotContent !== undefined && {
+          revisionContent: snapshotContent,
+        }),
       });
     }
 
diff --git a/services/platform/convex/threads/mutations.ts b/services/platform/convex/threads/mutations.ts
index 960c65bb2..b5b46e5a2 100644
--- a/services/platform/convex/threads/mutations.ts
+++ b/services/platform/convex/threads/mutations.ts
@@ -204,7 +204,7 @@ export const updateChatThread = mutation({
 export const cancelGeneration = mutation({
   args: {
     threadId: v.string(),
-    displayedContent: v.optional(v.union(v.string(), v.null())),
+    displayedLength: v.optional(v.union(v.number(), v.null())),
   },
   returns: v.null(),
   handler: async (ctx, args) => {
@@ -217,7 +217,7 @@ export const cancelGeneration = mutation({
       ctx,
       String(authUser._id),
       args.threadId,
-      args.displayedContent,
+      args.displayedLength,
     );
     return null;
   },
diff --git a/services/platform/convex/threads/truncate_message_content.test.ts b/services/platform/convex/threads/truncate_message_content.test.ts
new file mode 100644
index 000000000..2db21158a
--- /dev/null
+++ b/services/platform/convex/threads/truncate_message_content.test.ts
@@ -0,0 +1,126 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  type AssistantContent,
+  truncateAssistantContent,
+} from './truncate_message_content';
+
+describe('truncateAssistantContent — string content', () => {
+  it('truncates to the requested length', () => {
+    expect(truncateAssistantContent('Hello world', 5)).toBe('Hello');
+  });
+
+  it('returns the full string when length exceeds content', () => {
+    expect(truncateAssistantContent('Hi', 50)).toBe('Hi');
+  });
+
+  it('returns empty string when length is 0', () => {
+    expect(truncateAssistantContent('Hello', 0)).toBe('');
+  });
+
+  it('preserves multi-byte characters (slices by UTF-16 unit, like the snapshot)', () => {
+    // Snapshot length on the client is also UTF-16; ensures parity.
+    const text = 'Hi 🌍';
+    expect(truncateAssistantContent(text, 3)).toBe('Hi ');
+  });
+
+  it('throws on negative length', () => {
+    expect(() => truncateAssistantContent('Hello', -1)).toThrow(/>= 0/);
+  });
+});
+
+describe('truncateAssistantContent — array content', () => {
+  it('truncates a single text part', () => {
+    const content: AssistantContent = [{ type: 'text', text: 'Hello world' }];
+    const result = truncateAssistantContent(content, 5);
+    expect(result).toEqual([{ type: 'text', text: 'Hello' }]);
+  });
+
+  it('preserves a file part placed before a text part', () => {
+    const content: AssistantContent = [
+      {
+        type: 'file',
+        data: 'data:image/png;base64,xxx',
+        mediaType: 'image/png',
+      },
+      { type: 'text', text: 'Here is the image you asked for' },
+    ];
+    const result = truncateAssistantContent(content, 7);
+    expect(result).toEqual([
+      {
+        type: 'file',
+        data: 'data:image/png;base64,xxx',
+        mediaType: 'image/png',
+      },
+      { type: 'text', text: 'Here is' },
+    ]);
+  });
+
+  it('preserves a tool-call placed after the truncation point', () => {
+    const content: AssistantContent = [
+      { type: 'text', text: 'Let me check.' },
+      {
+        type: 'tool-call',
+        toolCallId: 't1',
+        toolName: 'search',
+        input: { query: 'x' },
+      },
+    ];
+    // displayedLength sits inside the text part; tool-call still kept.
+    const result = truncateAssistantContent(content, 7);
+    expect(result).toEqual([
+      { type: 'text', text: 'Let me ' },
+      {
+        type: 'tool-call',
+        toolCallId: 't1',
+        toolName: 'search',
+        input: { query: 'x' },
+      },
+    ]);
+  });
+
+  it('drops subsequent text parts past the cumulative limit but keeps non-text parts', () => {
+    const content: AssistantContent = [
+      { type: 'text', text: 'Hello' },
+      { type: 'reasoning', text: 'thinking...' },
+      { type: 'text', text: 'world' },
+    ];
+    // Limit at 5 — first text part fully fits, second should be dropped,
+    // reasoning between them stays.
+    const result = truncateAssistantContent(content, 5);
+    expect(result).toEqual([
+      { type: 'text', text: 'Hello' },
+      { type: 'reasoning', text: 'thinking...' },
+    ]);
+  });
+
+  it('preserves all non-text parts when displayedLength is 0', () => {
+    const content: AssistantContent = [
+      {
+        type: 'file',
+        data: 'data:image/png;base64,xxx',
+        mediaType: 'image/png',
+      },
+      { type: 'text', text: 'Hello' },
+      { type: 'reasoning', text: 'thinking' },
+    ];
+    const result = truncateAssistantContent(content, 0);
+    expect(result).toEqual([
+      {
+        type: 'file',
+        data: 'data:image/png;base64,xxx',
+        mediaType: 'image/png',
+      },
+      { type: 'reasoning', text: 'thinking' },
+    ]);
+  });
+
+  it('returns the full content when displayedLength exceeds total text', () => {
+    const content: AssistantContent = [
+      { type: 'text', text: 'Hello' },
+      { type: 'text', text: 'world' },
+    ];
+    const result = truncateAssistantContent(content, 1000);
+    expect(result).toEqual(content);
+  });
+});
diff --git a/services/platform/convex/threads/truncate_message_content.ts b/services/platform/convex/threads/truncate_message_content.ts
new file mode 100644
index 000000000..27e05b878
--- /dev/null
+++ b/services/platform/convex/threads/truncate_message_content.ts
@@ -0,0 +1,60 @@
+import type { vAssistantContent } from '@convex-dev/agent/validators';
+import type { Infer } from 'convex/values';
+
+export type AssistantContent = Infer<typeof vAssistantContent>;
+type AssistantContentParts = Exclude<AssistantContent, string>;
+type AssistantContentPart = AssistantContentParts[number];
+
+/**
+ * Truncate an assistant message's `content` to the first `displayedLength`
+ * characters of its text, **preserving every non-text part in place**.
+ *
+ * Why this exists: the cancel-generation flow used to overwrite a message
+ * with `{ role: 'assistant', content: '<string>' }`, which collapses
+ * structured parts (reasoning, tool-call, tool-result, file, source) into
+ * a single text part — wiping any image/file/tool cards the user had
+ * already seen. This helper rebuilds `content` with all non-text parts
+ * intact and only the text parts truncated.
+ *
+ * For multiple text parts (uncommon — typically text is split by an
+ * intervening tool-call), the cumulative truncation ignores the single
+ * space `joinText()` inserts between text parts; off-by-(n-1) chars in
+ * that edge case is acceptable.
+ */
+export function truncateAssistantContent(
+  content: AssistantContent,
+  displayedLength: number,
+): AssistantContent {
+  if (displayedLength < 0) {
+    throw new Error(
+      `truncateAssistantContent: displayedLength must be >= 0, got ${displayedLength}`,
+    );
+  }
+
+  if (typeof content === 'string') {
+    return content.slice(0, Math.min(displayedLength, content.length));
+  }
+
+  let textConsumed = 0;
+  const out: AssistantContentPart[] = [];
+
+  for (const part of content) {
+    if (part.type !== 'text') {
+      out.push(part);
+      continue;
+    }
+    if (textConsumed >= displayedLength) {
+      continue;
+    }
+    const remaining = displayedLength - textConsumed;
+    if (part.text.length <= remaining) {
+      out.push(part);
+      textConsumed += part.text.length;
+    } else {
+      out.push({ ...part, text: part.text.slice(0, remaining) });
+      textConsumed = displayedLength;
+    }
+  }
+
+  return out;
+}
diff --git a/services/platform/convex/video_links/mutations.ts b/services/platform/convex/video_links/mutations.ts
index af1a223f2..d4926a976 100644
--- a/services/platform/convex/video_links/mutations.ts
+++ b/services/platform/convex/video_links/mutations.ts
@@ -290,17 +290,24 @@ export const ingestVideoUrl = mutation({
 });
 
 /**
- * Cancel an in-flight or completed video link.
+ * Cancel / dismiss a video link.
  *
  * Semantics:
- *   - Non-terminal: flip to 'skipped'. Orchestrator's next phase-boundary
- *     check sees this and early-exits without persisting more.
+ *   - Any non-skipped status: flip to 'skipped'. For non-terminal rows the
+ *     orchestrator's next phase-boundary check sees this and early-exits
+ *     without persisting more; for terminal rows (completed/failed) the
+ *     flip is what makes the user's X dismissal survive a page refresh —
+ *     the composer filters `displayStatus==='skipped'` out, so without
+ *     the DB write the unbound query would re-emit the chip on next load.
  *   - 'transcribing_handoff': ALSO patch the linked fileMetadata's
  *     transcriptionStatus='skipped' so the existing transcribe_audio.ts
  *     early-exit at lines 317-337 fires; without this, Whisper completes
  *     in the background and writes a transcript/RAG entry the user
  *     thought they cancelled.
- *   - Schedules cleanup action (storage + RAG + maybe-fileMetadata).
+ *   - Schedules cleanup action (storage + RAG + maybe-fileMetadata). The
+ *     cleanup itself is guarded against message-bound rows, so dismissing
+ *     a terminal completed row from the composer (always unbound there)
+ *     is safe.
  *
  * Auth: uploader-only for v1. Org-admin override is a tracked follow-up
  * issue — see the PR description for the link.
@@ -328,9 +335,9 @@ export const cancelVideoLink = mutation({
       throw new Error('Only the uploader can cancel this video link');
     }
 
-    if (job.status === 'completed' || job.status === 'failed') {
-      // No-op — terminal states stay terminal. The chip will dismiss
-      // client-side via the hook's local state.
+    if (job.status === 'skipped') {
+      // Already dismissed — nothing to do. Avoids redundant patches /
+      // audit-log rows from double-clicks or retried mutations.
       return;
     }
 
diff --git a/services/platform/env.sh b/services/platform/env.sh
index 1e09ec921..107e8af68 100644
--- a/services/platform/env.sh
+++ b/services/platform/env.sh
@@ -46,6 +46,7 @@ env_normalize_common() {
 	  # They can be overridden via environment variables in .env when needed.
 	  export RAG_URL="${RAG_URL:-http://rag:8001}"
 	  export CRAWLER_URL="${CRAWLER_URL:-http://crawler:8002}"
+	  export SANDBOX_URL="${SANDBOX_URL:-http://sandbox:8003}"
 	  export SEARCH_SERVICE_URL="${SEARCH_SERVICE_URL:-http://search:8080}"
 
 	  # Convex instance configuration
diff --git a/services/platform/lib/i18n/keys-dynamic.txt b/services/platform/lib/i18n/keys-dynamic.txt
index a402fd0ce..b95fb0efc 100644
--- a/services/platform/lib/i18n/keys-dynamic.txt
+++ b/services/platform/lib/i18n/keys-dynamic.txt
@@ -51,3 +51,14 @@ websites.searchPlaceholder
 # sees `tTypes(key)` with `key` as a runtime variable, so every label entry
 # under the `piiTypes` namespace is dynamic from its perspective.
 piiTypes
+
+# Canvas runnable renderer + icon-map: `t(CANVAS_TYPE_LABEL_KEYS[type])` and
+# `t(`canvas.runStatus.${runStatus}`)` / `canvas.runErrorCode.${runErrorCode}`
+# / `canvas.runProgress.${runProgress.kind}` are all driven by the
+# CanvasContentType / SandboxRunStatus / SandboxErrorCode unions in
+# convex/sandbox/wire.ts. Adding new union members forces a rebuild that
+# covers the corresponding label key automatically.
+chat.canvas.typeLabel
+chat.canvas.runStatus
+chat.canvas.runErrorCode
+chat.canvas.runProgress
diff --git a/services/platform/lib/shared/video-url.test.ts b/services/platform/lib/shared/video-url.test.ts
index 4f21e44b3..cdde182a5 100644
--- a/services/platform/lib/shared/video-url.test.ts
+++ b/services/platform/lib/shared/video-url.test.ts
@@ -289,6 +289,30 @@ describe('extractVideoUrls', () => {
     expect(out).toHaveLength(0);
   });
 
+  it('skips URLs from non-video hosts (closed allowlist)', () => {
+    // GitHub, docs sites, and any other ordinary page must not trigger
+    // the video-link pipeline — pasting them used to spawn a yt-dlp job
+    // that always failed with "site isn't supported".
+    expect(
+      extractVideoUrls(
+        'https://github.com/anthropics/skills/tree/main/skills/pptx',
+      ),
+    ).toHaveLength(0);
+    expect(extractVideoUrls('https://example.com/article')).toHaveLength(0);
+    expect(
+      extractVideoUrls('https://docs.python.org/3/library/os.html'),
+    ).toHaveLength(0);
+  });
+
+  it('keeps only the known-platform URL in mixed text', () => {
+    const out = extractVideoUrls(
+      'see the repo https://github.com/foo/bar and the demo https://youtu.be/abc',
+    );
+    expect(out).toHaveLength(1);
+    expect(out[0].url).toBe('https://youtu.be/abc');
+    expect(out[0].platform).toBe('youtube');
+  });
+
   it('accepts watch?v=X&list=Y (video-in-playlist)', () => {
     const out = extractVideoUrls(
       'https://www.youtube.com/watch?v=abc&list=PL123',
diff --git a/services/platform/lib/shared/video-url.ts b/services/platform/lib/shared/video-url.ts
index 91dd6f05c..2e06049cf 100644
--- a/services/platform/lib/shared/video-url.ts
+++ b/services/platform/lib/shared/video-url.ts
@@ -16,9 +16,15 @@
  * intentionally redundant: the frontend gives instant UX feedback on a
  * mistyped URL; the server gates the actual spawn.
  *
- * Open: any https URL → yt-dlp. We do NOT allowlist hosts — yt-dlp's own
- * extractor list is canonical. `detectPlatform` returns a coarse string
- * for telemetry/chip-icon only, never gates processing.
+ * Closed allowlist: only hosts in `KNOWN_PLATFORMS` flow through
+ * `extractVideoUrls` and become chips. Any other https URL (GitHub,
+ * docs links, plain web pages) is ignored at extraction time — the
+ * paste handler does not `preventDefault`, so the URL stays in the
+ * textarea as plain text. Previously this layer admitted every https
+ * URL and leaned on yt-dlp's extractor table; that produced a red
+ * "This site isn't supported" chip for every non-video paste. The
+ * server's `ingestVideoUrl` mutation still accepts any https URL —
+ * the allowlist lives in the chat-input flow, not the ingest contract.
  */
 
 interface ExtractedVideoUrl {
@@ -258,6 +264,11 @@ export function extractVideoUrls(
     if (cleanedUrl.length === 0) continue;
     if (!isSafeVideoUrl(cleanedUrl)) continue;
     if (isPlaylistUrl(cleanedUrl)) continue;
+    const platform = detectPlatform(cleanedUrl);
+    // Closed allowlist: skip anything that isn't a recognized video host.
+    // Prevents the chat composer from spawning a yt-dlp job (and red
+    // "site isn't supported" chip) for ordinary links like GitHub URLs.
+    if (platform === 'generic') continue;
     const dedupKey = normalizeUrlForHash(cleanedUrl);
     if (seen.has(dedupKey)) continue;
     seen.add(dedupKey);
@@ -267,7 +278,7 @@ export function extractVideoUrls(
       // stripped trailing punctuation) so use-send-message.ts can do a
       // literal String.replace on the textarea content.
       pastedToken: original,
-      platform: detectPlatform(cleanedUrl),
+      platform,
     });
   }
 
diff --git a/services/platform/messages/de.json b/services/platform/messages/de.json
index c557ea21b..d41741f4c 100644
--- a/services/platform/messages/de.json
+++ b/services/platform/messages/de.json
@@ -2408,13 +2408,82 @@
       "streamingWriting": "KI schreibt…",
       "streamingPatch": "KI bearbeitet…",
       "cancel": "Bearbeitung abbrechen",
-      "streamingDuringEdit": "Der Agent aktualisiert dieses Artefakt — dein Entwurf bleibt gespeichert. Klicke auf Abbrechen, um ihn zu verwerfen."
+      "streamingDuringEdit": "Der Agent aktualisiert dieses Artefakt — dein Entwurf bleibt gespeichert. Klicke auf Abbrechen, um ihn zu verwerfen.",
+      "runDone": "Fertig",
+      "runStarted": "Gestartet",
+      "runStale": "Quellcode geändert",
+      "runFiles": "Dateien",
+      "runStdout": "stdout ({chars} Zeichen)",
+      "runStderr": "stderr ({chars} Zeichen)",
+      "runOpenFile": "Datei {name} öffnen",
+      "runResultEntryLabel": "Ausgabe",
+      "runResultSecondaryLabel": "Ausgabe für {path}",
+      "runResultSecondaryCount": "{count, plural, one {# weitere Datei} other {# weitere Dateien}}",
+      "runStatus": {
+        "queued": "In Warteschlange",
+        "installing": "Abhängigkeiten installieren",
+        "running": "Läuft",
+        "completed": "Abgeschlossen",
+        "failed": "Fehlgeschlagen",
+        "cancelled": "Abgebrochen"
+      },
+      "runErrorCode": {
+        "TIMEOUT": "Zeitüberschreitung",
+        "OOM": "Speicher voll",
+        "EGRESS_DENIED": "Netzwerk blockiert",
+        "INSTALL_FAILED": "Installation fehlgeschlagen",
+        "PACKAGE_NOT_FOUND": "Paket nicht gefunden",
+        "QUOTA_EXCEEDED": "Kontingent überschritten",
+        "RUNTIME_ERROR": "Laufzeitfehler",
+        "SPAWNER_UNAVAILABLE": "Sandbox nicht erreichbar",
+        "CANCELLED": "Abgebrochen",
+        "INPUT_REJECTED": "Eingabe abgelehnt",
+        "HARVEST_READ_FAILED": "Ausgabe konnte nicht gelesen werden",
+        "UPLOAD_FAILED": "Upload fehlgeschlagen",
+        "UPLOAD_QUOTA_EXCEEDED": "Upload-Kontingent überschritten",
+        "UPLOAD_REPORT_FAILED": "Upload-Bestätigung fehlgeschlagen",
+        "PRE_STAGE_FAILED": "Vorbereitung fehlgeschlagen",
+        "UPLOAD_INCOMPLETE": "Upload unvollständig"
+      },
+      "runProgress": {
+        "queued": "In Warteschlange",
+        "preparing": "Sandbox wird vorbereitet",
+        "installingPackage": "{package}{version, select, undefined {} other { {version}}} wird installiert",
+        "installing": "Abhängigkeiten installieren",
+        "running": "Läuft"
+      },
+      "typeLabel": {
+        "code": "Code",
+        "html": "HTML",
+        "mermaid": "Mermaid",
+        "svg": "SVG",
+        "markdown": "Markdown",
+        "script_runnable": "Skript (Sandbox)",
+        "python_runnable": "Python (Sandbox)",
+        "node_runnable": "Node (Sandbox)"
+      },
+      "fileSidebar": {
+        "label": "Projektdateien",
+        "title": "Dateien",
+        "expand": "Dateien anzeigen",
+        "collapse": "Dateien ausblenden",
+        "entryBadge": "Einstieg",
+        "streamingDot": "Diese Datei wird geschrieben…",
+        "addFile": "Datei hinzufügen",
+        "addFilePlaceholder": "pfad/zur/datei.ext",
+        "addFileConfirm": "Hinzufügen",
+        "addFileCancel": "Abbrechen",
+        "errorPathRequired": "Pfad ist erforderlich.",
+        "errorPathExists": "Eine Datei mit diesem Pfad existiert bereits.",
+        "errorAddFailed": "Datei konnte nicht hinzugefügt werden."
+      }
     },
     "artifacts": {
       "barLabel": "Artefakte in diesem Thread",
       "barTitle": "Artefakte",
       "openCard": "Artefakt öffnen: {title}",
-      "touchedByMessage": "{title} (Revision {revision})"
+      "touchedByMessage": "{title}",
+      "fileCount": "{count, plural, one {# Datei} other {# Dateien}}"
     },
     "branchNavigator": {
       "previous": "Vorheriger Zweig",
diff --git a/services/platform/messages/en.json b/services/platform/messages/en.json
index f9ee3a877..6863352c5 100644
--- a/services/platform/messages/en.json
+++ b/services/platform/messages/en.json
@@ -2408,13 +2408,82 @@
       "streamingWriting": "AI is writing…",
       "streamingPatch": "AI is editing…",
       "cancel": "Cancel edit",
-      "streamingDuringEdit": "Agent is updating this artifact — your draft is saved. Cancel to discard."
+      "streamingDuringEdit": "Agent is updating this artifact — your draft is saved. Cancel to discard.",
+      "runDone": "Done",
+      "runStarted": "Started",
+      "runStale": "Source edited",
+      "runFiles": "Files",
+      "runStdout": "stdout ({chars} chars)",
+      "runStderr": "stderr ({chars} chars)",
+      "runOpenFile": "Open file {name}",
+      "runResultEntryLabel": "Run output",
+      "runResultSecondaryLabel": "Output for {path}",
+      "runResultSecondaryCount": "{count, plural, one {# other file} other {# other files}}",
+      "runStatus": {
+        "queued": "Queued",
+        "installing": "Installing dependencies",
+        "running": "Running",
+        "completed": "Completed",
+        "failed": "Failed",
+        "cancelled": "Cancelled"
+      },
+      "runErrorCode": {
+        "TIMEOUT": "Timed out",
+        "OOM": "Out of memory",
+        "EGRESS_DENIED": "Network blocked",
+        "INSTALL_FAILED": "Install failed",
+        "PACKAGE_NOT_FOUND": "Package not found",
+        "QUOTA_EXCEEDED": "Quota exceeded",
+        "RUNTIME_ERROR": "Runtime error",
+        "SPAWNER_UNAVAILABLE": "Sandbox unavailable",
+        "CANCELLED": "Cancelled",
+        "INPUT_REJECTED": "Input rejected",
+        "HARVEST_READ_FAILED": "Output read failed",
+        "UPLOAD_FAILED": "Upload failed",
+        "UPLOAD_QUOTA_EXCEEDED": "Upload quota exceeded",
+        "UPLOAD_REPORT_FAILED": "Upload report failed",
+        "PRE_STAGE_FAILED": "Pre-stage failed",
+        "UPLOAD_INCOMPLETE": "Upload incomplete"
+      },
+      "runProgress": {
+        "queued": "Queued",
+        "preparing": "Preparing sandbox",
+        "installingPackage": "Installing {package}{version, select, undefined {} other { {version}}}",
+        "installing": "Installing dependencies",
+        "running": "Running"
+      },
+      "typeLabel": {
+        "code": "Code",
+        "html": "HTML",
+        "mermaid": "Mermaid",
+        "svg": "SVG",
+        "markdown": "Markdown",
+        "script_runnable": "Script (sandbox)",
+        "python_runnable": "Python (sandbox)",
+        "node_runnable": "Node (sandbox)"
+      },
+      "fileSidebar": {
+        "label": "Project files",
+        "title": "Files",
+        "expand": "Show files",
+        "collapse": "Hide files",
+        "entryBadge": "entry",
+        "streamingDot": "Writing this file…",
+        "addFile": "Add file",
+        "addFilePlaceholder": "path/to/file.ext",
+        "addFileConfirm": "Add",
+        "addFileCancel": "Cancel",
+        "errorPathRequired": "Path is required.",
+        "errorPathExists": "A file with this path already exists.",
+        "errorAddFailed": "Could not add the file."
+      }
     },
     "artifacts": {
       "barLabel": "Artifacts in this thread",
       "barTitle": "Artifacts",
       "openCard": "Open artifact: {title}",
-      "touchedByMessage": "{title} (revision {revision})"
+      "touchedByMessage": "{title}",
+      "fileCount": "{count, plural, one {# file} other {# files}}"
     },
     "branchNavigator": {
       "previous": "Previous branch",
diff --git a/services/platform/messages/fr.json b/services/platform/messages/fr.json
index c6ac0e532..9c7014799 100644
--- a/services/platform/messages/fr.json
+++ b/services/platform/messages/fr.json
@@ -2408,13 +2408,82 @@
       "streamingWriting": "L'IA écrit…",
       "streamingPatch": "L'IA modifie…",
       "cancel": "Annuler la modification",
-      "streamingDuringEdit": "L'agent met à jour cet artéfact — ton brouillon est conservé. Clique sur Annuler pour le rejeter."
+      "streamingDuringEdit": "L'agent met à jour cet artéfact — ton brouillon est conservé. Clique sur Annuler pour le rejeter.",
+      "runDone": "Terminé",
+      "runStarted": "Démarré",
+      "runStale": "Code modifié",
+      "runFiles": "Fichiers",
+      "runStdout": "stdout ({chars} car.)",
+      "runStderr": "stderr ({chars} car.)",
+      "runOpenFile": "Ouvrir le fichier {name}",
+      "runResultEntryLabel": "Sortie",
+      "runResultSecondaryLabel": "Sortie pour {path}",
+      "runResultSecondaryCount": "{count, plural, one {# autre fichier} other {# autres fichiers}}",
+      "runStatus": {
+        "queued": "En file d'attente",
+        "installing": "Installation des dépendances",
+        "running": "En cours",
+        "completed": "Terminé",
+        "failed": "Échec",
+        "cancelled": "Annulé"
+      },
+      "runErrorCode": {
+        "TIMEOUT": "Délai dépassé",
+        "OOM": "Mémoire saturée",
+        "EGRESS_DENIED": "Réseau bloqué",
+        "INSTALL_FAILED": "Échec d'installation",
+        "PACKAGE_NOT_FOUND": "Paquet introuvable",
+        "QUOTA_EXCEEDED": "Quota dépassé",
+        "RUNTIME_ERROR": "Erreur d'exécution",
+        "SPAWNER_UNAVAILABLE": "Sandbox indisponible",
+        "CANCELLED": "Annulé",
+        "INPUT_REJECTED": "Entrée refusée",
+        "HARVEST_READ_FAILED": "Lecture de la sortie échouée",
+        "UPLOAD_FAILED": "Téléversement échoué",
+        "UPLOAD_QUOTA_EXCEEDED": "Quota de téléversement dépassé",
+        "UPLOAD_REPORT_FAILED": "Confirmation de téléversement échouée",
+        "PRE_STAGE_FAILED": "Préparation échouée",
+        "UPLOAD_INCOMPLETE": "Téléversement incomplet"
+      },
+      "runProgress": {
+        "queued": "En file d'attente",
+        "preparing": "Préparation de la sandbox",
+        "installingPackage": "Installation de {package}{version, select, undefined {} other { {version}}}",
+        "installing": "Installation des dépendances",
+        "running": "En cours"
+      },
+      "typeLabel": {
+        "code": "Code",
+        "html": "HTML",
+        "mermaid": "Mermaid",
+        "svg": "SVG",
+        "markdown": "Markdown",
+        "script_runnable": "Script (sandbox)",
+        "python_runnable": "Python (sandbox)",
+        "node_runnable": "Node (sandbox)"
+      },
+      "fileSidebar": {
+        "label": "Fichiers du projet",
+        "title": "Fichiers",
+        "expand": "Afficher les fichiers",
+        "collapse": "Masquer les fichiers",
+        "entryBadge": "entrée",
+        "streamingDot": "Écriture de ce fichier…",
+        "addFile": "Ajouter un fichier",
+        "addFilePlaceholder": "chemin/vers/fichier.ext",
+        "addFileConfirm": "Ajouter",
+        "addFileCancel": "Annuler",
+        "errorPathRequired": "Le chemin est requis.",
+        "errorPathExists": "Un fichier avec ce chemin existe déjà.",
+        "errorAddFailed": "Impossible d'ajouter le fichier."
+      }
     },
     "artifacts": {
       "barLabel": "Artéfacts dans ce fil",
       "barTitle": "Artéfacts",
       "openCard": "Ouvrir l'artéfact : {title}",
-      "touchedByMessage": "{title} (révision {revision})"
+      "touchedByMessage": "{title}",
+      "fileCount": "{count, plural, one {# fichier} other {# fichiers}}"
     },
     "branchNavigator": {
       "previous": "Branche précédente",
diff --git a/services/platform/scripts/dev.ts b/services/platform/scripts/dev.ts
index 7ea61fea7..94bb022a7 100644
--- a/services/platform/scripts/dev.ts
+++ b/services/platform/scripts/dev.ts
@@ -69,6 +69,29 @@ function envNormalizeCommon() {
     process.env.SITE_URL = `http://${host}${host === 'localhost' ? `:${port}` : ''}`;
   }
 
+  // Sandbox-wobbly-origami plan §4: the spawner runs inside docker (compose)
+  // while Convex runs on the host in `bun dev` mode, so storage URLs the
+  // action sends to the spawner must use a hostname that resolves to the
+  // host from inside the container. `host.docker.internal` is the standard
+  // cross-platform alias (Docker Desktop ships it; Linux Docker requires
+  // `extra_hosts: ["host.docker.internal:host-gateway"]` which compose.dev.yml
+  // already sets on the sandbox service).
+  //
+  // Override in `services/platform/.env.local` only if your network stack
+  // breaks the default — e.g. a VPN/proxy (singbox-tun, tailscale, ...) that
+  // hijacks RFC1918 traffic and blocks docker-bridge → host. In that case
+  // set the host's LAN IP:
+  //
+  //   SANDBOX_STORAGE_INTERNAL_BASE_URL=http://192.168.x.y:3210
+  //   SANDBOX_HTTP_API_BASE_URL=http://192.168.x.y:3211
+  if (!process.env.SANDBOX_STORAGE_INTERNAL_BASE_URL) {
+    process.env.SANDBOX_STORAGE_INTERNAL_BASE_URL =
+      'http://host.docker.internal:3210';
+  }
+  if (!process.env.SANDBOX_HTTP_API_BASE_URL) {
+    process.env.SANDBOX_HTTP_API_BASE_URL = 'http://host.docker.internal:3211';
+  }
+
   // Root config directory only — Convex derives sub-dirs (agents/workflows/
   // integrations/providers) from TALE_CONFIG_DIR via `convex/*/file_utils.ts`.
   if (!process.env.TALE_CONFIG_DIR) {
diff --git a/services/proxy/Caddyfile b/services/proxy/Caddyfile
index 97a3b10e0..fbc9708e4 100644
--- a/services/proxy/Caddyfile
+++ b/services/proxy/Caddyfile
@@ -145,8 +145,24 @@
 		reverse_proxy convex:3210
 	}
 
+	# HTTP: Sandbox callback API (/api/sandbox/* -> convex:3211)
+	# Must come BEFORE the generic /api/* block so the more specific path
+	# wins. Skips access logging by default since the path itself is HMAC-
+	# authenticated and adds nothing diagnostically. The spawner is the
+	# only legitimate caller; runtime containers cannot reach proxy.
+	handle /api/sandbox/* {
+		log_skip
+		reverse_proxy convex:3211
+	}
+
 	# HTTP: Convex storage upload/download (/api/storage/* -> convex:3210)
+	# `log_skip`: Convex's `generateUploadUrl()` embeds a 1-hour upload
+	# token in the URL's query string. Default INFO-level access logs
+	# would write that token to stdout (audit finding R2V7). The path
+	# itself is auth-bound by the token; access logging adds no security
+	# value here.
 	handle /api/storage/* {
+		log_skip
 		reverse_proxy convex:3210
 	}
 
diff --git a/services/sandbox-egress/Dockerfile b/services/sandbox-egress/Dockerfile
new file mode 100644
index 000000000..9dbed6ff0
--- /dev/null
+++ b/services/sandbox-egress/Dockerfile
@@ -0,0 +1,46 @@
+# Tale Sandbox Egress Proxy
+#
+# HTTPS forward proxy filtering by CONNECT host. Sits on `tale-sandbox-net`
+# (an internal-only Docker bridge); sandbox runtime containers reach pypi/npm
+# via this proxy, all other internet is unreachable.
+#
+# See plan §2. Verified by R2.1: pip / npm / uv all honor HTTPS_PROXY and
+# fail loud when the proxy denies a host or is unreachable.
+#
+# The Dockerfile-level user stays root so the entrypoint can chown the log
+# file before exec, AND so iptables can install the SSRF firewall rules
+# in entrypoint.sh — tinyproxy itself drops privileges to `nobody` after
+# bind (configured in tinyproxy.conf.template).
+#
+# REQUIRED CAPABILITY: this container MUST be run with `cap_add: [NET_ADMIN]`
+# (set in compose.yml and the CLI compose generator) so the entrypoint's
+# `iptables -I OUTPUT -j REJECT` rules can install. Without NET_ADMIN the
+# entrypoint logs a warning and continues; the hostname allowlist still
+# applies but the IP-layer DNS-rebind defense is absent.
+
+# trivy:ignore:AVD-DS-0002 -- entrypoint needs root to chown log + install iptables; tinyproxy drops privs at bind time
+FROM alpine:3.20
+
+# - tinyproxy:      the proxy daemon
+# - gettext:        provides envsubst for the conf template
+# - ca-certificates: tinyproxy TLS validation when filtering
+# - iptables:       SSRF firewall (IMDS + RFC1918 REJECT rules in entrypoint)
+# - curl:           healthcheck CONNECT probe
+RUN apk add --no-cache tinyproxy gettext ca-certificates iptables curl && \
+    mkdir -p /etc/tinyproxy /var/log/tinyproxy && \
+    chown -R nobody:nobody /var/log/tinyproxy
+
+COPY services/sandbox-egress/tinyproxy.conf.template /etc/tinyproxy/tinyproxy.conf.template
+COPY services/sandbox-egress/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+EXPOSE 3128
+
+# Local readiness probe only — confirms tinyproxy is bound and accepting
+# TCP. We deliberately do NOT call out to pypi every 10s on every host
+# (allow-list regressions are caught by the smoke test). `nc` is part of
+# busybox in alpine; no extra apk install is needed.
+HEALTHCHECK --interval=30s --timeout=3s --retries=3 \
+  CMD nc -z 127.0.0.1 3128 || exit 1
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/services/sandbox-egress/entrypoint.sh b/services/sandbox-egress/entrypoint.sh
new file mode 100644
index 000000000..1a90fcf74
--- /dev/null
+++ b/services/sandbox-egress/entrypoint.sh
@@ -0,0 +1,136 @@
+#!/bin/sh
+# services/sandbox-egress/entrypoint.sh
+# Render allow-list + config, install IP-layer egress firewall, exec tinyproxy.
+
+set -e
+
+# ----------------------------------------------------------------------------
+# SSRF firewall (defense-in-depth)
+# ----------------------------------------------------------------------------
+# The tinyproxy allowlist is a hostname-regex filter applied AFTER the proxy
+# resolves the CONNECT target. A short-TTL DNS rebind on an allowlisted host
+# could flip resolution to 169.254.169.254 (cloud IMDS) or RFC1918 (corp VPN,
+# host bridge) between tinyproxy's lookup and the kernel connect(). Block
+# those targets at the IP layer so the entire tunnel surface is fenced
+# regardless of what hostname squeaked past the allowlist.
+#
+# Mirrors services/convex/docker-entrypoint.sh lines 59-83. Requires
+# NET_ADMIN; cap_add: ['NET_ADMIN'] is set in compose.yml and the CLI
+# compose generator. Skipped (with a loud warn) when iptables is missing
+# or the capability isn't granted, so dev environments still boot.
+SKIP_FIREWALL="${TALE_SKIP_SSRF_FIREWALL:-0}"
+
+if [ "$SKIP_FIREWALL" = "1" ]; then
+  echo "[sandbox-egress] WARN: TALE_SKIP_SSRF_FIREWALL=1 — SSRF firewall explicitly skipped"
+elif ! command -v iptables >/dev/null 2>&1; then
+  # Fail-closed: iptables is part of the image, so a missing binary means
+  # someone broke the build. Refuse to start rather than silently shipping
+  # the runtime containers a wide-open egress path.
+  echo "[sandbox-egress] FATAL: iptables binary missing; refusing to start without the SSRF firewall (set TALE_SKIP_SSRF_FIREWALL=1 to override for dev only)"
+  exit 1
+elif ! iptables -L OUTPUT >/dev/null 2>&1; then
+  # Fail-closed: NET_ADMIN is what compose.yml + the CLI compose generator
+  # grant; if it's not effective, the IP-layer DNS-rebind defense is
+  # absent and only the hostname allowlist stands between runtime code
+  # and the cloud IMDS. Don't ship that silently.
+  echo "[sandbox-egress] FATAL: NET_ADMIN unavailable; SSRF firewall cannot install (set TALE_SKIP_SSRF_FIREWALL=1 to override for dev only, or cap_add: [NET_ADMIN] in compose.yml)"
+  exit 1
+else
+  echo "[sandbox-egress] installing SSRF egress firewall (REJECT IMDS + link-local + RFC1918, v4 + v6)"
+  # Cloud instance metadata service (AWS/GCP/Azure IMDSv1 footprint).
+  iptables -I OUTPUT -d 169.254.169.254/32 -j REJECT --reject-with icmp-net-prohibited 2>/dev/null || \
+    echo "[sandbox-egress] WARN: failed to reject 169.254.169.254/32"
+  # All link-local — covers Azure 168.63.129.16 and other variants.
+  iptables -I OUTPUT -d 169.254.0.0/16 -j REJECT --reject-with icmp-net-prohibited 2>/dev/null || true
+  # RFC1918 ranges that aren't part of this container's own attached
+  # docker network. The kernel routes intra-network traffic via the
+  # bridge driver before OUTPUT is consulted for external-bound packets,
+  # so peer containers on the same docker network are not affected by
+  # these rules — only attempts to reach private ranges that route OUT
+  # of the bridge are dropped. If the operator deploys on a non-default
+  # docker-network topology where this assumption breaks, set
+  # TALE_SKIP_SSRF_FIREWALL=1 to bypass.
+  iptables -I OUTPUT -d 10.0.0.0/8 -j REJECT --reject-with icmp-net-prohibited 2>/dev/null || true
+  iptables -I OUTPUT -d 172.16.0.0/12 -j REJECT --reject-with icmp-net-prohibited 2>/dev/null || true
+  iptables -I OUTPUT -d 192.168.0.0/16 -j REJECT --reject-with icmp-net-prohibited 2>/dev/null || true
+
+  # Stateful ACCEPT for response traffic. Without this, the REJECT rules
+  # above also drop the SYN-ACK and data segments tinyproxy sends back to
+  # peer runtime containers — their IPs sit in 172.30.0.0/24 ⊂ 172.16/12,
+  # so the kernel rejects egress's reply with icmp-net-prohibited and the
+  # runtime's connect() times out. The header comment above optimistically
+  # assumed bridge-to-bridge traffic skips OUTPUT; on modern kernels with
+  # bridge-nf-call-iptables=1 it does NOT, so we explicitly let return
+  # traffic through. NEW outbound to RFC1918 is still rejected because
+  # this rule only matches ESTABLISHED/RELATED conntrack states.
+  iptables -I OUTPUT -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT 2>/dev/null || \
+    iptables -I OUTPUT -m state --state ESTABLISHED,RELATED -j ACCEPT 2>/dev/null || \
+    echo "[sandbox-egress] WARN: failed to install stateful ACCEPT — runtime callers will time out connecting to the proxy"
+
+  # IPv6 mirror: if a future tale-sandbox-net is created with IPv6 enabled
+  # (or the host kernel exposes a v6 default route into one of the
+  # sensitive private ranges), the v4-only rules above would leave a hole.
+  # ip6tables is best-effort — alpine kernels without ip6_tables loaded
+  # just log a warn and continue; on hosts with v6 enabled the rules
+  # bind and provide parity with the v4 defenses.
+  if command -v ip6tables >/dev/null 2>&1 && ip6tables -L OUTPUT >/dev/null 2>&1; then
+    # GCP / Azure ARM equivalents of 169.254.169.254 (fd00:ec2::254 etc.).
+    ip6tables -I OUTPUT -d fd00:ec2::254/128 -j REJECT 2>/dev/null || true
+    # IPv4-mapped IMDS — `curl -g http://[::ffff:169.254.169.254]/` hits
+    # the v4 stack through the v6 socket; block both the v4-mapped form
+    # and the bare v6 address space that overlaps.
+    ip6tables -I OUTPUT -d ::ffff:169.254.0.0/112 -j REJECT 2>/dev/null || true
+    ip6tables -I OUTPUT -d ::1/128 -j REJECT 2>/dev/null || true
+    # Link-local + unique-local (RFC4193) — covers any router-advertised
+    # private v6 fabric.
+    ip6tables -I OUTPUT -d fe80::/10 -j REJECT 2>/dev/null || true
+    ip6tables -I OUTPUT -d fc00::/7 -j REJECT 2>/dev/null || true
+    # Mirror the v4 stateful ACCEPT (see explanation above) so any IPv6
+    # peer runtime can also receive return packets.
+    ip6tables -I OUTPUT -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT 2>/dev/null || \
+      ip6tables -I OUTPUT -m state --state ESTABLISHED,RELATED -j ACCEPT 2>/dev/null || true
+  else
+    echo "[sandbox-egress] WARN: ip6tables unavailable; IPv6 SSRF defense not installed (harmless on IPv4-only hosts)"
+  fi
+fi
+
+DEFAULT_ALLOWLIST='^pypi\.org$
+^files\.pythonhosted\.org$
+^registry\.npmjs\.org$
+^objects\.githubusercontent\.com$
+^codeload\.github\.com$'
+
+# Operator override: one regex per line, or `|`-separated for compose-friendly
+# single-line env values.
+if [ -n "$SANDBOX_EGRESS_ALLOWLIST" ]; then
+  echo "$SANDBOX_EGRESS_ALLOWLIST" | tr '|' '\n' > /etc/tinyproxy/allowlist
+else
+  printf '%s\n' "$DEFAULT_ALLOWLIST" > /etc/tinyproxy/allowlist
+fi
+
+envsubst < /etc/tinyproxy/tinyproxy.conf.template > /etc/tinyproxy/tinyproxy.conf
+
+echo "[sandbox-egress] starting tinyproxy on :3128"
+echo "[sandbox-egress] CONNECT allow-list:"
+sed 's/^/  /' /etc/tinyproxy/allowlist
+echo "[sandbox-egress] config:"
+sed 's/^/  /' /etc/tinyproxy/tinyproxy.conf
+
+# tinyproxy logs to file by default; tail to stdout in foreground so docker
+# logs surfaces them. Chown to nobody so tinyproxy (which drops privs)
+# can write to it.
+touch /var/log/tinyproxy/tinyproxy.log
+chown nobody:nobody /var/log/tinyproxy/tinyproxy.log
+
+# Run tinyproxy in the background, then `exec tail -F` so the tail process
+# replaces this shell as PID 1. SIGTERM from `docker stop` then goes
+# straight to tail (which exits on signal), tail's death tears down the
+# container, and tinyproxy — as a sibling child of the original shell —
+# is reaped by the kernel rather than zombified through this entrypoint.
+# A signal trap forwards INT/TERM to tinyproxy so it gets a clean shutdown
+# instead of SIGKILL when the container stops.
+tinyproxy -d -c /etc/tinyproxy/tinyproxy.conf &
+TINYPROXY_PID=$!
+trap 'kill -TERM "$TINYPROXY_PID" 2>/dev/null || true' INT TERM
+
+exec tail -n0 -F /var/log/tinyproxy/tinyproxy.log
diff --git a/services/sandbox-egress/tinyproxy.conf.template b/services/sandbox-egress/tinyproxy.conf.template
new file mode 100644
index 000000000..5ee39855c
--- /dev/null
+++ b/services/sandbox-egress/tinyproxy.conf.template
@@ -0,0 +1,52 @@
+# Tale Sandbox Egress — tinyproxy config
+# Rendered at startup by entrypoint.sh (no template vars currently in use,
+# but keep envsubst-ready so we can introduce them without re-tooling).
+
+User nobody
+Group nobody
+
+Port 3128
+Listen 0.0.0.0
+Timeout 600
+DefaultErrorFile "/usr/share/tinyproxy/default.html"
+# Notice (not Info) — Info logs full request lines including query strings,
+# which can leak tokens/secrets a sandboxed user pastes into a URL.
+LogLevel Notice
+LogFile "/var/log/tinyproxy/tinyproxy.log"
+PidFile "/tmp/tinyproxy.pid"
+MaxClients 100
+ViaProxyName "tale-sandbox-egress"
+
+# CONNECT method (HTTPS tunneling) — required for pip/npm/uv installs.
+# Only the standard TLS port; nothing else.
+ConnectPort 443
+
+# Host-name allow-list (default-deny). Allowlist contents are rewritten
+# by entrypoint.sh from SANDBOX_EGRESS_ALLOWLIST or the default registry set.
+FilterDefaultDeny Yes
+FilterCaseSensitive No
+FilterExtended Yes
+FilterURLs Off
+Filter "/etc/tinyproxy/allowlist"
+
+# Client allow-list (network-layer): only loopback (healthcheck) and the
+# Docker user-network CIDRs that the sandbox runtime containers attach to.
+# 172.16.0.0/12 covers the default Docker bridge-driver range. This list is
+# defense-in-depth ONLY — it's a client ACL, NOT the primary network
+# boundary. The primary boundary is the `--internal` flag on tale-sandbox-net
+# (runtime containers can't reach the host bridge) plus the iptables OUTPUT
+# rules installed by entrypoint.sh (REJECT IMDS + RFC1918). The allowlist
+# entry would only matter if a future topology change exposed the egress
+# proxy to additional networks; until then it's a belt-and-braces guard.
+# ::1 covers IPv6 loopback for the healthcheck on dual-stack hosts.
+Allow 127.0.0.1
+Allow ::1
+Allow 172.16.0.0/12
+
+# DisableViaHeader replaces the default `No` (which emitted
+# `Via: 1.1 tale-sandbox-egress`). Note: tinyproxy's Anonymous block does
+# NOT apply to CONNECT / HTTPS traffic — the tunnel is end-to-end
+# encrypted between the runtime container and the upstream, so tinyproxy
+# cannot see or rewrite request headers. Header stripping would only
+# matter for plaintext HTTP, which CONNECT-only mode rejects anyway.
+DisableViaHeader Yes
diff --git a/services/sandbox-runtime/Dockerfile b/services/sandbox-runtime/Dockerfile
new file mode 100644
index 000000000..3883ff7e7
--- /dev/null
+++ b/services/sandbox-runtime/Dockerfile
@@ -0,0 +1,43 @@
+# Tale Sandbox Runtime
+#
+# Executed inside an ephemeral container per `artifact_run` tool call.
+# See /home/larry/.claude/plans/presentation-generation-from-prompts-delightful-aho.md §3
+#
+# Layers: python:3.12-slim-bookworm + uv + Node 24 + fontconfig (for Pillow).
+# Runs as uid 65534 under --read-only with all caps dropped; spawner forces
+# these via `docker run` flags but the image baseline matches.
+#
+FROM python:3.12-slim-bookworm
+
+# Runtime additions only — fontconfig + DejaVu so Pillow/matplotlib render
+# text correctly, jq so the entrypoint can read packages.json/options.json,
+# ca-certificates for HTTPS to pypi/npm via the egress proxy.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      fonts-dejavu-core \
+      fontconfig \
+      ca-certificates \
+      jq \
+    && rm -rf /var/lib/apt/lists/* \
+    && fc-cache -f
+
+# uv — fast Python package installer/resolver. See https://github.com/astral-sh/uv
+COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /usr/local/bin/uv
+
+# Node 24 LTS. Copy /usr/local from node:24-bookworm-slim into /opt/node.
+COPY --from=node:24-bookworm-slim /usr/local /opt/node
+
+ENV PATH=/opt/node/bin:/usr/local/bin:/usr/bin:/bin
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV NPM_CONFIG_UPDATE_NOTIFIER=false
+
+COPY services/sandbox-runtime/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+# Default user is nobody; spawner pins --user 65534:65534 to make this
+# explicit at the runtime call site.
+USER 65534:65534
+
+WORKDIR /workspace
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/services/sandbox-runtime/entrypoint.sh b/services/sandbox-runtime/entrypoint.sh
new file mode 100644
index 000000000..a3bca4e72
--- /dev/null
+++ b/services/sandbox-runtime/entrypoint.sh
@@ -0,0 +1,204 @@
+#!/bin/sh
+# services/sandbox-runtime/entrypoint.sh
+#
+# Per-call entrypoint inside an ephemeral sandbox container.
+#
+# Args (from spawner's docker run):
+#   $1 = language ('python' | 'node' | 'polyglot')
+#   $2 = path to packages.json (JSON array of pip/npm specs).
+#        Polyglot mode IGNORES this file and reads
+#        /workspace/code/packages-python.json + /workspace/code/packages-node.json
+#        instead (either may be missing or empty).
+#   $3 = path to options.json   ({ allowSdist?: bool, allowInstallScripts?: bool })
+#   $4 = entry path: either a relative POSIX path resolved under
+#        /workspace/code/, or an absolute path under /workspace/code/ or
+#        /workspace/.tale/ (the latter is the spawner-generated multi-step
+#        wrapper). Anything else exits 65.
+#
+# Env (set by spawner via --env):
+#   HTTPS_PROXY / HTTP_PROXY  -> http://sandbox-egress:3128
+#   PIP_CACHE_DIR             -> /cache/pip (per-org named volume)
+#   NPM_CONFIG_CACHE          -> /cache/npm
+#
+# Conventions:
+#   - User code at /workspace/code/<path> — staged 1:1 from the spawner's
+#     `files[]`. The runtime exec()s the file at $4; no synthetic mirror.
+#   - Multi-step wrapper (when used) at /workspace/.tale/runner.{py,js} —
+#     dotfile segment is unreachable from user-supplied paths, so user files
+#     can be named anything (including main.py).
+#   - Output files in /workspace/output/
+#   - install-stderr.log at /workspace/install-stderr.log — captured stderr
+#     from the package install step, tailed to container stderr on failure
+#     (exit 64) so the spawner can surface it. Nothing reads stdout: install
+#     stdout flows directly to the container stdout for live streaming.
+#   - PHASE markers on stdout so the spawner can split install vs run timing.
+#
+# Exit codes:
+#   0   = user code completed successfully
+#   64  = install failed (spawner classifies as INSTALL_FAILED / PACKAGE_NOT_FOUND)
+#   65  = bad invocation (unknown language / missing args / bad entry path)
+#   >0  = user code exit code (RUNTIME_ERROR)
+
+set -e
+
+LANG_NAME="$1"
+PACKAGES_FILE="${2:-/workspace/code/packages.json}"
+OPTIONS_FILE="${3:-/workspace/code/options.json}"
+ENTRY_ARG="${4:?sandbox-runtime: missing entry path (positional arg 4)}"
+
+# Resolve entry path. Accept either an absolute path under one of the two
+# allowed roots, or a relative path interpreted under /workspace/code/.
+case "$ENTRY_ARG" in
+  /workspace/.tale/*|/workspace/code/*)
+    ENTRY_FILE="$ENTRY_ARG"
+    ;;
+  /*)
+    echo "sandbox-runtime: entry path outside /workspace: $ENTRY_ARG" >&2
+    exit 65
+    ;;
+  *)
+    ENTRY_FILE="/workspace/code/$ENTRY_ARG"
+    ;;
+esac
+case "$ENTRY_FILE" in
+  *..*)
+    echo "sandbox-runtime: traversal segment in entry path: $ENTRY_ARG" >&2
+    exit 65
+    ;;
+esac
+
+# Workspace is delivered via host bind-mount (spawner.ts:stageWorkspace
+# writes /var/lib/tale-sandbox/sessions/<id>/{code,input,output}/ on the
+# host and mounts it 1:1 at /workspace inside this container). The mkdir
+# below is defensive — the bind-mount source already contains these dirs
+# when the spawner is happy, but a malformed call should still see
+# usable /workspace/output to write into.
+mkdir -p /workspace/code /workspace/input /workspace/output
+
+echo "PHASE: installing"
+
+ALLOW_SDIST="false"
+ALLOW_INSTALL_SCRIPTS="false"
+if [ -f "$OPTIONS_FILE" ]; then
+  ALLOW_SDIST=$(jq -r '.allowSdist // false' "$OPTIONS_FILE" 2>/dev/null || echo false)
+  ALLOW_INSTALL_SCRIPTS=$(jq -r '.allowInstallScripts // false' "$OPTIONS_FILE" 2>/dev/null || echo false)
+fi
+
+PACKAGES_ARGV=""
+if [ -f "$PACKAGES_FILE" ]; then
+  # jq @sh escapes each package spec safely for shell expansion. The PACKAGES_FILE
+  # was written by the spawner (a trusted, typed pipeline) — not user shell input.
+  PACKAGES_ARGV=$(jq -r '. | map(@sh) | join(" ")' "$PACKAGES_FILE" 2>/dev/null || echo "")
+fi
+
+# Polyglot extras — each bucket lives in its own file written by the
+# spawner. Either may be absent or carry an empty array, in which case
+# the matching install pass is skipped.
+PY_PACKAGES_FILE="/workspace/code/packages-python.json"
+NODE_PACKAGES_FILE="/workspace/code/packages-node.json"
+PY_PACKAGES_ARGV=""
+NODE_PACKAGES_ARGV=""
+if [ -f "$PY_PACKAGES_FILE" ]; then
+  PY_PACKAGES_ARGV=$(jq -r '. | map(@sh) | join(" ")' "$PY_PACKAGES_FILE" 2>/dev/null || echo "")
+fi
+if [ -f "$NODE_PACKAGES_FILE" ]; then
+  NODE_PACKAGES_ARGV=$(jq -r '. | map(@sh) | join(" ")' "$NODE_PACKAGES_FILE" 2>/dev/null || echo "")
+fi
+
+mkdir -p /workspace/output
+
+# Shared pip install. Used by both single-language Python runs and by the
+# polyglot bucket. Caller passes `$1`: the @sh-escaped argv string to install.
+install_python() {
+  PIP_ARGS="--target /workspace/.deps/python --no-progress"
+  if [ "$ALLOW_SDIST" != "true" ]; then
+    PIP_ARGS="$PIP_ARGS --only-binary=:all:"
+  fi
+  if [ -n "$1" ]; then
+    eval "uv pip install $PIP_ARGS $1" \
+      2> /workspace/install-stderr.log \
+      || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; }
+  fi
+}
+
+# Shared npm install. Same contract as install_python.
+install_node() {
+  NPM_ARGS="--prefix /workspace/.deps/node --no-audit --no-fund --no-progress --loglevel=error"
+  if [ "$ALLOW_INSTALL_SCRIPTS" != "true" ]; then
+    NPM_ARGS="$NPM_ARGS --ignore-scripts"
+  fi
+  if [ -n "$1" ]; then
+    mkdir -p /workspace/.deps/node
+    (cd /workspace/.deps/node && npm init -y > /dev/null 2> /workspace/install-stderr.log) \
+      || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; }
+    eval "npm install $NPM_ARGS $1" \
+      2> /workspace/install-stderr.log \
+      || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; }
+  fi
+}
+
+run_python() {
+  PIP_ARGS="--target /workspace/.deps/python --no-progress"
+  if [ "$ALLOW_SDIST" != "true" ]; then
+    # Block sdist installs by default — closes setup.py ACE vector (R2.7).
+    PIP_ARGS="$PIP_ARGS --only-binary=:all:"
+  fi
+  if [ -n "$PACKAGES_ARGV" ]; then
+    # Install stdout flows through to the container stdout so the spawner can
+    # surface progress live; stderr is captured to a file and tailed back on
+    # failure (exit 64). Do NOT redirect stderr to /dev/null — that would
+    # hide the only diagnostic on a broken install.
+    eval "uv pip install $PIP_ARGS $PACKAGES_ARGV" \
+      2> /workspace/install-stderr.log \
+      || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; }
+  fi
+  export PYTHONPATH=/workspace/.deps/python
+  echo "PHASE: running"
+  exec python3 "$ENTRY_FILE"
+}
+
+run_node() {
+  NPM_ARGS="--prefix /workspace/.deps/node --no-audit --no-fund --no-progress --loglevel=error"
+  if [ "$ALLOW_INSTALL_SCRIPTS" != "true" ]; then
+    # Block lifecycle scripts by default — closes Shai-Hulud-class postinstall ACE (R2.7).
+    NPM_ARGS="$NPM_ARGS --ignore-scripts"
+  fi
+  if [ -n "$PACKAGES_ARGV" ]; then
+    mkdir -p /workspace/.deps/node
+    # `npm init -y`'s only side effect is the package.json scaffold; its
+    # output is noise but its stderr is the only signal if (e.g.) the dir
+    # isn't writable. Capture stderr so a real failure is recoverable.
+    (cd /workspace/.deps/node && npm init -y > /dev/null 2> /workspace/install-stderr.log) \
+      || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; }
+    # Same pattern as run_python: stdout streams through, stderr is captured
+    # for failure-path harvest.
+    eval "npm install $NPM_ARGS $PACKAGES_ARGV" \
+      2> /workspace/install-stderr.log \
+      || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; }
+  fi
+  export NODE_PATH=/workspace/.deps/node/node_modules
+  echo "PHASE: running"
+  exec node "$ENTRY_FILE"
+}
+
+run_polyglot() {
+  # Polyglot mode: install both buckets when present, export both
+  # interpreter resolution paths, then exec the spawner-generated
+  # Python dispatcher (which subprocesses python3 / node per step).
+  install_python "$PY_PACKAGES_ARGV"
+  install_node "$NODE_PACKAGES_ARGV"
+  export PYTHONPATH=/workspace/.deps/python
+  export NODE_PATH=/workspace/.deps/node/node_modules
+  echo "PHASE: running"
+  exec python3 "$ENTRY_FILE"
+}
+
+case "$LANG_NAME" in
+  python)   run_python ;;
+  node)     run_node ;;
+  polyglot) run_polyglot ;;
+  *)
+    echo "sandbox-runtime: unknown language: $LANG_NAME" >&2
+    exit 65
+    ;;
+esac
diff --git a/services/sandbox/Dockerfile b/services/sandbox/Dockerfile
new file mode 100644
index 000000000..d98119839
--- /dev/null
+++ b/services/sandbox/Dockerfile
@@ -0,0 +1,93 @@
+# Tale Sandbox Spawner
+#
+# Thin stateless HTTP service. Accepts HMAC-signed /v1/execute calls and
+# spawns one ephemeral runtime container per call by talking to the host
+# docker daemon.
+#
+# Security model — `/var/run/docker.sock` is bind-mounted in (see compose.yml).
+# Anyone with write access to the socket is effectively root on the host, so
+# the spawner runs as root by design: that is the security boundary, not the
+# in-container UID. The HMAC on every API call + the loopback-only host port
+# (127.0.0.1:8003) keep unauthenticated callers off the socket; trivy is told
+# to ignore the non-root warning at the FROM line.
+#
+# Build (from repo root):
+#   docker compose build sandbox
+# or directly (CI uses context=., so all COPY paths are repo-root relative):
+#   docker build -f services/sandbox/Dockerfile .
+
+ARG VERSION=dev
+ARG BUN_VERSION=1.3.12
+ARG DOCKER_CLI_VERSION=27
+
+# docker CLI stage — aliased so the runner stage can `COPY --from=docker-cli`
+# without variable expansion in `--from=` (BuildKit forbids that and fails the
+# build; the workaround is a global-ARG-referencing FROM with a named stage).
+FROM docker:${DOCKER_CLI_VERSION}-cli AS docker-cli
+
+# =============================================================================
+# Stage 1: BUILDER — install full deps (incl. devDeps) for typecheck/tests
+# =============================================================================
+FROM oven/bun:${BUN_VERSION}-debian AS builder
+
+WORKDIR /app
+
+# Lockfile + manifest first so the dep layer caches across source edits.
+COPY services/sandbox/package.json services/sandbox/bun.lock ./
+
+RUN bun install --frozen-lockfile
+
+COPY services/sandbox/tsconfig.json ./
+COPY services/sandbox/src/ ./src/
+
+# =============================================================================
+# Stage 2: RUNNER — production deps only + docker CLI for spawning siblings
+# =============================================================================
+# trivy:ignore:AVD-DS-0002 -- runs as root by design; needs /var/run/docker.sock
+FROM oven/bun:${BUN_VERSION}-debian AS runner
+
+WORKDIR /app
+
+# docker CLI for spawning sibling containers via the mounted socket. The
+# Debian-shipped `docker.io` package is too old (API 1.41; current daemons
+# require >=1.44); pull the official static CLI binary instead.
+COPY --from=docker-cli /usr/local/bin/docker /usr/local/bin/docker
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      ca-certificates \
+      curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /usr/share/doc/* /usr/share/man/* /usr/share/info/*
+
+# Production install: skip devDependencies to keep the runtime image small.
+# Lockfile is already validated in the builder stage; --frozen-lockfile here
+# guards against a drifted package.json slipping into the runner image.
+COPY services/sandbox/package.json services/sandbox/bun.lock ./
+RUN bun install --frozen-lockfile --production
+
+COPY --from=builder /app/src ./src
+COPY --from=builder /app/tsconfig.json ./tsconfig.json
+
+ARG VERSION
+LABEL org.opencontainers.image.version="${VERSION}" \
+      org.opencontainers.image.title="tale-sandbox" \
+      org.opencontainers.image.description="Tale Sandbox Spawner — stateless docker-run service for artifact_run" \
+      org.opencontainers.image.source="https://github.com/tale-project/tale" \
+      org.opencontainers.image.vendor="Tale" \
+      org.opencontainers.image.licenses="MIT"
+
+ENV TALE_VERSION=${VERSION} \
+    SANDBOX_PORT=8003 \
+    DO_NOT_TRACK=1
+
+EXPOSE 8003
+
+# Healthcheck mirrors compose.yml's external probe so direct `docker run`
+# (without compose) gets the same liveness signal.
+HEALTHCHECK --interval=10s --timeout=5s --retries=3 --start-period=15s \
+  CMD curl -fsS http://127.0.0.1:8003/health || exit 1
+
+# Root by design — see header comment. The docker socket is the boundary.
+USER root
+
+CMD ["bun", "src/server.ts"]
diff --git a/services/sandbox/bun.lock b/services/sandbox/bun.lock
new file mode 100644
index 000000000..59a260293
--- /dev/null
+++ b/services/sandbox/bun.lock
@@ -0,0 +1,24 @@
+{
+  "lockfileVersion": 1,
+  "configVersion": 1,
+  "workspaces": {
+    "": {
+      "name": "@tale/sandbox",
+      "devDependencies": {
+        "@types/bun": "^1.1.0",
+        "typescript": "^5.6.0",
+      },
+    },
+  },
+  "packages": {
+    "@types/bun": ["@types/bun@1.3.14", "", { "dependencies": { "bun-types": "1.3.14" } }, "sha512-h1hFqFVcvAvD9j9K7ZW7vd82aSA+rTdznZa+5bwvCwqSB1jmmfLcbIWhOLx1/+boy/xmjgCs/OMUL8hRJSmnPw=="],
+
+    "@types/node": ["@types/node@25.9.0", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-AOQwYUNolgy3VosiRqXrACUXTN8nJUtPl7FJXMqZVyxiiCLhQuG3jXKvCS1ALr+Y2OmZhzzLVlYPEqJaiqkaJQ=="],
+
+    "bun-types": ["bun-types@1.3.14", "", { "dependencies": { "@types/node": "*" } }, "sha512-4N0ig0fEomHt5R0KCFWjovxow98rIoRwKolrYdCcknNwMekCXRnWEUvgu5soYV8QXtVsrUD8B95MBOZGPvr6KQ=="],
+
+    "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
+
+    "undici-types": ["undici-types@7.24.6", "", {}, "sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg=="],
+  }
+}
diff --git a/services/sandbox/package.json b/services/sandbox/package.json
new file mode 100644
index 000000000..ea8247adf
--- /dev/null
+++ b/services/sandbox/package.json
@@ -0,0 +1,22 @@
+{
+  "name": "@tale/sandbox",
+  "version": "0.1.0",
+  "private": true,
+  "description": "Tale sandbox spawner — thin stateless docker-run service for artifact_run",
+  "type": "module",
+  "scripts": {
+    "dev": "bun --hot src/server.ts",
+    "start": "bun src/server.ts",
+    "lint": "bunx oxlint --type-aware",
+    "lint:fix": "bunx oxlint --type-aware --fix",
+    "format": "bunx oxfmt",
+    "format:check": "bunx oxfmt --check",
+    "typecheck": "tsc --noEmit",
+    "test": "bun test"
+  },
+  "dependencies": {},
+  "devDependencies": {
+    "@types/bun": "^1.1.0",
+    "typescript": "^5.6.0"
+  }
+}
diff --git a/services/sandbox/src/auth.test.ts b/services/sandbox/src/auth.test.ts
new file mode 100644
index 000000000..77df8786d
--- /dev/null
+++ b/services/sandbox/src/auth.test.ts
@@ -0,0 +1,159 @@
+// HMAC verify tests — covers the 30s window, replay rejection via the nonce
+// cache, and the `reason` discriminator.
+
+import { afterEach, describe, expect, test } from 'bun:test';
+
+import {
+  TIMESTAMP_TOLERANCE_MS,
+  _resetNonceCacheForTests,
+  sign,
+  verify,
+} from './auth.ts';
+
+const TOKEN = 'test-token';
+const METHOD = 'POST';
+const PATH = '/v1/execute';
+const BODY = JSON.stringify({ hello: 'world' });
+
+afterEach(() => {
+  _resetNonceCacheForTests();
+});
+
+function buildHeaders(nowMs: number): { signature: string; timestamp: string } {
+  const timestamp = String(nowMs);
+  const signature = sign(METHOD, PATH, timestamp, BODY, TOKEN);
+  return { signature, timestamp };
+}
+
+describe('verify — happy path', () => {
+  test('accepts a freshly signed request', () => {
+    const now = Date.now();
+    const { signature, timestamp } = buildHeaders(now);
+    const r = verify(METHOD, PATH, BODY, signature, timestamp, TOKEN, now);
+    expect(r.ok).toBe(true);
+    expect(r.reason).toBeUndefined();
+  });
+
+  test('window is exactly 30s — accepts at +29.999s', () => {
+    const tsMs = Date.now();
+    const { signature, timestamp } = buildHeaders(tsMs);
+    const r = verify(
+      METHOD,
+      PATH,
+      BODY,
+      signature,
+      timestamp,
+      TOKEN,
+      tsMs + TIMESTAMP_TOLERANCE_MS - 1,
+    );
+    expect(r.ok).toBe(true);
+  });
+});
+
+describe('verify — replay protection', () => {
+  test('second use of the same signature within the window is rejected', () => {
+    const now = Date.now();
+    const { signature, timestamp } = buildHeaders(now);
+    const first = verify(METHOD, PATH, BODY, signature, timestamp, TOKEN, now);
+    expect(first.ok).toBe(true);
+    const second = verify(
+      METHOD,
+      PATH,
+      BODY,
+      signature,
+      timestamp,
+      TOKEN,
+      now + 1_000,
+    );
+    expect(second.ok).toBe(false);
+    expect(second.reason).toBe('replay');
+  });
+
+  test('cancel-style empty-body request also dedups by signature', () => {
+    const now = Date.now();
+    const ts = String(now);
+    const sig = sign('POST', '/v1/cancel/abc', ts, '', TOKEN);
+    const first = verify('POST', '/v1/cancel/abc', '', sig, ts, TOKEN, now);
+    const second = verify(
+      'POST',
+      '/v1/cancel/abc',
+      '',
+      sig,
+      ts,
+      TOKEN,
+      now + 500,
+    );
+    expect(first.ok).toBe(true);
+    expect(second.ok).toBe(false);
+    expect(second.reason).toBe('replay');
+  });
+});
+
+describe('verify — failure discriminators', () => {
+  test('missing signature header', () => {
+    const now = Date.now();
+    const r = verify(METHOD, PATH, BODY, null, String(now), TOKEN, now);
+    expect(r).toEqual({ ok: false, reason: 'missing_signature' });
+  });
+
+  test('missing timestamp header', () => {
+    const now = Date.now();
+    const { signature } = buildHeaders(now);
+    const r = verify(METHOD, PATH, BODY, signature, null, TOKEN, now);
+    expect(r).toEqual({ ok: false, reason: 'missing_timestamp' });
+  });
+
+  test('bad timestamp (non-numeric)', () => {
+    const r = verify(METHOD, PATH, BODY, 'whatever', 'nope', TOKEN, Date.now());
+    expect(r).toEqual({ ok: false, reason: 'bad_timestamp' });
+  });
+
+  test('timestamp_skew past the 30s window', () => {
+    const tsMs = Date.now();
+    const { signature, timestamp } = buildHeaders(tsMs);
+    const r = verify(
+      METHOD,
+      PATH,
+      BODY,
+      signature,
+      timestamp,
+      TOKEN,
+      tsMs + TIMESTAMP_TOLERANCE_MS + 1_000,
+    );
+    expect(r).toEqual({ ok: false, reason: 'timestamp_skew' });
+  });
+
+  test('wrong signature → bad_signature, not replay', () => {
+    const now = Date.now();
+    const { timestamp } = buildHeaders(now);
+    // Same length (sha256 hex = 64 chars) to exercise timingSafeEqual.
+    const bogus = 'a'.repeat(64);
+    const r = verify(METHOD, PATH, BODY, bogus, timestamp, TOKEN, now);
+    expect(r).toEqual({ ok: false, reason: 'bad_signature' });
+  });
+
+  test('signature with wrong length → bad_signature', () => {
+    const now = Date.now();
+    const { timestamp } = buildHeaders(now);
+    const r = verify(METHOD, PATH, BODY, 'too-short', timestamp, TOKEN, now);
+    expect(r).toEqual({ ok: false, reason: 'bad_signature' });
+  });
+
+  test('signature bound to method: GET signature does not verify a POST', () => {
+    const now = Date.now();
+    const ts = String(now);
+    const getSig = sign('GET', PATH, ts, BODY, TOKEN);
+    const r = verify(METHOD, PATH, BODY, getSig, ts, TOKEN, now);
+    expect(r.ok).toBe(false);
+    expect(r.reason).toBe('bad_signature');
+  });
+
+  test('signature bound to path: /v1/execute signature does not verify /v1/cancel/abc', () => {
+    const now = Date.now();
+    const ts = String(now);
+    const exSig = sign(METHOD, '/v1/execute', ts, '', TOKEN);
+    const r = verify(METHOD, '/v1/cancel/abc', '', exSig, ts, TOKEN, now);
+    expect(r.ok).toBe(false);
+    expect(r.reason).toBe('bad_signature');
+  });
+});
diff --git a/services/sandbox/src/auth.ts b/services/sandbox/src/auth.ts
new file mode 100644
index 000000000..9e87aed88
--- /dev/null
+++ b/services/sandbox/src/auth.ts
@@ -0,0 +1,133 @@
+// HMAC-SHA256 request authentication.
+//
+// Convex (the only legitimate client) signs each request with the shared
+// SANDBOX_TOKEN; spawner verifies before accepting. Reachable only on the
+// internal Docker network anyway; HMAC is defense-in-depth so a
+// misconfigured deployment that exposes :8003 doesn't immediately leak.
+//
+// The signature is bound to method, path, timestamp, AND body hash:
+//
+//   signedString = `${method}\n${path}\n${timestamp}\n${sha256Hex(body)}`
+//   signature    = HMAC-SHA256(SANDBOX_TOKEN, signedString)
+//
+// Binding method+path stops a captured /v1/execute signature from being
+// replayed against /v1/cancel/:id (or vice-versa). Binding the timestamp
+// AND keeping a short-TTL nonce cache of seen signatures bounds the replay
+// window: even within the clock-skew tolerance an attacker can't reuse a
+// captured signature, because the second verify hits the cache and is
+// rejected.
+
+import { timingSafeEqual, createHmac, createHash } from 'node:crypto';
+
+export const SIGNATURE_HEADER = 'x-tale-sandbox-signature';
+export const TIMESTAMP_HEADER = 'x-tale-sandbox-timestamp';
+
+// Tolerance for clock skew + request travel. Convex actions and the
+// spawner share a host clock in our compose deployments; 30s is tight
+// enough to bound the replay window and loose enough to absorb a few
+// seconds of NTP drift on dev laptops.
+export const TIMESTAMP_TOLERANCE_MS = 30_000;
+
+// Nonce cache TTL — slightly longer than the timestamp tolerance so a
+// just-accepted signature stays remembered until its own timestamp ages out
+// of the skew window. After TTL the entry expires and the signature
+// could in principle be accepted again, but by then `timestamp_skew`
+// rejects it first.
+const NONCE_TTL_MS = TIMESTAMP_TOLERANCE_MS + 5_000;
+
+// Periodic sweep cadence — every Nth verify call we drop expired entries
+// so the cache size stays bounded under high request volume. The cap is
+// loose since each entry is tiny (sha256 hex + a Date.now() number).
+const NONCE_SWEEP_INTERVAL = 100;
+const seenSignatures = new Map<string, number>();
+let verifyCallsSinceSweep = 0;
+
+function maybeSweepNonces(now: number): void {
+  verifyCallsSinceSweep += 1;
+  if (verifyCallsSinceSweep < NONCE_SWEEP_INTERVAL) return;
+  verifyCallsSinceSweep = 0;
+  for (const [sig, expiresAt] of seenSignatures) {
+    if (expiresAt <= now) seenSignatures.delete(sig);
+  }
+}
+
+/** Exposed for tests; do NOT call from production code. */
+export function _resetNonceCacheForTests(): void {
+  seenSignatures.clear();
+  verifyCallsSinceSweep = 0;
+}
+
+function buildSignedString(
+  method: string,
+  path: string,
+  timestamp: string,
+  body: string,
+): string {
+  const bodyHash = createHash('sha256').update(body).digest('hex');
+  return `${method.toUpperCase()}\n${path}\n${timestamp}\n${bodyHash}`;
+}
+
+export function sign(
+  method: string,
+  path: string,
+  timestamp: string,
+  body: string,
+  token: string,
+): string {
+  const signedString = buildSignedString(method, path, timestamp, body);
+  return createHmac('sha256', token).update(signedString).digest('hex');
+}
+
+interface VerifyResult {
+  ok: boolean;
+  reason?:
+    | 'missing_signature'
+    | 'missing_timestamp'
+    | 'bad_timestamp'
+    | 'timestamp_skew'
+    | 'bad_signature'
+    | 'replay';
+}
+
+export function verify(
+  method: string,
+  path: string,
+  body: string,
+  signatureHeader: string | null,
+  timestampHeader: string | null,
+  token: string,
+  nowMs: number = Date.now(),
+): VerifyResult {
+  if (!signatureHeader) return { ok: false, reason: 'missing_signature' };
+  if (!timestampHeader) return { ok: false, reason: 'missing_timestamp' };
+  const ts = Number(timestampHeader);
+  if (!Number.isFinite(ts) || ts <= 0) {
+    return { ok: false, reason: 'bad_timestamp' };
+  }
+  if (Math.abs(nowMs - ts) > TIMESTAMP_TOLERANCE_MS) {
+    return { ok: false, reason: 'timestamp_skew' };
+  }
+  const expected = sign(method, path, timestampHeader, body, token);
+  if (expected.length !== signatureHeader.length) {
+    return { ok: false, reason: 'bad_signature' };
+  }
+  const a = Buffer.from(expected, 'utf8');
+  const b = Buffer.from(signatureHeader, 'utf8');
+  let equal: boolean;
+  try {
+    equal = timingSafeEqual(a, b);
+  } catch {
+    return { ok: false, reason: 'bad_signature' };
+  }
+  if (!equal) return { ok: false, reason: 'bad_signature' };
+
+  // Signature is structurally valid AND within the skew window. Now check
+  // the nonce cache to block replay-within-window.
+  maybeSweepNonces(nowMs);
+  const cached = seenSignatures.get(signatureHeader);
+  if (cached !== undefined && cached > nowMs) {
+    return { ok: false, reason: 'replay' };
+  }
+  seenSignatures.set(signatureHeader, nowMs + NONCE_TTL_MS);
+  return { ok: true };
+}
diff --git a/services/sandbox/src/cleanup.ts b/services/sandbox/src/cleanup.ts
new file mode 100644
index 000000000..d6ba60b9b
--- /dev/null
+++ b/services/sandbox/src/cleanup.ts
@@ -0,0 +1,347 @@
+// Two-layer cleanup, audit-cleaned per round-2 findings.
+//
+//   1. Boot sweep: docker rm any tale.sandbox=1 container left over from a
+//      previous spawner process, AND host-dir sweep over old session dirs
+//      whose mtime is past the watchdog cutoff. The dead "volume sweep"
+//      that the original code shipped is gone — workspaces are host bind
+//      mounts (no volume), and the cache volumes carry a different label
+//      and MUST NOT be reaped.
+//   2. Periodic sweep: every 5 min, kill any tale-sbx-* container whose
+//      `tale.started=<ms>` label is older than 2× max_timeout AND whose
+//      session id isn't in the live in-flight set. Same host-dir sweep
+//      for orphan session dirs.
+//   3. SIGTERM handler (in server.ts after refactor): stop accepting new
+//      requests, wait for in-flight count to drop, then exit.
+
+import {
+  mkdir,
+  readFile,
+  readdir,
+  rm,
+  stat,
+  utimes,
+  writeFile,
+} from 'node:fs/promises';
+import { hostname } from 'node:os';
+import { join } from 'node:path';
+
+import { runDocker, dockerRm } from './spawn-util.ts';
+import { cancelExecution, inFlightIds, isInFlight } from './spawn.ts';
+import type { SpawnerConfig } from './types.ts';
+
+const PERIODIC_INTERVAL_MS = 5 * 60_000;
+const SPAWNER_LOCK_FILE = '.spawner.lock';
+// If an existing lock file is fresher than this, treat the previous spawner
+// as still alive and refuse to start. Otherwise we assume the previous
+// process crashed without cleanup and take over the lock.
+const SPAWNER_LOCK_FRESH_MS = 60_000;
+// Refresh the lock's mtime at 1/3 of the freshness window so a peer
+// looking for a "fresh" lock always sees one as long as we're alive.
+// Without this the lock starts looking stale once we cross the
+// freshness threshold and a second spawner would happily reclaim it,
+// defeating the lock's only purpose (audit follow-up F15).
+const SPAWNER_LOCK_REFRESH_MS = Math.floor(SPAWNER_LOCK_FRESH_MS / 3);
+let lockRefreshHandle: ReturnType<typeof setInterval> | undefined;
+
+interface SpawnerLockPayload {
+  pid: number;
+  hostname: string;
+  bootEpoch: number;
+}
+
+/**
+ * Best-effort cross-process lock for the host session root. Prevents two
+ * spawners pointed at the same `/var/lib/tale-sandbox/sessions/` from
+ * stomping on each other — specifically, prevents bootSweep's host-dir
+ * sweep from deleting another live spawner's in-flight workspace
+ * (audit finding R2-B5).
+ *
+ * Lock contract: if a fresh lock (mtime within SPAWNER_LOCK_FRESH_MS)
+ * exists, refuse to start. Otherwise overwrite. On graceful shutdown the
+ * server.ts caller deletes the lock; an ungraceful exit leaves the lock
+ * stale and the next start can reclaim it after the freshness window.
+ */
+export async function acquireSpawnerLock(cfg: SpawnerConfig): Promise<void> {
+  await mkdir(cfg.hostSessionRoot, { recursive: true });
+  const lockPath = join(cfg.hostSessionRoot, SPAWNER_LOCK_FILE);
+  try {
+    const st = await stat(lockPath);
+    // Clamp to [0, ∞) to defend against backward wall-clock skew (NTP
+    // step, VM snapshot resume). A negative `age` would otherwise read
+    // as "fresh forever" via the `<` comparison even though the lock
+    // hasn't been touched in minutes (audit follow-up F15).
+    const age = Math.max(0, Date.now() - st.mtimeMs);
+    if (age < SPAWNER_LOCK_FRESH_MS) {
+      let existing = '<unreadable>';
+      try {
+        existing = await readFile(lockPath, 'utf8');
+      } catch (err) {
+        console.warn(`[sandbox.lock] reading existing lock failed:`, err);
+      }
+      throw new Error(
+        `Another spawner appears to be running at ${cfg.hostSessionRoot} ` +
+          `(lock fresh, age=${age}ms): ${existing.trim()}`,
+      );
+    }
+    // Stale lock; fall through to overwrite.
+    console.warn(
+      `[sandbox.lock] reclaiming stale lock at ${lockPath} (age=${age}ms)`,
+    );
+  } catch (err) {
+    // `code` is a non-standard property only present on NodeJS fs errors; the
+    // `instanceof Error` + `'code' in err` guards above prove it exists at
+    // runtime, but TS can't narrow to the typed shape, so we read it through a
+    // minimal interface.
+    const code =
+      err instanceof Error && 'code' in err
+        ? // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+          (err as { code?: string }).code
+        : undefined;
+    if (code !== 'ENOENT') {
+      // Either the lock-fresh refusal above (rethrow) OR an unexpected error.
+      if (err instanceof Error && err.message.startsWith('Another spawner')) {
+        throw err;
+      }
+      console.warn(`[sandbox.lock] stat ${lockPath} failed:`, err);
+    }
+  }
+  const payload: SpawnerLockPayload = {
+    pid: process.pid,
+    hostname: hostname(),
+    bootEpoch: Date.now(),
+  };
+  await writeFile(lockPath, JSON.stringify(payload));
+  // Keep the lock visibly "alive" via mtime refresh while the process
+  // runs. Stops a long-running spawner from accidentally looking stale
+  // to a peer that started later than SPAWNER_LOCK_FRESH_MS after our
+  // initial write.
+  if (lockRefreshHandle !== undefined) clearInterval(lockRefreshHandle);
+  lockRefreshHandle = setInterval(() => {
+    const now = Date.now() / 1000;
+    utimes(lockPath, now, now).catch((err) => {
+      console.warn(`[sandbox.lock] refresh ${lockPath} failed:`, err);
+    });
+  }, SPAWNER_LOCK_REFRESH_MS);
+  // Don't keep the event loop alive solely to refresh the lock — the
+  // shutdown handler will clear this. .unref() avoids a hung-process
+  // case if every other timer is cleared.
+  lockRefreshHandle.unref?.();
+}
+
+/**
+ * Drop the lock on graceful shutdown so a fast restart doesn't need to wait
+ * out the freshness window.
+ */
+async function releaseSpawnerLock(cfg: SpawnerConfig): Promise<void> {
+  if (lockRefreshHandle !== undefined) {
+    clearInterval(lockRefreshHandle);
+    lockRefreshHandle = undefined;
+  }
+  const lockPath = join(cfg.hostSessionRoot, SPAWNER_LOCK_FILE);
+  try {
+    await rm(lockPath, { force: true });
+  } catch (err) {
+    console.warn(`[sandbox.lock] release ${lockPath} failed:`, err);
+  }
+}
+
+async function listLabeledContainers(label: string): Promise<string[]> {
+  const result = await runDocker(['ps', '-aq', '-f', `label=${label}`]);
+  if (result.exitCode !== 0) return [];
+  return result.stdout
+    .split('\n')
+    .map((s) => s.trim())
+    .filter((s) => s.length > 0);
+}
+
+async function sweepHostSessionDirs(
+  cfg: SpawnerConfig,
+  staleThreshold: number,
+): Promise<number> {
+  let entries;
+  try {
+    entries = await readdir(cfg.hostSessionRoot, { withFileTypes: true });
+  } catch (err) {
+    // Root not yet created (first boot) — fine.
+    if (err instanceof Error && 'code' in err && err.code === 'ENOENT') {
+      return 0;
+    }
+    console.warn(
+      `[sandbox.cleanup] failed to read host session root ${cfg.hostSessionRoot}:`,
+      err,
+    );
+    return 0;
+  }
+  let removed = 0;
+  for (const e of entries) {
+    if (!e.isDirectory()) continue;
+    if (isInFlight(e.name)) continue;
+    const abs = join(cfg.hostSessionRoot, e.name);
+    let st;
+    try {
+      st = await stat(abs);
+    } catch (err) {
+      console.warn(`[sandbox.cleanup] stat ${abs} failed:`, err);
+      continue;
+    }
+    if (st.mtimeMs >= staleThreshold) continue;
+    try {
+      await rm(abs, { recursive: true, force: true });
+      removed += 1;
+    } catch (err) {
+      console.warn(`[sandbox.cleanup] rm ${abs} failed:`, err);
+    }
+  }
+  return removed;
+}
+
+export async function bootSweep(cfg?: SpawnerConfig): Promise<void> {
+  const containers = await listLabeledContainers('tale.sandbox=1');
+  for (const c of containers) {
+    try {
+      await dockerRm(c);
+    } catch (err) {
+      console.warn(`[sandbox.bootSweep] dockerRm ${c} failed:`, err);
+    }
+  }
+  const stagingContainers = await listLabeledContainers(
+    'tale.sandbox-staging=1',
+  );
+  for (const c of stagingContainers) {
+    try {
+      await dockerRm(c);
+    } catch (err) {
+      console.warn(`[sandbox.bootSweep] dockerRm staging ${c} failed:`, err);
+    }
+  }
+  let dirsRemoved = 0;
+  if (cfg) {
+    // Belt-and-braces: even with the acquireSpawnerLock guarantee above
+    // that no other live spawner shares this hostSessionRoot, use the
+    // same `2 × maxTimeoutMs` staleness cutoff as the periodic sweep.
+    // Dirs younger than that may belong to a recently-killed previous
+    // spawner whose in-flight workspace was reaped along with its
+    // container; nothing references them anymore so they're safe to
+    // delete, but the conservative cutoff matches the rest of the code
+    // path's contract and is robust under any future change where the
+    // lock acquire is loosened (audit finding R2-B5).
+    dirsRemoved = await sweepHostSessionDirs(
+      cfg,
+      Date.now() - 2 * cfg.maxTimeoutMs,
+    );
+  }
+  if (containers.length > 0 || dirsRemoved > 0) {
+    console.log(
+      `[sandbox] boot sweep removed ${containers.length} container(s) and ${dirsRemoved} session dir(s)`,
+    );
+  }
+}
+
+export function startPeriodicSweep(cfg: SpawnerConfig): () => void {
+  const interval = setInterval(async () => {
+    try {
+      const result = await runDocker([
+        'ps',
+        '-a',
+        '--filter',
+        'label=tale.sandbox=1',
+        '--format',
+        '{{.Names}}\t{{.Labels}}',
+      ]);
+      if (result.exitCode !== 0) return;
+      const now = Date.now();
+      const staleThreshold = now - 2 * cfg.maxTimeoutMs;
+      for (const line of result.stdout.split('\n')) {
+        const [name, labels] = line.split('\t');
+        if (!name) continue;
+        const m = labels?.match(/tale\.started=(\d+)/);
+        if (!m) continue;
+        const started = Number.parseInt(m[1] ?? '0', 10);
+        if (Number.isNaN(started) || started >= staleThreshold) continue;
+        // session id is the second component of the name (tale-sbx-<id>).
+        const sessionId = name.replace(/^tale-sbx-/, '');
+        if (isInFlight(sessionId)) continue;
+        try {
+          await dockerRm(name);
+        } catch (err) {
+          console.warn(
+            `[sandbox.periodic] dockerRm stale ${name} failed:`,
+            err,
+          );
+          continue;
+        }
+        console.log(
+          `[sandbox] periodic sweep removed stale container ${name} (started ${new Date(started).toISOString()})`,
+        );
+      }
+      // Host-dir sweep: per-execution session dirs that lived past the
+      // stale threshold without an active in-flight entry are orphaned.
+      // Replaces the old volume-sweep block that targeted volumes nobody
+      // creates (audit finding R2-3 C5).
+      await sweepHostSessionDirs(cfg, staleThreshold);
+    } catch (err) {
+      console.warn(`[sandbox.periodic] sweep error:`, err);
+    }
+  }, PERIODIC_INTERVAL_MS);
+  return () => clearInterval(interval);
+}
+
+/**
+ * Graceful shutdown handler.
+ *
+ * The original code called `process.exit(0)` immediately after issuing
+ * `docker kill` for every in-flight id — but `executeRequest`'s finally
+ * block (which rm -rfs the host session dir) was racing with the exit,
+ * so SIGTERM mid-execution leaked the host workspace. The new flow:
+ *
+ *   1. Mark "draining" so the HTTP layer stops accepting new work
+ *      (callers pass the stop callback in).
+ *   2. Issue `cancelExecution` for every in-flight id; this aborts the
+ *      runDocker subprocess via AbortSignal and lets each
+ *      `executeRequest` proceed to its finally block.
+ *   3. Wait (with a 20s ceiling) for the in-flight Map to drain.
+ *   4. exit().
+ */
+export function installSignalHandlers(
+  stopAccepting: () => void,
+  cfg?: SpawnerConfig,
+): void {
+  let shuttingDown = false;
+  const onTerm = async (sig: string) => {
+    if (shuttingDown) {
+      console.warn(`[sandbox] received second ${sig}; forcing exit`);
+      process.exit(1);
+    }
+    shuttingDown = true;
+    console.log(`[sandbox] received ${sig}; draining in-flight executions`);
+    try {
+      stopAccepting();
+    } catch (err) {
+      console.warn(`[sandbox.shutdown] stopAccepting failed:`, err);
+    }
+    const ids = inFlightIds();
+    await Promise.allSettled(
+      ids.map((id) =>
+        cancelExecution(id).catch((err) => {
+          console.warn(`[sandbox.shutdown] cancel ${id} failed:`, err);
+        }),
+      ),
+    );
+    const deadline = Date.now() + 20_000;
+    while (inFlightIds().length > 0 && Date.now() < deadline) {
+      await new Promise<void>((resolve) => setTimeout(resolve, 200));
+    }
+    const remaining = inFlightIds();
+    if (remaining.length > 0) {
+      console.warn(
+        `[sandbox] shutdown deadline; ${remaining.length} execution(s) still in-flight (${remaining.join(', ')})`,
+      );
+    }
+    if (cfg) {
+      await releaseSpawnerLock(cfg);
+    }
+    process.exit(0);
+  };
+  process.on('SIGTERM', () => void onTerm('SIGTERM'));
+  process.on('SIGINT', () => void onTerm('SIGINT'));
+}
diff --git a/services/sandbox/src/config.ts b/services/sandbox/src/config.ts
new file mode 100644
index 000000000..2f70b4c6f
--- /dev/null
+++ b/services/sandbox/src/config.ts
@@ -0,0 +1,92 @@
+// Spawner configuration — parsed from env at boot. Defaults match the plan;
+// every knob is overridable so an operator can tune without rebuilding.
+
+import type { SpawnerConfig } from './types.ts';
+
+function numEnv(
+  name: string,
+  fallback: number,
+  opts?: { min?: number; max?: number },
+): number {
+  const v = process.env[name];
+  // Trim + empty-string ⇒ unset. Without the trim, `SANDBOX_PORT='  '` would
+  // pass `Number('  ') === 0` and silently disable the port (audit finding).
+  if (v === undefined || v.trim() === '') return fallback;
+  const n = Number(v);
+  if (!Number.isFinite(n)) {
+    throw new Error(
+      `Env var ${name} is not a finite number: ${JSON.stringify(v)}`,
+    );
+  }
+  const min = opts?.min ?? 0;
+  if (n < min) {
+    throw new Error(`Env var ${name} must be >= ${min}; got: ${n}`);
+  }
+  if (opts?.max !== undefined && n > opts.max) {
+    throw new Error(`Env var ${name} must be <= ${opts.max}; got: ${n}`);
+  }
+  return n;
+}
+
+export function loadConfig(): SpawnerConfig {
+  const rawRuntime = process.env.SANDBOX_RUNTIME ?? 'runc';
+  if (rawRuntime !== 'runc' && rawRuntime !== 'runsc') {
+    throw new Error(
+      `SANDBOX_RUNTIME must be 'runc' or 'runsc'; got: ${JSON.stringify(rawRuntime)}`,
+    );
+  }
+  const runtime: 'runc' | 'runsc' = rawRuntime;
+  const rawToken = process.env.SANDBOX_TOKEN;
+  return {
+    port: numEnv('SANDBOX_PORT', 8003, { min: 1, max: 65535 }),
+    // Token policy: opt-in verification. Unset (or empty-string) = HMAC
+    // disabled; set = enforced. `authorize()` returns null when this is
+    // null, so the wire path simply skips signature checks.
+    sandboxToken: rawToken && rawToken.length > 0 ? rawToken : null,
+    runtimeImage:
+      process.env.SANDBOX_RUNTIME_IMAGE ?? 'tale-sandbox-runtime:latest',
+    runtime,
+    defaultTimeoutMs: numEnv('SANDBOX_DEFAULT_TIMEOUT_MS', 30_000, { min: 1 }),
+    maxTimeoutMs: numEnv('SANDBOX_MAX_TIMEOUT_MS', 300_000, { min: 1 }),
+    maxConcurrent: numEnv('SANDBOX_MAX_CONCURRENT', 4, { min: 1 }),
+    hostSessionRoot:
+      process.env.SANDBOX_HOST_SESSION_ROOT ?? '/var/lib/tale-sandbox/sessions',
+    cacheVolumePrefix: {
+      pip:
+        process.env.SANDBOX_PIP_CACHE_VOLUME_PREFIX ?? 'tale-sandbox-pip-cache',
+      npm:
+        process.env.SANDBOX_NPM_CACHE_VOLUME_PREFIX ?? 'tale-sandbox-npm-cache',
+    },
+    egressNetwork: process.env.SANDBOX_EGRESS_NETWORK ?? 'tale-sandbox-net',
+    egressProxy:
+      process.env.SANDBOX_EGRESS_PROXY ?? 'http://sandbox-egress:3128',
+    stdoutMaxBytes: numEnv('SANDBOX_STDOUT_MAX_BYTES', 5 * 1024 * 1024, {
+      min: 1024,
+    }),
+    stderrMaxBytes: numEnv('SANDBOX_STDERR_MAX_BYTES', 5 * 1024 * 1024, {
+      min: 1024,
+    }),
+    outputFileMaxBytes: numEnv(
+      'SANDBOX_OUTPUT_FILE_MAX_BYTES',
+      50 * 1024 * 1024,
+      { min: 1024 },
+    ),
+    outputTotalMaxBytes: numEnv(
+      'SANDBOX_OUTPUT_TOTAL_MAX_BYTES',
+      100 * 1024 * 1024,
+      { min: 1024 },
+    ),
+    // Body cap on /v1/execute. Post-sandbox-wobbly-origami the request
+    // body carries only source files + URL lists (no inline base64
+    // outputs), so 2 MB is plenty: 800 KB MAX_FILES_BYTES + URL arrays
+    // + JSON wrapper overhead leaves room to spare while bounding the
+    // unsigned-mode OOM surface. The legacy 20 MB cap was sized for
+    // inline base64 prior-output round-tripping which no longer exists.
+    // Operators with a niche need can raise via SANDBOX_MAX_REQUEST_BODY_BYTES.
+    maxRequestBodyBytes: numEnv(
+      'SANDBOX_MAX_REQUEST_BODY_BYTES',
+      2 * 1024 * 1024,
+      { min: 4 * 1024 },
+    ),
+  };
+}
diff --git a/services/sandbox/src/docker-args.test.ts b/services/sandbox/src/docker-args.test.ts
new file mode 100644
index 000000000..e3b987520
Binary files /dev/null and b/services/sandbox/src/docker-args.test.ts differ
diff --git a/services/sandbox/src/docker-args.ts b/services/sandbox/src/docker-args.ts
new file mode 100644
index 000000000..a4415796a
--- /dev/null
+++ b/services/sandbox/src/docker-args.ts
@@ -0,0 +1,170 @@
+// Canonical `docker run` argv builder.
+//
+// Pure function so the unit test (R1.22 #1 regression gate) can snapshot the
+// argv without invoking docker. CRITICAL: user code is NEVER passed via argv
+// — it's staged via a host bind-mount that maps /var/lib/tale-sandbox/
+// sessions/<id>/ into /workspace inside the container (see
+// spawn.ts:stageWorkspace). Only typed identifiers (UUID, orgId after
+// validation, language, image) reach argv positions.
+
+import type { Language, SpawnerConfig } from './types.ts';
+
+interface DockerRunInput {
+  executionId: string;
+  organizationId: string;
+  language: Language;
+  timeoutMs: number;
+  pipCacheVolume: string;
+  npmCacheVolume: string;
+  // Host path (1:1 mounted into the spawner) that becomes /workspace inside
+  // the runtime container. Used instead of --tmpfs because docker cp cannot
+  // read from tmpfs mounts and we need to harvest files from /workspace/output
+  // after the container exits.
+  workspaceHostDir: string;
+  startedAtMs: number;
+  /**
+   * Path the runtime entrypoint will exec(). Either a relative POSIX path
+   * resolved under /workspace/code/ (single-script mode, points at the
+   * user's file), or an absolute path under /workspace/.tale/ (multi-step
+   * mode, points at the spawner-generated wrapper). The entrypoint
+   * rejects anything outside those two roots.
+   */
+  entryPath: string;
+}
+
+// executionId is either a UUID (hex + hyphens) from a direct caller or a
+// Convex doc id (lowercase alphanumeric). Both produce safe Docker container
+// names — alphanumeric + dash/underscore only.
+const UUID_RE = /^[a-zA-Z0-9_-]{1,64}$/;
+const ORG_RE = /^[a-zA-Z0-9_-]{1,128}$/;
+const VOL_RE = /^[a-zA-Z0-9_.-]{1,128}$/;
+const HOST_DIR_RE = /^\/[a-zA-Z0-9_./-]{1,256}$/;
+// Relative POSIX-safe path (under /workspace/code/) OR an absolute path
+// under one of the two roots the runtime entrypoint accepts. The negative
+// lookahead bans `..` segments — defense-in-depth, the spawner-side
+// validator already strips these.
+const ENTRY_PATH_RE =
+  /^(?:\/workspace\/(?:code|\.tale)\/(?!.*\.\.)[A-Za-z0-9_./-]{1,256}|(?!.*\.\.)[A-Za-z0-9_-][A-Za-z0-9_./-]{0,255})$/;
+
+function assertSafe(name: string, value: string, re: RegExp): void {
+  if (!re.test(value)) {
+    throw new Error(
+      `docker-args: ${name} value rejected by safety regex: ${JSON.stringify(value)}`,
+    );
+  }
+}
+
+export function buildDockerRunArgs(
+  cfg: SpawnerConfig,
+  inp: DockerRunInput,
+): string[] {
+  // Defense-in-depth: even though every caller is internal and typed, validate
+  // every string that ends up in argv. A regression that lets a user-controlled
+  // string land here would otherwise be a container-escape primitive.
+  assertSafe('executionId', inp.executionId, UUID_RE);
+  assertSafe('organizationId', inp.organizationId, ORG_RE);
+  assertSafe('pipCacheVolume', inp.pipCacheVolume, VOL_RE);
+  assertSafe('npmCacheVolume', inp.npmCacheVolume, VOL_RE);
+  assertSafe('workspaceHostDir', inp.workspaceHostDir, HOST_DIR_RE);
+  assertSafe('entryPath', inp.entryPath, ENTRY_PATH_RE);
+  if (
+    inp.language !== 'python' &&
+    inp.language !== 'node' &&
+    inp.language !== 'polyglot'
+  ) {
+    throw new Error(`docker-args: bad language: ${inp.language as string}`);
+  }
+
+  const containerName = `tale-sbx-${inp.executionId}`;
+  // No `--rm` because spawn.ts removes the container explicitly after
+  // harvesting outputs from the host bind-mounted workspace dir.
+  return [
+    'run',
+    `--runtime=${cfg.runtime}`,
+    '--name',
+    containerName,
+    '--label',
+    'tale.sandbox=1',
+    `--label`,
+    `tale.session=${inp.executionId}`,
+    `--label`,
+    `tale.started=${inp.startedAtMs}`,
+    `--label`,
+    `tale.org=${inp.organizationId}`,
+    `--network`,
+    cfg.egressNetwork,
+    `--env`,
+    `HTTPS_PROXY=${cfg.egressProxy}`,
+    `--env`,
+    `HTTP_PROXY=${cfg.egressProxy}`,
+    `--env`,
+    `NO_PROXY=127.0.0.1,localhost`,
+    `--env`,
+    `PIP_CACHE_DIR=/cache/pip`,
+    `--env`,
+    `UV_CACHE_DIR=/cache/pip`,
+    `--env`,
+    `NPM_CONFIG_CACHE=/cache/npm`,
+    // `--read-only` makes the nobody user's $HOME=/nonexistent un-writable;
+    // every tool that touches $HOME (uv, npm, fontconfig) errors out. Point
+    // HOME at the tmpfs /tmp so transient state goes somewhere writable.
+    `--env`,
+    `HOME=/tmp`,
+    '--cpus=1',
+    '--memory=1500m',
+    '--memory-swap=1500m',
+    '--pids-limit=128',
+    // Cap the host daemon's json-file log so a runtime container that floods
+    // stdout/stderr can't fill the host disk (audit finding R2-B2: spawner's
+    // own log_driver only covered the spawner container, not the sibling
+    // runtime containers it docker-runs). 10 MB × 1 file ≈ matches the
+    // spawner-side stdout/stderr caps after compression.
+    '--log-driver=json-file',
+    '--log-opt',
+    'max-size=10m',
+    '--log-opt',
+    'max-file=1',
+    '--ulimit',
+    'nofile=1024:4096',
+    '--ulimit',
+    'fsize=104857600',
+    '--ulimit',
+    'cpu=600',
+    '--ulimit',
+    'core=0:0',
+    '--oom-score-adj=500',
+    '--read-only',
+    '--tmpfs',
+    '/tmp:exec,nosuid,nodev,size=128m',
+    // Workspace is a host bind mount so the spawner can write the staging
+    // bundle directly from Bun fs (no tar pipe needed) and read output files
+    // back via Bun fs (docker cp cannot read from --tmpfs mounts). Total
+    // disk usage is capped by `--ulimit fsize` (100 MB per file) plus the
+    // post-run cleanup in spawn.ts. Trades the tmpfs ENOSPC cap (R2.2) for
+    // workable harvest semantics; see plan §"Trade-offs explicitly chosen".
+    '--mount',
+    `type=bind,src=${inp.workspaceHostDir},dst=/workspace`,
+    '--cap-drop=ALL',
+    '--security-opt',
+    'no-new-privileges',
+    '--security-opt',
+    'apparmor=docker-default',
+    // NOTE: custom seccomp profile is a v1.x hardening target. For v1 we rely
+    // on Docker's built-in default profile which already blocks unshare/keyctl
+    // /add_key/bpf/mount/pivot_root; see plan §"Security model".
+    '--user',
+    '65534:65534',
+    '--mount',
+    `type=volume,src=${inp.pipCacheVolume},dst=/cache/pip`,
+    '--mount',
+    `type=volume,src=${inp.npmCacheVolume},dst=/cache/npm`,
+    // The runtime image's ENTRYPOINT is already `/entrypoint.sh`, so we only
+    // pass the entrypoint's positional args here. The 4th positional is the
+    // path the entrypoint will exec — see services/sandbox-runtime/entrypoint.sh.
+    cfg.runtimeImage,
+    inp.language,
+    '/workspace/code/packages.json',
+    '/workspace/code/options.json',
+    inp.entryPath,
+  ];
+}
diff --git a/services/sandbox/src/sandbox-callback.ts b/services/sandbox/src/sandbox-callback.ts
new file mode 100644
index 000000000..47897f8b7
--- /dev/null
+++ b/services/sandbox/src/sandbox-callback.ts
@@ -0,0 +1,270 @@
+// Outbound HMAC-signed callbacks from the spawner back to the Convex
+// platform. The platform vends presigned upload URLs (EP1) and accepts
+// per-file storageId reports (EP2) via these endpoints; the spawner
+// reuses the same SANDBOX_TOKEN it accepts inbound requests with (the
+// shared secret is bidirectional — see sandbox-wobbly-origami plan §2).
+//
+// Signature contract (mirrors auth.ts on the inbound side):
+//   signedString = `${METHOD}\n${path}\n${timestamp}\n${sha256Hex(body)}`
+//   signature    = HMAC-SHA256(SANDBOX_TOKEN, signedString)
+
+import { createHash, createHmac } from 'node:crypto';
+
+import type { UploadFailure } from './types.ts';
+
+const SIGNATURE_HEADER = 'x-tale-sandbox-signature';
+const TIMESTAMP_HEADER = 'x-tale-sandbox-timestamp';
+
+function signSandboxRequest(
+  method: string,
+  path: string,
+  timestamp: string,
+  body: string,
+  token: string,
+): string {
+  const bodyHash = createHash('sha256').update(body).digest('hex');
+  const signedString = `${method.toUpperCase()}\n${path}\n${timestamp}\n${bodyHash}`;
+  return createHmac('sha256', token).update(signedString).digest('hex');
+}
+
+function pathOf(url: string): string {
+  try {
+    return new URL(url).pathname;
+  } catch {
+    return url;
+  }
+}
+
+interface CallbackOptions {
+  token: string | null;
+}
+
+/**
+ * Request additional presigned upload URLs from the platform (EP1). Returns
+ * the URL strings, or null on quota-exceeded (HTTP 412) / network failure.
+ * Caller handles the null by stopping further uploads and recording an
+ * `UPLOAD_QUOTA_EXCEEDED` (412) or `UPLOAD_FAILED` (everything else).
+ */
+export async function requestUploadUrls(
+  endpoint: string,
+  executionId: string,
+  count: number,
+  opts: CallbackOptions,
+): Promise<
+  | { ok: true; urls: string[] }
+  | {
+      ok: false;
+      code: 'QUOTA_EXCEEDED' | 'FAILED';
+      status: number;
+      snippet: string;
+    }
+> {
+  const body = JSON.stringify({ executionId, count });
+  const headers: Record<string, string> = {
+    'content-type': 'application/json',
+  };
+  if (opts.token !== null) {
+    const ts = String(Date.now());
+    headers[SIGNATURE_HEADER] = signSandboxRequest(
+      'POST',
+      pathOf(endpoint),
+      ts,
+      body,
+      opts.token,
+    );
+    headers[TIMESTAMP_HEADER] = ts;
+  }
+  let res: Response;
+  try {
+    res = await fetch(endpoint, { method: 'POST', headers, body });
+  } catch (err) {
+    return {
+      ok: false,
+      code: 'FAILED',
+      status: 0,
+      snippet: err instanceof Error ? err.message : String(err),
+    };
+  }
+  if (res.status === 412) {
+    return { ok: false, code: 'QUOTA_EXCEEDED', status: 412, snippet: '' };
+  }
+  if (!res.ok) {
+    const snippet = (await res.text().catch(() => '')).slice(0, 200);
+    return { ok: false, code: 'FAILED', status: res.status, snippet };
+  }
+  let parsed: unknown;
+  try {
+    parsed = await res.json();
+  } catch (err) {
+    return {
+      ok: false,
+      code: 'FAILED',
+      status: res.status,
+      snippet: `EP1 JSON parse: ${err instanceof Error ? err.message : String(err)}`,
+    };
+  }
+  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    return {
+      ok: false,
+      code: 'FAILED',
+      status: res.status,
+      snippet: 'EP1 not object',
+    };
+  }
+  // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+  const p = parsed as Record<string, unknown>;
+  if (!Array.isArray(p.urls)) {
+    return {
+      ok: false,
+      code: 'FAILED',
+      status: res.status,
+      snippet: 'EP1 urls missing',
+    };
+  }
+  const urls: string[] = [];
+  for (const u of p.urls) {
+    if (typeof u === 'string') urls.push(u);
+  }
+  return { ok: true, urls };
+}
+
+/**
+ * Report a successful per-file upload to the platform (EP2). Returns true
+ * on success, false on any HTTP / network failure. Caller logs the failure
+ * via `UploadFailure` but does NOT abort the harvest — EP2 is the rollback
+ * safety net, not the correctness contract.
+ */
+export async function reportUploaded(
+  endpoint: string,
+  executionId: string,
+  file: {
+    fileName: string;
+    storageId: string;
+    size: number;
+    contentType: string;
+  },
+  opts: CallbackOptions,
+): Promise<{ ok: true } | { ok: false; status: number; snippet: string }> {
+  const body = JSON.stringify({
+    executionId,
+    fileName: file.fileName,
+    storageId: file.storageId,
+    size: file.size,
+    contentType: file.contentType,
+  });
+  const headers: Record<string, string> = {
+    'content-type': 'application/json',
+  };
+  if (opts.token !== null) {
+    const ts = String(Date.now());
+    headers[SIGNATURE_HEADER] = signSandboxRequest(
+      'POST',
+      pathOf(endpoint),
+      ts,
+      body,
+      opts.token,
+    );
+    headers[TIMESTAMP_HEADER] = ts;
+  }
+  let res: Response;
+  try {
+    res = await fetch(endpoint, { method: 'POST', headers, body });
+  } catch (err) {
+    return {
+      ok: false,
+      status: 0,
+      snippet: err instanceof Error ? err.message : String(err),
+    };
+  }
+  if (!res.ok) {
+    const snippet = (await res.text().catch(() => '')).slice(0, 200);
+    return { ok: false, status: res.status, snippet };
+  }
+  return { ok: true };
+}
+
+/**
+ * POST raw file bytes to a presigned Convex upload URL. The URL is single-
+ * use and 1h-TTL; on success the body carries `{storageId}`. Returns the
+ * allocated storage id or a structured failure suitable for inclusion in
+ * `ExecuteResponse.uploadStats.failures`.
+ */
+export async function postToUploadSlot(
+  url: string,
+  bytes: Uint8Array | Buffer,
+  contentType: string,
+  slotIndex: number,
+  fileName: string,
+): Promise<
+  { ok: true; storageId: string } | { ok: false; failure: UploadFailure }
+> {
+  let res: Response;
+  try {
+    res = await fetch(url, {
+      method: 'POST',
+      headers: { 'content-type': contentType },
+      body: bytes,
+    });
+  } catch (err) {
+    return {
+      ok: false,
+      failure: {
+        slotIndex,
+        fileName,
+        httpStatus: 0,
+        errorSnippet: err instanceof Error ? err.message : String(err),
+      },
+    };
+  }
+  if (!res.ok) {
+    const snippet = (await res.text().catch(() => '')).slice(0, 200);
+    return {
+      ok: false,
+      failure: {
+        slotIndex,
+        fileName,
+        httpStatus: res.status,
+        errorSnippet: snippet,
+      },
+    };
+  }
+  let parsed: unknown;
+  try {
+    parsed = await res.json();
+  } catch (err) {
+    return {
+      ok: false,
+      failure: {
+        slotIndex,
+        fileName,
+        httpStatus: res.status,
+        errorSnippet: `JSON parse: ${err instanceof Error ? err.message : String(err)}`,
+      },
+    };
+  }
+  if (parsed === null || typeof parsed !== 'object' || Array.isArray(parsed)) {
+    return {
+      ok: false,
+      failure: {
+        slotIndex,
+        fileName,
+        httpStatus: res.status,
+        errorSnippet: 'upload response not an object',
+      },
+    };
+  }
+  // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+  const p = parsed as Record<string, unknown>;
+  if (typeof p.storageId !== 'string' || p.storageId.length === 0) {
+    return {
+      ok: false,
+      failure: {
+        slotIndex,
+        fileName,
+        httpStatus: res.status,
+        errorSnippet: 'upload response missing storageId',
+      },
+    };
+  }
+  return { ok: true, storageId: p.storageId };
+}
diff --git a/services/sandbox/src/server.test.ts b/services/sandbox/src/server.test.ts
new file mode 100644
index 000000000..45116e19a
--- /dev/null
+++ b/services/sandbox/src/server.test.ts
@@ -0,0 +1,186 @@
+// Smoke tests for the HTTP entrypoint's contracts.
+//
+// `server.ts` runs `loadConfig()` + `void main()` at module load, so we
+// don't import it directly. Instead we exercise the wire-level guarantees
+// that the router depends on (id alphabet regex, HMAC verifier, fail-closed
+// config defaults) — the same way `docker-args.test.ts` covers the spawn
+// argv builder without ever booting the server.
+
+import { describe, expect, test } from 'bun:test';
+
+import {
+  SIGNATURE_HEADER,
+  TIMESTAMP_HEADER,
+  TIMESTAMP_TOLERANCE_MS,
+  sign,
+  verify,
+} from './auth.ts';
+import { loadConfig } from './config.ts';
+import { ID_ALPHABET_RE } from './wire.ts';
+
+// The cancel-route regex in server.ts is constructed from the same id alphabet
+// as wire.ts (centralised in commit e9211127d). This block is a regression
+// gate so a future widening on one side doesn't silently desync from the
+// router. The literal here mirrors `CANCEL_ROUTE_RE` in server.ts.
+const CANCEL_ROUTE_RE = /^\/v1\/cancel\/([a-zA-Z0-9_-]{1,64})$/;
+
+describe('cancel route regex', () => {
+  test('accepts a Convex doc-id (base32-ish, includes letters g-z)', () => {
+    // Real Convex doc ids look like k7… and freely contain a-z; the original
+    // narrower [0-9a-f] alphabet rejected them, which is the bug this regex
+    // fixes.
+    const id = 'k74m9zr5b8jcgvx2pqfwsdyhntq3l1a0';
+    expect(CANCEL_ROUTE_RE.test(`/v1/cancel/${id}`)).toBe(true);
+    expect(ID_ALPHABET_RE.test(id)).toBe(true);
+  });
+
+  test('accepts dash + underscore (dev id alphabet)', () => {
+    expect(CANCEL_ROUTE_RE.test('/v1/cancel/dev_run-001')).toBe(true);
+  });
+
+  test('rejects path traversal and shell metacharacters', () => {
+    for (const bad of [
+      '/v1/cancel/../escape',
+      '/v1/cancel/a;b',
+      '/v1/cancel/$(whoami)',
+      '/v1/cancel/a b',
+      '/v1/cancel/',
+    ]) {
+      expect(CANCEL_ROUTE_RE.test(bad)).toBe(false);
+    }
+  });
+
+  test('caps id length at 64', () => {
+    const tooLong = 'a'.repeat(65);
+    expect(CANCEL_ROUTE_RE.test(`/v1/cancel/${tooLong}`)).toBe(false);
+  });
+});
+
+describe('loadConfig token defaults', () => {
+  test('returns null token on a fresh env (opt-in verification)', () => {
+    // server.ts main() only warns when sandboxToken is null; the wire path's
+    // `authorize()` returns null and skips HMAC checks. Drop the env var
+    // and re-parse to confirm the config surface matches the policy.
+    const prev = process.env.SANDBOX_TOKEN;
+    delete process.env.SANDBOX_TOKEN;
+    try {
+      const cfg = loadConfig();
+      expect(cfg.sandboxToken).toBeNull();
+    } finally {
+      if (prev !== undefined) process.env.SANDBOX_TOKEN = prev;
+    }
+  });
+
+  test('treats empty-string SANDBOX_TOKEN as unset', () => {
+    const prev = process.env.SANDBOX_TOKEN;
+    process.env.SANDBOX_TOKEN = '';
+    try {
+      const cfg = loadConfig();
+      expect(cfg.sandboxToken).toBeNull();
+    } finally {
+      if (prev === undefined) delete process.env.SANDBOX_TOKEN;
+      else process.env.SANDBOX_TOKEN = prev;
+    }
+  });
+});
+
+describe('HMAC verify (method+path+ts+body binding)', () => {
+  const token = 'shared-secret';
+  const body = JSON.stringify({ executionId: 'abc', code: 'print(1)' });
+  const method = 'POST';
+  const path = '/v1/execute';
+  const now = 1_700_000_000_000;
+  const ts = String(now);
+
+  test('accepts a correctly-signed request', () => {
+    const sig = sign(method, path, ts, body, token);
+    expect(verify(method, path, body, sig, ts, token, now)).toEqual({
+      ok: true,
+    });
+  });
+
+  test('rejects a wrong signature', () => {
+    const sig = sign(method, path, ts, body, 'other-secret');
+    expect(verify(method, path, body, sig, ts, token, now)).toEqual({
+      ok: false,
+      reason: 'bad_signature',
+    });
+  });
+
+  test('rejects a tampered body', () => {
+    const sig = sign(method, path, ts, body, token);
+    expect(verify(method, path, `${body} `, sig, ts, token, now)).toEqual({
+      ok: false,
+      reason: 'bad_signature',
+    });
+  });
+
+  test('rejects a captured signature replayed against a different path', () => {
+    // The whole point of binding the path: a leaked /v1/execute signature
+    // must not authenticate /v1/cancel/<id>.
+    const sig = sign(method, '/v1/execute', ts, body, token);
+    expect(verify(method, '/v1/cancel/abc', body, sig, ts, token, now)).toEqual(
+      { ok: false, reason: 'bad_signature' },
+    );
+  });
+
+  test('rejects a captured signature replayed with a different method', () => {
+    const sig = sign('POST', path, ts, body, token);
+    expect(verify('GET', path, body, sig, ts, token, now)).toEqual({
+      ok: false,
+      reason: 'bad_signature',
+    });
+  });
+
+  test('rejects a missing signature header', () => {
+    expect(verify(method, path, body, null, ts, token, now)).toEqual({
+      ok: false,
+      reason: 'missing_signature',
+    });
+  });
+
+  test('rejects a missing timestamp header', () => {
+    const sig = sign(method, path, ts, body, token);
+    expect(verify(method, path, body, sig, null, token, now)).toEqual({
+      ok: false,
+      reason: 'missing_timestamp',
+    });
+  });
+
+  test('rejects timestamps outside the tolerance window', () => {
+    const sig = sign(method, path, ts, body, token);
+    const tooLate = now + TIMESTAMP_TOLERANCE_MS + 1;
+    expect(verify(method, path, body, sig, ts, token, tooLate)).toEqual({
+      ok: false,
+      reason: 'timestamp_skew',
+    });
+    const tooEarly = now - TIMESTAMP_TOLERANCE_MS - 1;
+    expect(verify(method, path, body, sig, ts, token, tooEarly)).toEqual({
+      ok: false,
+      reason: 'timestamp_skew',
+    });
+  });
+
+  test('rejects a non-numeric timestamp', () => {
+    const sig = sign(method, path, ts, body, token);
+    expect(verify(method, path, body, sig, 'not-a-number', token, now)).toEqual(
+      { ok: false, reason: 'bad_timestamp' },
+    );
+  });
+
+  test('rejects a signature of the wrong length (timing-safe length check)', () => {
+    const sig = sign(method, path, ts, body, token);
+    expect(
+      verify(method, path, body, sig.slice(0, -1), ts, token, now),
+    ).toEqual({ ok: false, reason: 'bad_signature' });
+    expect(verify(method, path, body, `${sig}aa`, ts, token, now)).toEqual({
+      ok: false,
+      reason: 'bad_signature',
+    });
+  });
+
+  test('exports stable header names (wire contract)', () => {
+    expect(SIGNATURE_HEADER).toBe('x-tale-sandbox-signature');
+    expect(TIMESTAMP_HEADER).toBe('x-tale-sandbox-timestamp');
+  });
+});
diff --git a/services/sandbox/src/server.ts b/services/sandbox/src/server.ts
new file mode 100644
index 000000000..18836d7bb
--- /dev/null
+++ b/services/sandbox/src/server.ts
@@ -0,0 +1,430 @@
+// Tale Sandbox Spawner — HTTP entrypoint.
+//
+// Routes:
+//   GET  /health             — 200 if docker daemon reachable.
+//   POST /v1/execute         — HMAC-authenticated, runs one ephemeral container,
+//                              streams SSE phase events + final result.
+//   POST /v1/cancel/:id      — HMAC-authenticated, kills in-flight container.
+//
+// Concurrency: in-process semaphore at SANDBOX_MAX_CONCURRENT. 429 over cap.
+
+import { verify, SIGNATURE_HEADER, TIMESTAMP_HEADER } from './auth.ts';
+import {
+  acquireSpawnerLock,
+  bootSweep,
+  installSignalHandlers,
+  startPeriodicSweep,
+} from './cleanup.ts';
+import { loadConfig } from './config.ts';
+import { ensureImage, runDocker } from './spawn-util.ts';
+import {
+  cancelExecution,
+  executeRequest,
+  inFlightSize,
+  isInFlight,
+  registerInFlight,
+  unregisterInFlight,
+} from './spawn.ts';
+import { validateExecuteRequest } from './validate-request.ts';
+
+const cfg = loadConfig();
+
+async function readBodyCapped(req: Request, maxBytes: number): Promise<string> {
+  // Streaming guard so an unbounded POST can't OOM the process before we
+  // ever see HMAC. We rely on the Content-Length hint when present and
+  // hard-cap the actual byte count regardless.
+  const cl = req.headers.get('content-length');
+  if (cl !== null) {
+    const declared = Number(cl);
+    if (Number.isFinite(declared) && declared > maxBytes) {
+      throw Object.assign(new Error('payload_too_large'), { httpStatus: 413 });
+    }
+  }
+  const reader = req.body?.getReader();
+  if (!reader) {
+    return '';
+  }
+  const chunks: Uint8Array[] = [];
+  let total = 0;
+  for (;;) {
+    const { value, done } = await reader.read();
+    if (done) break;
+    if (value) {
+      total += value.byteLength;
+      if (total > maxBytes) {
+        reader.cancel().catch((err) => {
+          console.warn('[sandbox] reader cancel after body cap failed:', err);
+        });
+        throw Object.assign(new Error('payload_too_large'), {
+          httpStatus: 413,
+        });
+      }
+      chunks.push(value);
+    }
+  }
+  const first = chunks[0];
+  return new TextDecoder('utf-8').decode(
+    chunks.length === 1 && first ? first : concat(chunks, total),
+  );
+}
+
+function concat(chunks: Uint8Array[], total: number): Uint8Array {
+  const out = new Uint8Array(total);
+  let offset = 0;
+  for (const c of chunks) {
+    out.set(c, offset);
+    offset += c.byteLength;
+  }
+  return out;
+}
+
+function jsonResponse(
+  body: unknown,
+  status: number,
+  extraHeaders?: Record<string, string>,
+): Response {
+  return new Response(JSON.stringify(body), {
+    status,
+    headers: {
+      'content-type': 'application/json',
+      ...extraHeaders,
+    },
+  });
+}
+
+function authorize(body: string, req: Request): Response | null {
+  if (cfg.sandboxToken === null) return null; // dev opt-in mode
+  const url = new URL(req.url);
+  const result = verify(
+    req.method,
+    url.pathname,
+    body,
+    req.headers.get(SIGNATURE_HEADER),
+    req.headers.get(TIMESTAMP_HEADER),
+    cfg.sandboxToken,
+  );
+  if (!result.ok) {
+    // Log the discriminator server-side so operators can diagnose, but DON'T
+    // surface it in the response body — distinguishing "wrong signature" from
+    // "clock skew" lets an attacker calibrate (audit finding R2-B5).
+    console.warn(`[sandbox.auth] unauthorized (${result.reason})`);
+    return jsonResponse({ error: 'unauthorized' }, 401);
+  }
+  return null;
+}
+
+// Cache the docker version probe so the compose healthcheck (every 10s)
+// doesn't fork a subprocess on every hit. 60s is well under the watchdog
+// cutoff and short enough that a daemon recycle surfaces within one
+// healthcheck cycle of the user noticing.
+const DOCKER_PROBE_TTL_MS = 60_000;
+let dockerProbeCache:
+  | { ok: true; version: string; expiresAt: number }
+  | { ok: false; error: string; expiresAt: number }
+  | null = null;
+
+async function probeDocker(): Promise<
+  { ok: true; version: string } | { ok: false; error: string }
+> {
+  const now = Date.now();
+  if (dockerProbeCache !== null && dockerProbeCache.expiresAt > now) {
+    return dockerProbeCache.ok
+      ? { ok: true, version: dockerProbeCache.version }
+      : { ok: false, error: dockerProbeCache.error };
+  }
+  // Probe docker daemon reachability. Use `docker version --format` over the
+  // older `docker info --format` because some Debian-packaged CLIs (e.g.
+  // docker.io 20.10 in our base image) panic when templating a newer-API
+  // `info` response. `docker version` is a much smaller surface that has
+  // been compatible across the 20.10 ↔ 29.x gap.
+  const info = await runDocker(['version', '--format', '{{.Server.Version}}']);
+  if (info.exitCode !== 0) {
+    const error = info.stderr.trim() || info.stdout.trim();
+    dockerProbeCache = {
+      ok: false,
+      error,
+      expiresAt: now + DOCKER_PROBE_TTL_MS,
+    };
+    return { ok: false, error };
+  }
+  const version = info.stdout.trim();
+  dockerProbeCache = {
+    ok: true,
+    version,
+    expiresAt: now + DOCKER_PROBE_TTL_MS,
+  };
+  return { ok: true, version };
+}
+
+async function handleHealth(): Promise<Response> {
+  const docker = await probeDocker();
+  if (!docker.ok) {
+    return jsonResponse({ status: 'unhealthy', error: docker.error }, 503);
+  }
+  return jsonResponse(
+    { status: 'ok', dockerServerVersion: docker.version },
+    200,
+  );
+}
+
+async function handleExecute(req: Request): Promise<Response> {
+  let body: string;
+  try {
+    body = await readBodyCapped(req, cfg.maxRequestBodyBytes);
+  } catch (err) {
+    const status =
+      err && typeof err === 'object' && 'httpStatus' in err
+        ? Number((err as { httpStatus: unknown }).httpStatus)
+        : 400;
+    return jsonResponse(
+      {
+        error: status === 413 ? 'payload_too_large' : 'bad_request',
+        message: err instanceof Error ? err.message : String(err),
+      },
+      status === 413 ? 413 : 400,
+    );
+  }
+  const authFail = authorize(body, req);
+  if (authFail) return authFail;
+
+  let parsedUnknown: unknown;
+  try {
+    parsedUnknown = JSON.parse(body);
+  } catch (err) {
+    return jsonResponse({ error: 'bad_request', message: String(err) }, 400);
+  }
+  // Full runtime validation of every field — defends downstream spawn /
+  // docker-args code from malformed types that would otherwise crash mid
+  // pipeline. The previous spot-check of executionId was the only gate
+  // (audit finding R2-B3).
+  const validated = validateExecuteRequest(parsedUnknown);
+  if (!validated.ok) {
+    return jsonResponse(
+      { error: 'bad_request', message: validated.error },
+      400,
+    );
+  }
+  const parsed = validated.request;
+
+  // Per-request INFO so docker logs tale-sandbox surfaces what's been
+  // dispatched. The spawner used to only log warn/error which made
+  // every "did the request even get here?" question require code
+  // inspection — see pre-stage debugging session 2026-05-23.
+  console.info(
+    `[sandbox.execute] id=${parsed.executionId} org=${parsed.organizationId} lang=${parsed.language} ${
+      parsed.steps !== undefined
+        ? `steps=${JSON.stringify(parsed.steps)}`
+        : `entry=${parsed.entryPath}`
+    } files=${parsed.files?.length ?? 0} priorDownloads=${parsed.priorOutputDownloads?.length ?? 0} preAllocSlots=${parsed.outputUploadSlots.length}`,
+  );
+
+  // Reject duplicates explicitly: the in-flight registry is keyed by
+  // executionId, and overwriting the entry would silently detach the
+  // original AbortController from cancelExecution. The Convex action
+  // never retries the same executionId in practice, so a duplicate
+  // POST is almost always a misconfigured caller or a malicious replay.
+  if (isInFlight(parsed.executionId)) {
+    return jsonResponse(
+      {
+        error: 'duplicate',
+        message: `executionId ${parsed.executionId} is already in flight`,
+      },
+      409,
+    );
+  }
+
+  // Concurrency check AFTER validation so a malformed request can't
+  // consume a slot.
+  if (inFlightSize() >= cfg.maxConcurrent) {
+    return jsonResponse(
+      {
+        error: 'busy',
+        message: `Spawner at concurrency cap (${cfg.maxConcurrent})`,
+      },
+      429,
+      { 'retry-after': '5' },
+    );
+  }
+
+  // Register AFTER validation; the spawn-side registry is the single source
+  // of truth (previously had a separate server-side Set that could drift).
+  // The execution may also be aborted by the caller disconnecting — wire a
+  // request-signal abort to cancelExecution so a closed SSE stream tears
+  // the container down promptly.
+  const abortHandler = () => {
+    cancelExecution(parsed.executionId).catch((err) => {
+      console.warn('[sandbox] client-abort cancel failed:', err);
+    });
+  };
+  req.signal.addEventListener('abort', abortHandler, { once: true });
+  registerInFlight(parsed.executionId);
+
+  const stream = new ReadableStream<Uint8Array>({
+    async start(controller) {
+      const enc = new TextEncoder();
+      const send = (event: string, data: unknown) => {
+        try {
+          controller.enqueue(
+            enc.encode(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`),
+          );
+        } catch (err) {
+          // Stream already closed — common when the caller aborted; we
+          // continue draining the spawn so the cleanup paths run.
+          console.warn('[sandbox] SSE enqueue after close:', err);
+        }
+      };
+      // Bun.serve enforces a per-connection idleTimeout (we raise it to the
+      // 255 s max below, but install + run can still outlast that). An SSE
+      // comment line (`: ...\n\n`) is ignored by the platform-side parser
+      // and resets the idle clock, so a periodic tick keeps the stream live
+      // during silent stretches like `pip install` / `npm install`.
+      const sendKeepalive = () => {
+        try {
+          controller.enqueue(enc.encode(`: keepalive\n\n`));
+        } catch (err) {
+          console.warn('[sandbox] SSE keepalive enqueue after close:', err);
+        }
+      };
+      const keepalive = setInterval(sendKeepalive, 20_000);
+      try {
+        const result = await executeRequest(cfg, parsed, {
+          onPhase: (e) => send('phase', e),
+          // Live stdout/stderr tail. Per-line for stdout (PHASE markers
+          // stripped); per-chunk for stderr. Coalescing is left to the
+          // platform-side action because that's where the cost of "too
+          // many mutations" actually lives — SSE event overhead is small.
+          onStdoutDelta: (text) => send('stdout', { text }),
+          onStderrDelta: (text) => send('stderr', { text }),
+        });
+        send('result', result);
+      } catch (err) {
+        send('error', {
+          message: err instanceof Error ? err.message : String(err),
+        });
+      } finally {
+        clearInterval(keepalive);
+        unregisterInFlight(parsed.executionId);
+        req.signal.removeEventListener('abort', abortHandler);
+        try {
+          controller.close();
+        } catch (err) {
+          console.warn('[sandbox] SSE close failed:', err);
+        }
+      }
+    },
+  });
+  return new Response(stream, {
+    status: 200,
+    headers: {
+      'content-type': 'text/event-stream',
+      'cache-control': 'no-cache, no-transform',
+      'x-accel-buffering': 'no',
+    },
+  });
+}
+
+async function handleCancel(req: Request, id: string): Promise<Response> {
+  let body: string;
+  try {
+    body = await readBodyCapped(req, cfg.maxRequestBodyBytes);
+  } catch (err) {
+    return jsonResponse(
+      {
+        error: 'bad_request',
+        message: err instanceof Error ? err.message : String(err),
+      },
+      400,
+    );
+  }
+  const authFail = authorize(body, req);
+  if (authFail) return authFail;
+  if (!isInFlight(id)) {
+    return jsonResponse({ killed: false }, 404);
+  }
+  const killed = await cancelExecution(id);
+  return jsonResponse({ killed }, 200);
+}
+
+// Cancel route uses the same id alphabet as the execute payload so a
+// Convex doc id (contains g-z) is not silently rejected. Centralized in
+// wire.ts; one regex covers spawn.ts, docker-args.ts, and this router.
+const CANCEL_ROUTE_RE = /^\/v1\/cancel\/([a-zA-Z0-9_-]{1,64})$/;
+
+async function router(req: Request): Promise<Response> {
+  const url = new URL(req.url);
+  if (req.method === 'GET' && url.pathname === '/health') {
+    return handleHealth();
+  }
+  if (req.method === 'POST' && url.pathname === '/v1/execute') {
+    return handleExecute(req);
+  }
+  const cancelMatch = url.pathname.match(CANCEL_ROUTE_RE);
+  if (req.method === 'POST' && cancelMatch) {
+    return handleCancel(req, cancelMatch[1] ?? '');
+  }
+  return jsonResponse({ error: 'not_found' }, 404);
+}
+
+async function main(): Promise<void> {
+  // Token policy: SANDBOX_TOKEN is opt-in verification. Unset = skip HMAC
+  // (mirrors the Convex-side behavior); set = enforce. Production deploys
+  // auto-mint SANDBOX_TOKEN via the CLI's ensure-env helper. Log a single
+  // warn at boot so operators see the state.
+  if (cfg.sandboxToken === null) {
+    console.warn(
+      '[sandbox] SANDBOX_TOKEN is unset — HMAC verification disabled. Set SANDBOX_TOKEN to enable request authentication.',
+    );
+  }
+
+  // Cross-process lock BEFORE bootSweep — refuses to start if another live
+  // spawner is using the same hostSessionRoot. Prevents bootSweep's
+  // host-dir sweep from deleting a peer's in-flight workspace (audit
+  // finding R2-B5). Stale locks (mtime older than ~60s) are reclaimed.
+  try {
+    await acquireSpawnerLock(cfg);
+  } catch (err) {
+    console.error('[sandbox] FATAL: spawner lock acquire failed:', err);
+    process.exit(1);
+  }
+
+  await bootSweep(cfg);
+  // Warm the runtime image so the first /v1/execute call doesn't pay a
+  // cold registry round-trip. Non-fatal: if the daemon is unreachable at
+  // boot the spawner still starts (its /health probe will surface the
+  // real problem), but a hot daemon means the first call will get
+  // image-not-found if we never pull. Failure is logged inside ensureImage.
+  await ensureImage(cfg.runtimeImage);
+
+  const stopPeriodic = startPeriodicSweep(cfg);
+
+  const server = Bun.serve({
+    port: cfg.port,
+    // Bun's default idleTimeout is 10 s, which kills long SSE streams during
+    // silent install phases. 255 is Bun's max — combined with the in-stream
+    // keepalive in /v1/execute, this gives a generous backstop without
+    // disabling the timeout entirely.
+    idleTimeout: 255,
+    fetch: (req) =>
+      router(req).catch((err) => {
+        console.error('[sandbox] handler error:', err);
+        return jsonResponse({ error: 'internal', message: String(err) }, 500);
+      }),
+  });
+
+  installSignalHandlers(() => {
+    try {
+      void server.stop();
+    } catch (err) {
+      console.warn('[sandbox] server.stop() during shutdown failed:', err);
+    }
+  }, cfg);
+
+  console.log(
+    `[sandbox] spawner listening on :${server.port}; runtime=${cfg.runtime}; image=${cfg.runtimeImage}; maxConcurrent=${cfg.maxConcurrent}; tokenAuth=${cfg.sandboxToken !== null ? 'on' : 'OFF (dev opt-in)'}`,
+  );
+
+  // Keep the periodic sweep handle so it isn't GC'd.
+  void stopPeriodic;
+}
+
+void main();
diff --git a/services/sandbox/src/spawn-prior-outputs.test.ts b/services/sandbox/src/spawn-prior-outputs.test.ts
new file mode 100644
index 000000000..8755eed20
--- /dev/null
+++ b/services/sandbox/src/spawn-prior-outputs.test.ts
@@ -0,0 +1,337 @@
+// Unit tests for `stagePriorOutputDownloads` — the spawner-side helper
+// that fetches the artifact's previous run outputs (as URLs) and writes
+// them back into `/workspace/output/` before the container starts.
+//
+// We exercise the path-traversal guard end-to-end against a real temp
+// directory and a real ephemeral HTTP server (no mocks). Bad names and
+// failed fetches are logged + skipped, not fatal.
+
+import {
+  afterAll,
+  afterEach,
+  beforeAll,
+  beforeEach,
+  describe,
+  expect,
+  test,
+} from 'bun:test';
+import { createHash } from 'node:crypto';
+import { mkdir, mkdtemp, readFile, readdir, rm } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { stagePriorOutputDownloads } from './spawn.ts';
+
+// Minimal ephemeral file-server backed by an in-memory map. Each test sets
+// the map's `{name: Uint8Array}` entries and computes URLs against the
+// returned base.
+let server: ReturnType<typeof Bun.serve>;
+let baseUrl: string;
+const fileMap = new Map<string, Uint8Array>();
+
+beforeAll(() => {
+  server = Bun.serve({
+    port: 0,
+    fetch(req) {
+      const url = new URL(req.url);
+      const key = url.searchParams.get('k') ?? '';
+      const bytes = fileMap.get(key);
+      if (!bytes) return new Response('not found', { status: 404 });
+      return new Response(bytes, { status: 200 });
+    },
+  });
+  baseUrl = `http://localhost:${server.port}`;
+});
+
+afterAll(() => {
+  void server.stop();
+});
+
+function urlFor(key: string, bytes: Uint8Array | string): string {
+  fileMap.set(
+    key,
+    typeof bytes === 'string' ? new TextEncoder().encode(bytes) : bytes,
+  );
+  return `${baseUrl}/?k=${encodeURIComponent(key)}`;
+}
+
+describe('stagePriorOutputDownloads', () => {
+  let hostDir: string;
+  let outputDir: string;
+
+  beforeEach(async () => {
+    hostDir = await mkdtemp(join(tmpdir(), 'tale-sandbox-prior-'));
+    outputDir = join(hostDir, 'output');
+    await mkdir(outputDir, { recursive: true });
+    fileMap.clear();
+  });
+
+  afterEach(async () => {
+    await rm(hostDir, { recursive: true, force: true });
+  });
+
+  test('writes a flat-name prior output to /output/<name>', async () => {
+    await stagePriorOutputDownloads(outputDir, [
+      { name: 'report.pptx', url: urlFor('report.pptx', 'hello pptx') },
+    ]);
+    const buf = await readFile(join(outputDir, 'report.pptx'));
+    expect(buf.toString('utf8')).toBe('hello pptx');
+  });
+
+  test('creates nested directories as needed for a path-shaped name', async () => {
+    await stagePriorOutputDownloads(outputDir, [
+      { name: 'sub/dir/report.txt', url: urlFor('nested', 'nested') },
+    ]);
+    const buf = await readFile(join(outputDir, 'sub/dir/report.txt'));
+    expect(buf.toString('utf8')).toBe('nested');
+  });
+
+  test('refuses ".." traversal — file is NOT written outside outputDir', async () => {
+    await stagePriorOutputDownloads(outputDir, [
+      { name: '../escape.txt', url: urlFor('nope', 'nope') },
+    ]);
+    // The skipped file must not appear inside outputDir.
+    const inside = await readdir(outputDir);
+    expect(inside).not.toContain('escape.txt');
+    // And it must not have been written one level up either.
+    const oneUp = await readdir(hostDir);
+    expect(oneUp).not.toContain('escape.txt');
+  });
+
+  test('refuses an absolute path that escapes outputDir', async () => {
+    // Absolute paths to `resolve` ignore the `from` arg, so the result is
+    // the absolute path verbatim — well outside outputDir.
+    await stagePriorOutputDownloads(outputDir, [
+      { name: '/tmp/abs-escape.txt', url: urlFor('nope', 'nope') },
+    ]);
+    const inside = await readdir(outputDir);
+    expect(inside).not.toContain('abs-escape.txt');
+  });
+
+  test('writes multiple files in one call', async () => {
+    await stagePriorOutputDownloads(outputDir, [
+      { name: 'a.bin', url: urlFor('a', 'aaa') },
+      { name: 'b.bin', url: urlFor('b', 'bbb') },
+    ]);
+    expect((await readFile(join(outputDir, 'a.bin'))).toString('utf8')).toBe(
+      'aaa',
+    );
+    expect((await readFile(join(outputDir, 'b.bin'))).toString('utf8')).toBe(
+      'bbb',
+    );
+  });
+
+  test('no-ops on an empty list without throwing', async () => {
+    await stagePriorOutputDownloads(outputDir, []);
+    const inside = await readdir(outputDir);
+    expect(inside).toEqual([]);
+  });
+
+  test('preserves binary content faithfully', async () => {
+    const bytes = new Uint8Array([0, 1, 2, 255, 254, 0xff, 0x10, 0x20]);
+    await stagePriorOutputDownloads(outputDir, [
+      { name: 'binary.bin', url: urlFor('binary', bytes) },
+    ]);
+    const buf = await readFile(join(outputDir, 'binary.bin'));
+    expect(Array.from(new Uint8Array(buf))).toEqual(Array.from(bytes));
+  });
+
+  test('skips a fetch that returns 404 without throwing', async () => {
+    // URL is registered but the key doesn't exist in fileMap → server 404.
+    fileMap.clear();
+    await stagePriorOutputDownloads(outputDir, [
+      { name: 'missing.pptx', url: `${baseUrl}/?k=missing-key` },
+    ]);
+    const inside = await readdir(outputDir);
+    expect(inside).not.toContain('missing.pptx');
+  });
+
+  // -------------------------------------------------------------------
+  // Return-shape attestation (crispy-curry plan §3).
+  //
+  // The new signature returns `{staged, skipped}` so the platform can
+  // diff what it asked for against what landed on disk. Skip reasons
+  // are structured so the LLM-facing error payload can guide recovery
+  // (url_expired → re-mint, http_error → check storage, unsafe_path →
+  // never legitimate, etc.).
+  // -------------------------------------------------------------------
+
+  test('returns staged entries with bytes + sha256 of the written file', async () => {
+    const payload = 'hello pptx';
+    const expectedSha = createHash('sha256').update(payload).digest('hex');
+    const result = await stagePriorOutputDownloads(outputDir, [
+      { name: 'report.pptx', url: urlFor('report.pptx', payload) },
+    ]);
+    expect(result.staged).toHaveLength(1);
+    expect(result.staged[0]).toEqual({
+      name: 'report.pptx',
+      bytes: new TextEncoder().encode(payload).byteLength,
+      sha256: expectedSha,
+    });
+    expect(result.skipped).toEqual([]);
+  });
+
+  test('returns sha256 that matches the actual bytes for binary content', async () => {
+    const bytes = new Uint8Array([0, 1, 2, 255, 254, 0xff, 0x10, 0x20]);
+    const expectedSha = createHash('sha256').update(bytes).digest('hex');
+    const result = await stagePriorOutputDownloads(outputDir, [
+      { name: 'binary.bin', url: urlFor('binary', bytes) },
+    ]);
+    expect(result.staged[0]?.sha256).toBe(expectedSha);
+  });
+
+  test('classifies path-traversal as unsafe_path skip', async () => {
+    const result = await stagePriorOutputDownloads(outputDir, [
+      { name: '../escape.txt', url: urlFor('nope', 'nope') },
+    ]);
+    expect(result.staged).toEqual([]);
+    expect(result.skipped).toHaveLength(1);
+    expect(result.skipped[0]).toMatchObject({
+      name: '../escape.txt',
+      reason: 'unsafe_path',
+    });
+  });
+
+  test('classifies non-2xx as http_error skip with status in detail', async () => {
+    fileMap.clear();
+    const result = await stagePriorOutputDownloads(outputDir, [
+      { name: 'missing.pptx', url: `${baseUrl}/?k=missing-key` },
+    ]);
+    expect(result.staged).toEqual([]);
+    expect(result.skipped).toHaveLength(1);
+    expect(result.skipped[0]).toMatchObject({
+      name: 'missing.pptx',
+      reason: 'http_error',
+    });
+    expect(result.skipped[0]?.detail).toContain('404');
+  });
+
+  test('classifies 403 / 410 as url_expired skip (presigned URL TTL hint)', async () => {
+    // Spin up a tiny server that returns 410 Gone for any request.
+    const goneServer = Bun.serve({
+      port: 0,
+      fetch: () => new Response('gone', { status: 410 }),
+    });
+    try {
+      const result = await stagePriorOutputDownloads(outputDir, [
+        {
+          name: 'stale.pptx',
+          url: `http://localhost:${goneServer.port}/x`,
+        },
+      ]);
+      expect(result.skipped).toHaveLength(1);
+      expect(result.skipped[0]).toMatchObject({
+        name: 'stale.pptx',
+        reason: 'url_expired',
+      });
+    } finally {
+      void goneServer.stop();
+    }
+  });
+
+  test('classifies network-error as fetch_failed skip', async () => {
+    // Malformed URL string causes fetch to throw synchronously before
+    // any HTTP response — distinct from a remote-end http_error.
+    const result = await stagePriorOutputDownloads(outputDir, [
+      { name: 'unreachable.txt', url: 'not-a-real-url' },
+    ]);
+    expect(result.staged).toEqual([]);
+    expect(result.skipped).toHaveLength(1);
+    expect(result.skipped[0]).toMatchObject({
+      name: 'unreachable.txt',
+      reason: 'fetch_failed',
+    });
+  });
+
+  test('mixed staged + skipped surfaces both lists correctly', async () => {
+    const result = await stagePriorOutputDownloads(outputDir, [
+      { name: 'good.txt', url: urlFor('good', 'ok') },
+      { name: '../bad.txt', url: urlFor('bad', 'no') },
+      { name: 'missing.txt', url: `${baseUrl}/?k=does-not-exist` },
+    ]);
+    expect(result.staged.map((s) => s.name)).toEqual(['good.txt']);
+    expect(result.skipped.map((s) => s.reason).sort()).toEqual([
+      'http_error',
+      'unsafe_path',
+    ]);
+  });
+
+  test('classifies stalled fetch as fetch_timeout skip', async () => {
+    // Server that never responds; the timeoutMs override triggers
+    // AbortSignal.timeout before any data comes back.
+    const slowServer = Bun.serve({
+      port: 0,
+      async fetch() {
+        await new Promise<void>(() => {
+          /* never resolves */
+        });
+        return new Response('unreachable');
+      },
+    });
+    try {
+      const result = await stagePriorOutputDownloads(
+        outputDir,
+        [{ name: 'slow.txt', url: `http://localhost:${slowServer.port}/` }],
+        { timeoutMs: 50 },
+      );
+      expect(result.staged).toEqual([]);
+      expect(result.skipped).toHaveLength(1);
+      expect(result.skipped[0]).toMatchObject({
+        name: 'slow.txt',
+        reason: 'fetch_timeout',
+      });
+    } finally {
+      void slowServer.stop();
+    }
+  });
+
+  test('rejects oversize body via Content-Length pre-check', async () => {
+    const bigPayload = new Uint8Array(10_000); // server lies/doesn't, see below
+    const url = urlFor('big', bigPayload);
+    const result = await stagePriorOutputDownloads(
+      outputDir,
+      [{ name: 'big.bin', url }],
+      { maxBytesPerFile: 1_000 },
+    );
+    expect(result.staged).toEqual([]);
+    expect(result.skipped).toHaveLength(1);
+    expect(result.skipped[0]).toMatchObject({
+      name: 'big.bin',
+      reason: 'download_too_large',
+    });
+  });
+
+  test('rejects oversize body via streaming cap when Content-Length is absent', async () => {
+    // Bun.serve with a ReadableStream body usually omits Content-Length,
+    // so the size check has to be enforced by the streaming-read path.
+    const chunkBytes = new Uint8Array(512);
+    const chunks = 8;
+    const streamServer = Bun.serve({
+      port: 0,
+      fetch() {
+        const stream = new ReadableStream<Uint8Array>({
+          start(controller) {
+            for (let i = 0; i < chunks; i++) controller.enqueue(chunkBytes);
+            controller.close();
+          },
+        });
+        return new Response(stream, { status: 200 });
+      },
+    });
+    try {
+      const result = await stagePriorOutputDownloads(
+        outputDir,
+        [{ name: 'stream.bin', url: `http://localhost:${streamServer.port}/` }],
+        { maxBytesPerFile: 1_000 },
+      );
+      expect(result.staged).toEqual([]);
+      expect(result.skipped).toHaveLength(1);
+      expect(result.skipped[0]).toMatchObject({
+        name: 'stream.bin',
+        reason: 'download_too_large',
+      });
+    } finally {
+      void streamServer.stop();
+    }
+  });
+});
diff --git a/services/sandbox/src/spawn-staging.test.ts b/services/sandbox/src/spawn-staging.test.ts
new file mode 100644
index 000000000..898e4affa
--- /dev/null
+++ b/services/sandbox/src/spawn-staging.test.ts
@@ -0,0 +1,204 @@
+// Unit tests for the `stageWorkspace` helper — the part that lays out
+// /workspace/code/<files> and /workspace/.tale/runner.{py,js} on the host
+// bind-mounted dir before the container starts.
+//
+// We do not assert ownership (chownRecursive's lchown(65534) needs root and
+// is irrelevant to the layout contract). The test catches and ignores the
+// EPERM that fires after the writes have completed.
+
+import { afterEach, beforeEach, describe, expect, test } from 'bun:test';
+import { mkdtemp, readFile, readdir, rm, stat } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { stageWorkspace } from './spawn.ts';
+import type { ExecuteRequest } from './types.ts';
+
+async function stageIgnoringChown(
+  hostDir: string,
+  req: ExecuteRequest,
+): Promise<void> {
+  try {
+    await stageWorkspace(hostDir, req);
+  } catch (err) {
+    if (err instanceof Error && /EPERM|EINVAL/.test(err.message)) {
+      // Non-root test env can't chown to 65534 — fine, the file layout has
+      // already been written by the time chownRecursive runs.
+      return;
+    }
+    throw err;
+  }
+}
+
+function baseReq(overrides: Partial<ExecuteRequest>): ExecuteRequest {
+  return {
+    executionId: 'abc-123',
+    organizationId: 'org_42',
+    language: 'python',
+    files: [{ path: 'main.py', content: 'print("ok")' }],
+    entryPath: 'main.py',
+    // Staging tests don't exercise the upload path; the callback fields
+    // are passed through opaquely. An empty slot list is a valid wire
+    // payload (sandbox lazily fetches when it needs the first one).
+    outputUploadSlots: [],
+    outputUrlEndpoint: 'http://test-endpoint/upload-url',
+    reportUploadedEndpoint: 'http://test-endpoint/report-uploaded',
+    ...overrides,
+  };
+}
+
+describe('stageWorkspace', () => {
+  let hostDir: string;
+
+  beforeEach(async () => {
+    hostDir = await mkdtemp(join(tmpdir(), 'tale-sandbox-stage-'));
+  });
+
+  afterEach(async () => {
+    await rm(hostDir, { recursive: true, force: true });
+  });
+
+  test('single-script mode stages user files at declared paths and writes NO synthetic main.py mirror', async () => {
+    await stageIgnoringChown(
+      hostDir,
+      baseReq({
+        files: [
+          { path: 'main.py', content: 'print("user main")' },
+          { path: 'helpers.py', content: 'X = 1' },
+        ],
+        entryPath: 'main.py',
+      }),
+    );
+
+    // Files land at /workspace/code/<path>.
+    const main = await readFile(join(hostDir, 'code', 'main.py'), 'utf8');
+    expect(main).toBe('print("user main")');
+    const helpers = await readFile(join(hostDir, 'code', 'helpers.py'), 'utf8');
+    expect(helpers).toBe('X = 1');
+
+    // No /workspace/.tale/ in single-script mode.
+    let taleExists = true;
+    try {
+      await stat(join(hostDir, '.tale'));
+    } catch {
+      taleExists = false;
+    }
+    expect(taleExists).toBe(false);
+  });
+
+  test('multi-step mode writes the wrapper at /workspace/.tale/runner.py and leaves user files untouched', async () => {
+    await stageIgnoringChown(
+      hostDir,
+      baseReq({
+        files: [
+          // Critically: user file named main.py — the leaky-abstraction
+          // regression gate. The wrapper must NOT overwrite it.
+          { path: 'main.py', content: 'print("user generator")' },
+          { path: 'test.py', content: 'print("user validator")' },
+        ],
+        entryPath: undefined,
+        steps: ['main.py', 'test.py'],
+      }),
+    );
+
+    // User's main.py survives intact.
+    const userMain = await readFile(join(hostDir, 'code', 'main.py'), 'utf8');
+    expect(userMain).toBe('print("user generator")');
+    const userTest = await readFile(join(hostDir, 'code', 'test.py'), 'utf8');
+    expect(userTest).toBe('print("user validator")');
+
+    // Wrapper lands in /workspace/.tale/, NOT /workspace/code/.
+    const wrapper = await readFile(join(hostDir, '.tale', 'runner.py'), 'utf8');
+    expect(wrapper).toContain('Tale multi-step wrapper');
+    expect(wrapper).toContain('"main.py"');
+    expect(wrapper).toContain('"test.py"');
+
+    // /workspace/code/ only contains user files + packages.json + options.json.
+    const codeEntries = await readdir(join(hostDir, 'code'));
+    expect(codeEntries.sort()).toEqual(
+      ['main.py', 'options.json', 'packages.json', 'test.py'].sort(),
+    );
+    // /workspace/.tale/ only contains the wrapper.
+    const taleEntries = await readdir(join(hostDir, '.tale'));
+    expect(taleEntries).toEqual(['runner.py']);
+  });
+
+  test('multi-step mode for node language writes runner.js', async () => {
+    await stageIgnoringChown(
+      hostDir,
+      baseReq({
+        language: 'node',
+        files: [
+          { path: 'main.js', content: 'console.log("gen")' },
+          { path: 'test.js', content: 'console.log("validate")' },
+        ],
+        entryPath: undefined,
+        steps: ['main.js', 'test.js'],
+      }),
+    );
+
+    const wrapper = await readFile(join(hostDir, '.tale', 'runner.js'), 'utf8');
+    expect(wrapper).toContain('Tale multi-step wrapper');
+    expect(wrapper).toContain('"main.js"');
+  });
+
+  test('polyglot mode writes runner.py + packages-{python,node}.json with per-bucket specs', async () => {
+    await stageIgnoringChown(
+      hostDir,
+      baseReq({
+        language: 'polyglot',
+        files: [
+          { path: 'gen.js', content: 'console.log("gen")' },
+          { path: 'qa.py', content: 'print("qa")' },
+        ],
+        entryPath: undefined,
+        steps: ['gen.js', 'qa.py'],
+        packagesByLang: {
+          python: ['markitdown[pptx]==0.0.1a3'],
+          node: ['pptxgenjs@3.12.0'],
+        },
+      }),
+    );
+
+    // Polyglot uses the Python-hosted dispatcher.
+    const wrapper = await readFile(join(hostDir, '.tale', 'runner.py'), 'utf8');
+    expect(wrapper).toContain('Tale polyglot multi-step wrapper');
+    expect(wrapper).toContain('interpreter_for');
+    expect(wrapper).toContain('"gen.js"');
+    expect(wrapper).toContain('"qa.py"');
+
+    const pyPkgs = JSON.parse(
+      await readFile(join(hostDir, 'code', 'packages-python.json'), 'utf8'),
+    );
+    expect(pyPkgs).toEqual(['markitdown[pptx]==0.0.1a3']);
+    const nodePkgs = JSON.parse(
+      await readFile(join(hostDir, 'code', 'packages-node.json'), 'utf8'),
+    );
+    expect(nodePkgs).toEqual(['pptxgenjs@3.12.0']);
+    // Legacy packages.json is empty in polyglot mode — the entrypoint
+    // reads packages-python.json / packages-node.json directly.
+    const legacy = JSON.parse(
+      await readFile(join(hostDir, 'code', 'packages.json'), 'utf8'),
+    );
+    expect(legacy).toEqual([]);
+  });
+
+  test('packages.json and options.json land in /workspace/code/ alongside user files', async () => {
+    await stageIgnoringChown(
+      hostDir,
+      baseReq({
+        packages: ['numpy', 'pandas'],
+        options: { allowSdist: false, allowInstallScripts: false },
+      }),
+    );
+
+    const pkgs = JSON.parse(
+      await readFile(join(hostDir, 'code', 'packages.json'), 'utf8'),
+    );
+    expect(pkgs).toEqual(['numpy', 'pandas']);
+    const opts = JSON.parse(
+      await readFile(join(hostDir, 'code', 'options.json'), 'utf8'),
+    );
+    expect(opts).toEqual({ allowSdist: false, allowInstallScripts: false });
+  });
+});
diff --git a/services/sandbox/src/spawn-util.test.ts b/services/sandbox/src/spawn-util.test.ts
new file mode 100644
index 000000000..d427a7167
--- /dev/null
+++ b/services/sandbox/src/spawn-util.test.ts
@@ -0,0 +1,102 @@
+// spawn-util tests — runDocker drains pipes with hard byte caps so a
+// runaway runtime container can't OOM the spawner heap.
+//
+// We exercise the wrapper end-to-end against `bash` (always present on the
+// runtime image used in CI), not a mock, so the test catches Bun.spawn /
+// ReadableStream API drift along with the cap semantics.
+
+import { afterAll, beforeAll, describe, expect, test } from 'bun:test';
+
+import { runDocker } from './spawn-util.ts';
+
+// Override the docker binary for the duration of these tests. spawn-util
+// reads DOCKER_BIN lazily on each invocation so this override works after
+// module load.
+const ORIGINAL_DOCKER_BIN = process.env.DOCKER_BIN;
+beforeAll(() => {
+  process.env.DOCKER_BIN = '/bin/bash';
+});
+afterAll(() => {
+  if (ORIGINAL_DOCKER_BIN !== undefined) {
+    process.env.DOCKER_BIN = ORIGINAL_DOCKER_BIN;
+  } else {
+    delete process.env.DOCKER_BIN;
+  }
+});
+
+describe('runDocker — byte caps', () => {
+  test('caps stdout at stdoutMaxBytes and marks truncated', async () => {
+    // ~256 KiB of stdout — exceeds the 64 KiB cap by 4× (so truncation
+    // definitely fires) but is small enough to finish well inside bun's
+    // 5 s per-test budget on shared CI runners. `head -c … /dev/zero | tr`
+    // is byte-efficient in C; previously a 5 MiB bash brace-expansion
+    // loop intermittently timed out under CI load.
+    const result = await runDocker(
+      ['-c', `head -c ${256 * 1024} /dev/zero | tr '\\0' '_'`],
+      { stdoutMaxBytes: 64 * 1024 },
+    );
+    expect(result.exitCode).toBe(0);
+    expect(result.stdoutTruncated).toBe(true);
+    expect(result.stdout.length).toBeGreaterThan(0);
+    // Total buffered should be <= cap + one chunk overhang (~64 KiB max).
+    expect(Buffer.byteLength(result.stdout)).toBeLessThanOrEqual(64 * 1024);
+  });
+
+  test('caps stderr at stderrMaxBytes', async () => {
+    const result = await runDocker(
+      ['-c', `head -c ${128 * 1024} /dev/zero | tr '\\0' '_' >&2`],
+      { stderrMaxBytes: 32 * 1024 },
+    );
+    expect(result.exitCode).toBe(0);
+    expect(result.stderrTruncated).toBe(true);
+    expect(Buffer.byteLength(result.stderr)).toBeLessThanOrEqual(32 * 1024);
+  });
+
+  test('no truncation when output is within cap', async () => {
+    const result = await runDocker(['-c', 'echo "hello world"'], {
+      stdoutMaxBytes: 1024,
+    });
+    expect(result.exitCode).toBe(0);
+    expect(result.stdoutTruncated).toBe(false);
+    expect(result.stderrTruncated).toBe(false);
+    expect(result.stdout).toBe('hello world\n');
+  });
+
+  test('onStdoutChunk fires even for bytes past the cap (phase parsing)', async () => {
+    const chunks: Uint8Array[] = [];
+    const result = await runDocker(
+      [
+        '-c',
+        // Emit 200 lines × 1 KB. With a 4 KB cap the buffered output ≈ 4
+        // KB but we should still receive callbacks for all chunks so phase
+        // markers aren't silently dropped by truncation.
+        'for i in $(seq 1 200); do printf "%.0s_" {1..1024}; echo; done',
+      ],
+      {
+        stdoutMaxBytes: 4 * 1024,
+        onStdoutChunk: (c) => chunks.push(c),
+      },
+    );
+    expect(result.exitCode).toBe(0);
+    expect(result.stdoutTruncated).toBe(true);
+    const total = chunks.reduce((n, c) => n + c.byteLength, 0);
+    expect(total).toBeGreaterThan(4 * 1024); // post-cap chunks still fired
+  });
+});
+
+describe('runDocker — timeout race', () => {
+  test('timeout fires and exits within budget', async () => {
+    // Use `exec` so bash replaces itself with sleep — SIGKILL then targets a
+    // single process whose pipes close on exit. Without `exec`, bash forks
+    // sleep as a child and the inherited stdout pipe stays open until sleep
+    // also dies (an OS-level pipe-inheritance quirk, not relevant to the
+    // docker CLI which doesn't fork subprocesses that inherit its stdio).
+    const start = Date.now();
+    const result = await runDocker(['-c', 'echo started; exec sleep 10'], {
+      timeoutMs: 250,
+    });
+    const elapsed = Date.now() - start;
+    expect(result.exitCode).toBe(124);
+    expect(elapsed).toBeLessThan(3_000);
+  });
+});
diff --git a/services/sandbox/src/spawn-util.ts b/services/sandbox/src/spawn-util.ts
new file mode 100644
index 000000000..482b7e815
--- /dev/null
+++ b/services/sandbox/src/spawn-util.ts
@@ -0,0 +1,271 @@
+// Thin Bun-native wrapper around `docker` invocations.
+//
+// Centralised so docker-args.ts stays a pure argv builder (unit-testable) and
+// every actual docker call goes through one shape with consistent stdout/stderr
+// handling and timeouts.
+
+interface RunDockerOptions {
+  timeoutMs?: number;
+  signal?: AbortSignal;
+  // When set, on host-side timeout the CLI process is killed AND
+  // `docker kill <killOnTimeoutContainer>` is invoked so the actual
+  // sibling container stops. Without this the container keeps running
+  // after the CLI disconnects (R5 test).
+  killOnTimeoutContainer?: string;
+  // Per-chunk stdout callback fired while the subprocess is alive. Used
+  // by the phase-marker parser in spawn.ts to emit phase events to the
+  // SSE stream as soon as the container's entrypoint emits them, rather
+  // than waiting for the container to exit (Refinement 2). The callback
+  // is plain bytes; the caller is responsible for line-buffering.
+  onStdoutChunk?: (chunk: Uint8Array) => void;
+  // Per-chunk stderr callback. Mirrors `onStdoutChunk` so spawn.ts can
+  // emit incremental SSE `event: stderr` deltas to the platform (C5 — live
+  // stdout/stderr tail in the canvas instead of waiting for the terminal
+  // `result` event). Plain bytes; the caller decodes.
+  onStderrChunk?: (chunk: Uint8Array) => void;
+  // Hard cap on stdout bytes buffered into the spawner heap. Once exceeded,
+  // we keep draining the pipe (so the writer doesn't block) but discard
+  // further bytes. Without this a runaway runtime container can OOM the
+  // spawner via gigabytes of stdout (audit finding R2-B2).
+  stdoutMaxBytes?: number;
+  // Same as `stdoutMaxBytes`, applied to stderr.
+  stderrMaxBytes?: number;
+}
+
+interface RunDockerResult {
+  exitCode: number;
+  stdout: string;
+  stderr: string;
+  // True iff stdout/stderr capacity cap was hit. Spawn callers OR this with
+  // any further post-processing truncation to surface the truncated flag on
+  // the wire.
+  stdoutTruncated: boolean;
+  stderrTruncated: boolean;
+}
+
+// Read lazily so tests can override DOCKER_BIN (e.g. to /bin/bash) after
+// module load. Cheap: a single env-var read per docker invocation.
+function dockerBin(): string {
+  return process.env.DOCKER_BIN ?? 'docker';
+}
+
+/**
+ * Drain a Bun process pipe, buffering up to `maxBytes`. Continues to read
+ * past the cap (so the writer doesn't block on a full pipe — which would
+ * deadlock the docker CLI), but discards extra bytes. Returns the buffered
+ * portion plus a `truncated` flag.
+ *
+ * When `onChunk` is provided, every received chunk is forwarded — including
+ * chunks past the cap — so callers can do line-buffered scanning (e.g. the
+ * phase-marker parser in spawn.ts) without losing events to truncation.
+ */
+async function drainAndCap(
+  stream: ReadableStream<Uint8Array>,
+  maxBytes: number | undefined,
+  onChunk?: (chunk: Uint8Array) => void,
+): Promise<{ bytes: ArrayBuffer; truncated: boolean }> {
+  const reader = stream.getReader();
+  const collected: Uint8Array[] = [];
+  let total = 0;
+  let truncated = false;
+  try {
+    for (;;) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      if (!value || value.byteLength === 0) continue;
+      if (onChunk) onChunk(value);
+      if (maxBytes === undefined) {
+        collected.push(value);
+        total += value.byteLength;
+        continue;
+      }
+      if (total >= maxBytes) {
+        truncated = true;
+        continue;
+      }
+      if (total + value.byteLength <= maxBytes) {
+        collected.push(value);
+        total += value.byteLength;
+      } else {
+        // Partial chunk fits; take the prefix and mark truncated.
+        const remaining = maxBytes - total;
+        if (remaining > 0) {
+          collected.push(value.subarray(0, remaining));
+          total += remaining;
+        }
+        truncated = true;
+      }
+    }
+  } finally {
+    try {
+      reader.releaseLock();
+    } catch (err) {
+      console.warn('[sandbox] reader.releaseLock failed:', err);
+    }
+  }
+  const merged = new Uint8Array(total);
+  let off = 0;
+  for (const c of collected) {
+    merged.set(c, off);
+    off += c.byteLength;
+  }
+  return {
+    bytes: merged.buffer.slice(
+      merged.byteOffset,
+      merged.byteOffset + merged.byteLength,
+    ),
+    truncated,
+  };
+}
+
+export async function runDocker(
+  args: string[],
+  opts: RunDockerOptions = {},
+): Promise<RunDockerResult> {
+  const proc = Bun.spawn([dockerBin(), ...args], {
+    stdin: 'ignore',
+    stdout: 'pipe',
+    stderr: 'pipe',
+    signal: opts.signal,
+  });
+
+  // Drain both streams concurrently to avoid pipe-back-pressure deadlock,
+  // and cap each independently so a runaway docker invocation can't OOM
+  // the spawner heap (audit finding R2-B2). stderr was previously read via
+  // `new Response(proc.stderr).arrayBuffer()` which has no cap — same OOM
+  // surface in the rare case stderr dominates.
+  const collectIO = Promise.all([
+    drainAndCap(
+      proc.stdout as ReadableStream<Uint8Array>,
+      opts.stdoutMaxBytes,
+      opts.onStdoutChunk,
+    ),
+    drainAndCap(
+      proc.stderr as ReadableStream<Uint8Array>,
+      opts.stderrMaxBytes,
+      opts.onStderrChunk,
+    ),
+  ]);
+
+  // Race the COLLECTOR (not just `proc.exited`) against the optional timeout.
+  // The previous shape — `await Promise.all([collectStdout(), stderr])` BEFORE
+  // arming `setTimeout` — meant a wedged daemon whose pipes never close would
+  // block indefinitely; the supposed backstop timer never armed (audit
+  // finding R2-B2 #3).
+  let timedOut = false;
+  let timer: ReturnType<typeof setTimeout> | undefined;
+  let stdoutResult = { bytes: new ArrayBuffer(0), truncated: false };
+  let stderrResult = { bytes: new ArrayBuffer(0), truncated: false };
+  if (opts.timeoutMs !== undefined && Number.isFinite(opts.timeoutMs)) {
+    const timeoutPromise = new Promise<'timeout'>((resolve) => {
+      timer = setTimeout(() => {
+        timedOut = true;
+        try {
+          proc.kill('SIGKILL');
+        } catch (err) {
+          console.warn('[sandbox] proc.kill on timeout failed:', err);
+        }
+        if (opts.killOnTimeoutContainer) {
+          const target = opts.killOnTimeoutContainer;
+          const killer = Bun.spawn(
+            [dockerBin(), 'kill', '--signal=SIGKILL', target],
+            { stdout: 'ignore', stderr: 'ignore', stdin: 'ignore' },
+          );
+          killer.exited.catch((err) => {
+            console.warn(
+              `[sandbox] docker kill ${target} on timeout failed:`,
+              err,
+            );
+          });
+        }
+        resolve('timeout');
+      }, opts.timeoutMs);
+    });
+    const winner = await Promise.race([
+      collectIO.then((v) => ['io', v] as const),
+      timeoutPromise.then((t) => [t, null] as const),
+    ]);
+    if (winner[0] === 'io' && winner[1] !== null) {
+      [stdoutResult, stderrResult] = winner[1];
+    } else {
+      // Timer fired before collectors finished. Await collectIO once more so
+      // we still pick up whatever bytes were drained before the kill — the
+      // pipes should EOF promptly once the process is killed.
+      try {
+        [stdoutResult, stderrResult] = await collectIO;
+      } catch (err) {
+        console.warn(
+          '[sandbox] post-timeout drain failed; partial buffers:',
+          err,
+        );
+      }
+    }
+  } else {
+    [stdoutResult, stderrResult] = await collectIO;
+  }
+  await proc.exited;
+  if (timer) clearTimeout(timer);
+
+  const exitCode = timedOut ? 124 : (proc.exitCode ?? -1);
+
+  const decoder = new TextDecoder('utf-8', { fatal: false });
+  return {
+    exitCode,
+    stdout: decoder.decode(stdoutResult.bytes),
+    stderr: decoder.decode(stderrResult.bytes),
+    stdoutTruncated: stdoutResult.truncated,
+    stderrTruncated: stderrResult.truncated,
+  };
+}
+
+/**
+ * Send a signal to a container. Default is SIGTERM (graceful); cancel paths
+ * escalate to KILL when the graceful kill timed out. `timeoutMs` is
+ * forwarded to `runDocker` so a wedged daemon kills the docker CLI
+ * subprocess too — without it the outer caller's `withTimeout` would
+ * reject but the underlying Bun child would leak.
+ */
+export async function dockerKill(
+  containerName: string,
+  signal: 'TERM' | 'KILL' = 'TERM',
+  opts: { timeoutMs?: number } = {},
+): Promise<void> {
+  const runOpts: RunDockerOptions = {};
+  if (opts.timeoutMs !== undefined) runOpts.timeoutMs = opts.timeoutMs;
+  await runDocker(['kill', `--signal=SIG${signal}`, containerName], runOpts);
+}
+
+export async function dockerRm(containerName: string): Promise<void> {
+  await runDocker(['rm', '--force', containerName]);
+}
+
+/**
+ * Best-effort `docker pull` of an image, retried with exponential backoff.
+ * Used once at spawner boot so the first /v1/execute call doesn't pay a cold
+ * registry round-trip. Returns true on success; the caller decides whether
+ * to fail-closed on a persistent failure.
+ */
+export async function ensureImage(
+  image: string,
+  opts: { attempts?: number } = {},
+): Promise<boolean> {
+  const inspect = await runDocker(['image', 'inspect', image]);
+  if (inspect.exitCode === 0) return true;
+  const attempts = opts.attempts ?? 3;
+  for (let i = 0; i < attempts; i++) {
+    const result = await runDocker(['pull', image]);
+    if (result.exitCode === 0) return true;
+    if (i < attempts - 1) {
+      const delayMs = 1000 * (i + 1);
+      console.warn(
+        `[sandbox] docker pull ${image} attempt ${i + 1} failed; retrying in ${delayMs}ms — stderr: ${result.stderr.trim()}`,
+      );
+      await new Promise<void>((resolve) => setTimeout(resolve, delayMs));
+    } else {
+      console.error(
+        `[sandbox] docker pull ${image} failed after ${attempts} attempts — stderr: ${result.stderr.trim()}`,
+      );
+    }
+  }
+  return false;
+}
diff --git a/services/sandbox/src/spawn.ts b/services/sandbox/src/spawn.ts
new file mode 100644
index 000000000..43d53a0e5
--- /dev/null
+++ b/services/sandbox/src/spawn.ts
@@ -0,0 +1,1609 @@
+// Per-call execution pipeline. The route handler in server.ts hands a typed
+// ExecuteRequest in; this module owns the docker lifecycle and returns a
+// typed ExecuteResponse out.
+//
+// Flow:
+//   1. Ensure per-org pip/npm cache volumes exist (one-shot chown so the
+//      unprivileged runtime user can write).
+//   2. Create host workspace dir at /var/lib/tale-sandbox/sessions/<uuid>/
+//      and stage code/ + input/ via Bun fs (the spawner sees this path
+//      directly because it's bind-mounted 1:1 into the container).
+//   3. `docker run` the runtime with --mount type=bind workspaceHostDir
+//      → /workspace.
+//   4. Wait with host-side wall-clock timeout.
+//   5. Read /workspace/output/ back via Bun fs.
+//   6. Capture stdout/stderr; classify exit code → errorCode.
+//   7. `docker rm -f` + rm -rf the host dir.
+
+import { createHash } from 'node:crypto';
+import {
+  mkdir,
+  readdir,
+  readFile,
+  rm,
+  stat,
+  writeFile,
+  lchown,
+} from 'node:fs/promises';
+import { dirname, join, resolve, sep } from 'node:path';
+
+import { buildDockerRunArgs } from './docker-args.ts';
+import {
+  postToUploadSlot,
+  reportUploaded,
+  requestUploadUrls,
+} from './sandbox-callback.ts';
+import { runDocker, dockerKill, dockerRm } from './spawn-util.ts';
+import type {
+  ErrorCode,
+  ExecuteRequest,
+  ExecuteResponse,
+  OutputFile,
+  PriorStageResult,
+  PriorStageSkipReason,
+  SpawnerConfig,
+  UploadFailure,
+  UploadStats,
+} from './types.ts';
+import {
+  ensureCacheVolume,
+  npmCacheVolumeName,
+  pipCacheVolumeName,
+} from './volume.ts';
+import {
+  ID_ALPHABET_RE,
+  ORG_ID_ALPHABET_RE,
+  type SandboxPhaseEvent,
+  type SandboxStepResult,
+  type SandboxStepStatus,
+} from './wire.ts';
+
+// Hidden directory inside /workspace/output/ where the multi-step wrapper
+// writes its per-step bookkeeping. The harvest path filters anything under
+// this prefix so the bookkeeping never appears in the user-visible output
+// file chips.
+const STEPS_INTERNAL_DIR = '.tale-steps';
+const STEPS_RESULTS_FILENAME = 'results.json';
+
+const PHASE_INSTALL = 'PHASE: installing';
+const PHASE_RUN = 'PHASE: running';
+const RUNTIME_UID = 65534;
+const RUNTIME_GID = 65534;
+
+interface InFlight {
+  containerName: string;
+  abort: AbortController;
+  startedAt: number;
+}
+
+const inFlight = new Map<string, InFlight>();
+
+export function isInFlight(executionId: string): boolean {
+  return inFlight.has(executionId);
+}
+
+export function inFlightSize(): number {
+  return inFlight.size;
+}
+
+export function inFlightIds(): string[] {
+  return Array.from(inFlight.keys());
+}
+
+/**
+ * Pre-registers an id when the HTTP handler accepts a request but before
+ * `executeRequest` has constructed the real InFlight entry. The placeholder
+ * is overwritten in executeRequest; `unregisterInFlight` is a no-op once the
+ * real entry has been removed by executeRequest's own finally block.
+ */
+export function registerInFlight(executionId: string): void {
+  if (inFlight.has(executionId)) return;
+  // Placeholder until executeRequest swaps in the real entry. The
+  // AbortController exists so an early cancelExecution call sees a real
+  // signal-bearing object.
+  inFlight.set(executionId, {
+    containerName: `tale-sbx-${executionId}`,
+    abort: new AbortController(),
+    startedAt: Date.now(),
+  });
+}
+
+export function unregisterInFlight(executionId: string): void {
+  inFlight.delete(executionId);
+}
+
+export async function cancelExecution(executionId: string): Promise<boolean> {
+  const entry = inFlight.get(executionId);
+  if (!entry) return false;
+  entry.abort.abort('cancelled by client');
+  // Hard ceiling on docker kill so a wedged daemon can't hang the cancel
+  // HTTP response. The timeoutMs is passed THROUGH to runDocker so the
+  // underlying Bun subprocess is killed too — earlier this used an outer
+  // `withTimeout` wrapper which only rejected the promise but left the
+  // docker CLI child running (audit follow-up F4).
+  try {
+    await dockerKill(entry.containerName, 'TERM', { timeoutMs: 5_000 });
+  } catch (err) {
+    console.warn(
+      `[sandbox.cancel] dockerKill timed out / failed for ${executionId}:`,
+      err,
+    );
+    try {
+      await dockerKill(entry.containerName, 'KILL', { timeoutMs: 5_000 });
+    } catch (forceErr) {
+      console.error(
+        `[sandbox.cancel] forced dockerKill also failed for ${executionId}:`,
+        forceErr,
+      );
+    }
+  }
+  return true;
+}
+
+/**
+ * Generate the multi-step wrapper script that lands at /workspace/code/
+ * main.{py,js} in steps mode. Each step is invoked as a child process
+ * with the same cwd and inherited stdio so the user's stdout / stderr
+ * stream through unchanged; the wrapper itself prints a short banner
+ * around each step so a human reading the log can tell where boundaries
+ * fall. Per-step `{path, exitCode, durationMs, status}` records are
+ * written to /workspace/output/.tale-steps/results.json at the end (and
+ * also after every step in case the container is SIGKILLed mid-flight).
+ *
+ * Fail-fast: a non-zero exit aborts the remaining steps, which are
+ * recorded as `status: 'skipped'` so the caller can attribute the gap.
+ * The wrapper exits with the first non-zero exit code, surfacing the
+ * failure to docker's exit code → spawn.ts's classifyFailure().
+ *
+ * The step list is serialized as JSON inline (steps are validated paths,
+ * <= 200 chars, safe-alphabet, cap MAX_STEPS_PER_REQUEST) so the wrapper
+ * has zero external configuration.
+ */
+function buildMultiStepWrapper(
+  language: 'python' | 'node' | 'polyglot',
+  steps: readonly string[],
+): string {
+  const stepsJson = JSON.stringify(steps);
+  if (language === 'polyglot') {
+    // Polyglot mode: per-step interpreter selected by file extension at
+    // runtime. Wrapper is Python (always present — image's base layer)
+    // and shells out via subprocess to either `python3` or `node`. The
+    // `results.json` shape is identical to the single-language wrappers
+    // so the spawner's `readStepResults` consumer is unchanged.
+    return `# Tale polyglot multi-step wrapper — generated, do not edit.
+import json
+import os
+import subprocess
+import sys
+import time
+
+STEPS = ${stepsJson}
+RESULTS_DIR = "/workspace/output/${STEPS_INTERNAL_DIR}"
+RESULTS_PATH = os.path.join(RESULTS_DIR, "${STEPS_RESULTS_FILENAME}")
+
+os.makedirs(RESULTS_DIR, exist_ok=True)
+results = []
+
+def interpreter_for(path):
+    lower = path.lower()
+    if lower.endswith(".py"):
+        return "python3"
+    if lower.endswith(".js") or lower.endswith(".cjs") or lower.endswith(".mjs"):
+        return "node"
+    return None
+
+def flush_results():
+    try:
+        with open(RESULTS_PATH, "w") as fh:
+            json.dump(results, fh)
+    except Exception as exc:
+        sys.stderr.write(f"[tale-runner] failed to persist step results: {exc}\\n")
+
+failed_idx = None
+for i, path in enumerate(STEPS):
+    interp = interpreter_for(path)
+    banner = f"====== STEP {i + 1}/{len(STEPS)}: {path} ({interp or '?'}) ======"
+    sys.stdout.write(banner + "\\n")
+    sys.stdout.flush()
+    started = time.time()
+    if interp is None:
+        sys.stderr.write(f"[tale-runner] step {path} has no known interpreter\\n")
+        exit_code = 65
+    else:
+        try:
+            completed = subprocess.run(
+                [interp, path],
+                cwd="/workspace/code",
+            )
+            exit_code = completed.returncode
+        except FileNotFoundError as exc:
+            sys.stderr.write(f"[tale-runner] step {path} not found: {exc}\\n")
+            exit_code = 127
+        except Exception as exc:
+            sys.stderr.write(f"[tale-runner] step {path} crashed: {exc}\\n")
+            exit_code = 1
+    duration_ms = int((time.time() - started) * 1000)
+    status = "completed" if exit_code == 0 else "failed"
+    results.append(
+        {
+            "path": path,
+            "exitCode": exit_code,
+            "durationMs": duration_ms,
+            "status": status,
+        }
+    )
+    sys.stdout.write(
+        f"====== STEP {i + 1}/{len(STEPS)} END (exit {exit_code}, {duration_ms}ms) ======\\n"
+    )
+    sys.stdout.flush()
+    flush_results()
+    if exit_code != 0:
+        failed_idx = i
+        break
+
+if failed_idx is not None:
+    for j in range(failed_idx + 1, len(STEPS)):
+        results.append(
+            {
+                "path": STEPS[j],
+                "exitCode": None,
+                "durationMs": 0,
+                "status": "skipped",
+            }
+        )
+    flush_results()
+    sys.exit(results[failed_idx]["exitCode"] or 1)
+
+sys.exit(0)
+`;
+  }
+  if (language === 'python') {
+    return `# Tale multi-step wrapper — generated, do not edit.
+import json
+import os
+import subprocess
+import sys
+import time
+
+STEPS = ${stepsJson}
+RESULTS_DIR = "/workspace/output/${STEPS_INTERNAL_DIR}"
+RESULTS_PATH = os.path.join(RESULTS_DIR, "${STEPS_RESULTS_FILENAME}")
+
+os.makedirs(RESULTS_DIR, exist_ok=True)
+results = []
+
+def flush_results():
+    try:
+        with open(RESULTS_PATH, "w") as fh:
+            json.dump(results, fh)
+    except Exception as exc:
+        sys.stderr.write(f"[tale-runner] failed to persist step results: {exc}\\n")
+
+failed_idx = None
+for i, path in enumerate(STEPS):
+    banner = f"====== STEP {i + 1}/{len(STEPS)}: {path} ======"
+    sys.stdout.write(banner + "\\n")
+    sys.stdout.flush()
+    started = time.time()
+    try:
+        completed = subprocess.run(
+            [sys.executable, path],
+            cwd="/workspace/code",
+        )
+        exit_code = completed.returncode
+    except FileNotFoundError as exc:
+        sys.stderr.write(f"[tale-runner] step {path} not found: {exc}\\n")
+        exit_code = 127
+    except Exception as exc:
+        sys.stderr.write(f"[tale-runner] step {path} crashed: {exc}\\n")
+        exit_code = 1
+    duration_ms = int((time.time() - started) * 1000)
+    status = "completed" if exit_code == 0 else "failed"
+    results.append(
+        {
+            "path": path,
+            "exitCode": exit_code,
+            "durationMs": duration_ms,
+            "status": status,
+        }
+    )
+    sys.stdout.write(
+        f"====== STEP {i + 1}/{len(STEPS)} END (exit {exit_code}, {duration_ms}ms) ======\\n"
+    )
+    sys.stdout.flush()
+    flush_results()
+    if exit_code != 0:
+        failed_idx = i
+        break
+
+if failed_idx is not None:
+    for j in range(failed_idx + 1, len(STEPS)):
+        results.append(
+            {
+                "path": STEPS[j],
+                "exitCode": None,
+                "durationMs": 0,
+                "status": "skipped",
+            }
+        )
+    flush_results()
+    sys.exit(results[failed_idx]["exitCode"] or 1)
+
+sys.exit(0)
+`;
+  }
+  // node
+  return `// Tale multi-step wrapper — generated, do not edit.
+const { spawnSync } = require('node:child_process');
+const fs = require('node:fs');
+const path = require('node:path');
+
+const STEPS = ${stepsJson};
+const RESULTS_DIR = '/workspace/output/${STEPS_INTERNAL_DIR}';
+const RESULTS_PATH = path.join(RESULTS_DIR, '${STEPS_RESULTS_FILENAME}');
+
+fs.mkdirSync(RESULTS_DIR, { recursive: true });
+const results = [];
+
+function flushResults() {
+  try {
+    fs.writeFileSync(RESULTS_PATH, JSON.stringify(results));
+  } catch (err) {
+    process.stderr.write(\`[tale-runner] failed to persist step results: \${err}\\n\`);
+  }
+}
+
+let failedIdx = null;
+for (let i = 0; i < STEPS.length; i++) {
+  const step = STEPS[i];
+  process.stdout.write(\`====== STEP \${i + 1}/\${STEPS.length}: \${step} ======\\n\`);
+  const startedAt = Date.now();
+  let exitCode;
+  try {
+    const child = spawnSync(process.execPath, [step], {
+      cwd: '/workspace/code',
+      stdio: 'inherit',
+    });
+    if (child.error) {
+      process.stderr.write(\`[tale-runner] step \${step} crashed: \${child.error.message}\\n\`);
+      exitCode = 1;
+    } else if (child.status === null) {
+      // Killed by signal; surface SIGKILL-equivalent exit code so the host
+      // classifyFailure() still maps to RUNTIME_ERROR / OOM as appropriate.
+      exitCode = child.signal === 'SIGKILL' ? 137 : 1;
+    } else {
+      exitCode = child.status;
+    }
+  } catch (err) {
+    process.stderr.write(\`[tale-runner] step \${step} threw: \${err}\\n\`);
+    exitCode = 1;
+  }
+  const durationMs = Date.now() - startedAt;
+  const status = exitCode === 0 ? 'completed' : 'failed';
+  results.push({ path: step, exitCode, durationMs, status });
+  process.stdout.write(
+    \`====== STEP \${i + 1}/\${STEPS.length} END (exit \${exitCode}, \${durationMs}ms) ======\\n\`,
+  );
+  flushResults();
+  if (exitCode !== 0) {
+    failedIdx = i;
+    break;
+  }
+}
+
+if (failedIdx !== null) {
+  for (let j = failedIdx + 1; j < STEPS.length; j++) {
+    results.push({
+      path: STEPS[j],
+      exitCode: null,
+      durationMs: 0,
+      status: 'skipped',
+    });
+  }
+  flushResults();
+  process.exit(results[failedIdx].exitCode || 1);
+}
+
+process.exit(0);
+`;
+}
+
+/**
+ * Pre-stage the artifact's previous run outputs into `/workspace/output/`.
+ *
+ * Post-sandbox-wobbly-origami plan §1: instead of receiving base64-inlined
+ * bytes, the spawner now gets a list of `{name, url}` and fetches each
+ * URL itself (URLs are pre-rewritten through `toSandboxStorageUrl()` on the
+ * platform side so they target the internal Caddy alias). Path safety is
+ * still enforced here as defense in depth.
+ *
+ * Bad names / failed fetches are skipped (logged), not fatal — pre-staging
+ * is a best-effort convenience layer, not a correctness contract.
+ *
+ * Exported so the unit test can exercise the path-traversal guard.
+ */
+// Defaults for the pre-stage fetch. Overridable so unit tests can run
+// with tighter values without waiting on real timeouts.
+const PRIOR_FETCH_DEFAULT_TIMEOUT_MS = 30_000;
+const PRIOR_FETCH_DEFAULT_MAX_BYTES = 100 * 1024 * 1024; // 100 MB
+
+interface StagePriorOpts {
+  timeoutMs?: number;
+  maxBytesPerFile?: number;
+}
+
+export async function stagePriorOutputDownloads(
+  outputDir: string,
+  downloads: ReadonlyArray<{ name: string; url: string }>,
+  opts: StagePriorOpts = {},
+): Promise<PriorStageResult> {
+  const timeoutMs = opts.timeoutMs ?? PRIOR_FETCH_DEFAULT_TIMEOUT_MS;
+  const maxBytesPerFile = opts.maxBytesPerFile ?? PRIOR_FETCH_DEFAULT_MAX_BYTES;
+  const staged: PriorStageResult['staged'] = [];
+  const skipped: PriorStageResult['skipped'] = [];
+  for (const file of downloads) {
+    const dest = resolve(outputDir, file.name);
+    // Defense in depth — refuse anything escaping outputDir.
+    if (dest !== outputDir && !dest.startsWith(outputDir + sep)) {
+      const detail = `resolved path escapes outputDir`;
+      console.warn(
+        `[sandbox] skipping unsafe prior-output name: ${JSON.stringify(file.name)} (${detail})`,
+      );
+      skipped.push({ name: file.name, reason: 'unsafe_path', detail });
+      continue;
+    }
+    let res: Response;
+    try {
+      // AbortSignal.timeout caps the round trip so a stalled presigned URL
+      // can't hang stageWorkspace indefinitely (audit follow-up F5).
+      res = await fetch(file.url, { signal: AbortSignal.timeout(timeoutMs) });
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err);
+      // AbortSignal.timeout rejects with a DOMException whose `name` is
+      // 'TimeoutError'; surface a distinct reason so the platform can
+      // distinguish "URL was reachable" from "URL hung".
+      const reason: PriorStageSkipReason =
+        err instanceof Error && err.name === 'TimeoutError'
+          ? 'fetch_timeout'
+          : 'fetch_failed';
+      console.warn(
+        `[sandbox] prior-output fetch ${reason} for ${JSON.stringify(file.name)}: ${detail}`,
+      );
+      skipped.push({ name: file.name, reason, detail });
+      continue;
+    }
+    if (!res.ok) {
+      const detail = `HTTP ${res.status}`;
+      console.warn(
+        `[sandbox] prior-output fetch ${res.status} for ${JSON.stringify(file.name)}`,
+      );
+      // 403/410 from a presigned URL usually means TTL expired — give the
+      // platform side a distinct reason so it can re-mint and retry rather
+      // than failing the run outright (crispy-curry plan §3, url_expired).
+      const reason: PriorStageSkipReason =
+        res.status === 403 || res.status === 410 ? 'url_expired' : 'http_error';
+      skipped.push({ name: file.name, reason, detail });
+      continue;
+    }
+    // Fast-fail on Content-Length when the server provides one — avoids
+    // streaming a known-too-large body just to reject it.
+    const contentLengthHeader = res.headers.get('content-length');
+    if (contentLengthHeader !== null) {
+      const declaredBytes = Number(contentLengthHeader);
+      if (Number.isFinite(declaredBytes) && declaredBytes > maxBytesPerFile) {
+        const detail = `Content-Length ${declaredBytes} exceeds cap ${maxBytesPerFile}`;
+        console.warn(
+          `[sandbox] prior-output download_too_large for ${JSON.stringify(file.name)}: ${detail}`,
+        );
+        skipped.push({
+          name: file.name,
+          reason: 'download_too_large',
+          detail,
+        });
+        continue;
+      }
+    }
+    try {
+      // Stream-and-cap. Without this a server that lies about (or omits)
+      // Content-Length could still smuggle gigabytes through, filling the
+      // host disk. We abort the read as soon as the running total crosses
+      // the cap.
+      const chunks: Uint8Array[] = [];
+      let total = 0;
+      let oversize = false;
+      if (res.body !== null) {
+        const reader = res.body.getReader();
+        try {
+          for (;;) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            if (value === undefined) continue;
+            if (total + value.byteLength > maxBytesPerFile) {
+              oversize = true;
+              break;
+            }
+            chunks.push(value);
+            total += value.byteLength;
+          }
+        } finally {
+          try {
+            reader.releaseLock();
+          } catch (err) {
+            console.warn('[sandbox] prior-output reader.releaseLock:', err);
+          }
+        }
+      }
+      if (oversize) {
+        const detail = `streamed > ${maxBytesPerFile} bytes`;
+        console.warn(
+          `[sandbox] prior-output download_too_large for ${JSON.stringify(file.name)}: ${detail}`,
+        );
+        skipped.push({
+          name: file.name,
+          reason: 'download_too_large',
+          detail,
+        });
+        continue;
+      }
+      const buf = Buffer.concat(chunks.map((c) => Buffer.from(c)));
+      const sha256 = createHash('sha256').update(buf).digest('hex');
+      await mkdir(dirname(dest), { recursive: true });
+      await writeFile(dest, buf);
+      staged.push({ name: file.name, bytes: buf.byteLength, sha256 });
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err);
+      console.warn(
+        `[sandbox] failed to pre-stage ${JSON.stringify(file.name)}: ${detail}`,
+      );
+      skipped.push({ name: file.name, reason: 'write_failed', detail });
+    }
+  }
+  // INFO so it's visible in `docker logs tale-sandbox` without having
+  // to crank the global log level. Pre-stage is a black box otherwise.
+  if (staged.length > 0) {
+    console.info(
+      `[sandbox.stage] pre-staged ${staged.length} file(s) into ${outputDir}: ${JSON.stringify(staged.map((s) => s.name))}`,
+    );
+  }
+  if (skipped.length > 0) {
+    console.warn(
+      `[sandbox.stage] skipped ${skipped.length} prior-output(s): ${JSON.stringify(skipped)}`,
+    );
+  }
+  return { staged, skipped };
+}
+
+export async function stageWorkspace(
+  hostDir: string,
+  req: ExecuteRequest,
+): Promise<{ priorStage?: PriorStageResult }> {
+  const codeDir = join(hostDir, 'code');
+  const outputDir = join(hostDir, 'output');
+  await mkdir(codeDir, { recursive: true });
+  await mkdir(outputDir, { recursive: true });
+
+  let priorStage: PriorStageResult | undefined;
+  if (
+    req.priorOutputDownloads !== undefined &&
+    req.priorOutputDownloads.length > 0
+  ) {
+    priorStage = await stagePriorOutputDownloads(
+      outputDir,
+      req.priorOutputDownloads,
+    );
+  }
+
+  // Stage user files at their declared paths under /workspace/code/.
+  // In single-script mode the entry file lives here; in multi-step mode
+  // every step + its siblings live here. No synthetic mirror — the runtime
+  // entrypoint exec()s the file at its declared path, so tracebacks and
+  // `__file__` carry the user's real filename.
+  // Path safety already enforced by validate-request.ts; this resolve+prefix
+  // check is defense-in-depth — if the validator ever regresses, here we
+  // refuse to write outside codeDir.
+  if (req.files !== undefined) {
+    for (const file of req.files) {
+      const dest = resolve(codeDir, file.path);
+      if (dest !== codeDir && !dest.startsWith(codeDir + sep)) {
+        throw new Error(
+          `sandbox staging refused unsafe file path: ${JSON.stringify(file.path)}`,
+        );
+      }
+      await mkdir(dirname(dest), { recursive: true });
+      await writeFile(dest, file.content);
+    }
+  }
+
+  // Multi-step mode: write the spawner-generated wrapper to a hidden dir
+  // outside /workspace/code/. The validator already rejects user paths
+  // with dotfile segments, so /workspace/.tale/ is guaranteed disjoint
+  // from anything in req.files[] — user step names like `main.py` cannot
+  // collide with the wrapper.
+  if (req.steps !== undefined) {
+    const taleDir = join(hostDir, '.tale');
+    await mkdir(taleDir, { recursive: true });
+    // Wrapper filename: legacy single-language wrappers keep their
+    // language-tagged names (runner.py / runner.js) so any operator
+    // grep'ing through /workspace/.tale/ still sees what to expect.
+    // Polyglot mode emits a Python-hosted dispatcher (the image base
+    // layer always has python3 available).
+    const wrapperName =
+      req.language === 'python' || req.language === 'polyglot'
+        ? 'runner.py'
+        : 'runner.js';
+    await writeFile(
+      join(taleDir, wrapperName),
+      buildMultiStepWrapper(req.language, req.steps),
+    );
+  }
+
+  // Polyglot mode: stage per-language buckets in separate files so the
+  // entrypoint can decide whether to run pip and/or npm independently.
+  // Single-language modes keep the legacy single-file shape so existing
+  // tests and any old client still work unchanged.
+  if (req.language === 'polyglot') {
+    const byLang = req.packagesByLang ?? {};
+    await writeFile(
+      join(codeDir, 'packages-python.json'),
+      JSON.stringify(byLang.python ?? []),
+    );
+    await writeFile(
+      join(codeDir, 'packages-node.json'),
+      JSON.stringify(byLang.node ?? []),
+    );
+    // Legacy packages.json is left empty so a malformed `cat` from a
+    // future debug script doesn't print stale data.
+    await writeFile(join(codeDir, 'packages.json'), '[]');
+  } else {
+    // For single-runtime requests prefer `packages[]`. If a caller sent
+    // `packagesByLang` here too, extract just the matching bucket so the
+    // wire is forgiving.
+    const single =
+      req.packages !== undefined
+        ? req.packages
+        : (req.packagesByLang?.[req.language] ?? []);
+    await writeFile(
+      join(codeDir, 'packages.json'),
+      JSON.stringify(single ?? []),
+    );
+  }
+  await writeFile(
+    join(codeDir, 'options.json'),
+    JSON.stringify(req.options ?? {}),
+  );
+
+  // Spawner runs as root; the runtime container runs as nobody (65534) and
+  // needs to read the staged files. Recursively `lchown` (not `chown`) so a
+  // symlink the runtime container planted into the bind-mounted workspace
+  // CANNOT redirect ownership of an arbitrary host file (audit finding
+  // R2-B4: latent footgun if session dirs ever get reused across runs).
+  await chownRecursive(hostDir, RUNTIME_UID, RUNTIME_GID);
+  return { ...(priorStage !== undefined && { priorStage }) };
+}
+
+async function chownRecursive(
+  path: string,
+  uid: number,
+  gid: number,
+): Promise<void> {
+  await lchown(path, uid, gid);
+  const entries = await readdir(path, { withFileTypes: true });
+  for (const e of entries) {
+    const p = join(path, e.name);
+    if (e.isDirectory()) {
+      await chownRecursive(p, uid, gid);
+    } else {
+      await lchown(p, uid, gid);
+    }
+  }
+}
+
+interface HarvestEndpoints {
+  outputUrlEndpoint: string;
+  reportUploadedEndpoint: string;
+}
+
+interface HarvestResult {
+  files: OutputFile[];
+  truncatedCount: number;
+  uploadStats: UploadStats;
+  /** True if any file hit `UPLOAD_QUOTA_EXCEEDED` while requesting slots. */
+  quotaExhausted: boolean;
+  /** True if any file failed the upload POST. */
+  uploadFailed: boolean;
+  /** True if any EP2 report-back failed (non-fatal, but surfaced). */
+  reportFailed: boolean;
+  /** True if the directory walk itself errored. */
+  readFailed: boolean;
+  uploadMs: number;
+}
+
+/**
+ * Walk `/workspace/output/`, POST each file's bytes to a presigned upload
+ * slot URL, and report each successful storageId via EP2. Slot URLs come
+ * from the pre-allocated pool first; when that pool is empty we lazily
+ * request more from EP1 (server-side quota gate may reject with 412).
+ *
+ * Errors are accumulated into `uploadStats.failures` rather than thrown —
+ * caller decides which errorCode to surface based on the failure flags.
+ * The HTTP status of the FIRST failure drives errorCode classification:
+ * 412 → UPLOAD_QUOTA_EXCEEDED, anything else from postToUploadSlot →
+ * UPLOAD_FAILED, EP2-only failures → UPLOAD_REPORT_FAILED.
+ */
+async function harvestOutputDir(
+  hostDir: string,
+  caps: { perFileMax: number; totalMax: number },
+  uploadSlots: ReadonlyArray<{ url: string }>,
+  endpoints: HarvestEndpoints,
+  executionId: string,
+  sandboxToken: string | null,
+): Promise<HarvestResult> {
+  const outputDir = join(hostDir, 'output');
+  const files: OutputFile[] = [];
+  let truncatedCount = 0;
+  let totalAccepted = 0;
+  const slotPool: string[] = uploadSlots.map((s) => s.url);
+  let slotIndex = 0;
+  const failures: UploadFailure[] = [];
+  let attempted = 0;
+  let succeeded = 0;
+  let quotaExhausted = false;
+  let uploadFailed = false;
+  let reportFailed = false;
+  let readFailed = false;
+  const startUpload = Date.now();
+
+  async function nextSlotUrl(): Promise<string | null> {
+    if (slotPool.length > 0) {
+      // Pop FIFO so the order in audit logs matches the pre-alloc order.
+      const url = slotPool.shift();
+      return url ?? null;
+    }
+    if (quotaExhausted) return null;
+    const result = await requestUploadUrls(
+      endpoints.outputUrlEndpoint,
+      executionId,
+      2,
+      { token: sandboxToken },
+    );
+    if (!result.ok) {
+      if (result.code === 'QUOTA_EXCEEDED') {
+        quotaExhausted = true;
+      } else {
+        uploadFailed = true;
+      }
+      failures.push({
+        slotIndex: -1,
+        fileName: '(slot-request)',
+        httpStatus: result.status,
+        errorSnippet: result.snippet,
+      });
+      return null;
+    }
+    for (const u of result.urls) slotPool.push(u);
+    const url = slotPool.shift();
+    return url ?? null;
+  }
+
+  async function walk(rel: string): Promise<void> {
+    const abs = join(outputDir, rel);
+    let entries;
+    try {
+      entries = await readdir(abs, { withFileTypes: true });
+    } catch (err) {
+      console.warn(`[sandbox.harvest] failed to read output dir ${abs}:`, err);
+      readFailed = true;
+      return;
+    }
+    for (const e of entries) {
+      const childRel = rel ? `${rel}/${e.name}` : e.name;
+      const childAbs = join(outputDir, childRel);
+      // Skip the multi-step wrapper's internal bookkeeping. The runner
+      // writes per-step results to `/workspace/output/.tale-steps/` so the
+      // host side can read structured per-step state — those files must
+      // not appear in the user-visible outputFiles harvest.
+      if (rel === '' && e.name === STEPS_INTERNAL_DIR) continue;
+      if (e.isDirectory()) {
+        await walk(childRel);
+        continue;
+      }
+      if (!e.isFile()) continue;
+      const st = await stat(childAbs);
+      if (
+        st.size > caps.perFileMax ||
+        totalAccepted + st.size > caps.totalMax
+      ) {
+        truncatedCount += 1;
+        continue;
+      }
+      const url = await nextSlotUrl();
+      if (url === null) {
+        // Out of slots (quota OR network error). Mark this file failed
+        // and continue — subsequent files will also fail-fast at
+        // nextSlotUrl, recorded just once per cause.
+        attempted += 1;
+        failures.push({
+          slotIndex: slotIndex,
+          fileName: childRel,
+          httpStatus: quotaExhausted ? 412 : 0,
+          errorSnippet: quotaExhausted
+            ? 'per-run output quota exceeded'
+            : 'no upload slot available',
+        });
+        continue;
+      }
+      attempted += 1;
+      const bytes = await readFile(childAbs);
+      const contentType = guessContentType(childRel);
+      // sha256 is the per-file digest used by both the cumulative
+      // `artifactOutputs` manifest (crispy-curry plan §1) and the
+      // pre-stage attestation when this same file is later re-injected
+      // into a future run. Computed once during harvest; piggy-backs on
+      // the readFile we already did.
+      const sha256 = createHash('sha256').update(bytes).digest('hex');
+      const postResult = await postToUploadSlot(
+        url,
+        bytes,
+        contentType,
+        slotIndex,
+        childRel,
+      );
+      slotIndex += 1;
+      if (!postResult.ok) {
+        uploadFailed = true;
+        failures.push(postResult.failure);
+        continue;
+      }
+      // POST succeeded; report storageId via EP2 so the platform's
+      // rollback set tracks the live blob before we send back the
+      // final SSE result.
+      const reportResult = await reportUploaded(
+        endpoints.reportUploadedEndpoint,
+        executionId,
+        {
+          fileName: childRel,
+          storageId: postResult.storageId,
+          size: st.size,
+          contentType,
+        },
+        { token: sandboxToken },
+      );
+      if (!reportResult.ok) {
+        reportFailed = true;
+        failures.push({
+          slotIndex: slotIndex - 1,
+          fileName: childRel,
+          httpStatus: reportResult.status,
+          errorSnippet: `EP2: ${reportResult.snippet}`,
+        });
+        // EP2 failure is non-fatal — the bytes are in storage, the
+        // file is usable. Continue and surface via uploadStats.
+      }
+      files.push({
+        name: childRel,
+        storageId: postResult.storageId,
+        size: st.size,
+        contentType,
+        sha256,
+      });
+      totalAccepted += st.size;
+      succeeded += 1;
+    }
+  }
+  await walk('');
+  return {
+    files,
+    truncatedCount,
+    uploadStats: { attempted, succeeded, failures },
+    quotaExhausted,
+    uploadFailed,
+    reportFailed,
+    readFailed,
+    uploadMs: Date.now() - startUpload,
+  };
+}
+
+/**
+ * Read per-step results written by the wrapper into
+ * `/workspace/output/.tale-steps/results.json`. Returns `null` if the
+ * file is missing or malformed — callers should fall back to a synthetic
+ * `[{status:'failed'}]` so the response shape is still valid. Validates
+ * each entry's shape so a wrapper bug can't smuggle arbitrary JSON into
+ * the response.
+ */
+async function readStepResults(
+  hostDir: string,
+  requestedSteps: readonly string[],
+): Promise<SandboxStepResult[] | null> {
+  const resultsPath = join(
+    hostDir,
+    'output',
+    STEPS_INTERNAL_DIR,
+    STEPS_RESULTS_FILENAME,
+  );
+  let raw: string;
+  try {
+    raw = (await readFile(resultsPath)).toString('utf8');
+  } catch (err) {
+    // ENOENT is the most common — happens when the container was killed
+    // before the wrapper could flush. Log only at debug-ish level.
+    if (
+      err !== null &&
+      typeof err === 'object' &&
+      'code' in err &&
+      err.code === 'ENOENT'
+    ) {
+      return null;
+    }
+    console.warn(`[sandbox.harvest] failed to read step results:`, err);
+    return null;
+  }
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(raw);
+  } catch (err) {
+    console.warn(`[sandbox.harvest] step results JSON malformed:`, err);
+    return null;
+  }
+  if (!Array.isArray(parsed)) {
+    console.warn(`[sandbox.harvest] step results not an array`);
+    return null;
+  }
+  const out: SandboxStepResult[] = [];
+  // Use a `ReadonlySet<string>` here so the `.has(value)` call accepts the
+  // freshly-narrowed-but-still-`string` field without an extra cast. The
+  // type-guard below keeps `status` typed as `SandboxStepStatus` for the
+  // returned record.
+  const allowedStatuses: ReadonlySet<string> = new Set([
+    'completed',
+    'failed',
+    'skipped',
+  ] satisfies readonly SandboxStepStatus[]);
+  const isStepStatus = (v: string): v is SandboxStepStatus =>
+    allowedStatuses.has(v);
+  for (const entry of parsed) {
+    if (entry === null || typeof entry !== 'object' || Array.isArray(entry)) {
+      continue;
+    }
+    // After the guard `entry` is `object`; this is the canonical wire-shape
+    // narrowing pattern in the repo (see spawn.ts header docs on validation).
+    // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+    const e = entry as Record<string, unknown>;
+    if (typeof e.path !== 'string') continue;
+    if (typeof e.status !== 'string' || !isStepStatus(e.status)) {
+      continue;
+    }
+    const exitCode =
+      typeof e.exitCode === 'number'
+        ? e.exitCode
+        : e.exitCode === null
+          ? null
+          : 1;
+    const durationMs =
+      typeof e.durationMs === 'number' && Number.isFinite(e.durationMs)
+        ? e.durationMs
+        : 0;
+    out.push({
+      path: e.path,
+      status: e.status,
+      exitCode,
+      durationMs,
+    });
+  }
+  if (out.length === 0) return null;
+  // Defense: ensure paths reference real requested steps. A wrapper bug
+  // shouldn't surface an unrelated entry to the agent.
+  const requested = new Set(requestedSteps);
+  return out.filter((s) => requested.has(s.path));
+}
+
+function guessContentType(name: string): string {
+  const lower = name.toLowerCase();
+  if (lower.endsWith('.pptx'))
+    return 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
+  if (lower.endsWith('.pdf')) return 'application/pdf';
+  if (lower.endsWith('.xlsx'))
+    return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
+  if (lower.endsWith('.docx'))
+    return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
+  if (lower.endsWith('.png')) return 'image/png';
+  if (lower.endsWith('.jpg') || lower.endsWith('.jpeg')) return 'image/jpeg';
+  if (lower.endsWith('.svg')) return 'image/svg+xml';
+  if (lower.endsWith('.json')) return 'application/json';
+  if (lower.endsWith('.csv')) return 'text/csv; charset=utf-8';
+  if (lower.endsWith('.txt') || lower.endsWith('.log'))
+    return 'text/plain; charset=utf-8';
+  if (lower.endsWith('.html')) return 'text/html; charset=utf-8';
+  return 'application/octet-stream';
+}
+
+/**
+ * Phase events emitted while the runtime container is running. The server's
+ * SSE handler relays these to the convex action; the action then writes the
+ * artifact row's `runStatus` + `runProgress` so the canvas shows live
+ * progress instead of a frozen spinner.
+ *
+ * Shape mirrors `services/platform/convex/sandbox/wire.ts:sandboxPhaseEventLiterals`.
+ */
+type PhaseEvent = { phase: SandboxPhaseEvent };
+
+interface ExecuteRequestOptions {
+  onPhase?: (event: PhaseEvent) => void;
+  /**
+   * Fires for each non-PHASE-marker line on stdout while the container is
+   * alive, after the line has been decoded. The trailing newline IS
+   * included so consumers can append directly to a tail buffer without
+   * re-inserting separators. On stream EOF a final residual non-empty line
+   * (no newline) is also delivered. PHASE markers are stripped from this
+   * stream — they only fire `onPhase`. Used by server.ts to emit incremental
+   * `event: stdout` SSE deltas; the final `result` event still carries the
+   * canonical base64'd buffer.
+   */
+  onStdoutDelta?: (text: string) => void;
+  /**
+   * Fires for each decoded stderr chunk while the container is alive. Unlike
+   * stdout, stderr is emitted CHUNK-by-chunk (no line buffering) because
+   * (a) it carries no PHASE protocol, and (b) Python/Node tend to emit
+   * stderr without trailing newlines (progress bars, tracebacks). The
+   * platform-side coalescer rate-limits the mutations these deltas trigger.
+   */
+  onStderrDelta?: (text: string) => void;
+}
+
+export async function executeRequest(
+  cfg: SpawnerConfig,
+  req: ExecuteRequest,
+  opts: ExecuteRequestOptions = {},
+): Promise<ExecuteResponse> {
+  if (!ID_ALPHABET_RE.test(req.executionId)) {
+    return makeError('SPAWNER_UNAVAILABLE', 'invalid executionId', 0);
+  }
+  if (!ORG_ID_ALPHABET_RE.test(req.organizationId)) {
+    return makeError('SPAWNER_UNAVAILABLE', 'invalid organizationId', 0);
+  }
+  if (
+    req.language !== 'python' &&
+    req.language !== 'node' &&
+    req.language !== 'polyglot'
+  ) {
+    return makeError('SPAWNER_UNAVAILABLE', 'invalid language', 0);
+  }
+
+  const timeoutMs = Math.min(
+    Math.max(req.timeoutMs ?? cfg.defaultTimeoutMs, 1_000),
+    cfg.maxTimeoutMs,
+  );
+  const startedAtMs = Date.now();
+  const containerName = `tale-sbx-${req.executionId}`;
+  const pipVolume = pipCacheVolumeName(cfg, req.organizationId);
+  const npmVolume = npmCacheVolumeName(cfg, req.organizationId);
+  const workspaceHostDir = join(cfg.hostSessionRoot, req.executionId);
+
+  // Reuse the placeholder AbortController if the server pre-registered one
+  // when the request landed. A `cancelExecution` call between registerInFlight
+  // and this line targets the placeholder's signal — discarding it here and
+  // building a fresh controller would leak that early abort, leaving the
+  // child docker process running until the watchdog timeout. Reusing the
+  // entry preserves the (already-aborted, if cancelled) signal.
+  const placeholder = inFlight.get(req.executionId);
+  const abort = placeholder?.abort ?? new AbortController();
+  inFlight.set(req.executionId, {
+    containerName,
+    abort,
+    startedAt: startedAtMs,
+  });
+
+  try {
+    await ensureCacheVolume(pipVolume);
+    await ensureCacheVolume(npmVolume);
+    const stageStartedAt = Date.now();
+    const stageResult = await stageWorkspace(workspaceHostDir, req);
+    const stageMs = Date.now() - stageStartedAt;
+    // Captured here for inclusion in ExecuteResponse.priorStage. Undefined
+    // when the request had no priorOutputDownloads (nothing to attest).
+    const priorStage = stageResult.priorStage;
+
+    // Resolve the path the runtime entrypoint will exec().
+    //   - steps[] → the spawner-generated wrapper under /workspace/.tale/
+    //     (polyglot also routes through runner.py — Python is the image's
+    //     base layer and always available as the dispatcher host).
+    //   - single-script → the user file at its declared relative path
+    // The validator guarantees `entryPath` is defined whenever `steps` is
+    // not (and that polyglot always uses steps mode). The entrypoint
+    // reattaches /workspace/code/ for relative paths.
+    const entryPath =
+      req.steps !== undefined
+        ? `/workspace/.tale/${
+            req.language === 'python' || req.language === 'polyglot'
+              ? 'runner.py'
+              : 'runner.js'
+          }`
+        : // oxlint-disable-next-line typescript/no-non-null-assertion -- validator enforces mutex (entryPath xor steps)
+          req.entryPath!;
+
+    const argv = buildDockerRunArgs(cfg, {
+      executionId: req.executionId,
+      organizationId: req.organizationId,
+      language: req.language,
+      timeoutMs,
+      pipCacheVolume: pipVolume,
+      npmCacheVolume: npmVolume,
+      workspaceHostDir,
+      startedAtMs,
+      entryPath,
+    });
+
+    // Two-tier timeout:
+    //   - Inner: at `timeoutMs`, SIGKILL the container so user code cannot
+    //     exceed the cap. The runtime is untrusted; there's no graceful
+    //     shutdown contract to honor with SIGTERM, and SIGTERM-then-wait
+    //     would just let a misbehaving process burn additional wall-clock
+    //     before we force the kill anyway.
+    //   - Outer (in runDocker): at `timeoutMs + 30_000`, kill the docker
+    //     CLI process too — covers the case where `docker kill` itself
+    //     hangs (rare; would mean the daemon is in trouble).
+    const killTimer = setTimeout(() => {
+      // Bounded so a wedged docker daemon doesn't leak the Bun subprocess
+      // (audit follow-up F4). Same 5s ceiling as cancelExecution.
+      void dockerKill(containerName, 'KILL', { timeoutMs: 5_000 }).catch(
+        (err) => {
+          console.warn(
+            `[sandbox] timeout-triggered dockerKill failed for ${containerName}:`,
+            err,
+          );
+        },
+      );
+    }, timeoutMs);
+    let result: Awaited<ReturnType<typeof runDocker>>;
+    try {
+      // Line-buffered phase parser. The runtime image's entrypoint emits
+      // "PHASE: installing\n" then later "PHASE: running\n" on stdout. We
+      // accumulate bytes until we see a newline, then scan each line for
+      // those markers and fire the onPhase callback. Other lines (user's
+      // own prints) are ignored — the full stdout is still captured in
+      // result.stdout for the final response.
+      //
+      // On stream EOF without a trailing newline, the residual `lineBuf` is
+      // drained once via `finalize` so the last marker still produces an
+      // event (audit finding R2-3 C3 partial). `stripPhaseMarkers` below
+      // also handles the unterminated case via `split('\n')`.
+      let lineBuf = '';
+      // Hard cap on lineBuf so a runtime that emits no newlines (a single
+      // multi-GB "log line") cannot grow the spawner heap. On overflow we
+      // flush the buffered prefix as a synthetic line and reset — the
+      // PHASE markers are short, so they're never inside such a blast.
+      const MAX_LINE_BUF_BYTES = 64 * 1024;
+      // Live-tail delta byte caps mirror `stdoutMaxBytes`/`stderrMaxBytes`
+      // (which only bound the spawner's buffered output). Without these
+      // caps `onStdoutDelta`/`onStderrDelta` would forward unbounded
+      // bytes to the SSE consumer even after truncation kicks in.
+      let stdoutDeltaBytes = 0;
+      let stderrDeltaBytes = 0;
+      const decoder = new TextDecoder('utf-8', { fatal: false });
+      const stderrDecoder = new TextDecoder('utf-8', { fatal: false });
+      // PHASE-marker lines are stripped from the live tail (`onStdoutDelta`)
+      // so the user doesn't briefly see `PHASE: installing` in the canvas.
+      // Non-marker lines are forwarded WITH their trailing newline so the
+      // platform-side append produces a faithful tail.
+      const handleStdoutLine = (line: string) => {
+        if (line === PHASE_INSTALL) {
+          opts.onPhase?.({ phase: 'installing' });
+        } else if (line === PHASE_RUN) {
+          opts.onPhase?.({ phase: 'running' });
+        } else if (
+          opts.onStdoutDelta &&
+          stdoutDeltaBytes < cfg.stdoutMaxBytes
+        ) {
+          const payload = `${line}\n`;
+          stdoutDeltaBytes += payload.length;
+          opts.onStdoutDelta(payload);
+        }
+      };
+      const wantStdoutScan = Boolean(opts.onPhase || opts.onStdoutDelta);
+      const onStdoutChunk = wantStdoutScan
+        ? (chunk: Uint8Array) => {
+            lineBuf += decoder.decode(chunk, { stream: true });
+            // Flush any newline-delimited prefixes first so partial markers
+            // at the seam don't get clipped.
+            let nl: number;
+            while ((nl = lineBuf.indexOf('\n')) !== -1) {
+              const line = lineBuf.slice(0, nl);
+              lineBuf = lineBuf.slice(nl + 1);
+              handleStdoutLine(line);
+            }
+            // No-newline blast guard: if we still have a large pending
+            // buffer with no terminator, flush its prefix as a synthetic
+            // line so heap doesn't grow unbounded.
+            if (lineBuf.length > MAX_LINE_BUF_BYTES) {
+              const synthetic = lineBuf.slice(0, MAX_LINE_BUF_BYTES);
+              lineBuf = lineBuf.slice(MAX_LINE_BUF_BYTES);
+              handleStdoutLine(synthetic);
+            }
+          }
+        : undefined;
+      const onStderrChunk = opts.onStderrDelta
+        ? (chunk: Uint8Array) => {
+            if (stderrDeltaBytes >= cfg.stderrMaxBytes) return;
+            const text = stderrDecoder.decode(chunk, { stream: true });
+            if (text.length === 0) return;
+            stderrDeltaBytes += text.length;
+            opts.onStderrDelta?.(text);
+          }
+        : undefined;
+      result = await runDocker(argv, {
+        timeoutMs: timeoutMs + 30_000,
+        signal: abort.signal,
+        killOnTimeoutContainer: containerName,
+        // In-band byte caps prevent a runaway runtime container from OOM'ing
+        // the spawner heap; runDocker continues draining the pipe but
+        // discards bytes past the cap (audit finding R2-B2).
+        stdoutMaxBytes: cfg.stdoutMaxBytes,
+        stderrMaxBytes: cfg.stderrMaxBytes,
+        ...(onStdoutChunk && { onStdoutChunk }),
+        ...(onStderrChunk && { onStderrChunk }),
+      });
+      // EOF drain — the line loop above only fires on newlines; a final
+      // unterminated line (PHASE marker OR user output) lives in lineBuf.
+      if (wantStdoutScan) {
+        lineBuf += decoder.decode();
+        if (lineBuf.length > 0) {
+          if (lineBuf === PHASE_INSTALL) {
+            opts.onPhase?.({ phase: 'installing' });
+          } else if (lineBuf === PHASE_RUN) {
+            opts.onPhase?.({ phase: 'running' });
+          } else {
+            // Trailing chunk WITHOUT newline — forward verbatim.
+            opts.onStdoutDelta?.(lineBuf);
+          }
+        }
+      }
+      if (opts.onStderrDelta) {
+        const tail = stderrDecoder.decode();
+        if (tail.length > 0) opts.onStderrDelta(tail);
+      }
+    } finally {
+      clearTimeout(killTimer);
+    }
+
+    const durationMs = Date.now() - startedAtMs;
+    const exitCode = result.exitCode;
+
+    const stdoutWithoutPhases = stripPhaseMarkers(result.stdout);
+    const stdoutClean = stripControlChars(stdoutWithoutPhases);
+    const stderrClean = stripControlChars(result.stderr);
+    // runDocker now caps reads in-band, but keep capText as a defensive
+    // safety net (no-op when within bounds) and OR truncation flags so
+    // either signal surfaces on the wire.
+    const { text: stdoutCapped, truncated: stdoutCapPostTrunc } = capText(
+      stdoutClean,
+      cfg.stdoutMaxBytes,
+    );
+    const { text: stderrCapped, truncated: stderrCapPostTrunc } = capText(
+      stderrClean,
+      cfg.stderrMaxBytes,
+    );
+    const stdoutTrunc = result.stdoutTruncated || stdoutCapPostTrunc;
+    const stderrTrunc = result.stderrTruncated || stderrCapPostTrunc;
+
+    // Always attempt to load per-step results when the request was multi-
+    // step. The wrapper flushes after every step (and again on fail-fast),
+    // so even cancelled / failed runs usually have a partial results.json
+    // worth surfacing. `null` means the wrapper never got far enough — we
+    // synthesize a [{status:'failed'}] entry so the caller doesn't have to
+    // special-case the missing-file path.
+    const stepResults =
+      req.steps !== undefined
+        ? ((await readStepResults(workspaceHostDir, req.steps)) ??
+          synthesizeStepResults(req.steps))
+        : undefined;
+
+    // Harvest `/workspace/output/` unconditionally — even on failure or
+    // cancellation, any partial files the user script managed to write
+    // before crashing are worth surfacing (resolves D5 in plan
+    // llm-majestic-hamming.md). The presigned-URL upload happens inside
+    // harvestOutputDir; failures are accumulated rather than thrown so a
+    // network blip on one file doesn't lose the others.
+    let harvestedFiles: OutputFile[] = [];
+    let harvestTruncatedCount = 0;
+    let harvestUploadStats: UploadStats = {
+      attempted: 0,
+      succeeded: 0,
+      failures: [],
+    };
+    let harvestQuotaExhausted = false;
+    let harvestUploadFailed = false;
+    let harvestReportFailed = false;
+    let harvestReadFailed = false;
+    let uploadMs = 0;
+    const harvestStartedAt = Date.now();
+    try {
+      const harvested = await harvestOutputDir(
+        workspaceHostDir,
+        {
+          perFileMax: cfg.outputFileMaxBytes,
+          totalMax: cfg.outputTotalMaxBytes,
+        },
+        req.outputUploadSlots,
+        {
+          outputUrlEndpoint: req.outputUrlEndpoint,
+          reportUploadedEndpoint: req.reportUploadedEndpoint,
+        },
+        req.executionId,
+        cfg.sandboxToken,
+      );
+      harvestedFiles = harvested.files;
+      harvestTruncatedCount = harvested.truncatedCount;
+      harvestUploadStats = harvested.uploadStats;
+      harvestQuotaExhausted = harvested.quotaExhausted;
+      harvestUploadFailed = harvested.uploadFailed;
+      harvestReportFailed = harvested.reportFailed;
+      harvestReadFailed = harvested.readFailed;
+      uploadMs = harvested.uploadMs;
+    } catch (err) {
+      console.warn(`[sandbox.harvest] best-effort harvest failed:`, err);
+      harvestReadFailed = true;
+    }
+    const harvestMs = Date.now() - harvestStartedAt;
+
+    // Classify any harvest-side failure into a wire errorCode. Order
+    // matters: quota > upload > report > read. The first matching code
+    // becomes the response's errorCode IF the user code itself exited 0
+    // — we don't want to mask a legitimate runtime crash. For non-zero
+    // exits, classifyFailure() picks the runtime errorCode and the upload
+    // failure shows up in `uploadStats.failures` instead.
+    let harvestErrorCode: ErrorCode | undefined;
+    let harvestErrorMessage: string | undefined;
+    if (harvestQuotaExhausted) {
+      harvestErrorCode = 'UPLOAD_QUOTA_EXCEEDED';
+      harvestErrorMessage =
+        'Per-run output-file quota exceeded; some files were not uploaded';
+    } else if (harvestUploadFailed) {
+      harvestErrorCode = 'UPLOAD_FAILED';
+      harvestErrorMessage = 'One or more output uploads failed';
+    } else if (harvestReportFailed) {
+      harvestErrorCode = 'UPLOAD_REPORT_FAILED';
+      harvestErrorMessage =
+        'Upload succeeded but report-back to platform failed';
+    } else if (harvestReadFailed) {
+      harvestErrorCode = 'HARVEST_READ_FAILED';
+      harvestErrorMessage = "Couldn't read /workspace/output";
+    }
+
+    const timing = {
+      stageMs,
+      executeMs: Math.max(0, durationMs),
+      harvestMs,
+      uploadMs,
+    };
+
+    if (abort.signal.aborted) {
+      return {
+        status: 'cancelled',
+        exitCode: null,
+        errorCode: 'CANCELLED',
+        errorMessage: 'Execution cancelled by client',
+        stdoutBase64: Buffer.from(stdoutCapped).toString('base64'),
+        stderrBase64: Buffer.from(stderrCapped).toString('base64'),
+        durationMs,
+        truncated: {
+          stdout: stdoutTrunc,
+          stderr: stderrTrunc,
+          files: harvestTruncatedCount,
+        },
+        outputFiles: harvestedFiles,
+        ...(stepResults !== undefined && { steps: stepResults }),
+        uploadStats: harvestUploadStats,
+        timing,
+        ...(priorStage !== undefined && { priorStage }),
+      };
+    }
+
+    if (exitCode === 0) {
+      return {
+        status: harvestErrorCode !== undefined ? 'failed' : 'completed',
+        exitCode: 0,
+        ...(harvestErrorCode !== undefined && {
+          errorCode: harvestErrorCode,
+          ...(harvestErrorMessage !== undefined && {
+            errorMessage: harvestErrorMessage,
+          }),
+        }),
+        stdoutBase64: Buffer.from(stdoutCapped).toString('base64'),
+        stderrBase64: Buffer.from(stderrCapped).toString('base64'),
+        durationMs,
+        truncated: {
+          stdout: stdoutTrunc,
+          stderr: stderrTrunc,
+          files: harvestTruncatedCount,
+        },
+        outputFiles: harvestedFiles,
+        ...(stepResults !== undefined && { steps: stepResults }),
+        uploadStats: harvestUploadStats,
+        timing,
+        ...(priorStage !== undefined && { priorStage }),
+      };
+    }
+
+    const { code: ec, message } = classifyFailure(exitCode, stderrCapped);
+    return {
+      status: ec === 'CANCELLED' ? 'cancelled' : 'failed',
+      exitCode,
+      errorCode: ec,
+      errorMessage: message,
+      stdoutBase64: Buffer.from(stdoutCapped).toString('base64'),
+      stderrBase64: Buffer.from(stderrCapped).toString('base64'),
+      durationMs,
+      truncated: {
+        stdout: stdoutTrunc,
+        stderr: stderrTrunc,
+        files: harvestTruncatedCount,
+      },
+      outputFiles: harvestedFiles,
+      ...(stepResults !== undefined && { steps: stepResults }),
+      uploadStats: harvestUploadStats,
+      timing,
+      ...(priorStage !== undefined && { priorStage }),
+    };
+  } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
+    return makeError(
+      'SPAWNER_UNAVAILABLE',
+      `spawner internal error: ${message}`,
+      Date.now() - startedAtMs,
+    );
+  } finally {
+    inFlight.delete(req.executionId);
+    try {
+      await dockerRm(containerName);
+    } catch (err) {
+      console.warn(
+        `[sandbox.cleanup] dockerRm failed for ${containerName}:`,
+        err,
+      );
+    }
+    try {
+      await rm(workspaceHostDir, { recursive: true, force: true });
+    } catch (err) {
+      // Loud: silent rm failures = host disk leak. Audit finding.
+      console.warn(
+        `[sandbox.cleanup] failed to rm host workspace ${workspaceHostDir}:`,
+        err,
+      );
+    }
+  }
+}
+
+/**
+ * Synthesize a `steps[]` payload for the case where the wrapper never
+ * produced results.json (container killed during dependency install,
+ * spawner-side crash before docker run, etc). Every requested step is
+ * recorded as `skipped`. The caller can replace the first entry with a
+ * `failed` if the run carries a runtime error code.
+ */
+function synthesizeStepResults(steps: readonly string[]): SandboxStepResult[] {
+  return steps.map((path) => ({
+    path,
+    status: 'skipped',
+    exitCode: null,
+    durationMs: 0,
+  }));
+}
+
+function makeError(
+  errorCode: ErrorCode,
+  msg: string,
+  durationMs: number,
+): ExecuteResponse {
+  return {
+    status: 'failed',
+    exitCode: null,
+    errorCode,
+    errorMessage: msg,
+    stdoutBase64: '',
+    stderrBase64: '',
+    durationMs,
+    truncated: { stdout: false, stderr: false, files: 0 },
+    outputFiles: [],
+  };
+}
+
+function stripPhaseMarkers(stdout: string): string {
+  return stdout
+    .split('\n')
+    .filter((line) => line !== PHASE_INSTALL && line !== PHASE_RUN)
+    .join('\n');
+}
+
+// Strip ANSI CSI / OSC sequences and bare control characters that user
+// code (or pip/npm progress bars) emits. Without this, the chat-canvas
+// pre-renders raw escape codes as garbage glyphs, and `\r` overwrites
+// drag stdout lines into each other in the UI. Done once on the spawner
+// side so both the preview and the overflow-storage blob are clean.
+//
+// Pattern coverage:
+//   \x1b\[ ... <final>   — CSI sequences (color, cursor, erase, ...)
+//   \x1b\] ... \x07      — OSC sequences (terminator: BEL)
+//   \x1b\] ... \x1b\\    — OSC sequences (terminator: ST)
+//   \x07                 — bare BEL
+//   \r (not \r\n)        — lone carriage return → newline (progress bars)
+// Tabs (\t) are deliberately kept; they render fine in the UI.
+const ANSI_CSI_RE = /\x1b\[[0-9;?]*[ -/]*[@-~]/g;
+const ANSI_OSC_BEL_RE = /\x1b\][^\x07]*\x07/g;
+const ANSI_OSC_ST_RE = /\x1b\][^\x1b]*\x1b\\/g;
+const ESC_AND_CONTROL_RE = /[\x07\x08\x0b\x0c\x0e-\x1a\x1c-\x1f]/g;
+
+function stripControlChars(text: string): string {
+  return text
+    .replace(ANSI_OSC_BEL_RE, '')
+    .replace(ANSI_OSC_ST_RE, '')
+    .replace(ANSI_CSI_RE, '')
+    .replace(ESC_AND_CONTROL_RE, '')
+    .replace(/\r\n/g, '\n')
+    .replace(/\r/g, '\n');
+}
+
+function capText(
+  text: string,
+  maxBytes: number,
+): { text: string; truncated: boolean } {
+  const buf = Buffer.from(text);
+  if (buf.byteLength <= maxBytes) return { text, truncated: false };
+  return { text: buf.subarray(0, maxBytes).toString('utf8'), truncated: true };
+}
+
+const EGRESS_DENIED_RE =
+  /403 Filtered|Tunnel connection failed|ProxyError|connection refused/i;
+const PACKAGE_NOT_FOUND_RE =
+  /no matching distribution|could not find a version|unsatisfiable|404 Not Found|E404|No matching distribution found/i;
+
+function classifyFailure(
+  exitCode: number,
+  stderr: string,
+): { code: ErrorCode; message: string } {
+  if (exitCode === 124) {
+    return { code: 'TIMEOUT', message: 'Wall-clock timeout exceeded' };
+  }
+  if (exitCode === 137) {
+    if (/killed/i.test(stderr)) {
+      return { code: 'OOM', message: 'Container killed (likely OOM)' };
+    }
+    return { code: 'TIMEOUT', message: 'Container killed (SIGKILL)' };
+  }
+  if (exitCode === 64) {
+    if (PACKAGE_NOT_FOUND_RE.test(stderr)) {
+      return {
+        code: 'PACKAGE_NOT_FOUND',
+        message: 'Requested package could not be resolved',
+      };
+    }
+    if (EGRESS_DENIED_RE.test(stderr)) {
+      return {
+        code: 'EGRESS_DENIED',
+        message: 'Egress proxy denied the request',
+      };
+    }
+    return {
+      code: 'INSTALL_FAILED',
+      message: 'Package install failed',
+    };
+  }
+  if (exitCode === 65) {
+    return {
+      code: 'SPAWNER_UNAVAILABLE',
+      message: 'Sandbox runtime rejected the invocation',
+    };
+  }
+  // Non-zero from user code or runtime crash — but if stderr clearly shows the
+  // egress proxy blocked the call, prefer EGRESS_DENIED over a generic
+  // RUNTIME_ERROR so the LLM knows it's a network policy, not a code bug.
+  if (EGRESS_DENIED_RE.test(stderr)) {
+    return {
+      code: 'EGRESS_DENIED',
+      message: 'Egress proxy denied the request',
+    };
+  }
+  return {
+    code: 'RUNTIME_ERROR',
+    message: `User code exited with status ${exitCode}`,
+  };
+}
diff --git a/services/sandbox/src/types.ts b/services/sandbox/src/types.ts
new file mode 100644
index 000000000..ba6d05991
--- /dev/null
+++ b/services/sandbox/src/types.ts
@@ -0,0 +1,259 @@
+// HTTP request / response shapes for the sandbox spawner.
+// Mirrors the Convex action's `executeCode` and the agent's `artifact_run`.
+//
+// Wire-protocol enums live in `./wire.ts` (single source of truth); this
+// file imports them as type aliases so existing call sites in spawn.ts,
+// server.ts, docker-args.ts, etc. keep working unchanged.
+
+import type {
+  SandboxErrorCode,
+  SandboxLanguage,
+  SandboxStepResult,
+} from './wire.ts';
+
+export type Language = SandboxLanguage;
+export type ErrorCode = SandboxErrorCode;
+
+export interface SandboxFile {
+  /**
+   * POSIX-style relative path within /workspace/code/. Validated against
+   * the path-safety rules in validate-request.ts (no traversal, no NUL,
+   * no backslash, etc). Nested directories allowed; spawner mkdirs the
+   * parent on write.
+   */
+  path: string;
+  content: string;
+}
+
+export interface ExecuteRequest {
+  // Stable id from the Convex action; used for container name + label and
+  // for /v1/cancel/:id. Caller must supply this so cancellation has
+  // something to address before the spawner has finished spinning up.
+  executionId: string;
+  organizationId: string;
+  language: Language;
+  /**
+   * Files to stage under /workspace/code/<path>. Required: in single-script
+   * mode the entry file lives here; in multi-script mode all steps + their
+   * siblings live here. Aggregate size capped at MAX_FILES_BYTES; per-file
+   * path validated against MAX_PATH_LENGTH + POSIX-traversal rules. Path
+   * segments starting with `.` are rejected, so user files can never land
+   * inside `/workspace/.tale/` where the multi-step wrapper goes.
+   */
+  files?: SandboxFile[];
+  /**
+   * Single-script mode: relative path inside `files[]` to exec. The
+   * runtime image's entrypoint receives this as a positional arg and
+   * exec()s `/workspace/code/<entryPath>` directly — no synthetic mirror,
+   * so user filenames (including `main.py`) flow through unchanged and
+   * appear verbatim in tracebacks. Must reference an existing entry in
+   * `files[]` with non-empty content. Mutually exclusive with `steps`:
+   * requests must set exactly one of `entryPath` or `steps`.
+   */
+  entryPath?: string;
+  /**
+   * Multi-script mode: paths inside `files[]` to execute in sequence
+   * within the same container, sharing /workspace/. Spawner writes a
+   * generated wrapper to `/workspace/.tale/runner.{py,js}` (a dir
+   * unreachable from user paths) and the entrypoint exec()s that wrapper,
+   * which subprocess-invokes each step path. Fail-fast on first non-zero
+   * exit. Per-step results (exit code, duration, status) come back in
+   * `ExecuteResponse.steps[]`. Mutually exclusive with `entryPath`.
+   */
+  steps?: string[];
+  /**
+   * Prior-run output downloads. Spawner fetches each URL during
+   * `stageWorkspace` and writes the bytes to `/workspace/output/<name>`.
+   * Replaces the legacy inline-base64 `priorOutputFiles[]` field
+   * (sandbox-wobbly-origami plan §1). Names are validated against the
+   * same POSIX-traversal rules; rejects skip (logged, not fatal).
+   */
+  priorOutputDownloads?: Array<{
+    name: string;
+    url: string;
+  }>;
+  /**
+   * Legacy single-bucket package list. Sent for `python` / `node`
+   * single-runtime requests and routed to either `uv pip install` or
+   * `npm install` based on `language`. Polyglot requests should use
+   * {@link packagesByLang} instead.
+   */
+  packages?: string[];
+  /**
+   * Per-runtime package buckets. When `language === 'polyglot'` the
+   * entrypoint runs `uv pip install` for `python` and `npm install` for
+   * `node` (skipping whichever bucket is absent / empty). Also accepted
+   * for `python` / `node` single-runtime requests; the matching bucket
+   * is used and the other is ignored.
+   */
+  packagesByLang?: {
+    python?: string[];
+    node?: string[];
+  };
+  timeoutMs?: number;
+  options?: {
+    allowSdist?: boolean;
+    allowInstallScripts?: boolean;
+  };
+  /**
+   * Pre-allocated upload-slot URLs the spawner POSTs harvested output
+   * files to. Length = platform's pre-alloc N (defaults to 2). When the
+   * spawner exhausts this pool it lazily requests more via
+   * {@link outputUrlEndpoint}.
+   */
+  outputUploadSlots: Array<{ url: string }>;
+  /**
+   * HMAC-signed callback URL for requesting additional upload slots when
+   * the pre-allocated pool is empty (EP1; sandbox-wobbly-origami plan §2).
+   */
+  outputUrlEndpoint: string;
+  /**
+   * HMAC-signed callback URL the spawner POSTs to AFTER each successful
+   * upload, so the platform tracks `{fileName, storageId, ...}` against
+   * the audit row's rollback set (EP2; sandbox-wobbly-origami plan §2).
+   */
+  reportUploadedEndpoint: string;
+}
+
+/**
+ * Per-file harvest outcome. `storageId` is the Convex storage id allocated
+ * when the spawner POSTed the bytes to the pre-signed upload URL; the
+ * platform side just inserts the matching `fileMetadata` row.
+ *
+ * `sha256` (hex) is the digest of the raw bytes computed during harvest.
+ * Used for the cumulative `artifactOutputs` manifest (crispy-curry plan §1)
+ * and for pre-stage attestation when the same file is later re-injected
+ * into another run's `/workspace/output/`.
+ */
+export interface OutputFile {
+  name: string;
+  storageId: string;
+  size: number;
+  contentType: string;
+  sha256: string;
+}
+
+/**
+ * Pre-stage skip reasons reported back to the platform via
+ * `ExecuteResponse.priorStage.skipped`. The platform diffs the spawner's
+ * `staged[]` against the manifest it sent; any name in the manifest that's
+ * missing from `staged[]` triggers a fatal `PRE_STAGE_FAILED` BEFORE user
+ * code runs (crispy-curry plan §3).
+ */
+export type PriorStageSkipReason =
+  | 'unsafe_path'
+  | 'fetch_failed'
+  | 'fetch_timeout'
+  | 'http_error'
+  | 'url_expired'
+  | 'write_failed'
+  | 'download_too_large';
+
+/**
+ * Per-file pre-stage outcome. `bytes` and `sha256` are populated only for
+ * successfully staged files; skipped entries carry a structured reason +
+ * short detail string the platform can surface in the failure payload.
+ */
+export interface PriorStageResult {
+  staged: Array<{ name: string; bytes: number; sha256: string }>;
+  skipped: Array<{
+    name: string;
+    reason: PriorStageSkipReason;
+    detail: string;
+  }>;
+}
+
+/**
+ * Per-file upload failure (for `ExecuteResponse.uploadStats`). Surfaces
+ * the HTTP failure code + a short stderr snippet so the audit row /
+ * artifact_run_tool can show useful context without dumping kB of body.
+ */
+export interface UploadFailure {
+  slotIndex: number;
+  fileName: string;
+  httpStatus: number;
+  errorSnippet: string;
+}
+
+export interface UploadStats {
+  attempted: number;
+  succeeded: number;
+  failures: UploadFailure[];
+}
+
+export interface ExecuteResponse {
+  status: 'completed' | 'failed' | 'cancelled';
+  exitCode: number | null;
+  errorCode?: ErrorCode;
+  errorMessage?: string;
+  stdoutBase64: string;
+  stderrBase64: string;
+  durationMs: number;
+  truncated: {
+    stdout: boolean;
+    stderr: boolean;
+    files: number;
+  };
+  outputFiles: OutputFile[];
+  /**
+   * Populated only for multi-step (`ExecuteRequest.steps`) requests; one
+   * entry per requested step. Omitted entirely in single-script mode so
+   * existing callers don't have to thread the field through.
+   */
+  steps?: SandboxStepResult[];
+  /**
+   * Upload telemetry — per-file attempted / succeeded counts plus per-
+   * failure detail. Always present in new responses; the platform-side
+   * validator allows omission for old-image back-compat.
+   */
+  uploadStats?: UploadStats;
+  /**
+   * Per-phase timing breakdown (ms): `stageMs` (prior-output fetch +
+   * file writes), `executeMs` (inner docker run), `harvestMs` (output
+   * walk), `uploadMs` (presigned-URL POSTs + EP2 round-trips).
+   */
+  timing?: {
+    stageMs: number;
+    executeMs: number;
+    harvestMs: number;
+    uploadMs: number;
+  };
+  /**
+   * Pre-stage attestation (crispy-curry plan §3). For every entry in
+   * `ExecuteRequest.priorOutputDownloads` the spawner reports back whether
+   * it landed on `/workspace/output/` (`staged[]`, with bytes + sha256) or
+   * was skipped (`skipped[]`, with a structured reason).
+   *
+   * The platform diffs `staged[]` against the manifest it sent and aborts
+   * the run with `PRE_STAGE_FAILED` if any expected file is missing —
+   * BEFORE user code runs, so the script never sees a partially-corrupted
+   * workspace. Omitted from the response only when the request had no
+   * `priorOutputDownloads` (nothing to attest).
+   */
+  priorStage?: PriorStageResult;
+}
+
+export interface SpawnerConfig {
+  port: number;
+  // Token policy: opt-in verification. When null, the spawner skips HMAC
+  // checks on every route (a single warn at boot logs the state). When
+  // set, the wire path enforces signatures. Set by `loadConfig()` once
+  // at boot from `SANDBOX_TOKEN`; empty-string is treated as null.
+  sandboxToken: string | null;
+  runtimeImage: string;
+  runtime: 'runc' | 'runsc';
+  defaultTimeoutMs: number;
+  maxTimeoutMs: number;
+  maxConcurrent: number;
+  hostSessionRoot: string;
+  cacheVolumePrefix: { pip: string; npm: string };
+  egressNetwork: string;
+  egressProxy: string;
+  stdoutMaxBytes: number;
+  stderrMaxBytes: number;
+  outputFileMaxBytes: number;
+  outputTotalMaxBytes: number;
+  // Maximum request body size (bytes) for /v1/execute. Defaults to 256 KB
+  // to bound the unsigned-mode OOM surface (audit finding).
+  maxRequestBodyBytes: number;
+}
diff --git a/services/sandbox/src/validate-request.test.ts b/services/sandbox/src/validate-request.test.ts
new file mode 100644
index 000000000..58cc69568
--- /dev/null
+++ b/services/sandbox/src/validate-request.test.ts
@@ -0,0 +1,435 @@
+// Runtime validation covers every field downstream code trusts. The
+// spawner side previously did `as ExecuteRequest` and would crash deep
+// inside `spawn.ts` / `docker-args.ts` on a malformed input.
+
+import { describe, expect, test } from 'bun:test';
+
+import { validateExecuteRequest } from './validate-request.ts';
+
+// Minimal valid request shape. Post-sandbox-wobbly-origami the spawner
+// requires the platform to pre-allocate upload-slot URLs + supply the
+// EP1/EP2 callback endpoints; tests still pin to a tiny fixture but the
+// new fields are present so we exercise the success path on every call.
+const good = {
+  executionId: 'abc-123',
+  organizationId: 'org_42',
+  language: 'python',
+  files: [{ path: 'main.py', content: 'print("hi")' }],
+  entryPath: 'main.py',
+  outputUploadSlots: [{ url: 'http://proxy/api/storage/upload?token=test' }],
+  outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+  reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+};
+
+describe('validateExecuteRequest', () => {
+  test('accepts a minimal valid body', () => {
+    const r = validateExecuteRequest(good);
+    expect(r.ok).toBe(true);
+    if (r.ok) {
+      expect(r.request.executionId).toBe('abc-123');
+      expect(r.request.language).toBe('python');
+      expect(r.request.entryPath).toBe('main.py');
+      expect(r.request.files).toEqual([
+        { path: 'main.py', content: 'print("hi")' },
+      ]);
+    }
+  });
+
+  test('rejects null / non-object', () => {
+    expect(validateExecuteRequest(null).ok).toBe(false);
+    expect(validateExecuteRequest('hello').ok).toBe(false);
+    expect(validateExecuteRequest([1, 2, 3]).ok).toBe(false);
+  });
+
+  test('rejects bad executionId alphabet', () => {
+    const r = validateExecuteRequest({ ...good, executionId: 'abc;rm -rf' });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/executionId/);
+  });
+
+  test('rejects bad organizationId alphabet', () => {
+    const r = validateExecuteRequest({ ...good, organizationId: 'a b' });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/organizationId/);
+  });
+
+  test('rejects unknown language', () => {
+    const r = validateExecuteRequest({ ...good, language: 'ruby' });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/language/);
+  });
+
+  test('rejects non-array packages', () => {
+    const r = validateExecuteRequest({ ...good, packages: 'numpy' });
+    expect(r.ok).toBe(false);
+  });
+
+  test('rejects packages with > 20 entries', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      packages: Array.from({ length: 21 }, (_, i) => `pkg-${i}`),
+    });
+    expect(r.ok).toBe(false);
+  });
+
+  test('rejects oversized package spec', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      packages: ['x'.repeat(500)],
+    });
+    expect(r.ok).toBe(false);
+  });
+
+  test('rejects negative timeoutMs', () => {
+    const r = validateExecuteRequest({ ...good, timeoutMs: -1 });
+    expect(r.ok).toBe(false);
+  });
+
+  test('rejects out-of-range timeoutMs', () => {
+    const r = validateExecuteRequest({ ...good, timeoutMs: 1_000_000_000 });
+    expect(r.ok).toBe(false);
+  });
+
+  test('rejects non-numeric timeoutMs (regression: previous "as" cast let strings through)', () => {
+    const r = validateExecuteRequest({ ...good, timeoutMs: '30000' });
+    expect(r.ok).toBe(false);
+  });
+
+  test('rejects non-boolean options.allowSdist', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      options: { allowSdist: 'yes' },
+    });
+    expect(r.ok).toBe(false);
+  });
+
+  test('accepts options shape with both flags', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      options: { allowSdist: true, allowInstallScripts: false },
+    });
+    expect(r.ok).toBe(true);
+  });
+
+  test('preserves only known fields (drops unrecognized keys)', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      unknownField: 'should-not-survive',
+    });
+    expect(r.ok).toBe(true);
+    if (r.ok) {
+      expect(r.request).not.toHaveProperty('unknownField');
+    }
+  });
+
+  // ----- mutex (entryPath xor steps) -----
+
+  test('rejects request with both entryPath and steps (mutex)', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      steps: ['main.py'],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/exactly one/);
+  });
+
+  test('rejects request with neither entryPath nor steps', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      files: [{ path: 'main.py', content: 'x' }],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/exactly one/);
+  });
+
+  // ----- single-script (`entryPath`) mode -----
+
+  test('rejects single-script mode without files[]', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      entryPath: 'main.py',
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/files\[\]/);
+  });
+
+  test('rejects entryPath that has no matching files[] entry', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      entryPath: 'missing.py',
+      files: [{ path: 'main.py', content: 'print(1)' }],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/must reference a path in files/);
+  });
+
+  test('rejects entryPath whose file is empty', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      entryPath: 'main.py',
+      files: [{ path: 'main.py', content: '' }],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/empty/);
+  });
+
+  test('rejects non-string entryPath', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      entryPath: 42,
+    });
+    expect(r.ok).toBe(false);
+  });
+
+  // ----- multi-step (`steps`) mode -----
+
+  test('accepts a valid multi-step request', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      steps: ['gen.py', 'validate.py'],
+      files: [
+        { path: 'gen.py', content: 'print("gen")' },
+        { path: 'validate.py', content: 'print("validate")' },
+      ],
+      outputUploadSlots: [],
+      outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+      reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+    });
+    expect(r.ok).toBe(true);
+    if (r.ok) {
+      expect(r.request.steps).toEqual(['gen.py', 'validate.py']);
+      expect(r.request.entryPath).toBeUndefined();
+    }
+  });
+
+  test('rejects empty steps array', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      steps: [],
+      files: [{ path: 'gen.py', content: 'x' }],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/at least one/);
+  });
+
+  test('rejects steps without files[]', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      steps: ['gen.py'],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/files\[\]/);
+  });
+
+  test('rejects step path not present in files[]', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      steps: ['missing.py'],
+      files: [{ path: 'gen.py', content: 'x' }],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/must reference a path in files/);
+  });
+
+  test('accepts steps including main.py — the leaky-abstraction regression gate', () => {
+    // The user's literal trigger workflow: generator named main.py, validator
+    // named test.py, both run in sequence. Before the reservation removal this
+    // case errored out at the validator with "reserved entrypoint filename".
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      steps: ['main.py', 'test.py'],
+      files: [
+        { path: 'main.py', content: 'print("gen")' },
+        { path: 'test.py', content: 'print("validate")' },
+      ],
+      outputUploadSlots: [],
+      outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+      reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+    });
+    expect(r.ok).toBe(true);
+    if (r.ok) {
+      expect(r.request.steps).toEqual(['main.py', 'test.py']);
+    }
+  });
+
+  test('accepts a node multi-step request with main.js', () => {
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'node',
+      steps: ['main.js'],
+      files: [{ path: 'main.js', content: 'console.log(1)' }],
+      outputUploadSlots: [],
+      outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+      reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+    });
+    expect(r.ok).toBe(true);
+  });
+
+  test('rejects steps with > MAX_STEPS_PER_REQUEST entries', () => {
+    const files = Array.from({ length: 11 }, (_, i) => ({
+      path: `s${i}.py`,
+      content: 'x',
+    }));
+    const r = validateExecuteRequest({
+      executionId: 'abc-123',
+      organizationId: 'org_42',
+      language: 'python',
+      steps: files.map((f) => f.path),
+      files,
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/exceeds .* limit/);
+  });
+
+  test('accepts polyglot multi-step with mixed .py + .js extensions', () => {
+    const r = validateExecuteRequest({
+      executionId: 'poly-1',
+      organizationId: 'org_42',
+      language: 'polyglot',
+      steps: ['gen.js', 'qa.py'],
+      files: [
+        { path: 'gen.js', content: 'console.log("gen")' },
+        { path: 'qa.py', content: 'print("qa")' },
+      ],
+      packagesByLang: {
+        python: ['markitdown[pptx]==0.0.1a3'],
+        node: ['pptxgenjs@3.12.0'],
+      },
+      outputUploadSlots: [],
+      outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+      reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+    });
+    expect(r.ok).toBe(true);
+    if (r.ok) {
+      expect(r.request.language).toBe('polyglot');
+      expect(r.request.steps).toEqual(['gen.js', 'qa.py']);
+      expect(r.request.packagesByLang).toEqual({
+        python: ['markitdown[pptx]==0.0.1a3'],
+        node: ['pptxgenjs@3.12.0'],
+      });
+    }
+  });
+
+  test('rejects polyglot with a step using an unsupported extension', () => {
+    const r = validateExecuteRequest({
+      executionId: 'poly-2',
+      organizationId: 'org_42',
+      language: 'polyglot',
+      steps: ['main.py', 'helper.rb'],
+      files: [
+        { path: 'main.py', content: 'print(1)' },
+        { path: 'helper.rb', content: 'puts 1' },
+      ],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/unsupported polyglot extension/);
+  });
+
+  test('rejects polyglot without steps (single-script mode is not allowed)', () => {
+    const r = validateExecuteRequest({
+      executionId: 'poly-3',
+      organizationId: 'org_42',
+      language: 'polyglot',
+      entryPath: 'main.py',
+      files: [{ path: 'main.py', content: 'print(1)' }],
+      outputUploadSlots: [],
+      outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+      reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/polyglot requires/);
+  });
+
+  test('passes through priorOutputDownloads when valid', () => {
+    // Regression guard: the validator's request-output allowlist used to
+    // silently drop `priorOutputFiles` (legacy field). Post-sandbox-
+    // wobbly-origami this is `priorOutputDownloads` (URL list, no base64).
+    const r = validateExecuteRequest({
+      ...good,
+      priorOutputDownloads: [
+        { name: 'deck.pptx', url: 'http://proxy/api/storage/abc' },
+        { name: 'nested/report.txt', url: 'http://proxy/api/storage/def' },
+      ],
+    });
+    expect(r.ok).toBe(true);
+    if (r.ok) {
+      expect(r.request.priorOutputDownloads).toEqual([
+        { name: 'deck.pptx', url: 'http://proxy/api/storage/abc' },
+        { name: 'nested/report.txt', url: 'http://proxy/api/storage/def' },
+      ]);
+    }
+  });
+
+  test('rejects non-array priorOutputDownloads', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      priorOutputDownloads: 'oops',
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/priorOutputDownloads/);
+  });
+
+  test('rejects priorOutputDownloads entry with non-string fields', () => {
+    const r = validateExecuteRequest({
+      ...good,
+      priorOutputDownloads: [{ name: 'x', url: 123 }],
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/url/);
+  });
+
+  test('rejects body missing outputUploadSlots', () => {
+    const { outputUploadSlots: _, ...withoutSlots } = good;
+    const r = validateExecuteRequest(withoutSlots);
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/outputUploadSlots/);
+  });
+
+  test('rejects body missing outputUrlEndpoint', () => {
+    const { outputUrlEndpoint: _, ...withoutEndpoint } = good;
+    const r = validateExecuteRequest(withoutEndpoint);
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/outputUrlEndpoint/);
+  });
+
+  test('rejects packagesByLang exceeding combined 20-spec cap', () => {
+    const r = validateExecuteRequest({
+      executionId: 'poly-4',
+      organizationId: 'org_42',
+      language: 'polyglot',
+      steps: ['gen.js', 'qa.py'],
+      files: [
+        { path: 'gen.js', content: 'console.log(1)' },
+        { path: 'qa.py', content: 'print(1)' },
+      ],
+      packagesByLang: {
+        python: Array.from({ length: 15 }, (_, i) => `pkg${i}`),
+        node: Array.from({ length: 10 }, (_, i) => `npm${i}`),
+      },
+      outputUploadSlots: [],
+      outputUrlEndpoint: 'http://proxy/api/sandbox/output_upload_url',
+      reportUploadedEndpoint: 'http://proxy/api/sandbox/record_uploaded',
+    });
+    expect(r.ok).toBe(false);
+    if (!r.ok) expect(r.error).toMatch(/combined.*limit/i);
+  });
+});
diff --git a/services/sandbox/src/validate-request.ts b/services/sandbox/src/validate-request.ts
new file mode 100644
index 000000000..12d294f2b
--- /dev/null
+++ b/services/sandbox/src/validate-request.ts
@@ -0,0 +1,570 @@
+// Hand-rolled runtime validator for `POST /v1/execute` request bodies.
+//
+// The spawner ships ZERO runtime dependencies by design (server.ts is
+// Bun-native + node:crypto only), so we can't reach for zod/valibot here.
+// This file is the boundary between "an unknown object that came off the
+// wire" and the typed `ExecuteRequest` the rest of the pipeline accepts.
+//
+// Every field is checked against:
+//   1. type (string/number/array/object)
+//   2. shape constraints (length, alphabet, range)
+//
+// Audit finding R2-B3: server.ts previously did `parsedUnknown as
+// ExecuteRequest` and only spot-checked `executionId`. Each remaining
+// field was forwarded into deeper logic (spawn.ts, docker-args.ts) where
+// a malformed input would crash with a less useful diagnostic.
+
+import type { ExecuteRequest, Language, SandboxFile } from './types.ts';
+import {
+  FILE_PATH_SEGMENT_RE,
+  ID_ALPHABET_RE,
+  MAX_FILES_BYTES,
+  MAX_FILES_PER_REQUEST,
+  MAX_FILE_PATH_LENGTH,
+  MAX_STEPS_PER_REQUEST,
+  ORG_ID_ALPHABET_RE,
+  POLYGLOT_NODE_EXT_RE,
+  POLYGLOT_PYTHON_EXT_RE,
+  sandboxLanguageLiterals,
+} from './wire.ts';
+
+type ValidateResult =
+  | { ok: true; request: ExecuteRequest }
+  | { ok: false; error: string };
+
+// Caps mirror what downstream argv builders + the runtime image accept.
+// The spawner-side body cap (cfg.maxRequestBodyBytes, default 20 MB)
+// is the hard upper bound on string sizes; per-field caps below stay
+// inside that and surface as readable error strings instead of cryptic
+// downstream throws.
+const MAX_PACKAGES = 20;
+const MAX_PACKAGE_SPEC = 200;
+const MAX_PURPOSE = 200;
+const MAX_TIMEOUT_MS = 600_000; // 10 minutes — well above the runtime watchdog
+
+function isString(v: unknown): v is string {
+  return typeof v === 'string';
+}
+
+function isLanguage(v: unknown): v is Language {
+  return (
+    typeof v === 'string' &&
+    (sandboxLanguageLiterals as readonly string[]).includes(v)
+  );
+}
+
+export function validateExecuteRequest(raw: unknown): ValidateResult {
+  if (raw === null || typeof raw !== 'object' || Array.isArray(raw)) {
+    return { ok: false, error: 'request body must be a JSON object' };
+  }
+  // After the guard above `raw` is `object`; reading string-indexed properties
+  // through a typed Record is the canonical wire-shape narrowing pattern used
+  // throughout this file (see also validateFiles).
+  // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+  const r = raw as Record<string, unknown>;
+
+  if (!isString(r.executionId) || !ID_ALPHABET_RE.test(r.executionId)) {
+    return { ok: false, error: 'executionId is missing or malformed' };
+  }
+  if (
+    !isString(r.organizationId) ||
+    !ORG_ID_ALPHABET_RE.test(r.organizationId)
+  ) {
+    return { ok: false, error: 'organizationId is missing or malformed' };
+  }
+  if (!isLanguage(r.language)) {
+    return {
+      ok: false,
+      error: `language must be one of ${sandboxLanguageLiterals.join(', ')}`,
+    };
+  }
+
+  // `entryPath` (single-script) and `steps` (multi-script) are mutually
+  // exclusive — exactly one must be present. Single-script mode exec()s
+  // the file at `entryPath` directly; multi-script mode generates a
+  // wrapper at /workspace/.tale/runner.{py,js} that subprocess-invokes
+  // each step. Allowing both would let a caller shadow the wrapper's
+  // entry semantics; rejecting neither prevents a no-op container spawn.
+  const entryProvided = r.entryPath !== undefined;
+  const stepsProvided = r.steps !== undefined;
+  if (entryProvided === stepsProvided) {
+    return {
+      ok: false,
+      error: 'request must set exactly one of `entryPath` or `steps`',
+    };
+  }
+
+  // packages: optional string[] with length + per-element-length caps.
+  let packages: string[] | undefined;
+  if (r.packages !== undefined) {
+    if (!Array.isArray(r.packages)) {
+      return { ok: false, error: 'packages must be an array of strings' };
+    }
+    if (r.packages.length > MAX_PACKAGES) {
+      return {
+        ok: false,
+        error: `packages exceeds ${MAX_PACKAGES}-item limit`,
+      };
+    }
+    const validated: string[] = [];
+    for (const p of r.packages) {
+      if (!isString(p)) {
+        return { ok: false, error: 'every package entry must be a string' };
+      }
+      if (p.length > MAX_PACKAGE_SPEC) {
+        return {
+          ok: false,
+          error: `package spec exceeds ${MAX_PACKAGE_SPEC}-char limit`,
+        };
+      }
+      validated.push(p);
+    }
+    packages = validated;
+  }
+
+  // packagesByLang: optional grouped form. Either bucket may be omitted;
+  // the entrypoint skips a bucket whose list is empty. The MAX_PACKAGES
+  // cap applies to the combined length so a polyglot caller cannot
+  // smuggle 40 specs by splitting them across buckets.
+  let packagesByLang: ExecuteRequest['packagesByLang'];
+  if (r.packagesByLang !== undefined) {
+    if (
+      r.packagesByLang === null ||
+      typeof r.packagesByLang !== 'object' ||
+      Array.isArray(r.packagesByLang)
+    ) {
+      return {
+        ok: false,
+        error: 'packagesByLang must be an object',
+      };
+    }
+    // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+    const grouped = r.packagesByLang as Record<string, unknown>;
+    const buckets: Array<['python' | 'node', unknown]> = [
+      ['python', grouped.python],
+      ['node', grouped.node],
+    ];
+    const validatedByLang: { python?: string[]; node?: string[] } = {};
+    let total = 0;
+    for (const [lang, rawBucket] of buckets) {
+      if (rawBucket === undefined) continue;
+      if (!Array.isArray(rawBucket)) {
+        return {
+          ok: false,
+          error: `packagesByLang.${lang} must be an array of strings`,
+        };
+      }
+      const list: string[] = [];
+      for (const p of rawBucket) {
+        if (!isString(p)) {
+          return {
+            ok: false,
+            error: `every packagesByLang.${lang} entry must be a string`,
+          };
+        }
+        if (p.length > MAX_PACKAGE_SPEC) {
+          return {
+            ok: false,
+            error: `packagesByLang.${lang} spec exceeds ${MAX_PACKAGE_SPEC}-char limit`,
+          };
+        }
+        list.push(p);
+      }
+      total += list.length;
+      if (list.length > 0) validatedByLang[lang] = list;
+    }
+    if (total > MAX_PACKAGES) {
+      return {
+        ok: false,
+        error: `packagesByLang exceeds combined ${MAX_PACKAGES}-item limit`,
+      };
+    }
+    packagesByLang =
+      Object.keys(validatedByLang).length > 0 ? validatedByLang : undefined;
+  }
+
+  // timeoutMs: optional positive number, bounded.
+  let timeoutMs: number | undefined;
+  if (r.timeoutMs !== undefined) {
+    if (
+      typeof r.timeoutMs !== 'number' ||
+      !Number.isFinite(r.timeoutMs) ||
+      r.timeoutMs <= 0 ||
+      r.timeoutMs > MAX_TIMEOUT_MS
+    ) {
+      return {
+        ok: false,
+        error: `timeoutMs must be a positive number ≤ ${MAX_TIMEOUT_MS}`,
+      };
+    }
+    timeoutMs = r.timeoutMs;
+  }
+
+  // options: optional object with two optional booleans. We do NOT
+  // re-emit the field if it's empty — keeps the wire shape stable.
+  let options: ExecuteRequest['options'];
+  if (r.options !== undefined) {
+    if (
+      r.options === null ||
+      typeof r.options !== 'object' ||
+      Array.isArray(r.options)
+    ) {
+      return { ok: false, error: 'options must be an object' };
+    }
+    // Same wire-shape narrowing as `r` at the top of validateExecuteRequest.
+    // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+    const opts = r.options as Record<string, unknown>;
+    if (opts.allowSdist !== undefined && typeof opts.allowSdist !== 'boolean') {
+      return { ok: false, error: 'options.allowSdist must be a boolean' };
+    }
+    if (
+      opts.allowInstallScripts !== undefined &&
+      typeof opts.allowInstallScripts !== 'boolean'
+    ) {
+      return {
+        ok: false,
+        error: 'options.allowInstallScripts must be a boolean',
+      };
+    }
+    options = {
+      ...(opts.allowSdist !== undefined && {
+        allowSdist: opts.allowSdist,
+      }),
+      ...(opts.allowInstallScripts !== undefined && {
+        allowInstallScripts: opts.allowInstallScripts,
+      }),
+    };
+  }
+
+  // files: required for both single-script and multi-script modes —
+  // single-script needs the entry file, multi-script needs every step's
+  // file. Per-path safety mirrors the platform's `validatePath` rules;
+  // spawner-side check is defense-in-depth — never trust the upstream
+  // typecheck.
+  let files: SandboxFile[] | undefined;
+  if (r.files !== undefined) {
+    const validated = validateFiles(r.files);
+    if (!validated.ok) return { ok: false, error: validated.error };
+    files = validated.files;
+  }
+  if (files === undefined) {
+    return {
+      ok: false,
+      error: 'request must include `files[]` carrying the script contents',
+    };
+  }
+
+  // entryPath: single-script mode. Must name a non-empty file in `files[]`.
+  let entryPath: string | undefined;
+  if (entryProvided) {
+    if (!isString(r.entryPath)) {
+      return { ok: false, error: 'entryPath must be a string' };
+    }
+    const safe = isSafeRelativePath(r.entryPath);
+    if (!safe.ok) {
+      return { ok: false, error: `entryPath: ${safe.error}` };
+    }
+    const match = files.find((f) => f.path === r.entryPath);
+    if (match === undefined) {
+      return {
+        ok: false,
+        error: `entryPath "${r.entryPath}" must reference a path in files`,
+      };
+    }
+    if (match.content.length === 0) {
+      return {
+        ok: false,
+        error: `entryPath "${r.entryPath}" references an empty file`,
+      };
+    }
+    entryPath = r.entryPath;
+  }
+
+  // steps: multi-script execution list. Each step path must reference an
+  // entry in `files[]` and be safe-relative. The wrapper lives at
+  // /workspace/.tale/runner.{py,js} (a dir unreachable from user paths),
+  // so step names like "main.py" do not collide with anything.
+  let steps: string[] | undefined;
+  if (stepsProvided) {
+    if (!Array.isArray(r.steps)) {
+      return { ok: false, error: 'steps must be an array of strings' };
+    }
+    if (r.steps.length === 0) {
+      return { ok: false, error: 'steps must contain at least one entry' };
+    }
+    if (r.steps.length > MAX_STEPS_PER_REQUEST) {
+      return {
+        ok: false,
+        error: `steps exceeds ${MAX_STEPS_PER_REQUEST}-item limit`,
+      };
+    }
+    const validatedSteps: string[] = [];
+    for (let i = 0; i < r.steps.length; i += 1) {
+      const sp: unknown = r.steps[i];
+      if (!isString(sp)) {
+        return { ok: false, error: `steps[${i}] must be a string` };
+      }
+      const safe = isSafeRelativePath(sp);
+      if (!safe.ok) {
+        return { ok: false, error: `steps[${i}]: ${safe.error}` };
+      }
+      if (!files.some((f) => f.path === sp)) {
+        return {
+          ok: false,
+          error: `steps[${i}] "${sp}" must reference a path in files`,
+        };
+      }
+      validatedSteps.push(sp);
+    }
+    steps = validatedSteps;
+  }
+
+  // Polyglot mode: per-step interpreter is chosen by file extension at
+  // runtime. Validate up-front so a `.rb` step doesn't reach the wrapper
+  // and confuse it. Steps mode is required because polyglot's whole
+  // raison d'être is "different files run with different interpreters" —
+  // single-script polyglot would just be language=python or =node.
+  if (r.language === 'polyglot') {
+    if (steps === undefined) {
+      return {
+        ok: false,
+        error:
+          'language=polyglot requires `steps[]` — use language=python or =node for single-script execution',
+      };
+    }
+    for (let i = 0; i < steps.length; i += 1) {
+      const path = steps[i];
+      if (
+        path !== undefined &&
+        !POLYGLOT_PYTHON_EXT_RE.test(path) &&
+        !POLYGLOT_NODE_EXT_RE.test(path)
+      ) {
+        return {
+          ok: false,
+          error: `steps[${i}] "${path}" has an unsupported polyglot extension — must end in .py, .js, .cjs, or .mjs`,
+        };
+      }
+    }
+  }
+
+  // purpose: optional human-readable label, length-capped to defend the
+  // audit-row preview from a megabyte-sized "purpose" string.
+  // (purpose isn't in ExecuteRequest, but if a future caller ships it the
+  // spawn pipeline ignores it; bound here for defense-in-depth.)
+  if (r.purpose !== undefined && isString(r.purpose)) {
+    if (r.purpose.length > MAX_PURPOSE) {
+      return {
+        ok: false,
+        error: `purpose exceeds ${MAX_PURPOSE}-char limit`,
+      };
+    }
+  }
+
+  // priorOutputDownloads: list of {name, url} the spawner fetches during
+  // stageWorkspace. Replaces the legacy base64 priorOutputFiles —
+  // sandbox-wobbly-origami plan §1. Wire-shape validation only; URL
+  // safety (scheme/host) is left to the spawner's own fetch.
+  let priorOutputDownloads: ExecuteRequest['priorOutputDownloads'];
+  if (r.priorOutputDownloads !== undefined) {
+    if (!Array.isArray(r.priorOutputDownloads)) {
+      return { ok: false, error: 'priorOutputDownloads must be an array' };
+    }
+    const validated: { name: string; url: string }[] = [];
+    for (let i = 0; i < r.priorOutputDownloads.length; i += 1) {
+      const entry: unknown = r.priorOutputDownloads[i];
+      if (entry === null || typeof entry !== 'object' || Array.isArray(entry)) {
+        return {
+          ok: false,
+          error: `priorOutputDownloads[${i}] must be an object`,
+        };
+      }
+      // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+      const e = entry as Record<string, unknown>;
+      if (!isString(e.name)) {
+        return {
+          ok: false,
+          error: `priorOutputDownloads[${i}].name must be a string`,
+        };
+      }
+      if (!isString(e.url)) {
+        return {
+          ok: false,
+          error: `priorOutputDownloads[${i}].url must be a string`,
+        };
+      }
+      validated.push({ name: e.name, url: e.url });
+    }
+    priorOutputDownloads = validated;
+  }
+
+  // outputUploadSlots: pre-allocated upload-slot URLs (required field).
+  // Empty array is acceptable — spawner will lazily request slots via EP1.
+  if (!Array.isArray(r.outputUploadSlots)) {
+    return {
+      ok: false,
+      error: 'outputUploadSlots is required and must be an array',
+    };
+  }
+  const outputUploadSlots: Array<{ url: string }> = [];
+  for (let i = 0; i < r.outputUploadSlots.length; i += 1) {
+    const entry: unknown = r.outputUploadSlots[i];
+    if (entry === null || typeof entry !== 'object' || Array.isArray(entry)) {
+      return {
+        ok: false,
+        error: `outputUploadSlots[${i}] must be an object`,
+      };
+    }
+    // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+    const e = entry as Record<string, unknown>;
+    if (!isString(e.url)) {
+      return {
+        ok: false,
+        error: `outputUploadSlots[${i}].url must be a string`,
+      };
+    }
+    outputUploadSlots.push({ url: e.url });
+  }
+  if (!isString(r.outputUrlEndpoint)) {
+    return {
+      ok: false,
+      error: 'outputUrlEndpoint is required and must be a string',
+    };
+  }
+  if (!isString(r.reportUploadedEndpoint)) {
+    return {
+      ok: false,
+      error: 'reportUploadedEndpoint is required and must be a string',
+    };
+  }
+
+  return {
+    ok: true,
+    request: {
+      executionId: r.executionId,
+      organizationId: r.organizationId,
+      language: r.language,
+      ...(packages !== undefined && { packages }),
+      ...(packagesByLang !== undefined && { packagesByLang }),
+      ...(timeoutMs !== undefined && { timeoutMs }),
+      ...(options !== undefined && { options }),
+      files,
+      ...(entryPath !== undefined && { entryPath }),
+      ...(steps !== undefined && { steps }),
+      ...(priorOutputDownloads !== undefined && { priorOutputDownloads }),
+      outputUploadSlots,
+      outputUrlEndpoint: r.outputUrlEndpoint,
+      reportUploadedEndpoint: r.reportUploadedEndpoint,
+    },
+  };
+}
+
+/**
+ * Reject relative paths that could escape `/workspace/code/` or step on
+ * runtime conventions. Mirrors the subset of platform-side validatePath
+ * that matters at the spawner boundary; the platform's full 16-rule
+ * pipeline (NFC, BiDi, zero-width, Windows-reserved) runs server-side
+ * before any request reaches this code.
+ */
+function isSafeRelativePath(
+  p: string,
+): { ok: true } | { ok: false; error: string } {
+  if (p.length === 0) return { ok: false, error: 'path is empty' };
+  if (p.length > MAX_FILE_PATH_LENGTH) {
+    return { ok: false, error: `path exceeds ${MAX_FILE_PATH_LENGTH} chars` };
+  }
+  if (p.startsWith('/') || /^[A-Za-z]:[\\/]/.test(p)) {
+    return { ok: false, error: 'path must be relative' };
+  }
+  if (p.includes('\\')) {
+    return { ok: false, error: 'path must use forward slashes' };
+  }
+  if (p.startsWith('./')) {
+    return { ok: false, error: 'path must not start with "./"' };
+  }
+  if (p.endsWith('/')) {
+    return { ok: false, error: 'path must not end with "/"' };
+  }
+  if (p.includes('//')) {
+    return { ok: false, error: 'path must not contain "//"' };
+  }
+  // Reject control chars, NUL, and any non-printable byte (defense in
+  // depth — platform side already strips these).
+  for (let i = 0; i < p.length; i += 1) {
+    const c = p.charCodeAt(i);
+    if (c < 0x20 || c === 0x7f) {
+      return { ok: false, error: 'path contains control characters' };
+    }
+  }
+  const segments = p.split('/');
+  for (const seg of segments) {
+    if (seg === '' || seg === '.' || seg === '..') {
+      return { ok: false, error: `path has bad segment "${seg}"` };
+    }
+    if (seg.startsWith('.')) {
+      return { ok: false, error: `hidden dotfile segment "${seg}" rejected` };
+    }
+    if (!FILE_PATH_SEGMENT_RE.test(seg)) {
+      return {
+        ok: false,
+        error: `path segment "${seg}" has chars outside [A-Za-z0-9._-]`,
+      };
+    }
+  }
+  return { ok: true };
+}
+
+function validateFiles(
+  raw: unknown,
+): { ok: true; files: SandboxFile[] } | { ok: false; error: string } {
+  if (!Array.isArray(raw)) {
+    return { ok: false, error: 'files must be an array' };
+  }
+  if (raw.length > MAX_FILES_PER_REQUEST) {
+    return {
+      ok: false,
+      error: `files exceeds ${MAX_FILES_PER_REQUEST}-item limit`,
+    };
+  }
+  const seenLower = new Set<string>();
+  const out: SandboxFile[] = [];
+  let aggregateBytes = 0;
+  for (let i = 0; i < raw.length; i += 1) {
+    const entry: unknown = raw[i];
+    if (entry === null || typeof entry !== 'object' || Array.isArray(entry)) {
+      return { ok: false, error: `files[${i}] must be an object` };
+    }
+    // After the guard above `entry` is `object`; reading string-indexed
+    // properties through a typed Record is the canonical wire-shape
+    // narrowing pattern used elsewhere in this validator (see `r` at the
+    // top of validateExecuteRequest).
+    // oxlint-disable-next-line typescript-eslint/no-unsafe-type-assertion
+    const e = entry as Record<string, unknown>;
+    if (!isString(e.path)) {
+      return { ok: false, error: `files[${i}].path must be a string` };
+    }
+    if (!isString(e.content)) {
+      return { ok: false, error: `files[${i}].content must be a string` };
+    }
+    const safe = isSafeRelativePath(e.path);
+    if (!safe.ok) {
+      return { ok: false, error: `files[${i}].path: ${safe.error}` };
+    }
+    const lower = e.path.toLowerCase();
+    if (seenLower.has(lower)) {
+      return {
+        ok: false,
+        error: `files[${i}].path "${e.path}" duplicates an earlier entry (case-insensitive)`,
+      };
+    }
+    seenLower.add(lower);
+    aggregateBytes += Buffer.byteLength(e.content, 'utf8');
+    if (aggregateBytes > MAX_FILES_BYTES) {
+      return {
+        ok: false,
+        error: `files aggregate content exceeds ${MAX_FILES_BYTES}-byte limit`,
+      };
+    }
+    out.push({ path: e.path, content: e.content });
+  }
+  return { ok: true, files: out };
+}
diff --git a/services/sandbox/src/volume.ts b/services/sandbox/src/volume.ts
new file mode 100644
index 000000000..4a073af5b
--- /dev/null
+++ b/services/sandbox/src/volume.ts
@@ -0,0 +1,104 @@
+// Per-org cache volume helpers + post-run output harvest.
+//
+// Per-org pip/npm caches are persistent named volumes scoped to organizationId
+// (R2.3 — closes the cross-tenant wheel-cache poison vector). The runtime
+// container itself uses a `--tmpfs /workspace` for the workspace, so there is
+// no per-call workspace volume to manage.
+
+import { runDocker } from './spawn-util.ts';
+import type { SpawnerConfig } from './types.ts';
+
+const ORG_SLUG_RE = /^[a-zA-Z0-9_-]{1,128}$/;
+
+function orgSlug(organizationId: string): string {
+  if (!ORG_SLUG_RE.test(organizationId)) {
+    throw new Error(
+      `volume: refusing unsafe organizationId for volume name: ${JSON.stringify(organizationId)}`,
+    );
+  }
+  return organizationId;
+}
+
+export function pipCacheVolumeName(
+  cfg: SpawnerConfig,
+  organizationId: string,
+): string {
+  return `${cfg.cacheVolumePrefix.pip}-${orgSlug(organizationId)}`;
+}
+
+export function npmCacheVolumeName(
+  cfg: SpawnerConfig,
+  organizationId: string,
+): string {
+  return `${cfg.cacheVolumePrefix.npm}-${orgSlug(organizationId)}`;
+}
+
+// Coalesce concurrent ensureCacheVolume calls for the same volume name.
+// Two parallel /v1/execute requests from the same org trigger this twice
+// in quick succession; without a mutex, both race past the `volume inspect`
+// gate, both run `volume create`, and the second wastes a chown + race.
+// Storing the in-flight promise here lets the second caller await the
+// first's settle instead of repeating the work.
+const ensureInFlight = new Map<string, Promise<void>>();
+
+/**
+ * Lazy idempotent create. New volumes are root-owned by default and the
+ * runtime container runs as nobody (65534), so on first creation we also
+ * spin up a transient busybox to chown the volume's root to 65534:65534.
+ * Subsequent calls are no-ops (we detect via `docker volume inspect`).
+ */
+export async function ensureCacheVolume(name: string): Promise<void> {
+  const existing = ensureInFlight.get(name);
+  if (existing) return existing;
+  const work = ensureCacheVolumeUnlocked(name).finally(() => {
+    ensureInFlight.delete(name);
+  });
+  ensureInFlight.set(name, work);
+  return work;
+}
+
+async function ensureCacheVolumeUnlocked(name: string): Promise<void> {
+  const inspect = await runDocker(['volume', 'inspect', name]);
+  if (inspect.exitCode === 0) return; // already exists, already chowned
+
+  const create = await runDocker([
+    'volume',
+    'create',
+    '--label',
+    'tale.sandbox-cache=1',
+    name,
+  ]);
+  if (create.exitCode !== 0) {
+    // `volume create` is racey across processes/restarts: if another caller
+    // (or a prior boot) created the volume between our inspect and our
+    // create, Docker returns "volume already exists" with non-zero exit.
+    // That is the success state we wanted; treat it as such and skip chown
+    // because the prior create already ran it.
+    const stderr = create.stderr.trim();
+    if (/already exists/i.test(stderr)) return;
+    throw new Error(
+      `volume: failed to create cache volume ${name}: ${stderr || create.stdout.trim()}`,
+    );
+  }
+
+  // One-shot chown so the unprivileged runtime user can write to the cache.
+  const chown = await runDocker([
+    'run',
+    '--rm',
+    '--user',
+    '0:0',
+    '--label',
+    'tale.sandbox-staging=1',
+    '--mount',
+    `type=volume,src=${name},dst=/cache`,
+    'busybox:1.36',
+    'chown',
+    '65534:65534',
+    '/cache',
+  ]);
+  if (chown.exitCode !== 0) {
+    throw new Error(
+      `volume: failed to chown cache volume ${name}: ${chown.stderr.trim()}`,
+    );
+  }
+}
diff --git a/services/sandbox/src/wire.ts b/services/sandbox/src/wire.ts
new file mode 100644
index 000000000..1c0710567
--- /dev/null
+++ b/services/sandbox/src/wire.ts
@@ -0,0 +1,155 @@
+// Wire-protocol enums + literals shared between server.ts, spawn.ts, and
+// the response builder. Mirrors `services/platform/convex/sandbox/wire.ts`
+// on the Convex side — the spawner cannot import from Convex (different
+// runtime, different package), so this is a parallel file. Both ends must
+// stay in sync; the platform side carries a compile-time `satisfies`
+// assertion (see `convex/node_only/sandbox/helpers/spawner_client.ts`)
+// that asserts these literals are a subset of the Convex `sandboxRunStatusLiterals`
+// / `sandboxErrorCodeLiterals` / `sandboxPhaseEventLiterals` arrays, so a
+// drift on either side fails the CI typecheck.
+
+// `sandboxRunStatusLiterals` lives only on the Convex side
+// (`services/platform/convex/sandbox/wire.ts`) — the spawner never emits a
+// run-status string, only phase events + a final result with one of three
+// terminal `status` values (`completed | failed | cancelled`). Kept off
+// this file deliberately so unused-export sweeps stay clean.
+
+export const sandboxErrorCodeLiterals = [
+  'TIMEOUT',
+  'OOM',
+  'EGRESS_DENIED',
+  'INSTALL_FAILED',
+  'PACKAGE_NOT_FOUND',
+  'QUOTA_EXCEEDED',
+  'RUNTIME_ERROR',
+  'SPAWNER_UNAVAILABLE',
+  'CANCELLED',
+  'INPUT_REJECTED',
+  // Output-pipeline error codes (sandbox-wobbly-origami plan §5). Split out
+  // of the legacy catch-all so the LLM-side recovery hint can be specific:
+  // a HARVEST_READ_FAILED means "check stderr / file write didn't happen",
+  // an UPLOAD_FAILED means "transient, one retry is fine", an
+  // UPLOAD_QUOTA_EXCEEDED means "consolidate or split into multi-step", and
+  // an UPLOAD_REPORT_FAILED means "the storageId was uploaded but the
+  // report-back mutation failed — audit row may need manual reconciliation".
+  'HARVEST_READ_FAILED',
+  'UPLOAD_FAILED',
+  'UPLOAD_QUOTA_EXCEEDED',
+  'UPLOAD_REPORT_FAILED',
+  // Pre-stage attestation failure raised by the platform when
+  // `ExecuteResponse.priorStage.skipped` shows files the platform expected
+  // to inject didn't actually make it onto `/workspace/output/`. The
+  // spawner never emits this code itself — it's an action-side gate — but
+  // the literal lives here so the parity guard on the Convex side stays
+  // satisfied.
+  'PRE_STAGE_FAILED',
+  // Output-pipeline completeness gate: the action treats any non-empty
+  // `uploadStats.failures` as fatal so a partially-harvested workspace
+  // doesn't get reported as `success:true`. Same as PRE_STAGE_FAILED:
+  // this is an action-side decision, not a spawner-emitted code.
+  'UPLOAD_INCOMPLETE',
+] as const;
+
+export type SandboxErrorCode = (typeof sandboxErrorCodeLiterals)[number];
+
+export const sandboxPhaseEventLiterals = [
+  'preparing',
+  'installing',
+  'running',
+  'completed',
+] as const;
+
+export type SandboxPhaseEvent = (typeof sandboxPhaseEventLiterals)[number];
+
+/**
+ * SSE event types emitted by `POST /v1/execute`. The spawner emits:
+ *  - `phase` — zero or more transitions (preparing → installing → running)
+ *  - `stdout` / `stderr` — incremental output deltas while the container
+ *    is alive (added so the canvas can tail output instead of waiting for
+ *    the terminal `result` event with the whole base64'd buffer).
+ *  - `result` — exactly one terminal event with the canonical
+ *    ExecuteResponse shape.
+ *  - `error` — zero or one SSE-side transport error (e.g. spawn aborted
+ *    before a result was produced).
+ *
+ * The convex side has a compile-time parity guard
+ * (services/platform/convex/sandbox/wire.ts) that fails CI typecheck if
+ * either side drifts.
+ */
+export const sandboxSseEventLiterals = [
+  'phase',
+  'stdout',
+  'stderr',
+  'result',
+  'error',
+] as const;
+
+export const sandboxLanguageLiterals = ['python', 'node', 'polyglot'] as const;
+export type SandboxLanguage = (typeof sandboxLanguageLiterals)[number];
+
+// Stable id alphabet for executionId (Convex doc id + base32-ish dev ids).
+// Used by both the server route regex and the spawn-time argv assertions.
+// Centralized so widening one side doesn't drift from the other (commit
+// e9211127d widened spawn.ts + docker-args.ts but missed the cancel route).
+export const ID_ALPHABET_RE = /^[a-zA-Z0-9_-]{1,64}$/;
+export const ORG_ID_ALPHABET_RE = /^[a-zA-Z0-9_-]{1,128}$/;
+
+/**
+ * Per-segment allowlist for sandbox-staged file paths. Mirrors the strict
+ * ASCII allowlist enforced by the platform's `validatePath` (see
+ * `services/platform/convex/agent_tools/artifacts/shared.ts`). The platform
+ * runs the full 16-rule NFC + traversal + BiDi pipeline; this spawner-side
+ * regex is defense-in-depth — even if the platform side regresses, the
+ * spawner refuses to stage anything outside the safe alphabet.
+ */
+export const FILE_PATH_SEGMENT_RE = /^[A-Za-z0-9._-]+$/;
+
+/**
+ * Per-file caps for sandbox-staged `files[]`. Aggregate cap is enforced
+ * separately from the existing `code` cap because each file's content is
+ * accounted for independently.
+ */
+export const MAX_FILES_PER_REQUEST = 50;
+export const MAX_FILE_PATH_LENGTH = 200;
+export const MAX_FILES_BYTES = 800_000;
+
+/**
+ * Maximum number of `steps[]` per multi-step `/v1/execute` request. Each
+ * step launches one subprocess inside the same container so the cap
+ * doubles as a guard against pathological `steps.length === 1000`
+ * payloads. The spawner-generated wrapper script's size scales with this.
+ */
+export const MAX_STEPS_PER_REQUEST = 10;
+
+/**
+ * Polyglot file-extension dispatch. The spawner's multi-step wrapper
+ * looks at each step path's extension and runs the matching interpreter
+ * — `.py` → python3, `.js`/`.cjs`/`.mjs` → node. Both runtimes already
+ * live in the runtime image (Dockerfile layers Node 24 onto
+ * python:3.12-slim), so polyglot mode is purely a wrapper / install
+ * dispatch change, not an image change. Mirrored on the platform side
+ * by `inferStepLanguage()` in agent_tools/artifacts/shared.ts.
+ */
+export const POLYGLOT_PYTHON_EXT_RE = /\.py$/i;
+export const POLYGLOT_NODE_EXT_RE = /\.(?:c?js|mjs)$/i;
+
+/**
+ * Per-step outcome reported back inside `ExecuteResponse.steps[]` when
+ * the request used multi-step mode. `path` mirrors the requested step
+ * path; `status` is `'completed'` (exit 0), `'failed'` (exit ≠ 0), or
+ * `'skipped'` (a prior step failed and fail-fast aborted the rest).
+ */
+export const sandboxStepStatusLiterals = [
+  'completed',
+  'failed',
+  'skipped',
+] as const;
+
+export type SandboxStepStatus = (typeof sandboxStepStatusLiterals)[number];
+
+export interface SandboxStepResult {
+  path: string;
+  status: SandboxStepStatus;
+  exitCode: number | null;
+  durationMs: number;
+}
diff --git a/services/sandbox/tsconfig.json b/services/sandbox/tsconfig.json
new file mode 100644
index 000000000..dd7a6dd86
--- /dev/null
+++ b/services/sandbox/tsconfig.json
@@ -0,0 +1,21 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "lib": ["ES2023"],
+    "types": ["bun"],
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "noEmit": true,
+    "allowImportingTsExtensions": true,
+    "verbatimModuleSyntax": false,
+    "forceConsistentCasingInFileNames": true,
+    "exactOptionalPropertyTypes": false,
+    "noUncheckedIndexedAccess": true
+  },
+  "include": ["src/**/*.ts"]
+}
diff --git a/services/web/Dockerfile b/services/web/Dockerfile
index e808ab744..e6ea1aba4 100644
--- a/services/web/Dockerfile
+++ b/services/web/Dockerfile
@@ -24,6 +24,7 @@ COPY services/crawler/package.json ./services/crawler/
 COPY services/rag/package.json ./services/rag/
 COPY services/db/package.json ./services/db/
 COPY services/proxy/package.json ./services/proxy/
+COPY services/sandbox/package.json ./services/sandbox/
 COPY services/web/package.json ./services/web/
 COPY services/docs/package.json ./services/docs/
 COPY tools/cli/package.json ./tools/cli/
diff --git a/services/web/Dockerfile.dockerignore b/services/web/Dockerfile.dockerignore
index 390fe8c35..ccf5e55f9 100644
--- a/services/web/Dockerfile.dockerignore
+++ b/services/web/Dockerfile.dockerignore
@@ -121,6 +121,7 @@ services/crawler/
 services/rag/
 services/db/
 services/proxy/
+services/sandbox/
 packages/tale_knowledge/
 packages/tale_shared/
 packages/tale_telemetry/
@@ -135,6 +136,7 @@ examples/
 !services/rag/package.json
 !services/db/package.json
 !services/proxy/package.json
+!services/sandbox/package.json
 !packages/tale_knowledge/package.json
 !packages/tale_shared/package.json
 !packages/tale_telemetry/package.json
diff --git a/services/web/app/components/blocks/hardware-tiers.tsx b/services/web/app/components/blocks/hardware-tiers.tsx
index 1cfd2b237..080874e88 100644
--- a/services/web/app/components/blocks/hardware-tiers.tsx
+++ b/services/web/app/components/blocks/hardware-tiers.tsx
@@ -23,6 +23,7 @@ import type {
   HardwareMode,
 } from '@/app/pages/hardware-pricing-page';
 import { useT } from '@/lib/i18n/client';
+import { useCurrentLocale } from '@/lib/i18n/use-current-locale';
 
 /**
  * Pricing-card grid + toggles — the upper half of the hardware pricing
@@ -30,8 +31,16 @@ import { useT } from '@/lib/i18n/client';
  * on demand from `(buy, term)` so the rate-table lives in one place.
  */
 
-const HARDWARE_LOCALE = 'en-US';
+// Swiss-only product → currency is fixed at CHF, but the number-formatting
+// locale follows the page locale so a /de/ visitor sees `CHF 14'990` while
+// a /fr/ visitor sees `CHF 14 990` (audit finding R2-B12: previously
+// hardcoded to en-US which renders `CHF 14,990` for every locale).
 const HARDWARE_CURRENCY = 'CHF';
+const HARDWARE_NUMBER_LOCALE: Record<string, string> = {
+  en: 'en-CH',
+  de: 'de-CH',
+  fr: 'fr-CH',
+};
 
 const STANDARD_TIER_KEYS = ['quality', 'hybrid', 'speed'] as const;
 type StandardTierKey = (typeof STANDARD_TIER_KEYS)[number];
@@ -96,6 +105,8 @@ export function HardwareTiers({
   onTermChange,
 }: HardwareTiersProps) {
   const { t } = useT('hardwarePricing');
+  const locale = useCurrentLocale();
+  const numberLocale = HARDWARE_NUMBER_LOCALE[locale] ?? 'en-CH';
 
   const tiers = TIERS_BY_MODE[mode];
   const isRack = mode === 'rack';
@@ -152,7 +163,7 @@ export function HardwareTiers({
             billing === 'leasing' ? leasingMonthly(buy, term) : buy,
             {
               currency: HARDWARE_CURRENCY,
-              locale: HARDWARE_LOCALE,
+              locale: numberLocale,
               approximate: true,
             },
           );
diff --git a/services/web/app/components/blocks/pricing-tiers.tsx b/services/web/app/components/blocks/pricing-tiers.tsx
index 06da07c5a..ab59293e4 100644
--- a/services/web/app/components/blocks/pricing-tiers.tsx
+++ b/services/web/app/components/blocks/pricing-tiers.tsx
@@ -87,8 +87,12 @@ export function PricingTiers({
 }: PricingTiersProps) {
   const { t } = useT('pricing');
 
+  // Pass `billing` through so the displayed per-month figure reflects
+  // the yearly discount that the `billingNote.yearly` footnote promises
+  // ("2 months free" → 10/12 of the monthly rate). Audit finding
+  // R2-B12: previously the toggle moved the footnote but not the price.
   const enterprisePrice = formatMoney(
-    enterpriseMonthlyTotal(region, users),
+    enterpriseMonthlyTotal(region, users, billing),
     region,
   );
   const perUserPrice = formatMoney(PER_USER_MONTHLY[region], region);
diff --git a/services/web/app/components/blocks/segmented-radio.tsx b/services/web/app/components/blocks/segmented-radio.tsx
index 254b6fb3a..ed64aaff9 100644
--- a/services/web/app/components/blocks/segmented-radio.tsx
+++ b/services/web/app/components/blocks/segmented-radio.tsx
@@ -1,3 +1,5 @@
+import { useRef, type KeyboardEvent } from 'react';
+
 interface SegmentedRadioProps<T extends string | number> {
   ariaLabel: string;
   options: readonly T[];
@@ -10,6 +12,16 @@ interface SegmentedRadioProps<T extends string | number> {
  * Pill-style radio group for billing / region / mode / leasing-term
  * toggles. Accepts string or numeric values so the term selector can
  * pass `12 | 24 | …` directly.
+ *
+ * Keyboard contract (WAI-ARIA APG radio pattern, round-2 R2-B12):
+ *  - Only the currently-checked option is in the tab sequence
+ *    (`tabIndex=0`); the rest are `tabIndex=-1`.
+ *  - ArrowLeft / ArrowUp move selection back; ArrowRight / ArrowDown move
+ *    selection forward; selection wraps at both ends. Home / End jump to
+ *    the extremes. Each arrow press both selects and focuses the new
+ *    option, matching the canonical radio-group keyboard model.
+ *  - Space / Enter activation is handled natively by the underlying
+ *    `<button>` elements.
  */
 export function SegmentedRadio<T extends string | number>({
   ariaLabel,
@@ -18,22 +30,59 @@ export function SegmentedRadio<T extends string | number>({
   onChange,
   renderLabel,
 }: SegmentedRadioProps<T>) {
+  const buttonRefs = useRef<Array<HTMLButtonElement | null>>([]);
+
+  const handleKeyDown = (
+    e: KeyboardEvent<HTMLButtonElement>,
+    currentIndex: number,
+  ) => {
+    let nextIndex: number | null = null;
+    switch (e.key) {
+      case 'ArrowRight':
+      case 'ArrowDown':
+        nextIndex = (currentIndex + 1) % options.length;
+        break;
+      case 'ArrowLeft':
+      case 'ArrowUp':
+        nextIndex = (currentIndex - 1 + options.length) % options.length;
+        break;
+      case 'Home':
+        nextIndex = 0;
+        break;
+      case 'End':
+        nextIndex = options.length - 1;
+        break;
+      default:
+        return;
+    }
+    e.preventDefault();
+    const nextOption = options[nextIndex];
+    if (nextOption === undefined) return;
+    onChange(nextOption);
+    buttonRefs.current[nextIndex]?.focus();
+  };
+
   return (
     <div
       role="radiogroup"
       aria-label={ariaLabel}
       className="bg-bg-muted flex w-fit items-center gap-1 rounded-md p-0.5"
     >
-      {options.map((option) => {
+      {options.map((option, index) => {
         const isActive = value === option;
         return (
           <button
             key={String(option)}
+            ref={(el) => {
+              buttonRefs.current[index] = el;
+            }}
             type="button"
             role="radio"
             aria-checked={isActive}
+            tabIndex={isActive ? 0 : -1}
             onClick={() => onChange(option)}
-            className={`rounded-md px-3.5 py-1.5 text-sm font-medium transition-colors ${
+            onKeyDown={(e) => handleKeyDown(e, index)}
+            className={`focus-visible:ring-fg-base focus-visible:ring-offset-bg-elevated rounded-md px-3.5 py-1.5 text-sm font-medium transition-colors focus:outline-none focus-visible:ring-2 focus-visible:ring-offset-2 ${
               isActive
                 ? 'bg-bg-base text-fg-base shadow-sm dark:bg-[#404045]'
                 : 'text-fg-muted hover:text-fg-base cursor-pointer'
diff --git a/services/web/lib/pricing/tiers.ts b/services/web/lib/pricing/tiers.ts
index 7361744af..11a44d9c0 100644
--- a/services/web/lib/pricing/tiers.ts
+++ b/services/web/lib/pricing/tiers.ts
@@ -16,6 +16,31 @@ export const STORAGE_PER_TB_MONTHLY: Record<Region, number> = {
 
 export const DEFAULT_USERS = 25;
 
-export function enterpriseMonthlyTotal(region: Region, users: number): number {
-  return PER_USER_MONTHLY[region] * users;
+type Billing = 'monthly' | 'yearly';
+
+/**
+ * Discount applied to the yearly billing toggle. Mirrors the "2 months
+ * free" footnote on the pricing card — yearly customers pay 10 months
+ * of monthly rate, then divide back to a per-month displayed figure.
+ */
+const YEARLY_DISCOUNT_FACTOR = 10 / 12;
+
+/**
+ * Effective monthly seat cost for the chosen billing cadence. Yearly
+ * customers see 10/12 of the monthly rate so the "× users × 12 months"
+ * total honors the "2 months free" footnote (audit finding R2-B12: the
+ * displayed monthly price was previously identical for both toggles
+ * while the footnote claimed savings — misleading users).
+ */
+function effectivePerUserMonthly(region: Region, billing: Billing): number {
+  const base = PER_USER_MONTHLY[region];
+  return billing === 'yearly' ? base * YEARLY_DISCOUNT_FACTOR : base;
+}
+
+export function enterpriseMonthlyTotal(
+  region: Region,
+  users: number,
+  billing: Billing = 'monthly',
+): number {
+  return effectivePerUserMonthly(region, billing) * users;
 }
diff --git a/tests/container-image-test.sh b/tests/container-image-test.sh
index 7fc1fe63d..3f35a0506 100755
--- a/tests/container-image-test.sh
+++ b/tests/container-image-test.sh
@@ -36,6 +36,9 @@ declare -A SIZE_BUDGETS=(
     [db]=1200
     [proxy]=100
     [convex]=2500
+    [sandbox]=300
+    [sandbox-egress]=80
+    [sandbox-runtime]=900
 )
 
 header() {
@@ -67,7 +70,29 @@ warn() {
 get_image() {
     local service=$1
     cd "${PROJECT_ROOT}"
-    ${COMPOSE_CMD} config --images 2>/dev/null | grep "${service}" | head -1
+    # Anchor to `/tale-${service}:` so we don't match a different service
+    # whose name happens to contain `${service}` as a substring (e.g. plain
+    # `db` would otherwise match `tale-san**db**ox-egress`).
+    local img
+    img=$(${COMPOSE_CMD} config --images 2>/dev/null | grep "/tale-${service}:" | head -1 || echo "")
+    if [ -n "$img" ]; then
+        echo "$img"
+        return
+    fi
+    # sandbox-runtime is not a compose service — it's pulled at boot by the
+    # spawner. Fall back to the locally-tagged image used by docker_args.ts.
+    if [ "$service" = "sandbox-runtime" ]; then
+        if docker image inspect "tale-sandbox-runtime:latest" >/dev/null 2>&1; then
+            echo "tale-sandbox-runtime:latest"
+            return
+        fi
+        # CI smoke pre-tags the GHCR image under the tale-project namespace.
+        if docker image inspect "ghcr.io/tale-project/tale/tale-sandbox-runtime:latest" >/dev/null 2>&1; then
+            echo "ghcr.io/tale-project/tale/tale-sandbox-runtime:latest"
+            return
+        fi
+    fi
+    echo ""
 }
 
 # =============================================================================
@@ -76,7 +101,7 @@ get_image() {
 cd "${PROJECT_ROOT}"
 header "Building all images locally"
 
-SERVICES=(crawler rag platform db proxy convex)
+SERVICES=(crawler rag platform db proxy convex sandbox sandbox-egress sandbox-runtime)
 declare -A IMAGES
 
 echo -e "  ${YELLOW}Building images using compose...${NC}"
@@ -84,12 +109,30 @@ if [ "${SKIP_BUILD:-false}" = "true" ]; then
     echo -e "  ${YELLOW}⚠ SKIP_BUILD=true — using pre-built images${NC}"
 else
     ${COMPOSE_CMD} build --parallel 2>&1 || { echo -e "${RED}Build failed!${NC}"; exit 1; }
+    # sandbox-runtime is not a compose service — build it separately so the
+    # image is available for inspection. Tag matches the spawner default
+    # (SANDBOX_RUNTIME_IMAGE=tale-sandbox-runtime:latest). Build context is
+    # the repo root so the Dockerfile's `services/sandbox-runtime/...` COPY
+    # paths resolve the same way as CI build-push-action (context: .).
+    if ! docker image inspect tale-sandbox-runtime:latest >/dev/null 2>&1; then
+        echo -e "  ${YELLOW}Building tale-sandbox-runtime:latest...${NC}"
+        docker build \
+            -t tale-sandbox-runtime:latest \
+            -f services/sandbox-runtime/Dockerfile \
+            . \
+            2>&1 \
+            || { echo -e "${RED}sandbox-runtime build failed!${NC}"; exit 1; }
+    fi
 fi
 
 for svc in "${SERVICES[@]}"; do
     img=$(get_image "$svc")
     IMAGES[$svc]="$img"
-    echo -e "  ${GREEN}✓${NC} ${svc}: ${img}"
+    if [ -n "$img" ]; then
+        echo -e "  ${GREEN}✓${NC} ${svc}: ${img}"
+    else
+        echo -e "  ${YELLOW}⚠${NC} ${svc}: image not found (skipping checks)"
+    fi
 done
 
 # =============================================================================
@@ -143,6 +186,21 @@ for svc in "${SERVICES[@]}"; do
                 warn "${svc}: runs as root (consider adding non-root user in future)"
             fi
             ;;
+        sandbox|sandbox-egress)
+            # Sandbox spawner needs root to read /var/run/docker.sock;
+            # sandbox-egress runs tinyproxy which manages its own user. Both
+            # are expected to start as root and drop privileges at runtime.
+            pass "${svc}: root (expected — privilege drops to docker.sock owner / tinyproxy user)"
+            ;;
+        sandbox-runtime)
+            # Runtime is pinned to uid 65534 (nobody) via USER in the
+            # Dockerfile; spawner re-asserts --user 65534:65534 at run time.
+            if [ -n "$user" ] && [ "$user" != "root" ] && [ "$user" != "0" ]; then
+                pass "${svc}: runs as user '${user}' (non-root)"
+            else
+                fail "${svc}: runtime image must not run as root"
+            fi
+            ;;
     esac
 done
 
@@ -201,6 +259,15 @@ for svc in "${SERVICES[@]}"; do
     img="${IMAGES[$svc]:-}"
     [ -z "$img" ] && continue
 
+    # sandbox-runtime is an exec'd ephemeral container (lifecycle = one
+    # `artifact_run` call). Docker HEALTHCHECK would never run because the
+    # image is invoked with `docker run --rm` and exits when the user
+    # program returns. Skip the assertion.
+    if [ "$svc" = "sandbox-runtime" ]; then
+        pass "${svc}: HEALTHCHECK skipped (ephemeral exec container)"
+        continue
+    fi
+
     healthcheck=$(docker inspect --format='{{.Config.Healthcheck}}' "$img" 2>/dev/null || echo "")
 
     if [ -n "$healthcheck" ] && [ "$healthcheck" != "<nil>" ]; then
diff --git a/tests/container-smoke-test.sh b/tests/container-smoke-test.sh
index dda160648..829563135 100755
--- a/tests/container-smoke-test.sh
+++ b/tests/container-smoke-test.sh
@@ -66,6 +66,14 @@ cleanup() {
     fi
     header "Tearing down test containers"
     ${COMPOSE_CMD} down -v --remove-orphans 2>/dev/null || true
+    # The sandbox network is declared `external:` in compose.yml — `compose
+    # down` won't remove it. Drop it manually so the next run starts clean.
+    docker network rm tale-sandbox-net >/dev/null 2>&1 || true
+    # Only remove .env if we created it (CREATED_ENV=1). Otherwise we'd
+    # clobber a developer's real .env when the smoke test exits.
+    if [ "${CREATED_ENV:-0}" = "1" ]; then
+        rm -f "${PROJECT_ROOT}/.env"
+    fi
 }
 
 trap cleanup EXIT
@@ -76,10 +84,26 @@ trap cleanup EXIT
 cd "${PROJECT_ROOT}"
 ${COMPOSE_CMD} down -v --remove-orphans 2>/dev/null || true
 
-# Ensure dummy .env exists to satisfy compose.yml env_file declarations
+# Pre-create the sandbox bridge. It's declared `external:` in compose.yml
+# because the CLI (`tale start` / `tale deploy`) owns its lifecycle —
+# `--internal --ipv6=false` can't be expressed atomically in a compose
+# `networks:` block. Smoke tests don't go through the CLI, so we create it
+# here with the same shape ensureSandboxNetwork() uses.
+docker network rm tale-sandbox-net >/dev/null 2>&1 || true
+docker network create \
+    --internal \
+    --ipv6=false \
+    --driver=bridge \
+    tale-sandbox-net >/dev/null
+
+# Ensure dummy .env exists to satisfy compose.yml env_file declarations.
+# Track whether we created it so the cleanup trap doesn't delete a real
+# .env if one already existed on a developer's box.
+CREATED_ENV=0
 if [ ! -f "${PROJECT_ROOT}/.env" ]; then
     echo -e "  ${YELLOW}⚠ No .env file found — creating placeholder with defaults${NC}"
     cp "${PROJECT_ROOT}/.env.test" "${PROJECT_ROOT}/.env"
+    CREATED_ENV=1
 fi
 
 # =============================================================================
@@ -112,12 +136,14 @@ if [ "${SKIP_BUILD:-false}" != "true" ]; then
     printf "  ${BOLD}%-15s %-45s %10s${NC}\n" "SERVICE" "IMAGE" "SIZE"
     echo "  ─────────────────────────────────────────────────────────────────────"
     TOTAL_SIZE_MB=0
-    for svc in db convex crawler rag platform proxy; do
-        # Get the image name from compose config
-        img=$(cd "${PROJECT_ROOT}" && ${COMPOSE_CMD} config --images 2>/dev/null | grep "${svc}" | head -1)
+    for svc in db convex crawler rag platform proxy sandbox sandbox-egress; do
+        # Get the image name from compose config. Use anchored grep so we
+        # don't match service names that *contain* the target (e.g. "db"
+        # would otherwise match "tale-san**db**ox-egress").
+        img=$(cd "${PROJECT_ROOT}" && ${COMPOSE_CMD} config --images 2>/dev/null | grep "/tale-${svc}:" | head -1)
         if [ -z "$img" ]; then
             # Fallback: look for tale images in docker images list
-            img=$(docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep "tale-${svc}" | head -1)
+            img=$(docker images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep "tale-${svc}:" | head -1)
         fi
         if [ -n "$img" ]; then
             size=$(docker images --format '{{.Size}}' "$img" 2>/dev/null | head -1)
@@ -222,7 +248,7 @@ wait_for_healthy() {
     done
 }
 
-SERVICES=(db convex crawler rag platform proxy)
+SERVICES=(db convex crawler rag platform proxy sandbox sandbox-egress)
 HEALTH_FAILED=0
 
 for svc in "${SERVICES[@]}"; do
@@ -354,6 +380,110 @@ else
     fi
 fi
 
+# =============================================================================
+# 6. Sandbox /v1/execute end-to-end probe
+# =============================================================================
+# Submits a 1-line python program signed with the test SANDBOX_TOKEN and
+# asserts the SSE stream emits an `event: result` payload with status
+# "completed". The spawner pulls tale-sandbox-runtime at boot; we don't
+# probe the runtime image directly here — if the spawner is healthy and
+# the boot pull succeeded, /v1/execute will exercise it.
+header "Sandbox /v1/execute end-to-end"
+
+# Pull SANDBOX_TOKEN from .env.test rather than re-defining it, so any local
+# rotation only has to happen in one place.
+SANDBOX_TOKEN_VAL=$(grep -E '^SANDBOX_TOKEN=' "${PROJECT_ROOT}/.env.test" | head -1 | cut -d= -f2-)
+if [ -z "${SANDBOX_TOKEN_VAL}" ]; then
+    fail "Sandbox e2e: SANDBOX_TOKEN missing from .env.test"
+else
+    # Unique per-run executionId so re-running the test (or a stale entry
+    # left in the spawner's in-flight registry from a previous run) doesn't
+    # return 409 Duplicate.
+    SMOKE_EXEC_ID="smoke-$$-$(date +%s)$(date +%N | head -c 6)"
+    # New contract (post-wobbly-origami): source ships in `files[]`,
+    # `entryPath` names the file to exec, and `outputUploadSlots` + the
+    # upload-URL endpoints are required even when no outputs are produced.
+    # `print(1)` writes nothing under /workspace/output/, so the endpoint
+    # URLs are never actually called — placeholders satisfy the validator.
+    SANDBOX_BODY="{\"executionId\":\"${SMOKE_EXEC_ID}\",\"organizationId\":\"smoke\",\"language\":\"python\",\"files\":[{\"path\":\"main.py\",\"content\":\"print(1)\"}],\"entryPath\":\"main.py\",\"timeoutMs\":30000,\"outputUploadSlots\":[],\"outputUrlEndpoint\":\"http://platform:3000/api/sandbox/output_upload_url\",\"reportUploadedEndpoint\":\"http://platform:3000/api/sandbox/record_uploaded\"}"
+    SANDBOX_TS=$(($(date +%s%N) / 1000000))
+    SANDBOX_PATH="/v1/execute"
+    # New signing contract (auth.ts): METHOD\npath\ntimestamp\nsha256Hex(body)
+    SANDBOX_BODY_HASH=$(printf '%s' "${SANDBOX_BODY}" \
+        | openssl dgst -sha256 -r 2>/dev/null \
+        | awk '{print $1}')
+    SANDBOX_SIGNED_STRING=$(printf 'POST\n%s\n%s\n%s' "${SANDBOX_PATH}" "${SANDBOX_TS}" "${SANDBOX_BODY_HASH}")
+    SANDBOX_SIG=$(printf '%s' "${SANDBOX_SIGNED_STRING}" \
+        | openssl dgst -sha256 -hmac "${SANDBOX_TOKEN_VAL}" -r 2>/dev/null \
+        | awk '{print $1}')
+    if [ -z "${SANDBOX_SIG}" ]; then
+        fail "Sandbox e2e: failed to compute HMAC signature"
+    else
+        SANDBOX_OUT=$(mktemp)
+        # The endpoint streams SSE; --max-time bounds the probe. A 1-line
+        # python program completes in under 5s once the runtime image is
+        # warm, but allow 60s to absorb cold-image pulls on a fresh runner.
+        SANDBOX_HTTP=$(curl -sS \
+            -o "${SANDBOX_OUT}" \
+            -w "%{http_code}" \
+            --max-time 60 \
+            -X POST \
+            -H "content-type: application/json" \
+            -H "x-tale-sandbox-signature: ${SANDBOX_SIG}" \
+            -H "x-tale-sandbox-timestamp: ${SANDBOX_TS}" \
+            --data-binary "${SANDBOX_BODY}" \
+            "http://localhost:8003${SANDBOX_PATH}" 2>/dev/null || echo "000")
+
+        if [ "${SANDBOX_HTTP}" = "200" ] \
+           && grep -q '^event: result' "${SANDBOX_OUT}" \
+           && grep -q '"status":"completed"' "${SANDBOX_OUT}"; then
+            pass "Sandbox /v1/execute: completed result"
+        else
+            echo -e "  ${YELLOW}sandbox response (HTTP ${SANDBOX_HTTP}):${NC}"
+            head -c 4000 "${SANDBOX_OUT}" | sed 's/^/    /' || echo "    (empty body)"
+            echo ""
+            fail "Sandbox /v1/execute: expected HTTP 200 + completed result"
+        fi
+        rm -f "${SANDBOX_OUT}"
+    fi
+
+    # ---- Negative cases ----
+    # Missing signature header → 401. Defense-in-depth that the spawner
+    # actually enforces HMAC under .env.test (which DOES define a token).
+    NEG_HTTP=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 \
+        -X POST \
+        -H "content-type: application/json" \
+        --data-binary '{"executionId":"unauth","organizationId":"smoke","language":"python","code":"print(1)"}' \
+        "http://localhost:8003/v1/execute" 2>/dev/null || echo "000")
+    if [ "${NEG_HTTP}" = "401" ]; then
+        pass "Sandbox /v1/execute: 401 without signature"
+    else
+        fail "Sandbox /v1/execute: expected 401 without signature, got ${NEG_HTTP}"
+    fi
+
+    # 2 MB + 1 body → 413. Tests the streaming body cap before HMAC
+    # check; we don't bother signing because the byte cap fires first.
+    # Cap default (cfg.maxRequestBodyBytes) is 2 MiB, see services/sandbox/src/config.ts.
+    #
+    # The body has to come from a file rather than be passed inline: the
+    # Linux kernel caps a single argv string at MAX_ARG_STRLEN (128 KiB),
+    # independent of ARG_MAX, so `--data-binary "${TOO_BIG}"` with multi-MB
+    # of payload fails the execve before curl ever runs.
+    TOO_BIG_FILE="$(mktemp)"
+    head -c 2097153 /dev/zero | tr '\0' 'x' > "${TOO_BIG_FILE}"
+    NEG_HTTP=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 \
+        -X POST \
+        -H "content-type: application/json" \
+        --data-binary "@${TOO_BIG_FILE}" \
+        "http://localhost:8003/v1/execute" 2>/dev/null || echo "000")
+    rm -f "${TOO_BIG_FILE}"
+    if [ "${NEG_HTTP}" = "413" ]; then
+        pass "Sandbox /v1/execute: 413 on oversized body"
+    else
+        fail "Sandbox /v1/execute: expected 413 on oversized body, got ${NEG_HTTP}"
+    fi
+fi
+
 # =============================================================================
 # SUMMARY
 # =============================================================================
diff --git a/tools/cli/src/commands/deploy/index.ts b/tools/cli/src/commands/deploy/index.ts
index 7545352e5..4c36bd788 100644
--- a/tools/cli/src/commands/deploy/index.ts
+++ b/tools/cli/src/commands/deploy/index.ts
@@ -47,12 +47,20 @@ export function createDeployCommand(): Command {
       try {
         const projectDir = requireProject();
         await resolveOrAssignProjectContext(projectDir);
-        const { success: envSetupSuccess } = await ensureEnv({
-          deployDir: projectDir,
-        });
+        const { success: envSetupSuccess, regeneratedAutoSecrets } =
+          await ensureEnv({
+            deployDir: projectDir,
+          });
         if (!envSetupSuccess) {
           process.exit(1);
         }
+        // If ensureEnv had to mint missing auto-gen secrets headlessly
+        // (typical: a new `SANDBOX_TOKEN` for an existing deployment),
+        // force-recreate the running services so their in-memory env
+        // refreshes to the new value rather than keeping the stale null.
+        const forceRecreate =
+          regeneratedAutoSecrets !== undefined &&
+          regeneratedAutoSecrets.length > 0;
         const env = loadEnv(projectDir);
 
         const version = pkg.version.includes('-dev') ? 'latest' : pkg.version;
@@ -92,6 +100,7 @@ export function createDeployCommand(): Command {
           fresh: options.fresh,
           quiet: options.quiet,
           assumeYes: options.yes || options.migrateVolumes,
+          forceRecreate,
         });
       } catch (err) {
         logger.error(err instanceof Error ? err.message : String(err));
diff --git a/tools/cli/src/commands/doctor.ts b/tools/cli/src/commands/doctor.ts
new file mode 100644
index 000000000..5ee77cf09
--- /dev/null
+++ b/tools/cli/src/commands/doctor.ts
@@ -0,0 +1,216 @@
+import { execSync } from 'node:child_process';
+import { existsSync } from 'node:fs';
+
+import { Command } from 'commander';
+
+import { findProject } from '../lib/project/find-project';
+import { loadEnv } from '../utils/load-env';
+import * as logger from '../utils/logger';
+
+/**
+ * `tale doctor` — preflight checks for the host environment.
+ *
+ * Initial scope: sandbox-relevant items only (R1.17 surfaced that the
+ * CLI never had a doctor command). Future checks (Postgres / Docker
+ * versions, disk headroom, etc.) belong here too but are out of scope
+ * for the sandbox-foundation rollout.
+ */
+
+interface Check {
+  name: string;
+  status: 'ok' | 'warn' | 'fail';
+  detail: string;
+  fix?: string;
+}
+
+function tryRun(cmd: string): string | undefined {
+  try {
+    return execSync(cmd, { stdio: ['ignore', 'pipe', 'ignore'] })
+      .toString()
+      .trim();
+  } catch {
+    return undefined;
+  }
+}
+
+function checkDocker(): Check {
+  const version = tryRun('docker --version');
+  if (!version) {
+    return {
+      name: 'docker',
+      status: 'fail',
+      detail: 'docker CLI not on PATH',
+      fix: 'Install Docker Engine 24+ or Docker Desktop',
+    };
+  }
+  return { name: 'docker', status: 'ok', detail: version };
+}
+
+function checkSocket(): Check {
+  if (!existsSync('/var/run/docker.sock')) {
+    return {
+      name: 'docker socket',
+      status: 'fail',
+      detail: '/var/run/docker.sock not present',
+      fix: 'Start the Docker daemon (systemctl start docker) or open Docker Desktop',
+    };
+  }
+  return {
+    name: 'docker socket',
+    status: 'ok',
+    detail: '/var/run/docker.sock present',
+  };
+}
+
+function checkRunsc(): Check {
+  const runtimes = tryRun(
+    "docker info --format '{{json .Runtimes}}' 2>/dev/null",
+  );
+  const hasRunsc = runtimes ? /\brunsc\b/.test(runtimes) : false;
+  if (hasRunsc) {
+    return {
+      name: 'gVisor runtime (runsc)',
+      status: 'ok',
+      detail: 'registered with dockerd; set SANDBOX_RUNTIME=runsc to opt in',
+    };
+  }
+  return {
+    name: 'gVisor runtime (runsc)',
+    status: 'warn',
+    detail:
+      'not registered with dockerd — sandbox will use plain runc (recommended for demo stage; install runsc before exposing to untrusted external workloads)',
+    fix: 'https://gvisor.dev/docs/user_guide/install/ then `sudo runsc install && sudo systemctl restart docker`',
+  };
+}
+
+function checkUserns(): Check {
+  const out = tryRun("docker info --format '{{.SecurityOptions}}' 2>/dev/null");
+  if (out && /name=userns/.test(out)) {
+    return {
+      name: 'dockerd userns-remap',
+      status: 'ok',
+      detail: 'enabled — container root ≠ host root',
+    };
+  }
+  return {
+    name: 'dockerd userns-remap',
+    status: 'warn',
+    detail:
+      'not enabled — sandbox container UID 65534 maps to host UID 65534; combined with a kernel LPE this is a path to host root',
+    fix: 'Set "userns-remap": "default" in /etc/docker/daemon.json and restart docker',
+  };
+}
+
+function checkApparmor(): Check {
+  const aa = tryRun('cat /sys/kernel/security/apparmor/profiles 2>/dev/null');
+  if (aa && /docker-default/.test(aa)) {
+    return {
+      name: 'AppArmor docker-default',
+      status: 'ok',
+      detail: 'profile loaded',
+    };
+  }
+  return {
+    name: 'AppArmor docker-default',
+    status: 'warn',
+    detail:
+      'not loaded — sandbox containers rely on Docker built-in seccomp only; consider enabling AppArmor on production hosts',
+  };
+}
+
+function checkSandboxToken(env: NodeJS.ProcessEnv): Check {
+  // Token policy is opt-in (audit follow-up F1) — unset = HMAC disabled,
+  // valid for dev / internal-trust deployments. Report informationally:
+  // a short value is suspicious (probably truncated), but missing is OK.
+  const raw = env.SANDBOX_TOKEN;
+  if (!raw || raw.length === 0) {
+    return {
+      name: 'SANDBOX_TOKEN',
+      status: 'warn',
+      detail:
+        'unset — HMAC auth between Convex and the sandbox spawner is disabled',
+      fix: 'Set a 64-char hex value (or re-run `tale init`) to enable signature verification',
+    };
+  }
+  if (raw.length < 32) {
+    return {
+      name: 'SANDBOX_TOKEN',
+      status: 'fail',
+      detail: `set but suspiciously short (${raw.length} chars) — looks truncated`,
+      fix: 'Set a 64-char hex value (or re-run `tale init`)',
+    };
+  }
+  return {
+    name: 'SANDBOX_TOKEN',
+    status: 'ok',
+    detail: `enabled (${raw.length} chars)`,
+  };
+}
+
+function statusIcon(s: Check['status']): string {
+  return s === 'ok' ? '✓' : s === 'warn' ? '!' : '✗';
+}
+
+export function createDoctorCommand(): Command {
+  return new Command('doctor')
+    .description(
+      'Preflight checks for sandbox / artifact_run host requirements (docker, runsc, userns-remap, secrets).',
+    )
+    .action(async () => {
+      // Surface SANDBOX_TOKEN as the user actually configured it. Without
+      // loading the project's .env first, `tale doctor` always saw an
+      // empty process.env.SANDBOX_TOKEN and reported "missing" even when
+      // the value was set in .env (audit follow-up F10). loadEnv is a
+      // no-op when there's no project / no .env file.
+      const projectDir = findProject();
+      if (projectDir !== null) {
+        try {
+          loadEnv(projectDir);
+        } catch (err) {
+          logger.warn(
+            `Failed to load .env from ${projectDir}: ${err instanceof Error ? err.message : String(err)}`,
+          );
+        }
+      }
+      const env = process.env;
+      const checks: Check[] = [
+        checkDocker(),
+        checkSocket(),
+        checkRunsc(),
+        checkUserns(),
+        checkApparmor(),
+        checkSandboxToken(env),
+      ];
+
+      let failed = 0;
+      let warned = 0;
+      for (const c of checks) {
+        const icon = statusIcon(c.status);
+        const line = `${icon} ${c.name.padEnd(28)} ${c.detail}`;
+        if (c.status === 'ok') logger.info(line);
+        else if (c.status === 'warn') {
+          logger.warn(line);
+          warned += 1;
+        } else {
+          logger.error(line);
+          failed += 1;
+        }
+        if (c.status !== 'ok' && c.fix) {
+          logger.info(`  fix: ${c.fix}`);
+        }
+      }
+
+      logger.blank();
+      if (failed > 0) {
+        logger.error(`${failed} check(s) failed; sandbox will not work.`);
+        process.exit(1);
+      }
+      if (warned > 0) {
+        logger.warn(
+          `${warned} recommendation(s); sandbox will function but is using weaker defaults.`,
+        );
+        process.exit(0);
+      }
+      logger.success('All sandbox preflight checks passed.');
+    });
+}
diff --git a/tools/cli/src/index.ts b/tools/cli/src/index.ts
index f2aa60dbb..c5d1d16c8 100644
--- a/tools/cli/src/index.ts
+++ b/tools/cli/src/index.ts
@@ -7,6 +7,7 @@ import { createCleanupCommand } from './commands/cleanup';
 import { createConfigCommand } from './commands/config';
 import { createConvexCommand } from './commands/convex';
 import { createDeployCommand } from './commands/deploy';
+import { createDoctorCommand } from './commands/doctor';
 import { createInitCommand } from './commands/init';
 import { createLogsCommand } from './commands/logs';
 import { createResetCommand } from './commands/reset';
@@ -45,5 +46,6 @@ program.addCommand(createLogsCommand());
 program.addCommand(createRollbackCommand());
 program.addCommand(createResetCommand());
 program.addCommand(createCleanupCommand());
+program.addCommand(createDoctorCommand());
 
 await program.parseAsync();
diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts
index 344a3dd96..667bceae4 100644
--- a/tools/cli/src/lib/actions/deploy.ts
+++ b/tools/cli/src/lib/actions/deploy.ts
@@ -12,13 +12,15 @@ import {
   type RotatableService,
   type ServiceName,
   type StatefulService,
+  LOCKSTEP_SERVICES,
   ROTATABLE_SERVICES,
   STATEFUL_SERVICES,
+  isLockstepService,
   isRotatableService,
   isStatefulService,
 } from '../compose/types';
 import { dockerCompose } from '../docker/docker-compose';
-import { ensureNetwork } from '../docker/ensure-network';
+import { ensureNetwork, ensureSandboxNetwork } from '../docker/ensure-network';
 import { ensureVolumes } from '../docker/ensure-volumes';
 import { exec } from '../docker/exec';
 import { getContainerVersion } from '../docker/get-container-version';
@@ -56,6 +58,11 @@ async function ensureInfrastructure(
   if (!networkCreated) {
     throw new Error('Failed to create required network');
   }
+  // Sandbox bridge: fixed name `tale-sandbox-net`, internal-only, IPv6 off.
+  const sandboxNetworkCreated = await ensureSandboxNetwork();
+  if (!sandboxNetworkCreated) {
+    throw new Error('Failed to create sandbox network');
+  }
 }
 
 interface DeployOptions {
@@ -71,6 +78,16 @@ interface DeployOptions {
   assumeYes?: boolean;
   /** @deprecated use assumeYes. Kept for one release of CLI back-compat. */
   migrateVolumes?: boolean;
+  /**
+   * Set by the caller when `ensureEnv` filled in auto-gen secrets headlessly
+   * (e.g. an upgrade silently materialized `SANDBOX_TOKEN`). All subsequent
+   * `docker compose up -d` invocations gain `--force-recreate` so containers
+   * that were already running on an unchanged image pick up the new value
+   * — without this, the spawner could keep its pre-rotation null token in
+   * memory while Convex picks up the new one, breaking the HMAC handshake
+   * until the next manual restart.
+   */
+  forceRecreate?: boolean;
 }
 
 export async function deploy(options: DeployOptions): Promise<void> {
@@ -188,7 +205,15 @@ export async function deploy(options: DeployOptions): Promise<void> {
         rotatableToUpdate = services.filter(isRotatableService);
         statefulToUpdate = services.filter(isStatefulService);
       } else {
-        // Default: all rotatable services
+        // Default: all rotatable services PLUS lockstep services.
+        //
+        // Lockstep services (sandbox, sandbox-egress) version in step with
+        // the platform image — shipping an old sandbox against new
+        // platform code would break the SSE wire contract. Including
+        // them on every default deploy matches the build matrix's
+        // single-version policy and avoids the "platform upgraded but
+        // sandbox stayed on yesterday's image" failure mode that drove
+        // the sandbox-wobbly-origami plan §5 rollout decision.
         rotatableToUpdate = [...ROTATABLE_SERVICES];
 
         if (isFirstDeploy || updateStateful) {
@@ -199,9 +224,12 @@ export async function deploy(options: DeployOptions): Promise<void> {
             );
           }
         } else {
-          // Check if any required stateful services are not running
+          // Check if any required stateful services are not running, and
+          // ALWAYS include lockstep services so they roll forward with
+          // the platform image.
           const missingStateful: StatefulService[] = [];
           for (const service of STATEFUL_SERVICES) {
+            if (isLockstepService(service)) continue; // handled below
             const containerName = `${getProjectId()}-${service}`;
             const running = await isContainerRunning(containerName);
             if (!running) {
@@ -209,14 +237,19 @@ export async function deploy(options: DeployOptions): Promise<void> {
             }
           }
 
+          const lockstepToUpdate: StatefulService[] = [...LOCKSTEP_SERVICES];
+
           if (missingStateful.length > 0) {
             logger.notice(
               `Infrastructure services not running: ${missingStateful.join(', ')} - including automatically`,
             );
-            statefulToUpdate = missingStateful;
-          } else {
-            statefulToUpdate = [];
           }
+          if (lockstepToUpdate.length > 0) {
+            logger.info(
+              `Lockstep services: ${lockstepToUpdate.join(', ')} - included on every default deploy`,
+            );
+          }
+          statefulToUpdate = [...missingStateful, ...lockstepToUpdate];
         }
       }
 
@@ -255,10 +288,32 @@ export async function deploy(options: DeployOptions): Promise<void> {
         ),
       ];
 
+      // The spawner's runtime image (consumed by `docker run` of user code,
+      // not a compose service) must also be pulled and re-tagged to match the
+      // spawner's `SANDBOX_RUNTIME_IMAGE` default (`tale-sandbox-runtime:latest`).
+      // Without this, a fresh deploy host has no local runtime image and the
+      // first /v1/execute fails with image-not-found. Mirrors build.yml's
+      // re-tag step. Pulled whenever sandbox or sandbox-egress is being
+      // updated, since the runtime image versions in lockstep with the spawner.
+      const needsRuntimeImage =
+        statefulToUpdate.includes('sandbox') ||
+        statefulToUpdate.includes('sandbox-egress');
+      const runtimeImageRemote = needsRuntimeImage
+        ? `${env.GHCR_REGISTRY}/tale-sandbox-runtime:${version}`
+        : null;
+      if (runtimeImageRemote) {
+        imagesToPull.push(runtimeImageRemote);
+      }
+
       if (dryRun) {
         for (const image of imagesToPull) {
           logger.info(`${prefix}Would pull: ${image}`);
         }
+        if (runtimeImageRemote) {
+          logger.info(
+            `${prefix}Would tag: ${runtimeImageRemote} -> tale-sandbox-runtime:latest`,
+          );
+        }
       } else {
         const failedImages: string[] = [];
         for (const image of imagesToPull) {
@@ -274,6 +329,18 @@ export async function deploy(options: DeployOptions): Promise<void> {
               'Please wait a few minutes and try again.',
           );
         }
+        if (runtimeImageRemote) {
+          const tagResult = await exec('docker', [
+            'tag',
+            runtimeImageRemote,
+            'tale-sandbox-runtime:latest',
+          ]);
+          if (!tagResult.success) {
+            throw new Error(
+              `Failed to re-tag sandbox runtime image: ${tagResult.stderr.trim()}`,
+            );
+          }
+        }
       }
 
       // Must run AFTER migrations (which may `docker compose down`, removing
@@ -297,7 +364,12 @@ export async function deploy(options: DeployOptions): Promise<void> {
         } else {
           const result = await dockerCompose(
             statefulCompose,
-            ['up', '-d', ...statefulToUpdate],
+            [
+              'up',
+              '-d',
+              ...(options.forceRecreate ? ['--force-recreate'] : []),
+              ...statefulToUpdate,
+            ],
             { projectName: getProjectId(), cwd: env.DEPLOY_DIR },
           );
 
@@ -369,7 +441,12 @@ export async function deploy(options: DeployOptions): Promise<void> {
             );
             const deployResult = await dockerCompose(
               colorCompose,
-              ['up', '-d', ...coloredServices],
+              [
+                'up',
+                '-d',
+                ...(options.forceRecreate ? ['--force-recreate'] : []),
+                ...coloredServices,
+              ],
               {
                 projectName: `${getProjectId()}-${currentColor}`,
                 cwd: env.DEPLOY_DIR,
@@ -463,7 +540,12 @@ export async function deploy(options: DeployOptions): Promise<void> {
             );
             const deployResult = await dockerCompose(
               colorCompose,
-              ['up', '-d', ...coloredServices],
+              [
+                'up',
+                '-d',
+                ...(options.forceRecreate ? ['--force-recreate'] : []),
+                ...coloredServices,
+              ],
               {
                 projectName: `${getProjectId()}-${nextColor}`,
                 cwd: env.DEPLOY_DIR,
diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts
index d9d10d6db..01be0f381 100644
--- a/tools/cli/src/lib/actions/start.ts
+++ b/tools/cli/src/lib/actions/start.ts
@@ -10,7 +10,7 @@ import { findComposeOverride } from '../compose/find-compose-override';
 import { DEV_VOLUME_NAMES } from '../compose/generators/constants';
 import { generateDevCompose } from '../compose/generators/generate-dev-compose';
 import { dockerCompose } from '../docker/docker-compose';
-import { ensureNetwork } from '../docker/ensure-network';
+import { ensureNetwork, ensureSandboxNetwork } from '../docker/ensure-network';
 import { ensureVolumes } from '../docker/ensure-volumes';
 import { exec } from '../docker/exec';
 import { findProject } from '../project/find-project';
@@ -227,6 +227,14 @@ export async function start(options: StartOptions): Promise<void> {
     if (!networkOk) {
       throw new Error('Failed to create dev network');
     }
+    // Sandbox bridge has a fixed Docker name (tale-sandbox-net) and lives
+    // outside the project-prefixed naming scheme so the spawner can target
+    // it directly from `docker run --network`. Internal-only (no internet)
+    // and IPv6-disabled (R1.3 v4-allowlist-bypass mitigation).
+    const sandboxNetworkOk = await ensureSandboxNetwork();
+    if (!sandboxNetworkOk) {
+      throw new Error('Failed to create sandbox network');
+    }
   });
 
   const env = loadEnv(projectDir);
diff --git a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
index 96cd3bcdd..8a5b4231a 100644
--- a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
+++ b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts
@@ -11,6 +11,8 @@ import { createDbService } from '../services/create-db-service';
 import { createPlatformService } from '../services/create-platform-service';
 import { createProxyService } from '../services/create-proxy-service';
 import { createRagService } from '../services/create-rag-service';
+import { createSandboxEgressService } from '../services/create-sandbox-egress-service';
+import { createSandboxService } from '../services/create-sandbox-service';
 import type { ComposeConfig, ServiceConfig } from '../types';
 import { DEV_VOLUME_NAMES } from './constants';
 
@@ -134,6 +136,13 @@ export function generateDevCompose(
   const proxy = createProxyService(config, hostAlias);
   proxy.ports = [`${port}:443`];
 
+  // Dev-only: publish the sandbox spawner on host loopback so `bun dev`
+  // running Convex on the host can reach it at http://127.0.0.1:8003. The
+  // stateful compose generator never publishes this port — production Convex
+  // is in-container and uses the `internal` Docker network alias.
+  const sandbox = createSandboxService(config);
+  sandbox.ports = ['127.0.0.1:8003:8003'];
+
   // Scope dev volumes/networks explicitly via `external: true` + `name:`.
   // Dev volumes live under the `${projectId}-dev_` prefix (matching the
   // `-p ${projectId}-dev` passed to docker compose). They are pre-created by
@@ -153,6 +162,8 @@ export function generateDevCompose(
       platform,
       rag,
       crawler,
+      'sandbox-egress': createSandboxEgressService(config),
+      sandbox,
     },
     volumes,
     networks: {
@@ -160,6 +171,10 @@ export function generateDevCompose(
         external: true,
         name: `${devPrefix}internal`,
       },
+      // Sandbox bridge — internal-only, IPv6 disabled (declared in
+      // start.ts via ensureNetwork; here referenced as external so the
+      // generator emits the right ref).
+      sandbox: { external: true, name: 'tale-sandbox-net' },
     },
   };
 
diff --git a/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts b/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts
index ce25bfef2..3c62e8ab5 100644
--- a/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts
+++ b/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts
@@ -4,6 +4,8 @@ import { getProjectId } from '../../../utils/load-env';
 import { createConvexService } from '../services/create-convex-service';
 import { createDbService } from '../services/create-db-service';
 import { createProxyService } from '../services/create-proxy-service';
+import { createSandboxEgressService } from '../services/create-sandbox-egress-service';
+import { createSandboxService } from '../services/create-sandbox-service';
 import type { ComposeConfig, ServiceConfig } from '../types';
 
 interface StatefulComposeOptions {
@@ -26,6 +28,8 @@ export function generateStatefulCompose(
       db: createDbService(config),
       proxy: createProxyService(config, hostAlias),
       convex,
+      'sandbox-egress': createSandboxEgressService(config),
+      sandbox: createSandboxService(config),
     },
     volumes: {
       'db-data': { external: true, name: `${prefix}db-data` },
@@ -36,6 +40,11 @@ export function generateStatefulCompose(
     },
     networks: {
       internal: { external: true, name: `${prefix}internal` },
+      // Sandbox bridge is created fresh per deployment (internal-only, IPv6
+      // disabled). The Docker-level name is pinned to tale-sandbox-net so
+      // the spawner can `docker run --network tale-sandbox-net` without
+      // discovering compose's prefixed default name.
+      sandbox: { external: true, name: 'tale-sandbox-net' },
     },
   };
 
diff --git a/tools/cli/src/lib/compose/services/create-convex-service.ts b/tools/cli/src/lib/compose/services/create-convex-service.ts
index 6023dede8..03e3cd325 100644
--- a/tools/cli/src/lib/compose/services/create-convex-service.ts
+++ b/tools/cli/src/lib/compose/services/create-convex-service.ts
@@ -15,6 +15,25 @@ export function createConvexService(config: ServiceConfig): ComposeService {
   return {
     image: `${config.registry}/tale-convex:${config.version}`,
     container_name: `${getProjectId()}-convex`,
+    // NET_ADMIN: required for the entrypoint's SSRF egress firewall
+    // (iptables REJECT rules for IMDS + link-local + RFC1918). Without
+    // this cap, services/convex/docker-entrypoint.sh:79 logs a warning
+    // and skips the firewall — yt-dlp's own DNS resolution then becomes
+    // a DNS-rebinding SSRF vector against the host's cloud metadata
+    // service. The compose.yml had this all along; the CLI generator
+    // was silently dropping it (R1.17). Bonus fix surfaced by the
+    // sandbox review.
+    cap_add: ['NET_ADMIN'],
+    // Per-container resource caps. yt-dlp + ffmpeg subprocesses peak
+    // ~300-500 MB each; APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=32 means
+    // the worst case is 32 parallel ingest jobs. mem_limit caps blast
+    // radius; pids_limit defends against fork-bomb regressions; nofile
+    // gives breathing room for concurrent yt-dlp + ffmpeg + Convex.
+    mem_limit: '12g',
+    pids_limit: 4096,
+    ulimits: {
+      nofile: { soft: 65536, hard: 65536 },
+    },
     volumes: ['convex-data:/app/data', 'caddy-data:/caddy-data:ro'],
     env_file: ['.env'],
     restart: 'unless-stopped',
diff --git a/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts
new file mode 100644
index 000000000..1d376e2c9
--- /dev/null
+++ b/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts
@@ -0,0 +1,64 @@
+import { getProjectId } from '../../../utils/load-env';
+import type { ComposeService, ServiceConfig } from '../types';
+import { DEFAULT_LOGGING } from '../types';
+
+/**
+ * Sandbox egress proxy — tinyproxy on `sandbox` (faces the runtime
+ * containers) + `internal` (the only Docker network in this stack with
+ * outbound NAT to pypi/npmjs/etc; `tale-sandbox-net` is created with
+ * `--internal` so runtime containers cannot bypass the proxy).
+ *
+ * Filters CONNECT host requests against a configurable allow-list
+ * (default: pypi.org, files.pythonhosted.org, registry.npmjs.org,
+ * github package endpoints). Replaces the originally-planned iptables IP
+ * allow-list which R1.3/R2.1 showed was unsafe due to shared Fastly /
+ * Cloudflare CDN IPs.
+ *
+ * NET_ADMIN is granted so the container's entrypoint installs iptables
+ * REJECT rules for IMDS (169.254.169.254) and RFC1918 ranges; this is
+ * defense-in-depth against a DNS-rebind attack flipping an allowlisted
+ * hostname to a private IP between tinyproxy's lookup and the kernel
+ * connect(). Mirrors services/convex/docker-entrypoint.sh.
+ *
+ * Egress IS reachable from `internal` peers (rag, crawler, platform,
+ * web) — but only as a hostname-filtered proxy that can already reach
+ * the same registries those peers can reach directly via their own NAT.
+ * The proxy is not a meaningful new attack surface for those peers; the
+ * isolation it provides is for the `--internal` sandbox network, where
+ * it is the only outbound path.
+ */
+export function createSandboxEgressService(
+  config: ServiceConfig,
+): ComposeService {
+  return {
+    image: `${config.registry}/tale-sandbox-egress:${config.version}`,
+    container_name: `${getProjectId()}-sandbox-egress`,
+    env_file: ['.env'],
+    restart: 'unless-stopped',
+    cap_add: ['NET_ADMIN'],
+    // tinyproxy + tail = trivial footprint; the cap is here to bound a
+    // misbehaving allowlist-regex DoS that pegs CPU or floods the log.
+    mem_limit: '512m',
+    pids_limit: 512,
+    ulimits: {
+      nofile: { soft: 4096, hard: 8192 },
+    },
+    healthcheck: {
+      // Local readiness probe: a TCP `nc -z 3128` confirms tinyproxy is
+      // bound and accepting connections. We deliberately do NOT probe an
+      // external host (pypi) on every interval: 10s × 24h = 8,640
+      // pypi.org/simple/ hits per day per host, which is wasteful and
+      // makes the proxy's healthiness depend on a third party's uptime
+      // (a pypi blip would flap the container and trigger restarts).
+      // Allow-list regressions are caught by the smoke test, not by the
+      // health probe.
+      test: ['CMD-SHELL', 'nc -z 127.0.0.1 3128 || exit 1'],
+      interval: '30s',
+      timeout: '3s',
+      retries: 3,
+      start_period: '10s',
+    },
+    logging: DEFAULT_LOGGING,
+    networks: ['sandbox', 'internal'],
+  };
+}
diff --git a/tools/cli/src/lib/compose/services/create-sandbox-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-service.ts
new file mode 100644
index 000000000..235f2ee34
--- /dev/null
+++ b/tools/cli/src/lib/compose/services/create-sandbox-service.ts
@@ -0,0 +1,76 @@
+import { getProjectId } from '../../../utils/load-env';
+import type { ComposeService, ServiceConfig } from '../types';
+import { DEFAULT_LOGGING } from '../types';
+
+/**
+ * Sandbox spawner — thin stateless docker-run service.
+ *
+ * SECURITY: mounts /var/run/docker.sock so it can spawn sibling containers.
+ * docker.sock = host root; this is the explicit security boundary the
+ * sandbox plan accepts. The spawner accepts only HMAC-signed typed JSON
+ * over HTTP (reachable only on the `internal` network), and the docker
+ * argv builder validates every identifier with strict regexes so a
+ * malformed input never reaches `docker run` (see
+ * services/sandbox/src/docker-args.ts).
+ *
+ * Joined to BOTH networks:
+ *   - `internal` — so the platform container can reach it on
+ *     http://sandbox:8003.
+ *   - `sandbox` — so the per-call runtime containers it spawns can be
+ *     attached to the internal-only egress bridge.
+ *
+ * Operators wanting stronger isolation set SANDBOX_RUNTIME=runsc and
+ * install gVisor on the host; the spawner picks the runtime via env.
+ */
+export function createSandboxService(config: ServiceConfig): ComposeService {
+  return {
+    image: `${config.registry}/tale-sandbox:${config.version}`,
+    container_name: `${getProjectId()}-sandbox`,
+    // NOTE: no published `ports` here. Convex (in-container, stateful
+    // compose) reaches the spawner via the `internal` Docker network at
+    // http://sandbox:8003 — publishing a host-side port is unnecessary
+    // attack surface in production (the spawner mounts /var/run/docker.sock,
+    // so any reachable peer is effectively host-root). The dev compose
+    // generator overlays `127.0.0.1:8003:8003` so that `bun dev` with Convex
+    // running on the host can reach the spawner.
+    // Per-container resource caps. The spawner is a thin Bun HTTP server
+    // that issues `docker` subprocess calls; 512 MB is generous for the
+    // server itself but excludes the runtime containers it spawns (those
+    // get their own caps via `--memory=1g` in docker-args.ts). pids_limit
+    // bounds the docker-CLI fanout under a fork-bomb regression; the
+    // nofile bump leaves room for many in-flight SSE streams.
+    mem_limit: '512m',
+    pids_limit: 512,
+    ulimits: {
+      nofile: { soft: 4096, hard: 8192 },
+    },
+    env_file: ['.env'],
+    environment: {
+      SANDBOX_RUNTIME: '${SANDBOX_RUNTIME:-runc}',
+      SANDBOX_RUNTIME_IMAGE:
+        '${SANDBOX_RUNTIME_IMAGE:-tale-sandbox-runtime:latest}',
+      SANDBOX_EGRESS_NETWORK: 'tale-sandbox-net',
+      SANDBOX_EGRESS_PROXY: 'http://sandbox-egress:3128',
+    },
+    volumes: [
+      '/var/run/docker.sock:/var/run/docker.sock',
+      // 1:1 bind so per-call workspace dirs created by the spawner are
+      // visible to the docker daemon at the same host path when it mounts
+      // them into the runtime container.
+      '/var/lib/tale-sandbox:/var/lib/tale-sandbox',
+    ],
+    restart: 'unless-stopped',
+    healthcheck: {
+      test: ['CMD', 'curl', '-fsS', 'http://127.0.0.1:8003/health'],
+      interval: '10s',
+      timeout: '5s',
+      retries: 3,
+      start_period: '15s',
+    },
+    depends_on: {
+      'sandbox-egress': { condition: 'service_healthy' },
+    },
+    logging: DEFAULT_LOGGING,
+    networks: ['internal', 'sandbox'],
+  };
+}
diff --git a/tools/cli/src/lib/compose/types.ts b/tools/cli/src/lib/compose/types.ts
index 1bc8642cc..d534beeb2 100644
--- a/tools/cli/src/lib/compose/types.ts
+++ b/tools/cli/src/lib/compose/types.ts
@@ -32,6 +32,15 @@ export interface ComposeService {
   logging?: LoggingConfig;
   networks?: string[] | Record<string, { aliases?: string[] }>;
   extra_hosts?: string[];
+  // Linux capability + resource flags. Previously absent from the generator,
+  // which silently dropped them on the convex service (R1.17 latent bug)
+  // and made sandbox impossible. All optional; emit only when set.
+  cap_add?: string[];
+  mem_limit?: string;
+  pids_limit?: number;
+  ulimits?: Record<string, number | { soft: number; hard: number }>;
+  security_opt?: string[];
+  runtime?: string;
 }
 
 export interface ComposeConfig {
@@ -54,7 +63,27 @@ export interface ServiceConfig {
 }
 
 export const ROTATABLE_SERVICES = ['platform', 'rag', 'crawler'] as const;
-export const STATEFUL_SERVICES = ['db', 'proxy', 'convex'] as const;
+/**
+ * Lockstep services — always re-deployed on default `tale deploy`, even
+ * though they're stateful (no blue/green rotation). Sandbox-side wire
+ * protocol versions in lockstep with platform: shipping an old sandbox
+ * image against new platform code would fail with HARVEST_FAILED on the
+ * first run. Distinct from STATEFUL_SERVICES so the policy is explicit;
+ * see deploy.ts default-services logic.
+ *
+ * Plan: sandbox-wobbly-origami §5 "Rollout".
+ */
+export const LOCKSTEP_SERVICES = ['sandbox', 'sandbox-egress'] as const;
+export const STATEFUL_SERVICES = [
+  'db',
+  'proxy',
+  'convex',
+  // Lockstep entries are part of STATEFUL_SERVICES for legacy
+  // back-compat (existing isStatefulService callers depend on this).
+  // The deploy.ts default path treats LOCKSTEP_SERVICES specially —
+  // see below.
+  ...LOCKSTEP_SERVICES,
+] as const;
 export const ALL_SERVICES = [
   ...ROTATABLE_SERVICES,
   ...STATEFUL_SERVICES,
@@ -62,6 +91,7 @@ export const ALL_SERVICES = [
 
 export type RotatableService = (typeof ROTATABLE_SERVICES)[number];
 export type StatefulService = (typeof STATEFUL_SERVICES)[number];
+type LockstepService = (typeof LOCKSTEP_SERVICES)[number];
 export type ServiceName = RotatableService | StatefulService;
 
 export function isValidService(name: string): name is ServiceName {
@@ -75,3 +105,7 @@ export function isRotatableService(name: string): name is RotatableService {
 export function isStatefulService(name: string): name is StatefulService {
   return (STATEFUL_SERVICES as readonly string[]).includes(name);
 }
+
+export function isLockstepService(name: string): name is LockstepService {
+  return (LOCKSTEP_SERVICES as readonly string[]).includes(name);
+}
diff --git a/tools/cli/src/lib/config/ensure-env.ts b/tools/cli/src/lib/config/ensure-env.ts
index 996a34f1b..a2adf9049 100644
--- a/tools/cli/src/lib/config/ensure-env.ts
+++ b/tools/cli/src/lib/config/ensure-env.ts
@@ -81,6 +81,15 @@ interface EnvSetupResult {
   success: boolean;
   agePublicKey?: string;
   openrouterKey?: string;
+  /**
+   * Set when `ensureEnv` filled in missing auto-gen secrets (most relevant:
+   * `SANDBOX_TOKEN`) — so the deploy action can force-recreate the
+   * containers that depend on those secrets. Without forced recreate, a
+   * container that's already running on an unchanged image keeps its
+   * pre-rotation env in memory while peers pick up the new one, breaking
+   * the HMAC handshake until the next manual restart.
+   */
+  regeneratedAutoSecrets?: readonly string[];
 }
 
 export async function ensureEnv(
@@ -93,34 +102,53 @@ export async function ensureEnv(
     const content = await readFile(envPath, 'utf-8');
     const existing = parseEnvFile(content);
 
-    const requiredVars = [
-      'HOST',
-      'SITE_URL',
-      'TLS_MODE',
+    // Split required vars by who can satisfy them:
+    //   - User-supplied: needs human input (HOST, TLS choice). Non-TTY
+    //     upgrade can't fill these in; refuse and prompt for interactive.
+    //   - Auto-generatable: secret of a known shape (HMAC keys, DB password,
+    //     age key). Non-TTY upgrade silently fills these so headless
+    //     CI/CD deploys keep working when the schema gains a new secret
+    //     (history: `SANDBOX_TOKEN` was added to required mid-stream and
+    //     started failing every existing headless deploy).
+    const requiredUserVars = ['HOST', 'SITE_URL', 'TLS_MODE'];
+    const requiredAutoVars = [
       'BETTER_AUTH_SECRET',
       'ENCRYPTION_SECRET_HEX',
       'INSTANCE_SECRET',
       'DB_PASSWORD',
       'SOPS_AGE_KEY',
+      // Shared HMAC secret for Convex → sandbox spawner. Generated as
+      // 32 random bytes (hex); see services/sandbox/src/auth.ts.
+      'SANDBOX_TOKEN',
     ];
-    const missing = requiredVars.filter((v) => !existing[v]);
+    const missingUser = requiredUserVars.filter((v) => !existing[v]);
+    const missingAuto = requiredAutoVars.filter((v) => !existing[v]);
 
-    if (missing.length === 0) {
+    if (missingUser.length === 0 && missingAuto.length === 0) {
       // All required vars present — derive public key for caller
       const agePublicKey = deriveAgePublicKey(existing.SOPS_AGE_KEY);
       return { success: true, agePublicKey };
     }
 
     if (!isTTY) {
-      logger.warn(
-        `Existing .env is missing required variables: ${missing.join(', ')}`,
-      );
-      logger.info('Run the CLI interactively to complete environment setup.');
-      return { success: false };
+      // Headless: refuse only when user-supplied vars are missing (we
+      // can't synthesize a domain or TLS choice). Otherwise auto-generate
+      // the missing secrets and continue so CI/CD upgrades stay green.
+      if (missingUser.length > 0) {
+        logger.warn(
+          `Existing .env is missing required user-supplied variables: ${missingUser.join(', ')}`,
+        );
+        logger.info('Run the CLI interactively to complete environment setup.');
+        return { success: false };
+      }
+      return await runHeadlessAutoSecretFill(envPath, existing, missingAuto);
     }
 
     // Fill in only the missing variables
-    return await runPartialEnvSetup(envPath, existing, missing);
+    return await runPartialEnvSetup(envPath, existing, [
+      ...missingUser,
+      ...missingAuto,
+    ]);
   }
 
   if (!isTTY) {
@@ -136,6 +164,76 @@ export async function ensureEnv(
   return await runEnvSetup(envPath);
 }
 
+/**
+ * Headless (non-TTY) auto-gen path for known-shape secrets. Used when a
+ * deploy adds a new required secret (e.g. `SANDBOX_TOKEN`) and existing
+ * CI/CD deploys would otherwise fail because the secret isn't in their
+ * `.env`. Only invoked when every missing var is in the auto-gen set; a
+ * missing user-supplied var (HOST, TLS_MODE) still refuses non-TTY.
+ *
+ * The deploy action receives `regeneratedAutoSecrets` so it can
+ * force-recreate containers that read these secrets at boot (otherwise
+ * a container already running on an unchanged image keeps the old null
+ * value while its peer picks up the new one — HMAC handshake breaks).
+ */
+async function runHeadlessAutoSecretFill(
+  envPath: string,
+  existing: Record<string, string>,
+  missingAuto: string[],
+): Promise<EnvSetupResult> {
+  const secretDefaults: Record<string, () => string> = {
+    BETTER_AUTH_SECRET: generateBase64Secret,
+    ENCRYPTION_SECRET_HEX: generateHexSecret,
+    INSTANCE_SECRET: generateHexSecret,
+    DB_PASSWORD: generatePassword,
+    SANDBOX_TOKEN: generateHexSecret,
+  };
+
+  const updates: Record<string, string> = {};
+  let sopsAgeKey = existing.SOPS_AGE_KEY;
+
+  for (const key of missingAuto) {
+    if (key === 'SOPS_AGE_KEY') {
+      const keypair = generateAgeKeypair();
+      updates.SOPS_AGE_KEY = keypair.secretKey;
+      sopsAgeKey = keypair.secretKey;
+      continue;
+    }
+    const generator = secretDefaults[key];
+    if (generator === undefined) {
+      // Defensive: a var made it into requiredAutoVars without a
+      // generator. Refuse rather than silently leave it unset.
+      logger.error(
+        `[ensureEnv] Missing auto-secret generator for "${key}". Add one in runHeadlessAutoSecretFill.`,
+      );
+      return { success: false };
+    }
+    updates[key] = generator();
+  }
+
+  // Surgically append to preserve existing content + comments.
+  const existingContent = await readFile(envPath, 'utf-8');
+  const appendLines = Object.entries(updates).map(([k, v]) => `${k}=${v}`);
+  if (appendLines.length > 0) {
+    const separator = existingContent.endsWith('\n') ? '' : '\n';
+    await writeFile(
+      envPath,
+      existingContent + separator + appendLines.join('\n') + '\n',
+      'utf-8',
+    );
+    logger.info(
+      `[ensureEnv] Generated ${missingAuto.length} missing secret(s) headlessly: ${missingAuto.join(', ')}.`,
+    );
+  }
+
+  const agePublicKey = deriveAgePublicKey(sopsAgeKey);
+  return {
+    success: true,
+    agePublicKey,
+    regeneratedAutoSecrets: missingAuto,
+  };
+}
+
 /**
  * Fill in missing variables in an existing .env file.
  */
@@ -212,6 +310,7 @@ async function runPartialEnvSetup(
     ENCRYPTION_SECRET_HEX: generateHexSecret,
     INSTANCE_SECRET: generateHexSecret,
     DB_PASSWORD: generatePassword,
+    SANDBOX_TOKEN: generateHexSecret,
   };
 
   let generatedCount = 0;
@@ -408,6 +507,7 @@ async function runEnvSetup(envPath: string): Promise<EnvSetupResult> {
     instanceSecret: generateHexSecret(),
     dbPassword,
     sopsAgeKey: ageKeypair.secretKey,
+    sandboxToken: generateHexSecret(),
   };
 
   const envContent = generateEnvContent({
@@ -441,6 +541,7 @@ interface EnvConfig {
   instanceSecret: string;
   dbPassword: string;
   sopsAgeKey: string;
+  sandboxToken: string;
 }
 
 function generateEnvContent(config: EnvConfig): string {
@@ -508,6 +609,24 @@ function generateEnvContent(config: EnvConfig): string {
     `SOPS_AGE_KEY=${config.sopsAgeKey}`,
     '# SOPS_AGE_KEY_FILE=',
     '',
+    '# ============================================================================',
+    '# Sandbox (artifact_run) Configuration',
+    '# ============================================================================',
+    '# Shared HMAC secret. Convex signs every request to the sandbox spawner',
+    '# with this; the spawner rejects unsigned/wrong-signed requests. Rotate',
+    '# by setting a new value and restarting both `platform` and `sandbox`.',
+    `SANDBOX_TOKEN=${config.sandboxToken}`,
+    '# Container runtime for spawned sandbox containers. `runc` (default) is',
+    '# plain Docker; `runsc` is gVisor (requires `runsc` installed on the',
+    '# host and registered with dockerd — see `tale doctor`). gVisor provides',
+    '# a userspace kernel that mitigates runc-class escape CVEs at the cost',
+    '# of ~6x pip-install latency for native-extension packages.',
+    '# SANDBOX_RUNTIME=runc',
+    '# Pipe-separated regex allow-list of egress hostnames for the sandbox',
+    '# proxy. Default covers pypi/npm/github package endpoints; extend if',
+    '# your agents need other registries (e.g. private wheel mirrors).',
+    '# SANDBOX_EGRESS_ALLOWLIST=^pypi\\.org$|^files\\.pythonhosted\\.org$|^registry\\.npmjs\\.org$',
+    '',
   );
 
   return lines.join('\n');
diff --git a/tools/cli/src/lib/docker/ensure-network.ts b/tools/cli/src/lib/docker/ensure-network.ts
index cc8e9d953..eb662e8a3 100644
--- a/tools/cli/src/lib/docker/ensure-network.ts
+++ b/tools/cli/src/lib/docker/ensure-network.ts
@@ -7,7 +7,10 @@ async function networkExists(networkName: string): Promise<boolean> {
   return result.success;
 }
 
-async function createNetwork(networkName: string): Promise<boolean> {
+async function createNetwork(
+  networkName: string,
+  extraArgs: string[] = [],
+): Promise<boolean> {
   const exists = await networkExists(networkName);
   if (exists) {
     logger.debug(`Network ${networkName} already exists`);
@@ -20,6 +23,7 @@ async function createNetwork(networkName: string): Promise<boolean> {
     'create',
     '--label',
     `project=${getProjectId()}`,
+    ...extraArgs,
     networkName,
   );
   if (!result.success) {
@@ -37,3 +41,56 @@ export async function ensureNetwork(
   const fullName = `${prefix}${networkName}`;
   return createNetwork(fullName);
 }
+
+/**
+ * The sandbox network is shared across blue/green and across dev/prod —
+ * it's pinned to a fixed Docker name (`tale-sandbox-net`) so the spawner
+ * can `docker run --network tale-sandbox-net` without discovering the
+ * compose-project-prefixed default. `--internal` blocks all internet
+ * from this network so the per-call runtime containers can only reach
+ * pypi/npm via the egress proxy sidecar.
+ *
+ * Defense-in-depth: if a network with this name already exists, verify
+ * `--internal` is still in effect. A stale or hand-rolled network without
+ * `--internal` would let runtime containers reach arbitrary hosts on the
+ * default bridge, defeating the whole egress-proxy model.
+ */
+export async function ensureSandboxNetwork(): Promise<boolean> {
+  const name = 'tale-sandbox-net';
+  const existed = await networkExists(name);
+  if (existed) {
+    const inspect = await docker(
+      'network',
+      'inspect',
+      '--format',
+      '{{.Internal}}|{{.EnableIPv6}}',
+      name,
+    );
+    if (inspect.success) {
+      const [internalStr, ipv6Str] = inspect.stdout.trim().split('|');
+      if (internalStr !== 'true') {
+        logger.error(
+          `Sandbox network ${name} exists but is NOT internal (Internal=${internalStr}). ` +
+            `Runtime containers would have direct internet access, defeating egress filtering. ` +
+            `Remove the existing network ("docker network rm ${name}") and retry, or recreate with --internal.`,
+        );
+        return false;
+      }
+      if (ipv6Str === 'true') {
+        // We deliberately disable IPv6 on the sandbox network so the
+        // entrypoint's iptables (v4) rules are a complete fence. A
+        // v6-enabled network would route around them.
+        logger.warn(
+          `Sandbox network ${name} has IPv6 enabled (EnableIPv6=true). ` +
+            `Recommended: recreate with --ipv6=false so iptables (v4-only) covers the full egress surface.`,
+        );
+      }
+    } else {
+      logger.warn(
+        `Could not inspect existing sandbox network ${name}: ${inspect.stderr.trim()}`,
+      );
+    }
+    return true;
+  }
+  return createNetwork(name, ['--internal', '--ipv6=false', '--driver=bridge']);
+}