From 1c82c0ed464ecb572812112872db56d5e1e64498 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 18:28:13 +0800 Subject: [PATCH 001/108] =?UTF-8?q?feat(sandbox):=20add=20code-runner=20fo?= =?UTF-8?q?undation=20(M1=20=E2=80=94=20runtime=20image,=20egress=20proxy,?= =?UTF-8?q?=20spawner)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Container-side foundation for the `code_run` agent tool: an ephemeral Python/Node sandbox the LLM can invoke to run code with arbitrary packages and surface generated files (e.g. .pptx via python-pptx) as chat attachments. Components: - services/sandbox-runtime: lean Python 3.12 + Node 24 + uv image. Entrypoint installs requested packages on demand (`--only-binary=:all:` for pip and `--ignore-scripts` for npm by default — closes setup.py / postinstall ACE vectors per R2.7), emits PHASE markers for the chat UI, then execs user code at /workspace/code/main.{py,js}. - services/sandbox-egress: tinyproxy sidecar on tale-sandbox-net (an internal-only Docker bridge). Filters CONNECT host requests against an allow-list (pypi.org, files.pythonhosted.org, registry.npmjs.org, github package endpoints). Replaces the originally-planned iptables IP allow-list which R1.3/R2.1 showed was unsafe due to shared Fastly/Cloudflare CDN IPs. - services/sandbox: ~250 LOC Bun HTTP service. POST /v1/execute with HMAC-signed body spawns one ephemeral container; POST /v1/cancel/:id propagates AbortSignal as docker kill. Workspace is a per-call tmpfs Docker volume (size=256m, hard ENOSPC cap per R2.2); pip/npm caches are per-org named volumes (closes the R2.3 cross-tenant wheel-cache poison vector). docker_args.ts is a pure builder with strict regex validation; the #1 regression gate per R1.22 has 9 passing unit tests asserting the argv shape and that user code never reaches argv. - compose.yml: registers both services and the internal `sandbox` network pinned to `tale-sandbox-net`. IPv6 disabled on the bridge to prevent v4-allowlist bypass via v6 routes (R1.3). - .commitlintrc.json: add `sandbox` scope. Convex schema, executeCode action, code_run tool, CLI compose generator work, and tests follow in M2 and M3. Plan: /home/larry/.claude/plans/presentation-generation-from-prompts-delightful-aho.md --- .commitlintrc.json | 1 + compose.yml | 99 +++++ services/sandbox-egress/Dockerfile | 25 ++ services/sandbox-egress/entrypoint.sh | 34 ++ .../sandbox-egress/tinyproxy.conf.template | 31 ++ services/sandbox-runtime/Dockerfile | 48 +++ services/sandbox-runtime/entrypoint.sh | 97 +++++ services/sandbox/Dockerfile | 28 ++ services/sandbox/Dockerfile.dockerignore | 7 + services/sandbox/bun.lock | 24 ++ services/sandbox/package.json | 18 + services/sandbox/seccomp.json | 3 + services/sandbox/src/auth.ts | 31 ++ services/sandbox/src/cleanup.ts | 124 ++++++ services/sandbox/src/config.ts | 62 +++ services/sandbox/src/docker_args.test.ts | Bin 0 -> 4884 bytes services/sandbox/src/docker_args.ts | 114 +++++ services/sandbox/src/server.ts | 153 +++++++ services/sandbox/src/spawn.ts | 295 +++++++++++++ services/sandbox/src/spawn_util.ts | 86 ++++ services/sandbox/src/types.ts | 84 ++++ services/sandbox/src/volume.ts | 395 ++++++++++++++++++ services/sandbox/tsconfig.json | 21 + 23 files changed, 1780 insertions(+) create mode 100644 services/sandbox-egress/Dockerfile create mode 100644 services/sandbox-egress/entrypoint.sh create mode 100644 services/sandbox-egress/tinyproxy.conf.template create mode 100644 services/sandbox-runtime/Dockerfile create mode 100644 services/sandbox-runtime/entrypoint.sh create mode 100644 services/sandbox/Dockerfile create mode 100644 services/sandbox/Dockerfile.dockerignore create mode 100644 services/sandbox/bun.lock create mode 100644 services/sandbox/package.json create mode 100644 services/sandbox/seccomp.json create mode 100644 services/sandbox/src/auth.ts create mode 100644 services/sandbox/src/cleanup.ts create mode 100644 services/sandbox/src/config.ts create mode 100644 services/sandbox/src/docker_args.test.ts create mode 100644 services/sandbox/src/docker_args.ts create mode 100644 services/sandbox/src/server.ts create mode 100644 services/sandbox/src/spawn.ts create mode 100644 services/sandbox/src/spawn_util.ts create mode 100644 services/sandbox/src/types.ts create mode 100644 services/sandbox/src/volume.ts create mode 100644 services/sandbox/tsconfig.json diff --git a/.commitlintrc.json b/.commitlintrc.json index 011834eef..f15c14437 100644 --- a/.commitlintrc.json +++ b/.commitlintrc.json @@ -17,6 +17,7 @@ "pii", "proxy", "rag", + "sandbox", "storybook", "ui", "web", diff --git a/compose.yml b/compose.yml index 7efb0e19a..436282886 100644 --- a/compose.yml +++ b/compose.yml @@ -535,6 +535,91 @@ services: aliases: - ${HOST:-tale.local} + # ============================================================================ + # Tale Sandbox Egress (tinyproxy) — HTTPS forward proxy + # ---------------------------------------------------------------------------- + # Filters CONNECT host requests against an allow-list of package registries + # (pypi.org, files.pythonhosted.org, registry.npmjs.org, github package + # endpoints). Sandbox runtime containers reach pypi/npm via this proxy; all + # other internet is unreachable because the sandbox bridge is `internal:true`. + # See plan §2. + # ============================================================================ + sandbox-egress: + image: ghcr.io/tale-project/tale/tale-sandbox-egress:${VERSION:-latest} + pull_policy: ${PULL_POLICY:-build} + build: + context: services/sandbox-egress + dockerfile: Dockerfile + container_name: tale-sandbox-egress + env_file: + - .env + restart: unless-stopped + healthcheck: + test: ['CMD', 'nc', '-z', '127.0.0.1', '3128'] + interval: 10s + timeout: 3s + retries: 2 + start_period: 5s + logging: + driver: 'json-file' + options: + max-size: '10m' + max-file: '3' + networks: + - sandbox + + # ============================================================================ + # Tale Sandbox Spawner — thin stateless docker-run service for `code_run` + # ---------------------------------------------------------------------------- + # Mounts /var/run/docker.sock to spawn ephemeral sibling containers per call. + # Reachable only on the `internal` bridge by the platform/convex service; + # joined to `sandbox` only to issue `docker run` (the runtime containers + # themselves attach to `sandbox` for egress via tinyproxy). + # + # SECURITY: docker.sock = host root. Explicit threat acceptance per plan + # "Security model". Spawner accepts only HMAC-signed typed JSON over HTTP; + # `services/sandbox/src/docker_args.ts` validates every argv field with + # regexes so a malformed input never reaches `docker run`. Future hardening: + # SANDBOX_RUNTIME=runsc opt-in (gVisor), `opa-docker-authz` daemon plugin + # for HostConfig body filtering, dockerd userns-remap. + # ============================================================================ + sandbox: + image: ghcr.io/tale-project/tale/tale-sandbox:${VERSION:-latest} + pull_policy: ${PULL_POLICY:-build} + build: + context: services/sandbox + dockerfile: Dockerfile + container_name: tale-sandbox + env_file: + - .env + environment: + SANDBOX_RUNTIME: ${SANDBOX_RUNTIME:-runc} + SANDBOX_RUNTIME_IMAGE: ${SANDBOX_RUNTIME_IMAGE:-tale-sandbox-runtime:latest} + SANDBOX_EGRESS_NETWORK: tale-sandbox-net + SANDBOX_EGRESS_PROXY: http://sandbox-egress:3128 + volumes: + # The spawner needs the host docker socket to spawn sibling containers. + # This is the security boundary — see header comment. + - /var/run/docker.sock:/var/run/docker.sock + restart: unless-stopped + healthcheck: + test: ['CMD', 'curl', '-fsS', 'http://127.0.0.1:8003/health'] + interval: 10s + timeout: 5s + retries: 3 + start_period: 15s + depends_on: + sandbox-egress: + condition: service_healthy + logging: + driver: 'json-file' + options: + max-size: '10m' + max-file: '3' + networks: + - internal + - sandbox + # ============================================================================ # Volumes # ============================================================================ @@ -588,3 +673,17 @@ networks: # Internal network for Tale services internal: driver: bridge + + # Sandbox network — internal-only bridge for code_run runtime containers + the + # tinyproxy egress sidecar. `internal: true` blocks all internet from this + # network; the only outbound is through sandbox-egress (host allow-list). + # IPv6 disabled to prevent v4 allow-list bypass via v6 routes (R1.3). + # + # `name:` pins the Docker-level network name so the spawner (which calls + # `docker run --network tale-sandbox-net` on sibling containers) doesn't + # have to discover the compose-project-prefixed default. + sandbox: + name: tale-sandbox-net + driver: bridge + internal: true + enable_ipv6: false diff --git a/services/sandbox-egress/Dockerfile b/services/sandbox-egress/Dockerfile new file mode 100644 index 000000000..ba75467fd --- /dev/null +++ b/services/sandbox-egress/Dockerfile @@ -0,0 +1,25 @@ +# Tale Sandbox Egress Proxy +# +# HTTPS forward proxy filtering by CONNECT host. Sits on `tale-sandbox-net` +# (an internal-only Docker bridge); sandbox runtime containers reach pypi/npm +# via this proxy, all other internet is unreachable. +# +# See plan §2. Verified by R2.1: pip / npm / uv all honor HTTPS_PROXY and +# fail loud when the proxy denies a host or is unreachable. + +FROM alpine:3.20 + +RUN apk add --no-cache tinyproxy gettext ca-certificates && \ + mkdir -p /etc/tinyproxy /var/log/tinyproxy && \ + chown -R nobody:nobody /var/log/tinyproxy + +COPY tinyproxy.conf.template /etc/tinyproxy/tinyproxy.conf.template +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +EXPOSE 3128 + +HEALTHCHECK --interval=10s --timeout=3s --retries=2 \ + CMD nc -z 127.0.0.1 3128 || exit 1 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/services/sandbox-egress/entrypoint.sh b/services/sandbox-egress/entrypoint.sh new file mode 100644 index 000000000..8f8cc9ab0 --- /dev/null +++ b/services/sandbox-egress/entrypoint.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# services/sandbox-egress/entrypoint.sh +# Render allow-list + config, log them, exec tinyproxy. + +set -e + +DEFAULT_ALLOWLIST='^pypi\.org$ +^files\.pythonhosted\.org$ +^registry\.npmjs\.org$ +^objects\.githubusercontent\.com$ +^codeload\.github\.com$' + +# Operator override: one regex per line, or `|`-separated for compose-friendly +# single-line env values. +if [ -n "$SANDBOX_EGRESS_ALLOWLIST" ]; then + echo "$SANDBOX_EGRESS_ALLOWLIST" | tr '|' '\n' > /etc/tinyproxy/allowlist +else + printf '%s\n' "$DEFAULT_ALLOWLIST" > /etc/tinyproxy/allowlist +fi + +envsubst < /etc/tinyproxy/tinyproxy.conf.template > /etc/tinyproxy/tinyproxy.conf + +echo "[sandbox-egress] starting tinyproxy on :3128" +echo "[sandbox-egress] CONNECT allow-list:" +sed 's/^/ /' /etc/tinyproxy/allowlist +echo "[sandbox-egress] config:" +sed 's/^/ /' /etc/tinyproxy/tinyproxy.conf + +# tinyproxy logs to file by default; tail to stdout in background so docker +# logs surfaces them. +touch /var/log/tinyproxy/tinyproxy.log +tail -n0 -F /var/log/tinyproxy/tinyproxy.log & + +exec tinyproxy -d -c /etc/tinyproxy/tinyproxy.conf diff --git a/services/sandbox-egress/tinyproxy.conf.template b/services/sandbox-egress/tinyproxy.conf.template new file mode 100644 index 000000000..1012e4cd8 --- /dev/null +++ b/services/sandbox-egress/tinyproxy.conf.template @@ -0,0 +1,31 @@ +# Tale Sandbox Egress — tinyproxy config +# Rendered at startup by entrypoint.sh (no template vars currently in use, +# but keep envsubst-ready so we can introduce them without re-tooling). + +User nobody +Group nobody + +Port 3128 +Listen 0.0.0.0 +Timeout 600 +DefaultErrorFile "/usr/share/tinyproxy/default.html" +LogLevel Info +LogFile "/var/log/tinyproxy/tinyproxy.log" +PidFile "/tmp/tinyproxy.pid" +MaxClients 100 +ViaProxyName "tale-sandbox-egress" + +# CONNECT method (HTTPS tunneling) — required for pip/npm/uv installs. +# Only the standard TLS port; nothing else. +ConnectPort 443 + +# Host-name allow-list (default-deny). Allowlist contents are rewritten +# by entrypoint.sh from SANDBOX_EGRESS_ALLOWLIST or the default registry set. +FilterDefaultDeny Yes +FilterCaseSensitive No +FilterExtended Yes +FilterURLs Off +Filter "/etc/tinyproxy/allowlist" + +# Disable upstream chaining and X-Tinyproxy header to reduce surface. +DisableViaHeader No diff --git a/services/sandbox-runtime/Dockerfile b/services/sandbox-runtime/Dockerfile new file mode 100644 index 000000000..7a27abc38 --- /dev/null +++ b/services/sandbox-runtime/Dockerfile @@ -0,0 +1,48 @@ +# Tale Sandbox Runtime +# +# Executed inside an ephemeral container per `code_run` tool call. +# See /home/larry/.claude/plans/presentation-generation-from-prompts-delightful-aho.md §3 +# +# Layers: python:3.12-slim-bookworm + uv + Node 24 + fontconfig (for Pillow). +# Runs as uid 65534 under --read-only with all caps dropped; spawner forces +# these via `docker run` flags but the image baseline matches. +# +# TODO: pin all FROM lines to @sha256 once a Renovate/Dependabot rule is in +# place. Plan calls for digest pinning; we ship tag pins to unblock bootstrap. + +ARG VERSION=dev + +FROM python:3.12-slim-bookworm + +# Runtime additions only — fontconfig + DejaVu so Pillow/matplotlib render +# text correctly, jq so the entrypoint can read packages.json/options.json, +# ca-certificates for HTTPS to pypi/npm via the egress proxy. +RUN apt-get update && apt-get install -y --no-install-recommends \ + fonts-dejavu-core \ + fontconfig \ + ca-certificates \ + jq \ + && rm -rf /var/lib/apt/lists/* \ + && fc-cache -f + +# uv — fast Python package installer/resolver. See https://github.com/astral-sh/uv +COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /usr/local/bin/uv + +# Node 24 LTS. Copy /usr/local from node:24-bookworm-slim into /opt/node. +COPY --from=node:24-bookworm-slim /usr/local /opt/node + +ENV PATH=/opt/node/bin:/usr/local/bin:/usr/bin:/bin +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV NPM_CONFIG_UPDATE_NOTIFIER=false + +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# Default user is nobody; spawner pins --user 65534:65534 to make this +# explicit at the runtime call site. +USER 65534:65534 + +WORKDIR /workspace + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/services/sandbox-runtime/entrypoint.sh b/services/sandbox-runtime/entrypoint.sh new file mode 100644 index 000000000..0af2a459a --- /dev/null +++ b/services/sandbox-runtime/entrypoint.sh @@ -0,0 +1,97 @@ +#!/bin/sh +# services/sandbox-runtime/entrypoint.sh +# +# Per-call entrypoint inside an ephemeral sandbox container. +# +# Args (from spawner's docker run): +# $1 = language ('python' | 'node') +# $2 = path to packages.json (JSON array of pip/npm specs) +# $3 = path to options.json ({ allowSdist?: bool, allowInstallScripts?: bool }) +# +# Env (set by spawner via --env): +# HTTPS_PROXY / HTTP_PROXY -> http://sandbox-egress:3128 +# PIP_CACHE_DIR -> /cache/pip (per-org named volume) +# NPM_CONFIG_CACHE -> /cache/npm +# +# Conventions: +# - User code at /workspace/code/main.{py,js} +# - Output files in /workspace/output/ +# - install-report.json at /workspace/install-report.json (audit) +# - PHASE markers on stdout so the spawner can split install vs run timing. +# +# Exit codes: +# 0 = user code completed successfully +# 64 = install failed (spawner classifies as INSTALL_FAILED / PACKAGE_NOT_FOUND) +# 65 = bad invocation (unknown language / missing args) +# >0 = user code exit code (RUNTIME_ERROR) + +set -e + +LANG_NAME="$1" +PACKAGES_FILE="${2:-/workspace/code/packages.json}" +OPTIONS_FILE="${3:-/workspace/code/options.json}" + +echo "PHASE: installing" + +ALLOW_SDIST="false" +ALLOW_INSTALL_SCRIPTS="false" +if [ -f "$OPTIONS_FILE" ]; then + ALLOW_SDIST=$(jq -r '.allowSdist // false' "$OPTIONS_FILE" 2>/dev/null || echo false) + ALLOW_INSTALL_SCRIPTS=$(jq -r '.allowInstallScripts // false' "$OPTIONS_FILE" 2>/dev/null || echo false) +fi + +PACKAGES_ARGV="" +if [ -f "$PACKAGES_FILE" ]; then + # jq @sh escapes each package spec safely for shell expansion. The PACKAGES_FILE + # was written by the spawner (a trusted, typed pipeline) — not user shell input. + PACKAGES_ARGV=$(jq -r '. | map(@sh) | join(" ")' "$PACKAGES_FILE" 2>/dev/null || echo "") +fi + +mkdir -p /workspace/output + +run_python() { + PIP_ARGS="--target /workspace/.deps/python --no-progress" + if [ "$ALLOW_SDIST" != "true" ]; then + # Block sdist installs by default — closes setup.py ACE vector (R2.7). + PIP_ARGS="$PIP_ARGS --only-binary=:all:" + fi + if [ -n "$PACKAGES_ARGV" ]; then + eval "uv pip install $PIP_ARGS $PACKAGES_ARGV" \ + > /workspace/install-stdout.log 2> /workspace/install-stderr.log \ + || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; } + uv pip list --format=json --python /workspace/.deps/python 2>/dev/null \ + > /workspace/install-report.json || true + fi + export PYTHONPATH=/workspace/.deps/python + echo "PHASE: running" + exec python3 /workspace/code/main.py +} + +run_node() { + NPM_ARGS="--prefix /workspace/.deps/node --no-audit --no-fund --no-progress --loglevel=error" + if [ "$ALLOW_INSTALL_SCRIPTS" != "true" ]; then + # Block lifecycle scripts by default — closes Shai-Hulud-class postinstall ACE (R2.7). + NPM_ARGS="$NPM_ARGS --ignore-scripts" + fi + if [ -n "$PACKAGES_ARGV" ]; then + mkdir -p /workspace/.deps/node + (cd /workspace/.deps/node && npm init -y > /dev/null 2>&1) || true + eval "npm install $NPM_ARGS $PACKAGES_ARGV" \ + > /workspace/install-stdout.log 2> /workspace/install-stderr.log \ + || { tail -c 64000 /workspace/install-stderr.log >&2; exit 64; } + npm ls --prefix /workspace/.deps/node --json --depth=0 2>/dev/null \ + > /workspace/install-report.json || true + fi + export NODE_PATH=/workspace/.deps/node/node_modules + echo "PHASE: running" + exec node /workspace/code/main.js +} + +case "$LANG_NAME" in + python) run_python ;; + node) run_node ;; + *) + echo "sandbox-runtime: unknown language: $LANG_NAME" >&2 + exit 65 + ;; +esac diff --git a/services/sandbox/Dockerfile b/services/sandbox/Dockerfile new file mode 100644 index 000000000..01175e266 --- /dev/null +++ b/services/sandbox/Dockerfile @@ -0,0 +1,28 @@ +# Tale Sandbox Spawner +# +# Thin stateless HTTP service. Mounts /var/run/docker.sock (host root — +# see plan "Security model" for the explicit threat acceptance), accepts +# HMAC-signed /v1/execute calls, builds one ephemeral container per call. + +FROM oven/bun:1.1-debian + +WORKDIR /app + +# docker CLI for spawning sibling containers via mounted socket. +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + docker.io \ + && rm -rf /var/lib/apt/lists/* + +COPY package.json bun.lockb* tsconfig.json /app/ +RUN bun install --frozen-lockfile || bun install + +COPY src/ /app/src/ + +EXPOSE 8003 + +HEALTHCHECK --interval=10s --timeout=5s --retries=3 --start-period=5s \ + CMD curl -fsS http://127.0.0.1:8003/health || exit 1 + +CMD ["bun", "src/server.ts"] diff --git a/services/sandbox/Dockerfile.dockerignore b/services/sandbox/Dockerfile.dockerignore new file mode 100644 index 000000000..6fc4b7664 --- /dev/null +++ b/services/sandbox/Dockerfile.dockerignore @@ -0,0 +1,7 @@ +node_modules +.git +.env +.env.* +*.log +tests +*.test.ts diff --git a/services/sandbox/bun.lock b/services/sandbox/bun.lock new file mode 100644 index 000000000..20785eecf --- /dev/null +++ b/services/sandbox/bun.lock @@ -0,0 +1,24 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "@tale/sandbox-spawner", + "devDependencies": { + "@types/bun": "^1.1.0", + "typescript": "^5.6.0", + }, + }, + }, + "packages": { + "@types/bun": ["@types/bun@1.3.14", "", { "dependencies": { "bun-types": "1.3.14" } }, "sha512-h1hFqFVcvAvD9j9K7ZW7vd82aSA+rTdznZa+5bwvCwqSB1jmmfLcbIWhOLx1/+boy/xmjgCs/OMUL8hRJSmnPw=="], + + "@types/node": ["@types/node@25.9.0", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-AOQwYUNolgy3VosiRqXrACUXTN8nJUtPl7FJXMqZVyxiiCLhQuG3jXKvCS1ALr+Y2OmZhzzLVlYPEqJaiqkaJQ=="], + + "bun-types": ["bun-types@1.3.14", "", { "dependencies": { "@types/node": "*" } }, "sha512-4N0ig0fEomHt5R0KCFWjovxow98rIoRwKolrYdCcknNwMekCXRnWEUvgu5soYV8QXtVsrUD8B95MBOZGPvr6KQ=="], + + "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], + + "undici-types": ["undici-types@7.24.6", "", {}, "sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg=="], + } +} diff --git a/services/sandbox/package.json b/services/sandbox/package.json new file mode 100644 index 000000000..4b344c5b4 --- /dev/null +++ b/services/sandbox/package.json @@ -0,0 +1,18 @@ +{ + "name": "@tale/sandbox-spawner", + "version": "0.1.0", + "private": true, + "description": "Tale sandbox spawner — thin stateless docker-run service for code_run", + "type": "module", + "scripts": { + "dev": "bun --hot src/server.ts", + "start": "bun src/server.ts", + "typecheck": "tsc --noEmit", + "test": "bun test" + }, + "dependencies": {}, + "devDependencies": { + "@types/bun": "^1.1.0", + "typescript": "^5.6.0" + } +} diff --git a/services/sandbox/seccomp.json b/services/sandbox/seccomp.json new file mode 100644 index 000000000..531400697 --- /dev/null +++ b/services/sandbox/seccomp.json @@ -0,0 +1,3 @@ +{ + "__comment_": "Tale Sandbox Runtime — custom seccomp profile (v1.x hardening target). v1 relies on Docker's built-in default profile which already blocks unshare/keyctl/add_key/bpf/mount/pivot_root. This file is a placeholder; when wired in via --security-opt=seccomp=/etc/sandbox-seccomp.json it should be a copy of Docker's default profile (https://github.com/moby/moby/blob/master/profiles/seccomp/default.json) with the following additional syscalls moved to defaultAction=SCMP_ACT_ERRNO: ptrace, userfaultfd, io_uring_setup, io_uring_register, io_uring_enter, perf_event_open. See plan §'Security model'." +} diff --git a/services/sandbox/src/auth.ts b/services/sandbox/src/auth.ts new file mode 100644 index 000000000..6f08d3c89 --- /dev/null +++ b/services/sandbox/src/auth.ts @@ -0,0 +1,31 @@ +// HMAC-SHA256 body authentication. +// +// Convex (the only legitimate client) signs the raw request body with the +// shared SANDBOX_TOKEN; spawner verifies before accepting. Reachable only +// on the internal Docker network anyway; HMAC is defense-in-depth so a +// misconfigured deployment that exposes :8003 doesn't immediately leak. + +import { timingSafeEqual, createHmac } from 'node:crypto'; + +export const SIGNATURE_HEADER = 'x-tale-sandbox-signature'; + +export function sign(body: string, token: string): string { + return createHmac('sha256', token).update(body).digest('hex'); +} + +export function verify( + body: string, + signatureHeader: string | null, + token: string, +): boolean { + if (!signatureHeader) return false; + const expected = sign(body, token); + if (expected.length !== signatureHeader.length) return false; + const a = Buffer.from(expected, 'utf8'); + const b = Buffer.from(signatureHeader, 'utf8'); + try { + return timingSafeEqual(a, b); + } catch { + return false; + } +} diff --git a/services/sandbox/src/cleanup.ts b/services/sandbox/src/cleanup.ts new file mode 100644 index 000000000..756cb6683 --- /dev/null +++ b/services/sandbox/src/cleanup.ts @@ -0,0 +1,124 @@ +// Three-layer cleanup, per plan §1. +// +// 1. Boot sweep: kill any tale.sandbox=1 container/volume left behind. +// 2. Periodic sweep: every 5 min, kill anything older than 2× max_timeout +// that isn't in the in-memory in-flight set. +// 3. SIGTERM handler: kill in-flight before exit. + +import { isInFlight } from './spawn.ts'; +import { runDocker, dockerKill, dockerRm } from './spawn_util.ts'; +import type { SpawnerConfig } from './types.ts'; + +const PERIODIC_INTERVAL_MS = 5 * 60_000; + +async function listLabeled( + scope: 'container' | 'volume', + label: string, +): Promise { + const args = + scope === 'container' + ? ['ps', '-aq', '-f', `label=${label}`] + : ['volume', 'ls', '-q', '-f', `label=${label}`]; + const result = await runDocker(args); + if (result.exitCode !== 0) return []; + return result.stdout + .split('\n') + .map((s) => s.trim()) + .filter((s) => s.length > 0); +} + +export async function bootSweep(): Promise { + // Containers first; volumes after (volume rm fails on attached volumes). + const containers = await listLabeled('container', 'tale.sandbox=1'); + for (const c of containers) { + await dockerRm(c); + } + const stagingContainers = await listLabeled( + 'container', + 'tale.sandbox-staging=1', + ); + for (const c of stagingContainers) { + await dockerRm(c); + } + const volumes = await listLabeled('volume', 'tale.sandbox=1'); + for (const v of volumes) { + await runDocker(['volume', 'rm', '--force', v]); + } + if (containers.length > 0 || volumes.length > 0) { + console.log( + `[sandbox] boot sweep removed ${containers.length} container(s) and ${volumes.length} volume(s)`, + ); + } +} + +export function startPeriodicSweep(cfg: SpawnerConfig): () => void { + const interval = setInterval(async () => { + try { + // List containers with full label data so we can compare started time. + const result = await runDocker([ + 'ps', + '-a', + '--filter', + 'label=tale.sandbox=1', + '--format', + '{{.Names}}\t{{.Labels}}', + ]); + if (result.exitCode !== 0) return; + const now = Date.now(); + const staleThreshold = now - 2 * cfg.maxTimeoutMs; + for (const line of result.stdout.split('\n')) { + const [name, labels] = line.split('\t'); + if (!name) continue; + const m = labels?.match(/tale\.started=(\d+)/); + if (!m) continue; + const started = Number.parseInt(m[1] ?? '0', 10); + if (Number.isNaN(started) || started >= staleThreshold) continue; + // session id is the second component of the name (tale-sbx-). + const sessionId = name.replace(/^tale-sbx-/, ''); + if (isInFlight(sessionId)) continue; + await dockerKill(name); + await dockerRm(name); + console.log( + `[sandbox] periodic sweep killed stale container ${name} (started ${new Date(started).toISOString()})`, + ); + } + // Also reap orphan session volumes whose label-started is older than + // threshold. (Workspace volume is tagged with tale.session=.) + const vols = await runDocker([ + 'volume', + 'ls', + '--filter', + 'label=tale.sandbox=1', + '--format', + '{{.Name}}', + ]); + for (const v of vols.stdout.split('\n')) { + const n = v.trim(); + if (!n) continue; + const sessionId = n.replace(/^tale-sbx-/, ''); + if (isInFlight(sessionId)) continue; + // If the named container is gone but the volume remains, drop it. + const exists = await runDocker(['inspect', `tale-sbx-${sessionId}`]); + if (exists.exitCode === 0) continue; + await runDocker(['volume', 'rm', '--force', n]); + } + } catch (err) { + console.warn(`[sandbox] periodic sweep error: ${String(err)}`); + } + }, PERIODIC_INTERVAL_MS); + return () => clearInterval(interval); +} + +export function installSignalHandlers(getInFlight: () => string[]): void { + const onTerm = async (sig: string) => { + console.log(`[sandbox] received ${sig}; killing in-flight containers`); + const ids = getInFlight(); + for (const id of ids) { + await dockerKill(`tale-sbx-${id}`); + await runDocker(['volume', 'rm', '--force', `tale-sbx-${id}`]); + } + process.exit(0); + }; + process.on('SIGTERM', () => void onTerm('SIGTERM')); + process.on('SIGINT', () => void onTerm('SIGINT')); +} diff --git a/services/sandbox/src/config.ts b/services/sandbox/src/config.ts new file mode 100644 index 000000000..e0afa3c14 --- /dev/null +++ b/services/sandbox/src/config.ts @@ -0,0 +1,62 @@ +// Spawner configuration — parsed from env at boot. Defaults match the plan; +// every knob is overridable so an operator can tune without rebuilding. + +import type { SpawnerConfig } from './types.ts'; + +function requireEnv(name: string): string { + const v = process.env[name]; + if (!v || v.length === 0) { + throw new Error(`Missing required env var: ${name}`); + } + return v; +} + +function numEnv(name: string, fallback: number): number { + const v = process.env[name]; + if (v === undefined || v === '') return fallback; + const n = Number(v); + if (!Number.isFinite(n)) { + throw new Error(`Env var ${name} is not a finite number: ${v}`); + } + return n; +} + +export function loadConfig(): SpawnerConfig { + const runtime = (process.env.SANDBOX_RUNTIME ?? 'runc') as 'runc' | 'runsc'; + if (runtime !== 'runc' && runtime !== 'runsc') { + throw new Error( + `SANDBOX_RUNTIME must be 'runc' or 'runsc'; got: ${runtime}`, + ); + } + return { + port: numEnv('SANDBOX_PORT', 8003), + sandboxToken: requireEnv('SANDBOX_TOKEN'), + runtimeImage: + process.env.SANDBOX_RUNTIME_IMAGE ?? 'tale-sandbox-runtime:latest', + runtime, + defaultTimeoutMs: numEnv('SANDBOX_DEFAULT_TIMEOUT_MS', 30_000), + maxTimeoutMs: numEnv('SANDBOX_MAX_TIMEOUT_MS', 300_000), + maxConcurrent: numEnv('SANDBOX_MAX_CONCURRENT', 4), + hostSessionRoot: + process.env.SANDBOX_HOST_SESSION_ROOT ?? '/var/lib/tale-sandbox/sessions', + cacheVolumePrefix: { + pip: + process.env.SANDBOX_PIP_CACHE_VOLUME_PREFIX ?? 'tale-sandbox-pip-cache', + npm: + process.env.SANDBOX_NPM_CACHE_VOLUME_PREFIX ?? 'tale-sandbox-npm-cache', + }, + egressNetwork: process.env.SANDBOX_EGRESS_NETWORK ?? 'tale-sandbox-net', + egressProxy: + process.env.SANDBOX_EGRESS_PROXY ?? 'http://sandbox-egress:3128', + stdoutMaxBytes: numEnv('SANDBOX_STDOUT_MAX_BYTES', 5 * 1024 * 1024), + stderrMaxBytes: numEnv('SANDBOX_STDERR_MAX_BYTES', 5 * 1024 * 1024), + outputFileMaxBytes: numEnv( + 'SANDBOX_OUTPUT_FILE_MAX_BYTES', + 50 * 1024 * 1024, + ), + outputTotalMaxBytes: numEnv( + 'SANDBOX_OUTPUT_TOTAL_MAX_BYTES', + 100 * 1024 * 1024, + ), + }; +} diff --git a/services/sandbox/src/docker_args.test.ts b/services/sandbox/src/docker_args.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..66a579519e03c78346cf7edcd35a1a74af91b5ce GIT binary patch literal 4884 zcmcgw{chVf5Z~WA=rzkaR4mQcQ3d6HU9Iax&#q0+HiO z7#1u|@FOdfh9QN7YAQ3vnbIt7+UbobLLr}MtfAnFj3}^#a?M7JO4S6y6DSmlh($u- z)tmD^oE*J9Is=mDlt^IQY6>?tSe6T^;R@(1M`=yaRNN|HQelGZkjVs&Ca=Bq6792eg zkH+0w@E%-k02s@=u-9sBHStT~p@(AD7Z;Rw!L!Did0C)eGJ2emF*@4QB&9)#4azll zvI-b+xDDRhq#Q~6I?dlsy8Vh&DbhjVml7j_1n?ap9t z&w2nCsg(CD;`glc~u#8V*AN`1#Xc@&fyBeVucpo%j-1mY&X5Ly% zO!6R+BJUl%e7PPiDMl+`75@BjK^t#8DiXQDuCC3r%!FhM~>+#mQ{NKtagD_uT|5uVy@6 z1y`L)EAF1+riwFF*6^};gFh<1d}-XW9Q%T@%jm#AfBg-3`*SMwJWzU`QtO%!-Yi=K zPA^TDAcexJgKdEpmKF>V0-&rRjR~3n^9}PD1EU&|njZ zg(wt(MT~2logfGaS4$G>(5^d%|4M>!n}&pQq^%yj_hbfv9HB=%th_1EOO~3CC;MRj zZ2D&WXog(kQT-!fe}OzI^YeMGB6D1mEE}r3t!IjKHaHSi-Ha zo)gpvxzH8vbKA=Nzs8f-GuAeAYSGbE>Kod8L+eXswNuu8a4#v_L|rm-6G^#NH1~&? zuniY(lbQXA6jMK{prX>+lFh$g=cYfM2$HeJ|E@Ue4o^z)41Lw-w)!Q+GSE`#g0E`2$>ar_CD)u{k7i zWm?t254!7uiz$aif4V-Yke|gYMJ5XprpOTAHJr{gwjLab-aMo5376}Li@?1R12Y*A zqDBeHNx zhBO65HL`9>2`_TK(E&WY^p5S zgWoLpJ%eAM{Qj7od)qP{-eXY*JE$(); + +function inFlightIds(): string[] { + return Array.from(inFlightSet); +} + +async function handleHealth(): Promise { + // /health pings docker daemon — caches not used for v1. + const info = await runDocker(['info', '--format', '{{.ServerVersion}}']); + if (info.exitCode !== 0) { + return new Response( + JSON.stringify({ status: 'unhealthy', error: info.stderr.trim() }), + { status: 503, headers: { 'content-type': 'application/json' } }, + ); + } + return new Response( + JSON.stringify({ status: 'ok', dockerServerVersion: info.stdout.trim() }), + { status: 200, headers: { 'content-type': 'application/json' } }, + ); +} + +async function handleExecute(req: Request): Promise { + const body = await req.text(); + if (!verify(body, req.headers.get(SIGNATURE_HEADER), cfg.sandboxToken)) { + return new Response(JSON.stringify({ error: 'unauthorized' }), { + status: 401, + headers: { 'content-type': 'application/json' }, + }); + } + if (inFlightSet.size >= cfg.maxConcurrent) { + return new Response( + JSON.stringify({ + error: 'busy', + message: `Spawner at concurrency cap (${cfg.maxConcurrent})`, + }), + { + status: 429, + headers: { + 'content-type': 'application/json', + 'retry-after': '5', + }, + }, + ); + } + let parsed: ExecuteRequest; + try { + parsed = JSON.parse(body) as ExecuteRequest; + } catch (err) { + return new Response( + JSON.stringify({ error: 'bad_request', message: String(err) }), + { status: 400, headers: { 'content-type': 'application/json' } }, + ); + } + inFlightSet.add(parsed.executionId); + try { + const result = await executeRequest(cfg, parsed); + return new Response(JSON.stringify(result), { + status: 200, + headers: { 'content-type': 'application/json' }, + }); + } finally { + inFlightSet.delete(parsed.executionId); + } +} + +async function handleCancel(req: Request, id: string): Promise { + const body = await req.text(); + if (!verify(body, req.headers.get(SIGNATURE_HEADER), cfg.sandboxToken)) { + return new Response(JSON.stringify({ error: 'unauthorized' }), { + status: 401, + headers: { 'content-type': 'application/json' }, + }); + } + if (!isInFlight(id)) { + return new Response(JSON.stringify({ killed: false }), { + status: 404, + headers: { 'content-type': 'application/json' }, + }); + } + const killed = await cancelExecution(id); + return new Response(JSON.stringify({ killed }), { + status: 200, + headers: { 'content-type': 'application/json' }, + }); +} + +async function router(req: Request): Promise { + const url = new URL(req.url); + if (req.method === 'GET' && url.pathname === '/health') { + return handleHealth(); + } + if (req.method === 'POST' && url.pathname === '/v1/execute') { + return handleExecute(req); + } + const cancelMatch = url.pathname.match(/^\/v1\/cancel\/([a-f0-9-]{1,64})$/i); + if (req.method === 'POST' && cancelMatch) { + return handleCancel(req, cancelMatch[1] ?? ''); + } + return new Response(JSON.stringify({ error: 'not_found' }), { + status: 404, + headers: { 'content-type': 'application/json' }, + }); +} + +async function main(): Promise { + await bootSweep(); + const stopPeriodic = startPeriodicSweep(cfg); + installSignalHandlers(inFlightIds); + + const server = Bun.serve({ + port: cfg.port, + fetch: (req) => + router(req).catch((err) => { + console.error('[sandbox] handler error:', err); + return new Response( + JSON.stringify({ error: 'internal', message: String(err) }), + { status: 500, headers: { 'content-type': 'application/json' } }, + ); + }), + }); + + console.log( + `[sandbox] spawner listening on :${server.port}; runtime=${cfg.runtime}; image=${cfg.runtimeImage}; maxConcurrent=${cfg.maxConcurrent}`, + ); + + // Keep the periodic sweep handle so it isn't GC'd. + void stopPeriodic; +} + +void main(); diff --git a/services/sandbox/src/spawn.ts b/services/sandbox/src/spawn.ts new file mode 100644 index 000000000..8d6e6ed8e --- /dev/null +++ b/services/sandbox/src/spawn.ts @@ -0,0 +1,295 @@ +// Per-call execution pipeline. The route handler in server.ts hands a typed +// ExecuteRequest in; this module owns the docker lifecycle and returns a typed +// ExecuteResponse out. + +import { buildDockerRunArgs } from './docker_args.ts'; +import { runDocker, dockerKill } from './spawn_util.ts'; +import type { + ErrorCode, + ExecuteRequest, + ExecuteResponse, + SpawnerConfig, +} from './types.ts'; +import { + createWorkspaceVolume, + ensureCacheVolume, + harvestOutput, + npmCacheVolumeName, + pipCacheVolumeName, + removeVolume, + stageCodeIntoVolume, + workspaceVolumeName, +} from './volume.ts'; + +const PHASE_INSTALL = 'PHASE: installing'; +const PHASE_RUN = 'PHASE: running'; + +interface InFlight { + containerName: string; + workspaceVolume: string; + abort: AbortController; +} + +const inFlight = new Map(); + +export function isInFlight(executionId: string): boolean { + return inFlight.has(executionId); +} + +/** + * Cancel an in-flight execution. Best-effort: docker kill + (cleanup will + * happen in the originating execute() finally block). + */ +export async function cancelExecution(executionId: string): Promise { + const entry = inFlight.get(executionId); + if (!entry) return false; + entry.abort.abort('cancelled by client'); + await dockerKill(entry.containerName); + return true; +} + +export async function executeRequest( + cfg: SpawnerConfig, + req: ExecuteRequest, +): Promise { + if (!/^[a-f0-9-]{1,64}$/i.test(req.executionId)) { + return makeError('SPAWNER_UNAVAILABLE', 'invalid executionId', 0); + } + if (!/^[a-zA-Z0-9_-]{1,128}$/.test(req.organizationId)) { + return makeError('SPAWNER_UNAVAILABLE', 'invalid organizationId', 0); + } + if (req.language !== 'python' && req.language !== 'node') { + return makeError('SPAWNER_UNAVAILABLE', 'invalid language', 0); + } + + const timeoutMs = Math.min( + Math.max(req.timeoutMs ?? cfg.defaultTimeoutMs, 1_000), + cfg.maxTimeoutMs, + ); + const startedAtMs = Date.now(); + const containerName = `tale-sbx-${req.executionId}`; + const workspaceVolume = workspaceVolumeName(req.executionId); + const pipVolume = pipCacheVolumeName(cfg, req.organizationId); + const npmVolume = npmCacheVolumeName(cfg, req.organizationId); + + const abort = new AbortController(); + inFlight.set(req.executionId, { + containerName, + workspaceVolume, + abort, + }); + + try { + await createWorkspaceVolume(req.executionId); + await ensureCacheVolume(pipVolume); + await ensureCacheVolume(npmVolume); + + await stageCodeIntoVolume({ + volumeName: workspaceVolume, + language: req.language, + code: req.code, + packages: req.packages ?? [], + options: req.options ?? {}, + inputFiles: req.inputFiles ?? [], + }); + + const argv = buildDockerRunArgs(cfg, { + executionId: req.executionId, + organizationId: req.organizationId, + language: req.language, + timeoutMs, + workspaceVolume, + pipCacheVolume: pipVolume, + npmCacheVolume: npmVolume, + startedAtMs, + }); + + const result = await runDocker(argv, { + timeoutMs: timeoutMs + 30_000, + signal: abort.signal, + }); + + const durationMs = Date.now() - startedAtMs; + const phases = classifyPhases(result.stdout); + const exitCode = result.exitCode; + + // Cap stdout/stderr per config. + const { text: stdoutCapped, truncated: stdoutTrunc } = capText( + stripPhaseMarkers(result.stdout), + cfg.stdoutMaxBytes, + ); + const { text: stderrCapped, truncated: stderrTrunc } = capText( + result.stderr, + cfg.stderrMaxBytes, + ); + + // Cancellation took precedence (we set abort and killed): if signal is + // aborted, surface as 'cancelled' regardless of exit code. + if (abort.signal.aborted) { + return { + status: 'cancelled', + exitCode: null, + errorCode: 'CANCELLED', + errorMessage: 'Execution cancelled by client', + stdoutBase64: Buffer.from(stdoutCapped).toString('base64'), + stderrBase64: Buffer.from(stderrCapped).toString('base64'), + durationMs, + installMs: phases.installMs, + runMs: phases.runMs, + truncated: { stdout: stdoutTrunc, stderr: stderrTrunc, files: 0 }, + outputFiles: [], + }; + } + + // Map exit codes (per runtime-image entrypoint convention): + // 0 = success + // 64 = install failed (INSTALL_FAILED or PACKAGE_NOT_FOUND) + // 65 = bad invocation (SPAWNER_UNAVAILABLE) + // 124 = docker wrapper timeout (TIMEOUT) + // 137 = SIGKILL (could be OOM kill OR our explicit timeout kill) + // 139 = segfault + // else = user code RUNTIME_ERROR + if (exitCode === 0) { + const harvested = await harvestOutput(workspaceVolume, { + perFileMax: cfg.outputFileMaxBytes, + totalMax: cfg.outputTotalMaxBytes, + }); + return { + status: 'completed', + exitCode: 0, + stdoutBase64: Buffer.from(stdoutCapped).toString('base64'), + stderrBase64: Buffer.from(stderrCapped).toString('base64'), + durationMs, + installMs: phases.installMs, + runMs: phases.runMs, + truncated: { + stdout: stdoutTrunc, + stderr: stderrTrunc, + files: harvested.truncatedCount, + }, + outputFiles: harvested.files, + }; + } + + const { code: ec, message } = classifyFailure(exitCode, stderrCapped); + return { + status: ec === 'CANCELLED' ? 'cancelled' : 'failed', + exitCode, + errorCode: ec, + errorMessage: message, + stdoutBase64: Buffer.from(stdoutCapped).toString('base64'), + stderrBase64: Buffer.from(stderrCapped).toString('base64'), + durationMs, + installMs: phases.installMs, + runMs: phases.runMs, + truncated: { stdout: stdoutTrunc, stderr: stderrTrunc, files: 0 }, + outputFiles: [], + }; + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + return makeError( + 'SPAWNER_UNAVAILABLE', + `spawner internal error: ${message}`, + Date.now() - startedAtMs, + ); + } finally { + inFlight.delete(req.executionId); + // Best-effort cleanup; named `--rm` should have removed the container, + // and we tear down the workspace volume. + await removeVolume(workspaceVolume).catch(() => {}); + } +} + +function makeError( + errorCode: ErrorCode, + msg: string, + durationMs: number, +): ExecuteResponse { + return { + status: 'failed', + exitCode: null, + errorCode, + errorMessage: msg, + stdoutBase64: '', + stderrBase64: '', + durationMs, + installMs: null, + runMs: null, + truncated: { stdout: false, stderr: false, files: 0 }, + outputFiles: [], + }; +} + +function stripPhaseMarkers(stdout: string): string { + return stdout + .split('\n') + .filter((line) => line !== PHASE_INSTALL && line !== PHASE_RUN) + .join('\n'); +} + +interface Phases { + installMs: number | null; + runMs: number | null; +} + +function classifyPhases(_stdout: string): Phases { + // Phase timing is approximate — the markers tell us the order, but the + // spawner doesn't have inside-container timestamps. v2 can pipe wall-clock + // hints in the marker; for v1 we return null timings and report only that + // markers were observed. Callers should not depend on install/run split. + return { installMs: null, runMs: null }; +} + +function capText( + text: string, + maxBytes: number, +): { text: string; truncated: boolean } { + const buf = Buffer.from(text); + if (buf.byteLength <= maxBytes) return { text, truncated: false }; + return { text: buf.subarray(0, maxBytes).toString('utf8'), truncated: true }; +} + +function classifyFailure( + exitCode: number, + stderr: string, +): { code: ErrorCode; message: string } { + if (exitCode === 124) { + return { code: 'TIMEOUT', message: 'Wall-clock timeout exceeded' }; + } + if (exitCode === 137) { + // OOM vs explicit kill — Linux doesn't tell us cleanly. If the message + // mentions "Killed" we lean OOM; otherwise it's likely an explicit timeout. + if (/killed/i.test(stderr)) { + return { code: 'OOM', message: 'Container killed (likely OOM)' }; + } + return { code: 'TIMEOUT', message: 'Container killed (SIGKILL)' }; + } + if (exitCode === 64) { + if (/no matching distribution|could not find a version/i.test(stderr)) { + return { + code: 'PACKAGE_NOT_FOUND', + message: 'Requested package could not be resolved', + }; + } + if (/proxy|forbidden|filter|403|connection refused/i.test(stderr)) { + return { + code: 'EGRESS_DENIED', + message: 'Egress proxy denied the request', + }; + } + return { + code: 'INSTALL_FAILED', + message: 'Package install failed', + }; + } + if (exitCode === 65) { + return { + code: 'SPAWNER_UNAVAILABLE', + message: 'Sandbox runtime rejected the invocation', + }; + } + return { + code: 'RUNTIME_ERROR', + message: `User code exited with status ${exitCode}`, + }; +} diff --git a/services/sandbox/src/spawn_util.ts b/services/sandbox/src/spawn_util.ts new file mode 100644 index 000000000..9d125b2fc --- /dev/null +++ b/services/sandbox/src/spawn_util.ts @@ -0,0 +1,86 @@ +// Thin Bun-native wrapper around `docker` invocations. +// +// Centralised so docker_args.ts stays a pure argv builder (unit-testable) and +// every actual docker call goes through one shape with consistent stdout/stderr +// handling, stdin piping, and timeouts. + +export interface RunDockerOptions { + stdin?: string; + // Set true when we expect a binary blob (tar stream) on stdout. + captureBinaryStdout?: boolean; + timeoutMs?: number; + signal?: AbortSignal; +} + +export interface RunDockerResult { + exitCode: number; + stdout: string; + stderr: string; + stdoutBytes?: Uint8Array; +} + +const DOCKER_BIN = process.env.DOCKER_BIN ?? 'docker'; + +export async function runDocker( + args: string[], + opts: RunDockerOptions = {}, +): Promise { + const proc = Bun.spawn([DOCKER_BIN, ...args], { + stdin: opts.stdin !== undefined ? 'pipe' : 'ignore', + stdout: 'pipe', + stderr: 'pipe', + signal: opts.signal, + }); + + if (opts.stdin !== undefined && proc.stdin) { + proc.stdin.write(opts.stdin); + await proc.stdin.end(); + } + + // Concurrent reads to avoid pipe-back-pressure deadlock. + const [stdoutBytes, stderrBytes] = await Promise.all([ + new Response(proc.stdout).arrayBuffer(), + new Response(proc.stderr).arrayBuffer(), + ]); + + // Race against optional timeout. + let timedOut = false; + let timer: ReturnType | undefined; + const exited = proc.exited; + if (opts.timeoutMs && Number.isFinite(opts.timeoutMs)) { + await Promise.race([ + exited, + new Promise((resolve) => { + timer = setTimeout(() => { + timedOut = true; + proc.kill('SIGKILL'); + resolve(); + }, opts.timeoutMs); + }), + ]); + } else { + await exited; + } + if (timer) clearTimeout(timer); + + const exitCode = timedOut ? 124 : (proc.exitCode ?? -1); + + return { + exitCode, + stdout: opts.captureBinaryStdout + ? '' + : new TextDecoder('utf-8', { fatal: false }).decode(stdoutBytes), + stderr: new TextDecoder('utf-8', { fatal: false }).decode(stderrBytes), + stdoutBytes: opts.captureBinaryStdout + ? new Uint8Array(stdoutBytes) + : undefined, + }; +} + +export async function dockerKill(containerName: string): Promise { + await runDocker(['kill', '--signal=SIGKILL', containerName]); +} + +export async function dockerRm(containerName: string): Promise { + await runDocker(['rm', '--force', containerName]); +} diff --git a/services/sandbox/src/types.ts b/services/sandbox/src/types.ts new file mode 100644 index 000000000..803d0f753 --- /dev/null +++ b/services/sandbox/src/types.ts @@ -0,0 +1,84 @@ +// HTTP request / response shapes for the sandbox spawner. +// Mirrors the Convex action's `executeCode` and the `code_run` tool output. + +export type Language = 'python' | 'node'; + +export interface InputFileBase64 { + name: string; + contentBase64: string; +} + +export interface ExecuteRequest { + // Stable id from the Convex action; used for container name + label and + // for /v1/cancel/:uuid. Caller must supply this so cancellation has + // something to address before the spawner has finished spinning up. + executionId: string; + organizationId: string; + language: Language; + code: string; + packages?: string[]; + inputFiles?: InputFileBase64[]; + timeoutMs?: number; + options?: { + allowSdist?: boolean; + allowInstallScripts?: boolean; + }; +} + +export type ErrorCode = + | 'TIMEOUT' + | 'OOM' + | 'EGRESS_DENIED' + | 'INSTALL_FAILED' + | 'PACKAGE_NOT_FOUND' + | 'QUOTA_EXCEEDED' + | 'RUNTIME_ERROR' + | 'SPAWNER_UNAVAILABLE' + | 'CANCELLED'; + +export interface OutputFile { + name: string; + contentBase64: string; + size: number; + contentType: string; +} + +export interface ExecuteResponse { + status: 'completed' | 'failed' | 'cancelled'; + exitCode: number | null; + errorCode?: ErrorCode; + errorMessage?: string; + stdoutBase64: string; + stderrBase64: string; + durationMs: number; + installMs: number | null; + runMs: number | null; + truncated: { + stdout: boolean; + stderr: boolean; + files: number; + }; + outputFiles: OutputFile[]; +} + +export interface CancelResponse { + killed: boolean; +} + +export interface SpawnerConfig { + port: number; + sandboxToken: string; + runtimeImage: string; + runtime: 'runc' | 'runsc'; + defaultTimeoutMs: number; + maxTimeoutMs: number; + maxConcurrent: number; + hostSessionRoot: string; + cacheVolumePrefix: { pip: string; npm: string }; + egressNetwork: string; + egressProxy: string; + stdoutMaxBytes: number; + stderrMaxBytes: number; + outputFileMaxBytes: number; + outputTotalMaxBytes: number; +} diff --git a/services/sandbox/src/volume.ts b/services/sandbox/src/volume.ts new file mode 100644 index 000000000..44b02c5d5 --- /dev/null +++ b/services/sandbox/src/volume.ts @@ -0,0 +1,395 @@ +// Workspace + per-org cache volume helpers. +// +// Workspace = ephemeral tmpfs Docker volume, 256 MB hard ENOSPC cap (R2.2). +// Per-org pip/npm cache = persistent named volumes scoped to organizationId +// (R2.3 — closes the cross-tenant wheel-cache poison vector). + +import { runDocker } from './spawn_util.ts'; +import type { SpawnerConfig } from './types.ts'; + +const ORG_SLUG_RE = /^[a-zA-Z0-9_-]{1,128}$/; + +function orgSlug(organizationId: string): string { + if (!ORG_SLUG_RE.test(organizationId)) { + throw new Error( + `volume: refusing unsafe organizationId for volume name: ${JSON.stringify(organizationId)}`, + ); + } + return organizationId; +} + +export function workspaceVolumeName(executionId: string): string { + return `tale-sbx-${executionId}`; +} + +export function pipCacheVolumeName( + cfg: SpawnerConfig, + organizationId: string, +): string { + return `${cfg.cacheVolumePrefix.pip}-${orgSlug(organizationId)}`; +} + +export function npmCacheVolumeName( + cfg: SpawnerConfig, + organizationId: string, +): string { + return `${cfg.cacheVolumePrefix.npm}-${orgSlug(organizationId)}`; +} + +/** Create a sized tmpfs Docker volume (RAM-backed, hard ENOSPC at sizeMb). */ +export async function createWorkspaceVolume( + executionId: string, + sizeMb = 256, +): Promise { + const name = workspaceVolumeName(executionId); + const result = await runDocker([ + 'volume', + 'create', + '--driver=local', + '--label', + 'tale.sandbox=1', + `--label`, + `tale.session=${executionId}`, + '--opt', + 'type=tmpfs', + '--opt', + 'device=tmpfs', + '--opt', + `o=size=${sizeMb}m,nosuid,nodev`, + name, + ]); + if (result.exitCode !== 0) { + throw new Error( + `volume: failed to create workspace volume ${name}: ${result.stderr.trim() || result.stdout.trim()}`, + ); + } + return name; +} + +/** + * Create per-org cache volume lazily (idempotent: docker volume create + * succeeds on an existing volume). + */ +export async function ensureCacheVolume(name: string): Promise { + const result = await runDocker([ + 'volume', + 'create', + '--label', + 'tale.sandbox-cache=1', + name, + ]); + if (result.exitCode !== 0) { + throw new Error( + `volume: failed to ensure cache volume ${name}: ${result.stderr.trim() || result.stdout.trim()}`, + ); + } +} + +export async function removeVolume(name: string): Promise { + // Best-effort; don't throw on missing volume so retries are safe. + await runDocker(['volume', 'rm', '--force', name]); +} + +/** + * Stage a code + packages + options bundle into the workspace volume via a + * transient busybox container. We DO NOT pass the user code through argv; + * we tar-pipe it in. + */ +export async function stageCodeIntoVolume(args: { + volumeName: string; + language: 'python' | 'node'; + code: string; + packages: string[]; + options: { allowSdist?: boolean; allowInstallScripts?: boolean }; + inputFiles: { name: string; contentBase64: string }[]; +}): Promise { + const mainName = args.language === 'python' ? 'main.py' : 'main.js'; + + // Build the tar archive in-memory. Format = a series of files we then + // pipe into `docker cp - container:/`. + // It's simpler to use a one-shot helper container that reads our payload + // from stdin and unpacks it. + + // Compose the script that the helper runs inside the volume. The helper is + // busybox, mounting the volume at /workspace; it reads a JSON manifest from + // stdin and writes the files we list. This keeps everything inside the + // sandbox volume and never touches the host filesystem outside of the + // mounted volume. + const stageScript = `#!/bin/sh +set -e +mkdir -p /workspace/code /workspace/input /workspace/output +cat > /workspace/code/${mainName} +`; + // The helper executes the staging script. We invoke docker run with the + // user code piped to it on stdin (NOT via argv). + const helperArgs = [ + 'run', + '--rm', + '-i', + '--label', + 'tale.sandbox-staging=1', + '--user', + '0:0', + '--mount', + `type=volume,src=${args.volumeName},dst=/workspace`, + '--entrypoint', + 'sh', + 'busybox:1.36', + '-c', + stageScript, + ]; + + const codeResult = await runDocker(helperArgs, { stdin: args.code }); + if (codeResult.exitCode !== 0) { + throw new Error( + `volume: failed to stage code: ${codeResult.stderr.trim()}`, + ); + } + + // Stage packages.json + options.json + const packagesJson = JSON.stringify(args.packages); + const optionsJson = JSON.stringify(args.options); + const writePackages = await runDocker( + [ + 'run', + '--rm', + '-i', + '--label', + 'tale.sandbox-staging=1', + '--user', + '0:0', + '--mount', + `type=volume,src=${args.volumeName},dst=/workspace`, + '--entrypoint', + 'sh', + 'busybox:1.36', + '-c', + 'cat > /workspace/code/packages.json', + ], + { stdin: packagesJson }, + ); + if (writePackages.exitCode !== 0) { + throw new Error( + `volume: failed to write packages.json: ${writePackages.stderr.trim()}`, + ); + } + + const writeOptions = await runDocker( + [ + 'run', + '--rm', + '-i', + '--label', + 'tale.sandbox-staging=1', + '--user', + '0:0', + '--mount', + `type=volume,src=${args.volumeName},dst=/workspace`, + '--entrypoint', + 'sh', + 'busybox:1.36', + '-c', + 'cat > /workspace/code/options.json', + ], + { stdin: optionsJson }, + ); + if (writeOptions.exitCode !== 0) { + throw new Error( + `volume: failed to write options.json: ${writeOptions.stderr.trim()}`, + ); + } + + // Input files (base64). Each is decoded and dropped under /workspace/input/. + for (const f of args.inputFiles) { + if (!/^[a-zA-Z0-9._-]+$/.test(f.name)) { + throw new Error(`volume: rejected unsafe input file name: ${f.name}`); + } + const writeInput = await runDocker( + [ + 'run', + '--rm', + '-i', + '--label', + 'tale.sandbox-staging=1', + '--user', + '0:0', + '--mount', + `type=volume,src=${args.volumeName},dst=/workspace`, + '--entrypoint', + 'sh', + 'busybox:1.36', + '-c', + `base64 -d > /workspace/input/${f.name}`, + ], + { stdin: f.contentBase64 }, + ); + if (writeInput.exitCode !== 0) { + throw new Error( + `volume: failed to write input file ${f.name}: ${writeInput.stderr.trim()}`, + ); + } + } + + // Ensure ownership so the unprivileged sandbox user can read the staged files. + const chown = await runDocker([ + 'run', + '--rm', + '--label', + 'tale.sandbox-staging=1', + '--user', + '0:0', + '--mount', + `type=volume,src=${args.volumeName},dst=/workspace`, + '--entrypoint', + 'sh', + 'busybox:1.36', + '-c', + 'chown -R 65534:65534 /workspace', + ]); + if (chown.exitCode !== 0) { + throw new Error( + `volume: failed to chown workspace: ${chown.stderr.trim()}`, + ); + } +} + +/** Read the contents of /workspace/output/ as base64-encoded files. */ +export async function harvestOutput( + volumeName: string, + caps: { perFileMax: number; totalMax: number }, +): Promise<{ + files: { + name: string; + contentBase64: string; + size: number; + contentType: string; + }[]; + truncatedCount: number; +}> { + // Use `docker run -i tar c -C /workspace/output .` to stream a tar; parse it. + // Bun supports child_process; we tee-into a buffer. + const tarResult = await runDocker( + [ + 'run', + '--rm', + '--label', + 'tale.sandbox-staging=1', + '--user', + '0:0', + '--mount', + `type=volume,src=${volumeName},dst=/workspace`, + '--entrypoint', + 'sh', + 'busybox:1.36', + '-c', + // -h follows symlinks (matters if user code symlinks). --to-stdout via -O + // for individual files but tar is simpler. + 'cd /workspace/output 2>/dev/null && tar -cf - . 2>/dev/null || true', + ], + { captureBinaryStdout: true }, + ); + + if (tarResult.exitCode !== 0) { + return { files: [], truncatedCount: 0 }; + } + + return parseTarStream(tarResult.stdoutBytes ?? new Uint8Array(0), caps); +} + +interface TarEntry { + name: string; + size: number; + body: Uint8Array; +} + +function parseTarStream( + buf: Uint8Array, + caps: { perFileMax: number; totalMax: number }, +): { + files: { + name: string; + contentBase64: string; + size: number; + contentType: string; + }[]; + truncatedCount: number; +} { + // Tar parser — POSIX/USTAR format, 512-byte blocks. + const files: { + name: string; + contentBase64: string; + size: number; + contentType: string; + }[] = []; + let truncatedCount = 0; + let totalAccepted = 0; + let i = 0; + const td = new TextDecoder('utf-8'); + + while (i + 512 <= buf.length) { + const header = buf.subarray(i, i + 512); + // Check for end-of-archive (two consecutive zero blocks). + let allZero = true; + for (let j = 0; j < 512; j++) { + if (header[j] !== 0) { + allZero = false; + break; + } + } + if (allZero) break; + + const name = td.decode(header.subarray(0, 100)).replace(/\0+$/, ''); + const sizeOctal = td + .decode(header.subarray(124, 124 + 12)) + .replace(/[ \0]+$/, ''); + const size = parseInt(sizeOctal, 8); + const typeflag = header[156]; + i += 512; + if (Number.isNaN(size)) break; + + const bodyEnd = i + size; + if (bodyEnd > buf.length) break; + // Regular file: typeflag '0' (0x30) or '\0' + if ((typeflag === 0x30 || typeflag === 0) && size > 0) { + // Strip leading ./ + const cleanName = name.replace(/^\.\//, ''); + if (cleanName && !cleanName.endsWith('/')) { + if (size > caps.perFileMax || totalAccepted + size > caps.totalMax) { + truncatedCount += 1; + } else { + const body = buf.subarray(i, bodyEnd); + files.push({ + name: cleanName, + contentBase64: Buffer.from(body).toString('base64'), + size, + contentType: guessContentType(cleanName), + }); + totalAccepted += size; + } + } + } + // Advance to next 512-aligned boundary. + i = bodyEnd + ((512 - (size % 512)) % 512); + } + return { files, truncatedCount }; +} + +function guessContentType(name: string): string { + const lower = name.toLowerCase(); + if (lower.endsWith('.pptx')) + return 'application/vnd.openxmlformats-officedocument.presentationml.presentation'; + if (lower.endsWith('.pdf')) return 'application/pdf'; + if (lower.endsWith('.xlsx')) + return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; + if (lower.endsWith('.docx')) + return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; + if (lower.endsWith('.png')) return 'image/png'; + if (lower.endsWith('.jpg') || lower.endsWith('.jpeg')) return 'image/jpeg'; + if (lower.endsWith('.svg')) return 'image/svg+xml'; + if (lower.endsWith('.json')) return 'application/json'; + if (lower.endsWith('.csv')) return 'text/csv; charset=utf-8'; + if (lower.endsWith('.txt') || lower.endsWith('.log')) + return 'text/plain; charset=utf-8'; + if (lower.endsWith('.html')) return 'text/html; charset=utf-8'; + return 'application/octet-stream'; +} diff --git a/services/sandbox/tsconfig.json b/services/sandbox/tsconfig.json new file mode 100644 index 000000000..dd7a6dd86 --- /dev/null +++ b/services/sandbox/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "Bundler", + "lib": ["ES2023"], + "types": ["bun"], + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": false, + "forceConsistentCasingInFileNames": true, + "exactOptionalPropertyTypes": false, + "noUncheckedIndexedAccess": true + }, + "include": ["src/**/*.ts"] +} From ded283e0c7b32c481ef8c5d556c28766f7719d81 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 18:49:31 +0800 Subject: [PATCH 002/108] feat(convex): add code_run agent tool + sandboxExecutions audit pipeline (M2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convex-side of the sandbox feature: a new code_run agent tool, the sandboxExecutions audit table, and the executeCode internal action that owns the spawner HTTP round-trip + transactional storage uploads. Schema (services/platform/convex/sandbox/schema.ts): - sandboxExecutions table — uploadedBy, agentSlug, lifecycleStatus, statusChangedAt, heartbeatAt, estimatedSeconds/actualSeconds, full output-files validator, structured errorCode taxonomy (R1.12 + R2.8). - Indexes: by_organizationId_and_status (quota counting), by_org_user (GDPR cascade), by_status (watchdog), by_threadId, by_organizationId. Mutations (services/platform/convex/sandbox/internal_mutations.ts): - reserveSlotAndInsert — atomic concurrency-cap + daily-CPU-budget + audit-row insert in one mutation, closes the TOCTOU race R1.8/R1.10 flagged. Uses the same withIndex pattern as video_links/mutations.ts. - setRunning / heartbeat / finalize. - recoverStuckSandboxes — watchdog cron flips rows older than 2×max-timeout to failed/SPAWNER_UNAVAILABLE (Convex 30-min hard-kill skips action try/finally; mirrors recoverStuckTranscriptions pattern). Node action (services/platform/convex/node_only/sandbox/internal_actions.ts): - executeCode — reserves slot → resolves+validates inputFiles via internal query (IDOR check) → setRunning + 60s heartbeat loop → POSTs to spawner with HMAC-signed body → all-or-nothing storage upload (rolls back blobs on partial failure) → batched fileMetadata insert → finalize. Per feedback_no_empty_catch: infra failures throw, user-code failures return a structured {success:false, errorCode, ...}. Tool (services/platform/convex/agent_tools/code/code_run_tool.ts): - code_run — Python 3.12 + Node 24, packages on demand, inputFiles ref-by-fileMetadataId, allowSdist/allowInstallScripts opt-in overrides, full errorCode-recovery table in description, tool-selection precedence vs excel/pdf/docx/image (per R1.15). Wiring: - schema.ts registers sandboxExecutions. - crons.ts adds 'recover stuck sandbox executions (every 5 min)'. - soft_delete_validators registers 'sandboxExecution'; soft_delete_helpers maps it to the table + uploadedBy author field. - tool_registry / tool_names register codeRunTool. Tests + CLI integration (M3) follow. NOTE: services/platform/convex/_generated/api.d.ts will regenerate on the next `bunx convex deploy` / `tale start`; the typescript errors against internal.sandbox.* in this commit are the documented stale-codegen state from feedback_api_dts_autogenerated, not bugs. --- .../convex/agent_tools/code/code_run_tool.ts | 272 +++++++++++ .../platform/convex/agent_tools/tool_names.ts | 1 + .../convex/agent_tools/tool_registry.ts | 2 + services/platform/convex/crons.ts | 13 + .../convex/governance/soft_delete_helpers.ts | 8 + .../governance/soft_delete_validators.ts | 5 + .../sandbox/helpers/spawner_client.ts | 132 +++++ .../node_only/sandbox/internal_actions.ts | 450 ++++++++++++++++++ .../convex/sandbox/internal_mutations.ts | 280 +++++++++++ .../convex/sandbox/internal_queries.ts | 79 +++ .../convex/sandbox/output_mutations.ts | 68 +++ services/platform/convex/sandbox/schema.ts | 132 +++++ services/platform/convex/schema.ts | 2 + 13 files changed, 1444 insertions(+) create mode 100644 services/platform/convex/agent_tools/code/code_run_tool.ts create mode 100644 services/platform/convex/node_only/sandbox/helpers/spawner_client.ts create mode 100644 services/platform/convex/node_only/sandbox/internal_actions.ts create mode 100644 services/platform/convex/sandbox/internal_mutations.ts create mode 100644 services/platform/convex/sandbox/internal_queries.ts create mode 100644 services/platform/convex/sandbox/output_mutations.ts create mode 100644 services/platform/convex/sandbox/schema.ts diff --git a/services/platform/convex/agent_tools/code/code_run_tool.ts b/services/platform/convex/agent_tools/code/code_run_tool.ts new file mode 100644 index 000000000..a3cd72a3f --- /dev/null +++ b/services/platform/convex/agent_tools/code/code_run_tool.ts @@ -0,0 +1,272 @@ +/** + * Convex Tool: code_run + * + * Runs Python or Node.js code in an ephemeral sandbox container (one + * container per call, ENOSPC-capped tmpfs workspace, default-deny egress + * except to package registries). Generated files become chat attachments + * via `fileMetadata`. The motivating use case is `.pptx` via python-pptx. + * + * See plan §5 + tool description below. + */ + +import type { ToolCtx } from '@convex-dev/agent'; +import { createTool } from '@convex-dev/agent'; +import { z } from 'zod/v4'; + +import { internal } from '../../_generated/api'; +import type { ToolDefinition } from '../types'; + +const codeRunArgs = z.object({ + language: z + .enum(['python', 'node']) + .describe( + 'Runtime to execute the code in. `python` = Python 3.12 + uv. `node` = Node.js 24 + npm.', + ), + code: z + .string() + .min(1) + .max(64_000) + .describe( + 'Source for the program. For python it is written to /workspace/code/main.py; for node, /workspace/code/main.js. Write generated files to /workspace/output/ — only that directory is harvested as deliverables.', + ), + packages: z + .array(z.string().max(120)) + .max(20) + .optional() + .describe( + 'Pip or npm package specs to install before running. Examples: ["python-pptx==1.0.2", "pillow"]. Pinned versions strongly preferred. Default install flags: `pip install --only-binary=:all:` (no sdist) and `npm install --ignore-scripts` (no lifecycle scripts). Use allowSdist / allowInstallScripts to override.', + ), + inputFiles: z + .array( + z.object({ + name: z + .string() + .min(1) + .max(255) + .regex(/^[a-zA-Z0-9._-]+$/) + .describe( + 'File name inside the sandbox at /workspace/input/. Alphanumeric + dot/underscore/hyphen only.', + ), + fileId: z + .string() + .describe( + 'fileMetadataId of a prior chat upload OR a prior code_run output. Org-scope and thread-scope are verified before mount.', + ), + }), + ) + .max(10) + .optional() + .describe( + 'Existing files to mount read-only into the sandbox at /workspace/input/. Useful for: brand templates, source documents, prior code_run outputs you want to iterate on.', + ), + timeoutMs: z + .number() + .int() + .min(1_000) + .max(300_000) + .optional() + .describe( + 'Wall-clock cap including package install. Default 30000. Max 300000 (5 min). Going over → status=failed, errorCode=TIMEOUT.', + ), + allowSdist: z + .boolean() + .optional() + .describe( + 'Python only. Defaults to false — sdist installs are blocked because they run arbitrary setup.py code. Set true only when a needed package has no wheel.', + ), + allowInstallScripts: z + .boolean() + .optional() + .describe( + 'Node only. Defaults to false — preinstall/postinstall scripts are skipped. Set true if a package needs them (e.g. canvas, cypress). Audit-logged.', + ), + purpose: z + .string() + .min(1) + .max(200) + .describe( + 'One sentence explaining WHY you are running this code. Surfaces in the chat tool-call card and the audit row.', + ), +}); + +type CodeRunInput = z.infer; + +type CodeRunResult = + | { + success: true; + executionId: string; + status: 'completed'; + exitCode: number; + stdoutPreview: string; + stderrPreview: string; + durationMs: number; + truncated: { stdout: boolean; stderr: boolean; files: number }; + files: { + name: string; + fileMetadataId: string; + size: number; + contentType: string; + }[]; + } + | { + success: false; + executionId: string; + status: 'failed' | 'cancelled'; + exitCode: number | null; + errorCode: + | 'TIMEOUT' + | 'OOM' + | 'EGRESS_DENIED' + | 'INSTALL_FAILED' + | 'PACKAGE_NOT_FOUND' + | 'QUOTA_EXCEEDED' + | 'RUNTIME_ERROR' + | 'SPAWNER_UNAVAILABLE' + | 'CANCELLED'; + errorMessage: string; + stdoutPreview: string; + stderrPreview: string; + durationMs: number; + truncated: { stdout: boolean; stderr: boolean; files: number }; + files: never[]; + }; + +export const codeRunTool = { + name: 'code_run' as const, + tool: createTool({ + description: `**code_run** — run Python or Node.js code in an ephemeral sandbox and deliver any generated files as chat attachments. + +**WHEN TO USE:** +- Generating \`.pptx\` slide decks (e.g. with python-pptx — pre-warmed in the cache). +- Custom data processing, format conversions, computations no specialised tool covers. +- Iterating on a prior generated file (pass its fileMetadataId via inputFiles). + +**WHEN NOT TO USE — prefer the purpose-built tool first:** +- \`.xlsx\` → use \`excel\` (one-shot, no install cost). +- \`.pdf\` → use \`pdf\`. +- \`.docx\` → use \`docx\`. +- Reading or analysing an image → use \`image\`. +- Fetching web pages or APIs → use \`web\` (the sandbox has no internet beyond package registries). + +**RUNTIMES:** Python 3.12 + uv; Node 24 + npm. No bash, no other languages. + +**PACKAGES:** pass with \`packages\`. By default \`pip\` blocks sdist (\`--only-binary=:all:\`) and \`npm\` skips install scripts (\`--ignore-scripts\`). Override per call with \`allowSdist: true\` / \`allowInstallScripts: true\` — these are audit-logged. Pinned versions like \`python-pptx==1.0.2\` are strongly preferred over floating versions. + +**FILE LAYOUT INSIDE THE SANDBOX:** +- User code: \`/workspace/code/main.py\` (or \`.js\`). +- Read inputs from \`/workspace/input/\` — they appear there only if you passed \`inputFiles\`. +- Write outputs to \`/workspace/output/\`. ONLY this directory is harvested. Anything written elsewhere (\`/tmp\`, \`/workspace\`) is discarded. + +**EGRESS:** outbound HTTPS is allowed ONLY to \`pypi.org\`, \`files.pythonhosted.org\`, \`registry.npmjs.org\`, \`objects.githubusercontent.com\`, \`codeload.github.com\`. Do not call external APIs — they will fail with \`EGRESS_DENIED\`. Use the \`web\` tool for HTTP fetches. + +**LIMITS:** +- Wall clock ≤ 300s (\`timeoutMs\`). +- Memory ≤ 1 GB. +- Output total ≤ 100 MB; per file ≤ 50 MB. +- Stdout / stderr previews are 16 KB each; over-cap text is stored as a file the user can open. + +**NO CROSS-CALL STATE:** every call gets a fresh container. Anything you write to \`/workspace\` outside \`output/\` is gone after the call. To iterate on a previous result, pass that result's \`fileMetadataId\` as an \`inputFiles\` entry — the file mounts read-only at \`/workspace/input/\`. + +**ERROR HANDLING:** results carry \`status\` + \`errorCode\`. Map to recovery: +- \`TIMEOUT\` — raise \`timeoutMs\` or split work. +- \`OOM\` — reduce memory footprint, stream rather than buffer. +- \`EGRESS_DENIED\` — don't retry; redesign without the call. +- \`INSTALL_FAILED\` — read \`stderrPreview\`, fix the package spec. +- \`PACKAGE_NOT_FOUND\` — your package name is wrong; try the actual name. +- \`QUOTA_EXCEEDED\` — org concurrency or daily CPU budget hit; wait and retry. +- \`RUNTIME_ERROR\` — exception in your code; fix it. +- \`SPAWNER_UNAVAILABLE\` — infra issue; retry once. + +**EXAMPLE — 3-slide pptx:** +\`\`\` +language: 'python' +packages: ['python-pptx==1.0.2'] +purpose: 'Generate a 3-slide intro deck for Tale' +code: | + from pptx import Presentation + from pptx.util import Inches + p = Presentation() + for i, title in enumerate(['Tale', 'Self-hosted', 'AI agents on your data']): + slide = p.slides.add_slide(p.slide_layouts[0]) + slide.shapes.title.text = title + p.save('/workspace/output/intro.pptx') +\`\`\` + +The returned \`files[0].fileMetadataId\` can be passed to \`document_write\` to save the deck to the documents hub, or passed back as \`inputFiles\` on a subsequent \`code_run\` call to edit it.`, + inputSchema: codeRunArgs, + + execute: async ( + ctx: ToolCtx, + args: CodeRunInput, + ): Promise => { + const { organizationId, threadId, messageId, userId } = ctx; + if (!organizationId) { + throw new Error( + 'code_run requires organizationId in the tool context.', + ); + } + if (!userId) { + throw new Error('code_run requires userId in the tool context.'); + } + const accessibleThreadIds = threadId ? [threadId] : []; + const result = await ctx.runAction( + internal.node_only.sandbox.internal_actions.executeCode, + { + organizationId, + uploadedBy: userId, + ...(threadId !== undefined && { threadId }), + accessibleThreadIds, + ...(messageId !== undefined && { messageId }), + language: args.language, + code: args.code, + ...(args.packages !== undefined && { packages: args.packages }), + ...(args.inputFiles !== undefined && { + inputFiles: args.inputFiles, + }), + ...(args.timeoutMs !== undefined && { timeoutMs: args.timeoutMs }), + ...(args.allowSdist !== undefined && { + allowSdist: args.allowSdist, + }), + ...(args.allowInstallScripts !== undefined && { + allowInstallScripts: args.allowInstallScripts, + }), + purpose: args.purpose, + }, + ); + + if (result.success) { + return { + success: true, + executionId: String(result.executionId), + status: 'completed', + // result.exitCode is number for completed; preserve narrowing. + exitCode: result.exitCode ?? 0, + stdoutPreview: result.stdoutPreview, + stderrPreview: result.stderrPreview, + durationMs: result.durationMs, + truncated: result.truncated, + files: result.files.map((f) => ({ + name: f.name, + fileMetadataId: String(f.fileMetadataId), + size: f.size, + contentType: f.contentType, + })), + }; + } + + return { + success: false, + executionId: String(result.executionId), + status: result.status, + exitCode: result.exitCode, + errorCode: result.errorCode ?? 'RUNTIME_ERROR', + errorMessage: result.errorMessage ?? 'Unknown error', + stdoutPreview: result.stdoutPreview, + stderrPreview: result.stderrPreview, + durationMs: result.durationMs, + truncated: result.truncated, + files: [], + }; + }, + }), +} as const satisfies ToolDefinition; diff --git a/services/platform/convex/agent_tools/tool_names.ts b/services/platform/convex/agent_tools/tool_names.ts index 2c8d66afa..1ae1a62c9 100644 --- a/services/platform/convex/agent_tools/tool_names.ts +++ b/services/platform/convex/agent_tools/tool_names.ts @@ -39,6 +39,7 @@ export const TOOL_NAMES = [ 'conversation_read', 'update_todos', 'propose_memory', + 'code_run', ] as const; export type ToolName = (typeof TOOL_NAMES)[number]; diff --git a/services/platform/convex/agent_tools/tool_registry.ts b/services/platform/convex/agent_tools/tool_registry.ts index 7ac0b9c82..30be51cb2 100644 --- a/services/platform/convex/agent_tools/tool_registry.ts +++ b/services/platform/convex/agent_tools/tool_registry.ts @@ -7,6 +7,7 @@ import { artifactCreateTool } from './artifacts/artifact_create_tool'; import { artifactEditTool } from './artifacts/artifact_edit_tool'; +import { codeRunTool } from './code/code_run_tool'; import { conversationReadTool } from './conversations/conversation_read_tool'; import { customerReadTool } from './customers/customer_read_tool'; import { databaseSchemaTool } from './database/database_schema_tool'; @@ -46,6 +47,7 @@ export { TOOL_NAMES, type ToolName } from './tool_names'; export const TOOL_REGISTRY = [ artifactCreateTool, artifactEditTool, + codeRunTool, customerReadTool, productReadTool, ragSearchTool, diff --git a/services/platform/convex/crons.ts b/services/platform/convex/crons.ts index 620ade8e9..28685e973 100644 --- a/services/platform/convex/crons.ts +++ b/services/platform/convex/crons.ts @@ -91,6 +91,19 @@ crons.cron( {}, ); +// Sandbox watchdog — same shape as the transcription / video-link sweeps. +// Convex hard-kills actions at the 30-min timeout without running the +// action's finally; that leaves sandboxExecutions stuck at `status='running'` +// and the slot they hold permanently shrinks the org's concurrent cap. +// Heartbeat from `executeCode` keeps `heartbeatAt` fresh while the action +// is alive; this cron flips rows older than 2× max-timeout to `failed`. +crons.cron( + 'recover stuck sandbox executions (every 5 min)', + '*/5 * * * *', + internal.sandbox.internal_mutations.recoverStuckSandboxes, + {}, +); + // GDPR erasure watchdog (round-2 V5 P0-14) - the same shape as the // transcription watchdog above. Convex actions hard-stop at 30 min; // `gdprErasureRequests` rows whose subject has too many rows / RAG diff --git a/services/platform/convex/governance/soft_delete_helpers.ts b/services/platform/convex/governance/soft_delete_helpers.ts index 2ad90ab56..7a1d2a0a8 100644 --- a/services/platform/convex/governance/soft_delete_helpers.ts +++ b/services/platform/convex/governance/soft_delete_helpers.ts @@ -156,6 +156,14 @@ export const SOFT_DELETE_RESOURCE_CONFIG: Record< displayNameField: 'action', authorField: 'subjectUserId', }, + sandboxExecution: { + tableName: 'sandboxExecutions', + statusField: 'lifecycleStatus', + auditPrefix: 'sandbox_execution', + auditResourceType: 'sandbox_execution', + displayNameField: 'purpose', + authorField: 'uploadedBy', + }, }; interface SoftDeletableRow { diff --git a/services/platform/convex/governance/soft_delete_validators.ts b/services/platform/convex/governance/soft_delete_validators.ts index 314503074..26c29ac7c 100644 --- a/services/platform/convex/governance/soft_delete_validators.ts +++ b/services/platform/convex/governance/soft_delete_validators.ts @@ -62,6 +62,11 @@ export const SOFT_DELETE_RESOURCE_TYPES = [ 'auditLog', 'chatFilterEvent', 'memoryAudit', + // Sandbox `code_run` audit rows — retention parity with workflowExecution. + // Trash flips lifecycleStatus='trashed'; grace-period sweep cascades to + // codeStorageId/stdoutStorageId/stderrStorageId + outputFiles[*] + // .fileMetadataId via the standard storage erasure helper. + 'sandboxExecution', ] as const; export type SoftDeleteResourceType = diff --git a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts new file mode 100644 index 000000000..1b07ba23d --- /dev/null +++ b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts @@ -0,0 +1,132 @@ +'use node'; + +// HTTP client for the sandbox spawner. +// +// HMAC-signs each request body with SANDBOX_TOKEN (mirrors services/sandbox/ +// src/auth.ts). Spawner rejects unsigned or wrong-signed requests with 401. + +import { createHmac } from 'node:crypto'; + +const SIGNATURE_HEADER = 'x-tale-sandbox-signature'; + +export interface SpawnerExecuteBody { + executionId: string; + organizationId: string; + language: 'python' | 'node'; + code: string; + packages?: string[]; + inputFiles?: { name: string; contentBase64: string }[]; + timeoutMs?: number; + options?: { allowSdist?: boolean; allowInstallScripts?: boolean }; +} + +export type SpawnerErrorCode = + | 'TIMEOUT' + | 'OOM' + | 'EGRESS_DENIED' + | 'INSTALL_FAILED' + | 'PACKAGE_NOT_FOUND' + | 'QUOTA_EXCEEDED' + | 'RUNTIME_ERROR' + | 'SPAWNER_UNAVAILABLE' + | 'CANCELLED'; + +export interface SpawnerExecuteResponse { + status: 'completed' | 'failed' | 'cancelled'; + exitCode: number | null; + errorCode?: SpawnerErrorCode; + errorMessage?: string; + stdoutBase64: string; + stderrBase64: string; + durationMs: number; + installMs: number | null; + runMs: number | null; + truncated: { stdout: boolean; stderr: boolean; files: number }; + outputFiles: { + name: string; + contentBase64: string; + size: number; + contentType: string; + }[]; +} + +function sign(body: string, token: string): string { + return createHmac('sha256', token).update(body).digest('hex'); +} + +function getSpawnerUrl(): string { + return process.env.SANDBOX_URL ?? 'http://sandbox:8003'; +} + +function getSpawnerToken(): string { + const token = process.env.SANDBOX_TOKEN; + if (!token) { + throw new Error( + 'SANDBOX_TOKEN env var is required for sandbox/code_run; set it in .env', + ); + } + return token; +} + +/** + * POST /v1/execute. Throws on transport / 5xx / 401; returns the spawner's + * own success-shape `{status, errorCode, ...}` otherwise so the caller can + * decide failure semantics. + */ +export async function spawnerExecute( + body: SpawnerExecuteBody, + signal: AbortSignal, +): Promise { + const url = `${getSpawnerUrl()}/v1/execute`; + const token = getSpawnerToken(); + const bodyJson = JSON.stringify(body); + + let res: Response; + try { + res = await fetch(url, { + method: 'POST', + headers: { + 'content-type': 'application/json', + [SIGNATURE_HEADER]: sign(bodyJson, token), + }, + body: bodyJson, + signal, + }); + } catch (err) { + throw new Error( + `sandbox spawner unreachable at ${url}: ${err instanceof Error ? err.message : String(err)}`, + ); + } + + if (res.status === 401) { + throw new Error( + 'sandbox spawner rejected request (401) — SANDBOX_TOKEN mismatch between Convex and spawner', + ); + } + if (res.status === 429) { + throw new Error('sandbox spawner busy (429) — concurrency cap reached'); + } + if (!res.ok) { + const text = await res.text().catch(() => ''); + throw new Error(`sandbox spawner ${res.status}: ${text || res.statusText}`); + } + return (await res.json()) as SpawnerExecuteResponse; +} + +export async function spawnerCancel(executionId: string): Promise { + const url = `${getSpawnerUrl()}/v1/cancel/${encodeURIComponent(executionId)}`; + const token = getSpawnerToken(); + const body = ''; + try { + await fetch(url, { + method: 'POST', + headers: { + 'content-type': 'application/json', + [SIGNATURE_HEADER]: sign(body, token), + }, + body, + }); + } catch { + // Cancellation is best-effort; the watchdog cron will reap stuck rows. + } +} diff --git a/services/platform/convex/node_only/sandbox/internal_actions.ts b/services/platform/convex/node_only/sandbox/internal_actions.ts new file mode 100644 index 000000000..f4b9ec3d5 --- /dev/null +++ b/services/platform/convex/node_only/sandbox/internal_actions.ts @@ -0,0 +1,450 @@ +'use node'; + +// `executeCode` — the action the `code_run` agent tool calls. +// +// Owns the spawner round-trip + storage transactionality: +// 1. reserveSlotAndInsert mutation (atomic quota + audit row insert). +// 2. resolveInputFiles internal query (IDOR + org/thread scoping). +// 3. ctx.storage.get → base64 for each input file. +// 4. setRunning mutation + start a 60s heartbeat loop. +// 5. POST /v1/execute on the spawner with AbortSignal wired through. +// 6. Upload every output blob; if all succeed, single batched +// `insertOutputFiles` mutation. On any storage failure, delete the +// blobs we already wrote so we don't orphan `_storage`. +// 7. Upload stdout/stderr to `_storage` when over the preview cap. +// 8. finalize mutation with the structured result. +// 9. usageLedger row (TODO: wire in once schema accepts cpuSeconds — +// see plan §4 step 9; ledger schema extension is a separate PR). +// +// Error rule (per R1.13 / [feedback_no_empty_catch]): +// - Infrastructure failures (spawner unreachable, action timeout, quota +// mutation throw) → THROW so the agent SDK surfaces them clearly. +// - User-code failures (exit ≠ 0, sandbox timeout, OOM, install failure) +// → RETURN structured `{success: false, status: 'failed', errorCode, ...}` +// so the LLM can read and react. + +import { ConvexError, v } from 'convex/values'; + +import { internal } from '../../_generated/api'; +import { internalAction } from '../../_generated/server'; +import { + SANDBOX_CODE_PREVIEW_MAX, + SANDBOX_DEFAULT_TIMEOUT_MS, + SANDBOX_MAX_TIMEOUT_MS, + SANDBOX_STDERR_PREVIEW_MAX, + SANDBOX_STDOUT_PREVIEW_MAX, +} from '../../sandbox/schema'; +import { spawnerCancel, spawnerExecute } from './helpers/spawner_client'; + +const languageValidator = v.union(v.literal('python'), v.literal('node')); + +const errorCodeValidator = v.union( + v.literal('TIMEOUT'), + v.literal('OOM'), + v.literal('EGRESS_DENIED'), + v.literal('INSTALL_FAILED'), + v.literal('PACKAGE_NOT_FOUND'), + v.literal('QUOTA_EXCEEDED'), + v.literal('RUNTIME_ERROR'), + v.literal('SPAWNER_UNAVAILABLE'), + v.literal('CANCELLED'), +); + +const HEARTBEAT_INTERVAL_MS = 60_000; + +export const executeCode = internalAction({ + args: { + organizationId: v.string(), + uploadedBy: v.string(), + threadId: v.optional(v.string()), + accessibleThreadIds: v.array(v.string()), + messageId: v.optional(v.string()), + toolCallId: v.optional(v.string()), + agentSlug: v.optional(v.string()), + + language: languageValidator, + code: v.string(), + packages: v.optional(v.array(v.string())), + inputFiles: v.optional( + v.array(v.object({ name: v.string(), fileId: v.string() })), + ), + timeoutMs: v.optional(v.number()), + allowSdist: v.optional(v.boolean()), + allowInstallScripts: v.optional(v.boolean()), + purpose: v.string(), + }, + returns: v.object({ + executionId: v.id('sandboxExecutions'), + success: v.boolean(), + status: v.union( + v.literal('completed'), + v.literal('failed'), + v.literal('cancelled'), + ), + exitCode: v.union(v.number(), v.null()), + errorCode: v.optional(errorCodeValidator), + errorMessage: v.optional(v.string()), + stdoutPreview: v.string(), + stderrPreview: v.string(), + durationMs: v.number(), + truncated: v.object({ + stdout: v.boolean(), + stderr: v.boolean(), + files: v.number(), + }), + files: v.array( + v.object({ + name: v.string(), + fileMetadataId: v.id('fileMetadata'), + size: v.number(), + contentType: v.string(), + }), + ), + }), + handler: async (ctx, args) => { + const timeoutMs = Math.min( + Math.max(args.timeoutMs ?? SANDBOX_DEFAULT_TIMEOUT_MS, 1_000), + SANDBOX_MAX_TIMEOUT_MS, + ); + const estimatedSeconds = Math.ceil(timeoutMs / 1000); + + // ---- codePreview / codeStorageId split ---- + const codeBytes = Buffer.byteLength(args.code, 'utf8'); + let codePreview = args.code; + let codeStorageId: string | undefined; + if (codeBytes > SANDBOX_CODE_PREVIEW_MAX) { + const blob = new Blob([args.code], { type: 'text/plain' }); + codeStorageId = await ctx.storage.store(blob); + codePreview = args.code.slice(0, SANDBOX_CODE_PREVIEW_MAX); + } + + // ---- atomic reservation (concurrent cap + daily CPU budget + insert) ---- + let executionId: Awaited< + ReturnType< + typeof ctx.runMutation< + typeof internal.sandbox.internal_mutations.reserveSlotAndInsert + > + > + >; + try { + executionId = await ctx.runMutation( + internal.sandbox.internal_mutations.reserveSlotAndInsert, + { + organizationId: args.organizationId, + uploadedBy: args.uploadedBy, + ...(args.threadId !== undefined && { threadId: args.threadId }), + ...(args.messageId !== undefined && { messageId: args.messageId }), + ...(args.toolCallId !== undefined && { toolCallId: args.toolCallId }), + ...(args.agentSlug !== undefined && { agentSlug: args.agentSlug }), + language: args.language, + purpose: args.purpose, + codePreview, + ...(codeStorageId !== undefined && { + // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- storage.store returns Id<'_storage'> + codeStorageId: codeStorageId as unknown as never, + }), + packages: args.packages ?? [], + ...((args.allowSdist !== undefined || + args.allowInstallScripts !== undefined) && { + installOptions: { + ...(args.allowSdist !== undefined && { + allowSdist: args.allowSdist, + }), + ...(args.allowInstallScripts !== undefined && { + allowInstallScripts: args.allowInstallScripts, + }), + }, + }), + estimatedSeconds, + }, + ); + } catch (err) { + // Quota errors are user-facing — surface as structured result rather + // than throwing, so the LLM can decide to wait / retry / abort. + if ( + err instanceof ConvexError && + typeof err.data === 'object' && + err.data !== null && + // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose + (err.data as { code?: string }).code === 'QUOTA_EXCEEDED' + ) { + // We never got an executionId, so synthesize a clearly-unreal one. + // The tool's wrapper will surface this back to the LLM cleanly. + throw new ConvexError({ + code: 'QUOTA_EXCEEDED', + message: + err.data && typeof err.data === 'object' && 'message' in err.data + ? String((err.data as { message?: string }).message) + : 'Sandbox quota exceeded', + }); + } + throw err; + } + + // ---- input file resolution + IDOR check ---- + let stagedInputs: { name: string; contentBase64: string }[] = []; + if (args.inputFiles && args.inputFiles.length > 0) { + const resolved = await ctx.runQuery( + internal.sandbox.internal_queries.resolveInputFiles, + { + organizationId: args.organizationId, + accessibleThreadIds: args.accessibleThreadIds, + fileIds: args.inputFiles.map((f) => f.fileId), + }, + ); + if (!resolved.ok) { + await ctx.runMutation(internal.sandbox.internal_mutations.finalize, { + executionId, + status: 'failed', + errorCode: 'SPAWNER_UNAVAILABLE', + errorMessage: `Input file rejected: ${resolved.reason}`, + outputFiles: [], + durationMs: 0, + actualSeconds: 0, + }); + return { + executionId, + success: false, + status: 'failed' as const, + exitCode: null, + errorCode: 'SPAWNER_UNAVAILABLE' as const, + errorMessage: `Input file rejected: ${resolved.reason}`, + stdoutPreview: '', + stderrPreview: '', + durationMs: 0, + truncated: { stdout: false, stderr: false, files: 0 }, + files: [], + }; + } + stagedInputs = await Promise.all( + resolved.files.map(async (rf, i) => { + // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- storage id from resolveInputFiles is the branded type + const blob = await ctx.storage.get(rf.storageId as never); + if (!blob) { + throw new Error( + `Sandbox: failed to read storage blob for ${rf.fileName}`, + ); + } + const ab = await blob.arrayBuffer(); + const requested = args.inputFiles?.[i]; + return { + name: requested?.name ?? rf.fileName, + contentBase64: Buffer.from(ab).toString('base64'), + }; + }), + ); + } + + // ---- flip status, start heartbeat ---- + await ctx.runMutation(internal.sandbox.internal_mutations.setRunning, { + executionId, + }); + + const heartbeat = setInterval(() => { + void ctx.runMutation(internal.sandbox.internal_mutations.heartbeat, { + executionId, + }); + }, HEARTBEAT_INTERVAL_MS); + + const abort = new AbortController(); + const startedAt = Date.now(); + + try { + const spawnerResult = await spawnerExecute( + { + executionId: String(executionId), + organizationId: args.organizationId, + language: args.language, + code: args.code, + ...(args.packages !== undefined && { packages: args.packages }), + ...(stagedInputs.length > 0 && { inputFiles: stagedInputs }), + timeoutMs, + ...((args.allowSdist !== undefined || + args.allowInstallScripts !== undefined) && { + options: { + ...(args.allowSdist !== undefined && { + allowSdist: args.allowSdist, + }), + ...(args.allowInstallScripts !== undefined && { + allowInstallScripts: args.allowInstallScripts, + }), + }, + }), + }, + abort.signal, + ); + + // ---- file upload (all-or-nothing) ---- + const uploadedStorageIds: string[] = []; + let uploadFailureMessage: string | undefined; + const stagedForInsert: { + name: string; + // oxlint-disable-next-line typescript/no-explicit-any -- normalized as Id<'_storage'> in mutation arg validator + storageId: any; + size: number; + contentType: string; + }[] = []; + for (const f of spawnerResult.outputFiles) { + try { + const bytes = Buffer.from(f.contentBase64, 'base64'); + const blob = new Blob([bytes], { type: f.contentType }); + const storageId = await ctx.storage.store(blob); + uploadedStorageIds.push(String(storageId)); + stagedForInsert.push({ + name: f.name, + storageId, + size: f.size, + contentType: f.contentType, + }); + } catch (err) { + uploadFailureMessage = + err instanceof Error ? err.message : String(err); + break; + } + } + if (uploadFailureMessage !== undefined) { + // Roll back uploads we already wrote so _storage doesn't orphan. + for (const sid of uploadedStorageIds) { + // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- delete needs Id<'_storage'> + await ctx.storage.delete(sid as never).catch(() => {}); + } + await ctx.runMutation(internal.sandbox.internal_mutations.finalize, { + executionId, + status: 'failed', + errorCode: 'SPAWNER_UNAVAILABLE', + errorMessage: `Output upload failed: ${uploadFailureMessage}`, + stdoutPreview: spawnerResult.stdoutBase64 + ? Buffer.from(spawnerResult.stdoutBase64, 'base64') + .toString('utf8') + .slice(0, SANDBOX_STDOUT_PREVIEW_MAX) + : '', + stderrPreview: spawnerResult.stderrBase64 + ? Buffer.from(spawnerResult.stderrBase64, 'base64') + .toString('utf8') + .slice(0, SANDBOX_STDERR_PREVIEW_MAX) + : '', + outputFiles: [], + durationMs: Date.now() - startedAt, + actualSeconds: (Date.now() - startedAt) / 1000, + }); + return { + executionId, + success: false, + status: 'failed' as const, + exitCode: null, + errorCode: 'SPAWNER_UNAVAILABLE' as const, + errorMessage: `Output upload failed: ${uploadFailureMessage}`, + stdoutPreview: '', + stderrPreview: '', + durationMs: Date.now() - startedAt, + truncated: { stdout: false, stderr: false, files: 0 }, + files: [], + }; + } + + const insertedFiles = await ctx.runMutation( + internal.sandbox.output_mutations.insertOutputFiles, + { + organizationId: args.organizationId, + ...(args.threadId !== undefined && { threadId: args.threadId }), + uploadedBy: args.uploadedBy, + files: stagedForInsert, + }, + ); + + // ---- stdout/stderr previews + overflow storage ---- + const stdoutText = Buffer.from( + spawnerResult.stdoutBase64, + 'base64', + ).toString('utf8'); + const stderrText = Buffer.from( + spawnerResult.stderrBase64, + 'base64', + ).toString('utf8'); + const stdoutPreview = stdoutText.slice(0, SANDBOX_STDOUT_PREVIEW_MAX); + const stderrPreview = stderrText.slice(0, SANDBOX_STDERR_PREVIEW_MAX); + let stdoutStorageId: string | undefined; + let stderrStorageId: string | undefined; + if (stdoutText.length > SANDBOX_STDOUT_PREVIEW_MAX) { + const blob = new Blob([stdoutText], { type: 'text/plain' }); + stdoutStorageId = await ctx.storage.store(blob); + } + if (stderrText.length > SANDBOX_STDERR_PREVIEW_MAX) { + const blob = new Blob([stderrText], { type: 'text/plain' }); + stderrStorageId = await ctx.storage.store(blob); + } + + const durationMs = spawnerResult.durationMs; + const actualSeconds = durationMs / 1000; + + await ctx.runMutation(internal.sandbox.internal_mutations.finalize, { + executionId, + status: spawnerResult.status, + ...(spawnerResult.exitCode !== null && { + exitCode: spawnerResult.exitCode, + }), + ...(spawnerResult.errorCode !== undefined && { + errorCode: spawnerResult.errorCode, + }), + ...(spawnerResult.errorMessage !== undefined && { + errorMessage: spawnerResult.errorMessage, + }), + stdoutPreview, + stderrPreview, + ...(stdoutStorageId !== undefined && { + // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- store returns Id<'_storage'> + stdoutStorageId: stdoutStorageId as unknown as never, + }), + ...(stderrStorageId !== undefined && { + // oxlint-disable-next-line typescript/no-unsafe-type-assertion + stderrStorageId: stderrStorageId as unknown as never, + }), + outputFiles: insertedFiles.map((f) => ({ + name: f.name, + fileMetadataId: f.fileMetadataId, + size: f.size, + contentType: f.contentType, + })), + truncated: spawnerResult.truncated, + durationMs, + actualSeconds, + }); + + return { + executionId, + success: spawnerResult.status === 'completed', + status: spawnerResult.status, + exitCode: spawnerResult.exitCode, + ...(spawnerResult.errorCode !== undefined && { + errorCode: spawnerResult.errorCode, + }), + ...(spawnerResult.errorMessage !== undefined && { + errorMessage: spawnerResult.errorMessage, + }), + stdoutPreview, + stderrPreview, + durationMs, + truncated: spawnerResult.truncated, + files: insertedFiles, + }; + } catch (err) { + // Infra failure: throw so the agent SDK surfaces it. We still finalize + // the audit row to release the slot. + const message = err instanceof Error ? err.message : String(err); + // Best-effort spawner cancel (idempotent if container already gone). + await spawnerCancel(String(executionId)); + await ctx.runMutation(internal.sandbox.internal_mutations.finalize, { + executionId, + status: 'failed', + errorCode: 'SPAWNER_UNAVAILABLE', + errorMessage: message, + outputFiles: [], + durationMs: Date.now() - startedAt, + actualSeconds: (Date.now() - startedAt) / 1000, + }); + throw new Error(`Sandbox spawner failed: ${message}`); + } finally { + clearInterval(heartbeat); + } + }, +}); diff --git a/services/platform/convex/sandbox/internal_mutations.ts b/services/platform/convex/sandbox/internal_mutations.ts new file mode 100644 index 000000000..773b71d24 --- /dev/null +++ b/services/platform/convex/sandbox/internal_mutations.ts @@ -0,0 +1,280 @@ +import { ConvexError, v } from 'convex/values'; + +import { internalMutation } from '../_generated/server'; +import { + SANDBOX_DAILY_CPU_BUDGET_SECONDS, + SANDBOX_MAX_CONCURRENT_PER_ORG, + SANDBOX_WATCHDOG_CUTOFF_MS, +} from './schema'; + +const ONE_DAY_MS = 24 * 60 * 60 * 1000; + +const languageValidator = v.union(v.literal('python'), v.literal('node')); + +const errorCodeValidator = v.union( + v.literal('TIMEOUT'), + v.literal('OOM'), + v.literal('EGRESS_DENIED'), + v.literal('INSTALL_FAILED'), + v.literal('PACKAGE_NOT_FOUND'), + v.literal('QUOTA_EXCEEDED'), + v.literal('RUNTIME_ERROR'), + v.literal('SPAWNER_UNAVAILABLE'), + v.literal('CANCELLED'), +); + +const truncatedValidator = v.object({ + stdout: v.boolean(), + stderr: v.boolean(), + files: v.number(), +}); + +/** + * Atomic concurrency-cap + daily-CPU-budget reservation. + * + * Convex mutations are serializable with OCC: the by_organizationId_and_status + * index range read here is recorded in the read set, so two parallel + * reservations that both see "3/4 in flight" cannot both insert — one + * retries. This closes the TOCTOU race R1.8/R1.10 flagged. + * + * Daily CPU budget = sum(actualSeconds of completed-today) + sum(estimatedSeconds + * of currently-running) + this call's estimate. Pre-debit so 4 concurrent + * 300s calls cannot collectively overshoot (post-debit would allow a 20-min + * burst per wave). + */ +export const reserveSlotAndInsert = internalMutation({ + args: { + organizationId: v.string(), + uploadedBy: v.string(), + threadId: v.optional(v.string()), + messageId: v.optional(v.string()), + toolCallId: v.optional(v.string()), + agentSlug: v.optional(v.string()), + language: languageValidator, + purpose: v.optional(v.string()), + codePreview: v.string(), + codeStorageId: v.optional(v.id('_storage')), + packages: v.array(v.string()), + installOptions: v.optional( + v.object({ + allowSdist: v.optional(v.boolean()), + allowInstallScripts: v.optional(v.boolean()), + }), + ), + estimatedSeconds: v.number(), + }, + returns: v.id('sandboxExecutions'), + handler: async (ctx, args) => { + const now = Date.now(); + + // Concurrent cap. Short-circuit at the cap; never materialise the full set. + let inFlight = 0; + let runningSecondsProjected = 0; + for await (const row of ctx.db + .query('sandboxExecutions') + .withIndex('by_organizationId_and_status', (q) => + q.eq('organizationId', args.organizationId).eq('status', 'running'), + )) { + inFlight += 1; + runningSecondsProjected += row.estimatedSeconds; + if (inFlight >= SANDBOX_MAX_CONCURRENT_PER_ORG) { + throw new ConvexError({ + code: 'QUOTA_EXCEEDED', + message: `At most ${SANDBOX_MAX_CONCURRENT_PER_ORG} sandboxes can run concurrently for this organization.`, + }); + } + } + // Also include queued rows in the cap so a misbehaving caller can't + // burst-insert N queued rows before any flip to running. + for await (const row of ctx.db + .query('sandboxExecutions') + .withIndex('by_organizationId_and_status', (q) => + q.eq('organizationId', args.organizationId).eq('status', 'queued'), + )) { + inFlight += 1; + runningSecondsProjected += row.estimatedSeconds; + if (inFlight >= SANDBOX_MAX_CONCURRENT_PER_ORG) { + throw new ConvexError({ + code: 'QUOTA_EXCEEDED', + message: `At most ${SANDBOX_MAX_CONCURRENT_PER_ORG} sandboxes can run concurrently for this organization.`, + }); + } + } + + // Daily CPU-second budget. Today = last 24h sliding window keyed by + // `_creationTime`. Reusing `by_organizationId` index (per `videoLinkJobs` + // convention) keeps the scan bounded for typical orgs (≤dozens/day). + const dayCutoff = now - ONE_DAY_MS; + let completedToday = 0; + for await (const row of ctx.db + .query('sandboxExecutions') + .withIndex('by_organizationId', (q) => + q.eq('organizationId', args.organizationId), + ) + .order('desc')) { + if (row._creationTime < dayCutoff) break; + if (row.status === 'completed' || row.status === 'failed') { + completedToday += row.actualSeconds ?? row.estimatedSeconds; + } + } + if ( + completedToday + runningSecondsProjected + args.estimatedSeconds > + SANDBOX_DAILY_CPU_BUDGET_SECONDS + ) { + throw new ConvexError({ + code: 'QUOTA_EXCEEDED', + message: `Daily CPU-second budget exceeded (${SANDBOX_DAILY_CPU_BUDGET_SECONDS}s/org). Try again tomorrow or split the work.`, + }); + } + + return await ctx.db.insert('sandboxExecutions', { + organizationId: args.organizationId, + uploadedBy: args.uploadedBy, + ...(args.threadId !== undefined && { threadId: args.threadId }), + ...(args.messageId !== undefined && { messageId: args.messageId }), + ...(args.toolCallId !== undefined && { toolCallId: args.toolCallId }), + ...(args.agentSlug !== undefined && { agentSlug: args.agentSlug }), + language: args.language, + ...(args.purpose !== undefined && { purpose: args.purpose }), + codePreview: args.codePreview, + ...(args.codeStorageId !== undefined && { + codeStorageId: args.codeStorageId, + }), + packages: args.packages, + ...(args.installOptions !== undefined && { + installOptions: args.installOptions, + }), + status: 'queued', + statusChangedAt: now, + heartbeatAt: now, + estimatedSeconds: args.estimatedSeconds, + outputFiles: [], + startedAt: now, + lifecycleStatus: 'active', + }); + }, +}); + +export const setRunning = internalMutation({ + args: { executionId: v.id('sandboxExecutions') }, + returns: v.null(), + handler: async (ctx, args) => { + const row = await ctx.db.get(args.executionId); + if (!row) return null; + if (row.status !== 'queued') return null; + const now = Date.now(); + await ctx.db.patch(args.executionId, { + status: 'running', + statusChangedAt: now, + heartbeatAt: now, + }); + return null; + }, +}); + +export const heartbeat = internalMutation({ + args: { executionId: v.id('sandboxExecutions') }, + returns: v.null(), + handler: async (ctx, args) => { + const row = await ctx.db.get(args.executionId); + if (!row) return null; + if (row.status !== 'running') return null; + await ctx.db.patch(args.executionId, { heartbeatAt: Date.now() }); + return null; + }, +}); + +export const finalize = internalMutation({ + args: { + executionId: v.id('sandboxExecutions'), + status: v.union( + v.literal('completed'), + v.literal('failed'), + v.literal('cancelled'), + ), + exitCode: v.optional(v.number()), + errorCode: v.optional(errorCodeValidator), + errorMessage: v.optional(v.string()), + stdoutPreview: v.optional(v.string()), + stderrPreview: v.optional(v.string()), + stdoutStorageId: v.optional(v.id('_storage')), + stderrStorageId: v.optional(v.id('_storage')), + outputFiles: v.array( + v.object({ + name: v.string(), + fileMetadataId: v.id('fileMetadata'), + size: v.number(), + contentType: v.string(), + }), + ), + truncated: v.optional(truncatedValidator), + durationMs: v.number(), + actualSeconds: v.number(), + }, + returns: v.null(), + handler: async (ctx, args) => { + const row = await ctx.db.get(args.executionId); + if (!row) return null; + const now = Date.now(); + await ctx.db.patch(args.executionId, { + status: args.status, + statusChangedAt: now, + completedAt: now, + durationMs: args.durationMs, + actualSeconds: args.actualSeconds, + ...(args.exitCode !== undefined && { exitCode: args.exitCode }), + ...(args.errorCode !== undefined && { errorCode: args.errorCode }), + ...(args.errorMessage !== undefined && { + errorMessage: args.errorMessage, + }), + ...(args.stdoutPreview !== undefined && { + stdoutPreview: args.stdoutPreview, + }), + ...(args.stderrPreview !== undefined && { + stderrPreview: args.stderrPreview, + }), + ...(args.stdoutStorageId !== undefined && { + stdoutStorageId: args.stdoutStorageId, + }), + ...(args.stderrStorageId !== undefined && { + stderrStorageId: args.stderrStorageId, + }), + outputFiles: args.outputFiles, + ...(args.truncated !== undefined && { truncated: args.truncated }), + }); + return null; + }, +}); + +/** + * Watchdog cron — flips long-stuck running rows to failed/SPAWNER_UNAVAILABLE. + * + * Convex 30-min hard-kill skips action `try/finally`, so without this the + * audit row stays `running` forever and the slot it holds permanently + * shrinks the org's concurrent cap. Heartbeat from the action keeps + * `heartbeatAt` fresh; we declare a row stuck when it's been 2×max_timeout + * without an update. + */ +export const recoverStuckSandboxes = internalMutation({ + args: {}, + returns: v.number(), + handler: async (ctx) => { + const cutoff = Date.now() - SANDBOX_WATCHDOG_CUTOFF_MS; + let recovered = 0; + for await (const row of ctx.db + .query('sandboxExecutions') + .withIndex('by_status', (q) => q.eq('status', 'running'))) { + if (row.heartbeatAt >= cutoff) continue; + await ctx.db.patch(row._id, { + status: 'failed', + statusChangedAt: Date.now(), + completedAt: Date.now(), + errorCode: 'SPAWNER_UNAVAILABLE', + errorMessage: 'Watchdog reaped a stuck running row', + actualSeconds: row.estimatedSeconds, + }); + recovered += 1; + } + return recovered; + }, +}); diff --git a/services/platform/convex/sandbox/internal_queries.ts b/services/platform/convex/sandbox/internal_queries.ts new file mode 100644 index 000000000..c5d00dec8 --- /dev/null +++ b/services/platform/convex/sandbox/internal_queries.ts @@ -0,0 +1,79 @@ +// Internal queries the sandbox Node action uses to resolve input file refs +// and verify org+thread scoping (closes the IDOR vector R2.8 flagged for +// `inputFiles`). + +import { v } from 'convex/values'; + +import type { Id } from '../_generated/dataModel'; +import { internalQuery } from '../_generated/server'; + +/** + * Resolve a list of caller-supplied `fileId` strings (intended to be + * `Id<'fileMetadata'>`) into their `storageId`s. Refuses any row that + * doesn't belong to the caller's organization, or any chat-bound row + * whose `threadId` isn't in the caller's accessible-thread set. + * + * The Node action calls this BEFORE staging anything into the sandbox. + */ +export const resolveInputFiles = internalQuery({ + args: { + organizationId: v.string(), + accessibleThreadIds: v.array(v.string()), + fileIds: v.array(v.string()), + }, + returns: v.union( + v.object({ + ok: v.literal(true), + files: v.array( + v.object({ + fileId: v.string(), + storageId: v.id('_storage'), + contentType: v.string(), + size: v.number(), + fileName: v.string(), + }), + ), + }), + v.object({ ok: v.literal(false), reason: v.string() }), + ), + handler: async (ctx, args) => { + const allowedThreads = new Set(args.accessibleThreadIds); + const out: { + fileId: string; + storageId: Id<'_storage'>; + contentType: string; + size: number; + fileName: string; + }[] = []; + for (const fileIdStr of args.fileIds) { + const fileId = ctx.db.normalizeId('fileMetadata', fileIdStr); + if (!fileId) { + return { ok: false as const, reason: `Invalid fileId: ${fileIdStr}` }; + } + const row = await ctx.db.get(fileId); + if (!row) { + return { ok: false as const, reason: `Unknown fileId: ${fileIdStr}` }; + } + if (row.organizationId !== args.organizationId) { + return { + ok: false as const, + reason: `fileId ${fileIdStr} belongs to a different organization`, + }; + } + if (row.threadId !== undefined && !allowedThreads.has(row.threadId)) { + return { + ok: false as const, + reason: `fileId ${fileIdStr} is bound to a thread outside this caller's scope`, + }; + } + out.push({ + fileId: fileIdStr, + storageId: row.storageId, + contentType: row.contentType, + size: row.size, + fileName: row.fileName, + }); + } + return { ok: true as const, files: out }; + }, +}); diff --git a/services/platform/convex/sandbox/output_mutations.ts b/services/platform/convex/sandbox/output_mutations.ts new file mode 100644 index 000000000..0b1910429 --- /dev/null +++ b/services/platform/convex/sandbox/output_mutations.ts @@ -0,0 +1,68 @@ +// Internal mutations the sandbox Node action uses to commit storage uploads +// transactionally. Kept in the non-`use node` module because mutations don't +// run in the Node runtime. + +import { v } from 'convex/values'; + +import type { Id } from '../_generated/dataModel'; +import { internalMutation } from '../_generated/server'; + +const outputFileValidator = v.object({ + name: v.string(), + storageId: v.id('_storage'), + size: v.number(), + contentType: v.string(), +}); + +/** + * After the action has uploaded every output blob to `_storage`, this + * mutation atomically inserts the `fileMetadata` rows that point at them. + * All-or-nothing: if any insert fails the mutation aborts and the caller + * deletes the orphaned `_storage` blobs. + */ +export const insertOutputFiles = internalMutation({ + args: { + organizationId: v.string(), + threadId: v.optional(v.string()), + uploadedBy: v.string(), + files: v.array(outputFileValidator), + }, + returns: v.array( + v.object({ + name: v.string(), + fileMetadataId: v.id('fileMetadata'), + size: v.number(), + contentType: v.string(), + }), + ), + handler: async (ctx, args) => { + const now = Date.now(); + const out: { + name: string; + fileMetadataId: Id<'fileMetadata'>; + size: number; + contentType: string; + }[] = []; + for (const f of args.files) { + const fileMetadataId = await ctx.db.insert('fileMetadata', { + organizationId: args.organizationId, + storageId: f.storageId, + ...(args.threadId !== undefined && { threadId: args.threadId }), + uploadedBy: args.uploadedBy, + fileName: f.name, + contentType: f.contentType, + size: f.size, + source: 'agent', + lifecycleStatus: 'active', + statusChangedAt: now, + }); + out.push({ + name: f.name, + fileMetadataId, + size: f.size, + contentType: f.contentType, + }); + } + return out; + }, +}); diff --git a/services/platform/convex/sandbox/schema.ts b/services/platform/convex/sandbox/schema.ts new file mode 100644 index 000000000..30053e946 --- /dev/null +++ b/services/platform/convex/sandbox/schema.ts @@ -0,0 +1,132 @@ +import { defineTable } from 'convex/server'; +import { v } from 'convex/values'; + +import { lifecycleStatusValidator } from '../governance/soft_delete_validators'; + +/** + * Audit row for one `code_run` tool call. + * + * Lifecycle: + * queued — inserted atomically inside reserveSlotAndInsert (concurrent + * cap + daily CPU budget both checked in the same mutation). + * running — flipped after the spawner HTTP call begins; heartbeatAt + * refreshed every 60s by the Convex action so the watchdog + * can distinguish "Convex hard-killed the action" from + * "still working". + * completed — exitCode === 0 and the file harvest succeeded. + * failed — any non-success outcome; `errorCode` carries the cause. + * cancelled — client aborted via /v1/cancel or LLM-side abort signal. + * + * Status is intentionally thin (5 values); every "why" lives in errorCode + * so audit queries don't have to special-case ad-hoc kill modes. + * + * Indexes: + * by_organizationId_and_status — quota counting (reserveSlot scan) + * by_organizationId — daily CPU-budget sum + general + * per-org history + * by_org_user — GDPR right-to-be-forgotten cascade + * by_status — watchdog sweep across all orgs + * by_threadId — chat-pane history (future UI) + */ +export const sandboxExecutionsTable = defineTable({ + organizationId: v.string(), + threadId: v.optional(v.string()), + messageId: v.optional(v.string()), + toolCallId: v.optional(v.string()), + uploadedBy: v.string(), + agentSlug: v.optional(v.string()), + + language: v.union(v.literal('python'), v.literal('node')), + purpose: v.optional(v.string()), + + // Preview kept inline so the chat-pane card can render without an extra + // round-trip; full code persists in `_storage` when over ~8 KB. + codePreview: v.string(), + codeStorageId: v.optional(v.id('_storage')), + packages: v.array(v.string()), + installOptions: v.optional( + v.object({ + allowSdist: v.optional(v.boolean()), + allowInstallScripts: v.optional(v.boolean()), + }), + ), + + status: v.union( + v.literal('queued'), + v.literal('running'), + v.literal('completed'), + v.literal('failed'), + v.literal('cancelled'), + ), + // Every status patch must update this. Watchdog reads + // `now - heartbeatAt` (not statusChangedAt) so a long-running but + // healthy job isn't reaped. + statusChangedAt: v.number(), + heartbeatAt: v.number(), + + // For daily CPU-second budget enforcement we pre-debit with this + // estimate at reservation time; finalize replaces it with actualSeconds. + estimatedSeconds: v.number(), + actualSeconds: v.optional(v.number()), + + exitCode: v.optional(v.number()), + durationMs: v.optional(v.number()), + + stdoutPreview: v.optional(v.string()), // ≤16 KB + stderrPreview: v.optional(v.string()), + stdoutStorageId: v.optional(v.id('_storage')), + stderrStorageId: v.optional(v.id('_storage')), + + outputFiles: v.array( + v.object({ + name: v.string(), + fileMetadataId: v.id('fileMetadata'), + size: v.number(), + contentType: v.string(), + }), + ), + // Spawner reports per-call caps were hit; the tool result mirrors these + // so the LLM can react ("re-run with smaller scope"). + truncated: v.optional( + v.object({ + stdout: v.boolean(), + stderr: v.boolean(), + files: v.number(), + }), + ), + + startedAt: v.number(), + completedAt: v.optional(v.number()), + + errorCode: v.optional( + v.union( + v.literal('TIMEOUT'), + v.literal('OOM'), + v.literal('EGRESS_DENIED'), + v.literal('INSTALL_FAILED'), + v.literal('PACKAGE_NOT_FOUND'), + v.literal('QUOTA_EXCEEDED'), + v.literal('RUNTIME_ERROR'), + v.literal('SPAWNER_UNAVAILABLE'), + v.literal('CANCELLED'), + ), + ), + errorMessage: v.optional(v.string()), + + lifecycleStatus: v.optional(lifecycleStatusValidator), +}) + .index('by_organizationId_and_status', ['organizationId', 'status']) + .index('by_organizationId', ['organizationId']) + .index('by_org_user', ['organizationId', 'uploadedBy']) + .index('by_status', ['status']) + .index('by_threadId', ['threadId']); + +export const SANDBOX_MAX_CONCURRENT_PER_ORG = 4; +export const SANDBOX_DAILY_CPU_BUDGET_SECONDS = 1800; +export const SANDBOX_MAX_TIMEOUT_MS = 300_000; +export const SANDBOX_DEFAULT_TIMEOUT_MS = 30_000; +export const SANDBOX_WATCHDOG_CUTOFF_MS = 2 * SANDBOX_MAX_TIMEOUT_MS; + +export const SANDBOX_CODE_PREVIEW_MAX = 8 * 1024; +export const SANDBOX_STDOUT_PREVIEW_MAX = 16 * 1024; +export const SANDBOX_STDERR_PREVIEW_MAX = 16 * 1024; diff --git a/services/platform/convex/schema.ts b/services/platform/convex/schema.ts index 2a9877d92..9abfb00a0 100644 --- a/services/platform/convex/schema.ts +++ b/services/platform/convex/schema.ts @@ -54,6 +54,7 @@ import { notificationsTable } from './notifications/schema'; import { onedriveSyncConfigsTable } from './onedrive/schema'; import { productsTable } from './products/schema'; import { promptCategoriesTable, promptTemplatesTable } from './prompts/schema'; +import { sandboxExecutionsTable } from './sandbox/schema'; import { ssoProvidersTable } from './sso_providers/schema'; import { messageMetadataTable } from './streaming/schema'; import { threadTodosTable } from './thread_todos/schema'; @@ -152,6 +153,7 @@ export default defineSchema({ products: productsTable, ssoProviders: ssoProvidersTable, vendors: vendorsTable, + sandboxExecutions: sandboxExecutionsTable, videoLinkJobs: videoLinkJobsTable, websites: websitesTable, wfApiKeys: wfApiKeysTable, From 4c8af7483e44ec9e50fb2e426600c24ff7388f33 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 18:56:57 +0800 Subject: [PATCH 003/108] feat(cli): integrate sandbox + sandbox-egress services + tale doctor (M3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CLI work to deploy the sandbox stack via `tale start` / `tale deploy`: - ComposeService type extended with cap_add, mem_limit, pids_limit, ulimits, security_opt, runtime — previously absent from the generator which silently dropped these on the convex service. - BONUS FIX surfaced by sandbox review (R1.17): create-convex-service.ts was shipping the production convex container WITHOUT NET_ADMIN, so services/convex/docker-entrypoint.sh:79 was silently logging "iptables present but no NET_ADMIN capability — SSRF firewall NOT installed" on every deploy. Apply the missing cap_add + mem_limit + pids_limit + ulimits flags so production deployments finally get the SSRF egress firewall the entrypoint was always trying to install. - New service factories: create-sandbox-service (HTTP spawner, mounts docker.sock, two-network membership) and create-sandbox-egress-service (tinyproxy sidecar on the internal sandbox bridge). - STATEFUL_SERVICES includes 'sandbox' + 'sandbox-egress' so the deploy.ts auto-include-missing-stateful logic picks them up on the next `tale deploy` after upgrade — no migration registry entry needed. - ensureSandboxNetwork() creates `tale-sandbox-net` (fixed Docker name, --internal, --ipv6=false). Called from both start.ts (dev) and deploy.ts (prod) infrastructure setup. - ensure-env: SANDBOX_TOKEN added to requiredVars + secretDefaults (auto-generated 32-byte hex). generateEnvContent emits SANDBOX_TOKEN + SANDBOX_RUNTIME / SANDBOX_EGRESS_ALLOWLIST comment block for operators to override. - New command: `tale doctor` — preflight checks for sandbox host requirements (docker, /var/run/docker.sock, runsc registration with dockerd, userns-remap, AppArmor docker-default, SANDBOX_TOKEN presence). R1.17 surfaced that there was no `doctor` command at all; scope here is intentionally narrow (sandbox-relevant only) to avoid scope creep — future Docker version / disk headroom checks belong here too but separately. CLI typecheck passes (`bunx tsc --noEmit`). Next: tests (M3 test pass) + bun run check until green. --- tools/cli/src/commands/doctor.ts | 187 ++++++++++++++++++ tools/cli/src/index.ts | 2 + tools/cli/src/lib/actions/deploy.ts | 7 +- tools/cli/src/lib/actions/start.ts | 10 +- .../generators/generate-dev-compose.ts | 8 + .../generators/generate-stateful-compose.ts | 9 + .../compose/services/create-convex-service.ts | 19 ++ .../services/create-sandbox-egress-service.ts | 35 ++++ .../services/create-sandbox-service.ts | 52 +++++ tools/cli/src/lib/compose/types.ts | 21 +- tools/cli/src/lib/config/ensure-env.ts | 24 +++ tools/cli/src/lib/docker/ensure-network.ts | 22 ++- 12 files changed, 392 insertions(+), 4 deletions(-) create mode 100644 tools/cli/src/commands/doctor.ts create mode 100644 tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts create mode 100644 tools/cli/src/lib/compose/services/create-sandbox-service.ts diff --git a/tools/cli/src/commands/doctor.ts b/tools/cli/src/commands/doctor.ts new file mode 100644 index 000000000..76f4371a9 --- /dev/null +++ b/tools/cli/src/commands/doctor.ts @@ -0,0 +1,187 @@ +import { execSync } from 'node:child_process'; +import { existsSync } from 'node:fs'; + +import { Command } from 'commander'; + +import * as logger from '../utils/logger'; + +/** + * `tale doctor` — preflight checks for the host environment. + * + * Initial scope: sandbox-relevant items only (R1.17 surfaced that the + * CLI never had a doctor command). Future checks (Postgres / Docker + * versions, disk headroom, etc.) belong here too but are out of scope + * for the sandbox-foundation rollout. + */ + +interface Check { + name: string; + status: 'ok' | 'warn' | 'fail'; + detail: string; + fix?: string; +} + +function tryRun(cmd: string): string | undefined { + try { + return execSync(cmd, { stdio: ['ignore', 'pipe', 'ignore'] }) + .toString() + .trim(); + } catch { + return undefined; + } +} + +function checkDocker(): Check { + const version = tryRun('docker --version'); + if (!version) { + return { + name: 'docker', + status: 'fail', + detail: 'docker CLI not on PATH', + fix: 'Install Docker Engine 24+ or Docker Desktop', + }; + } + return { name: 'docker', status: 'ok', detail: version }; +} + +function checkSocket(): Check { + if (!existsSync('/var/run/docker.sock')) { + return { + name: 'docker socket', + status: 'fail', + detail: '/var/run/docker.sock not present', + fix: 'Start the Docker daemon (systemctl start docker) or open Docker Desktop', + }; + } + return { + name: 'docker socket', + status: 'ok', + detail: '/var/run/docker.sock present', + }; +} + +function checkRunsc(): Check { + const runtimes = tryRun( + "docker info --format '{{json .Runtimes}}' 2>/dev/null", + ); + const hasRunsc = runtimes ? /\brunsc\b/.test(runtimes) : false; + if (hasRunsc) { + return { + name: 'gVisor runtime (runsc)', + status: 'ok', + detail: 'registered with dockerd; set SANDBOX_RUNTIME=runsc to opt in', + }; + } + return { + name: 'gVisor runtime (runsc)', + status: 'warn', + detail: + 'not registered with dockerd — sandbox will use plain runc (recommended for demo stage; install runsc before exposing to untrusted external workloads)', + fix: 'https://gvisor.dev/docs/user_guide/install/ then `sudo runsc install && sudo systemctl restart docker`', + }; +} + +function checkUserns(): Check { + const out = tryRun("docker info --format '{{.SecurityOptions}}' 2>/dev/null"); + if (out && /name=userns/.test(out)) { + return { + name: 'dockerd userns-remap', + status: 'ok', + detail: 'enabled — container root ≠ host root', + }; + } + return { + name: 'dockerd userns-remap', + status: 'warn', + detail: + 'not enabled — sandbox container UID 65534 maps to host UID 65534; combined with a kernel LPE this is a path to host root', + fix: 'Set "userns-remap": "default" in /etc/docker/daemon.json and restart docker', + }; +} + +function checkApparmor(): Check { + const aa = tryRun('cat /sys/kernel/security/apparmor/profiles 2>/dev/null'); + if (aa && /docker-default/.test(aa)) { + return { + name: 'AppArmor docker-default', + status: 'ok', + detail: 'profile loaded', + }; + } + return { + name: 'AppArmor docker-default', + status: 'warn', + detail: + 'not loaded — sandbox containers rely on Docker built-in seccomp only; consider enabling AppArmor on production hosts', + }; +} + +function checkSandboxToken(env: NodeJS.ProcessEnv): Check { + if (!env.SANDBOX_TOKEN || env.SANDBOX_TOKEN.length < 32) { + return { + name: 'SANDBOX_TOKEN', + status: 'fail', + detail: + 'missing or too short — required for HMAC auth between Convex and the sandbox spawner', + fix: 'Re-run `tale init` (or set a 64-char hex value manually)', + }; + } + return { + name: 'SANDBOX_TOKEN', + status: 'ok', + detail: `set (${env.SANDBOX_TOKEN.length} chars)`, + }; +} + +function statusIcon(s: Check['status']): string { + return s === 'ok' ? '✓' : s === 'warn' ? '!' : '✗'; +} + +export function createDoctorCommand(): Command { + return new Command('doctor') + .description( + 'Preflight checks for sandbox / code_run host requirements (docker, runsc, userns-remap, secrets).', + ) + .action(async () => { + const env = process.env; + const checks: Check[] = [ + checkDocker(), + checkSocket(), + checkRunsc(), + checkUserns(), + checkApparmor(), + checkSandboxToken(env), + ]; + + let failed = 0; + let warned = 0; + for (const c of checks) { + const icon = statusIcon(c.status); + const line = `${icon} ${c.name.padEnd(28)} ${c.detail}`; + if (c.status === 'ok') logger.info(line); + else if (c.status === 'warn') { + logger.warn(line); + warned += 1; + } else { + logger.error(line); + failed += 1; + } + if (c.status !== 'ok' && c.fix) { + logger.info(` fix: ${c.fix}`); + } + } + + logger.blank(); + if (failed > 0) { + logger.error(`${failed} check(s) failed; sandbox will not work.`); + process.exit(1); + } + if (warned > 0) { + logger.warn( + `${warned} recommendation(s); sandbox will function but is using weaker defaults.`, + ); + process.exit(0); + } + logger.success('All sandbox preflight checks passed.'); + }); +} diff --git a/tools/cli/src/index.ts b/tools/cli/src/index.ts index f2aa60dbb..c5d1d16c8 100644 --- a/tools/cli/src/index.ts +++ b/tools/cli/src/index.ts @@ -7,6 +7,7 @@ import { createCleanupCommand } from './commands/cleanup'; import { createConfigCommand } from './commands/config'; import { createConvexCommand } from './commands/convex'; import { createDeployCommand } from './commands/deploy'; +import { createDoctorCommand } from './commands/doctor'; import { createInitCommand } from './commands/init'; import { createLogsCommand } from './commands/logs'; import { createResetCommand } from './commands/reset'; @@ -45,5 +46,6 @@ program.addCommand(createLogsCommand()); program.addCommand(createRollbackCommand()); program.addCommand(createResetCommand()); program.addCommand(createCleanupCommand()); +program.addCommand(createDoctorCommand()); await program.parseAsync(); diff --git a/tools/cli/src/lib/actions/deploy.ts b/tools/cli/src/lib/actions/deploy.ts index 344a3dd96..9093036fe 100644 --- a/tools/cli/src/lib/actions/deploy.ts +++ b/tools/cli/src/lib/actions/deploy.ts @@ -18,7 +18,7 @@ import { isStatefulService, } from '../compose/types'; import { dockerCompose } from '../docker/docker-compose'; -import { ensureNetwork } from '../docker/ensure-network'; +import { ensureNetwork, ensureSandboxNetwork } from '../docker/ensure-network'; import { ensureVolumes } from '../docker/ensure-volumes'; import { exec } from '../docker/exec'; import { getContainerVersion } from '../docker/get-container-version'; @@ -56,6 +56,11 @@ async function ensureInfrastructure( if (!networkCreated) { throw new Error('Failed to create required network'); } + // Sandbox bridge: fixed name `tale-sandbox-net`, internal-only, IPv6 off. + const sandboxNetworkCreated = await ensureSandboxNetwork(); + if (!sandboxNetworkCreated) { + throw new Error('Failed to create sandbox network'); + } } interface DeployOptions { diff --git a/tools/cli/src/lib/actions/start.ts b/tools/cli/src/lib/actions/start.ts index d9d10d6db..01be0f381 100644 --- a/tools/cli/src/lib/actions/start.ts +++ b/tools/cli/src/lib/actions/start.ts @@ -10,7 +10,7 @@ import { findComposeOverride } from '../compose/find-compose-override'; import { DEV_VOLUME_NAMES } from '../compose/generators/constants'; import { generateDevCompose } from '../compose/generators/generate-dev-compose'; import { dockerCompose } from '../docker/docker-compose'; -import { ensureNetwork } from '../docker/ensure-network'; +import { ensureNetwork, ensureSandboxNetwork } from '../docker/ensure-network'; import { ensureVolumes } from '../docker/ensure-volumes'; import { exec } from '../docker/exec'; import { findProject } from '../project/find-project'; @@ -227,6 +227,14 @@ export async function start(options: StartOptions): Promise { if (!networkOk) { throw new Error('Failed to create dev network'); } + // Sandbox bridge has a fixed Docker name (tale-sandbox-net) and lives + // outside the project-prefixed naming scheme so the spawner can target + // it directly from `docker run --network`. Internal-only (no internet) + // and IPv6-disabled (R1.3 v4-allowlist-bypass mitigation). + const sandboxNetworkOk = await ensureSandboxNetwork(); + if (!sandboxNetworkOk) { + throw new Error('Failed to create sandbox network'); + } }); const env = loadEnv(projectDir); diff --git a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts index 96cd3bcdd..426458f36 100644 --- a/tools/cli/src/lib/compose/generators/generate-dev-compose.ts +++ b/tools/cli/src/lib/compose/generators/generate-dev-compose.ts @@ -11,6 +11,8 @@ import { createDbService } from '../services/create-db-service'; import { createPlatformService } from '../services/create-platform-service'; import { createProxyService } from '../services/create-proxy-service'; import { createRagService } from '../services/create-rag-service'; +import { createSandboxEgressService } from '../services/create-sandbox-egress-service'; +import { createSandboxService } from '../services/create-sandbox-service'; import type { ComposeConfig, ServiceConfig } from '../types'; import { DEV_VOLUME_NAMES } from './constants'; @@ -153,6 +155,8 @@ export function generateDevCompose( platform, rag, crawler, + 'sandbox-egress': createSandboxEgressService(config), + sandbox: createSandboxService(config), }, volumes, networks: { @@ -160,6 +164,10 @@ export function generateDevCompose( external: true, name: `${devPrefix}internal`, }, + // Sandbox bridge — internal-only, IPv6 disabled (declared in + // start.ts via ensureNetwork; here referenced as external so the + // generator emits the right ref). + sandbox: { external: true, name: 'tale-sandbox-net' }, }, }; diff --git a/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts b/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts index ce25bfef2..3c62e8ab5 100644 --- a/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts +++ b/tools/cli/src/lib/compose/generators/generate-stateful-compose.ts @@ -4,6 +4,8 @@ import { getProjectId } from '../../../utils/load-env'; import { createConvexService } from '../services/create-convex-service'; import { createDbService } from '../services/create-db-service'; import { createProxyService } from '../services/create-proxy-service'; +import { createSandboxEgressService } from '../services/create-sandbox-egress-service'; +import { createSandboxService } from '../services/create-sandbox-service'; import type { ComposeConfig, ServiceConfig } from '../types'; interface StatefulComposeOptions { @@ -26,6 +28,8 @@ export function generateStatefulCompose( db: createDbService(config), proxy: createProxyService(config, hostAlias), convex, + 'sandbox-egress': createSandboxEgressService(config), + sandbox: createSandboxService(config), }, volumes: { 'db-data': { external: true, name: `${prefix}db-data` }, @@ -36,6 +40,11 @@ export function generateStatefulCompose( }, networks: { internal: { external: true, name: `${prefix}internal` }, + // Sandbox bridge is created fresh per deployment (internal-only, IPv6 + // disabled). The Docker-level name is pinned to tale-sandbox-net so + // the spawner can `docker run --network tale-sandbox-net` without + // discovering compose's prefixed default name. + sandbox: { external: true, name: 'tale-sandbox-net' }, }, }; diff --git a/tools/cli/src/lib/compose/services/create-convex-service.ts b/tools/cli/src/lib/compose/services/create-convex-service.ts index 6023dede8..03e3cd325 100644 --- a/tools/cli/src/lib/compose/services/create-convex-service.ts +++ b/tools/cli/src/lib/compose/services/create-convex-service.ts @@ -15,6 +15,25 @@ export function createConvexService(config: ServiceConfig): ComposeService { return { image: `${config.registry}/tale-convex:${config.version}`, container_name: `${getProjectId()}-convex`, + // NET_ADMIN: required for the entrypoint's SSRF egress firewall + // (iptables REJECT rules for IMDS + link-local + RFC1918). Without + // this cap, services/convex/docker-entrypoint.sh:79 logs a warning + // and skips the firewall — yt-dlp's own DNS resolution then becomes + // a DNS-rebinding SSRF vector against the host's cloud metadata + // service. The compose.yml had this all along; the CLI generator + // was silently dropping it (R1.17). Bonus fix surfaced by the + // sandbox review. + cap_add: ['NET_ADMIN'], + // Per-container resource caps. yt-dlp + ffmpeg subprocesses peak + // ~300-500 MB each; APPLICATION_MAX_CONCURRENT_NODE_ACTIONS=32 means + // the worst case is 32 parallel ingest jobs. mem_limit caps blast + // radius; pids_limit defends against fork-bomb regressions; nofile + // gives breathing room for concurrent yt-dlp + ffmpeg + Convex. + mem_limit: '12g', + pids_limit: 4096, + ulimits: { + nofile: { soft: 65536, hard: 65536 }, + }, volumes: ['convex-data:/app/data', 'caddy-data:/caddy-data:ro'], env_file: ['.env'], restart: 'unless-stopped', diff --git a/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts new file mode 100644 index 000000000..72bd9bff2 --- /dev/null +++ b/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts @@ -0,0 +1,35 @@ +import { getProjectId } from '../../../utils/load-env'; +import type { ComposeService, ServiceConfig } from '../types'; +import { DEFAULT_LOGGING } from '../types'; + +/** + * Sandbox egress proxy — tinyproxy sidecar on the internal `sandbox` + * network. Filters CONNECT host requests against a configurable + * allow-list (default: pypi.org, files.pythonhosted.org, registry.npmjs.org, + * github package endpoints). Replaces the originally-planned iptables IP + * allow-list which R1.3/R2.1 showed was unsafe due to shared Fastly / + * Cloudflare CDN IPs. + * + * The runtime containers spawned by services/sandbox set + * HTTPS_PROXY=http://sandbox-egress:3128 and join `tale-sandbox-net` + * (internal: true), so this proxy is their ONLY outbound path. + */ +export function createSandboxEgressService( + config: ServiceConfig, +): ComposeService { + return { + image: `${config.registry}/tale-sandbox-egress:${config.version}`, + container_name: `${getProjectId()}-sandbox-egress`, + env_file: ['.env'], + restart: 'unless-stopped', + healthcheck: { + test: ['CMD', 'nc', '-z', '127.0.0.1', '3128'], + interval: '10s', + timeout: '3s', + retries: 2, + start_period: '5s', + }, + logging: DEFAULT_LOGGING, + networks: ['sandbox'], + }; +} diff --git a/tools/cli/src/lib/compose/services/create-sandbox-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-service.ts new file mode 100644 index 000000000..9bfd90f23 --- /dev/null +++ b/tools/cli/src/lib/compose/services/create-sandbox-service.ts @@ -0,0 +1,52 @@ +import { getProjectId } from '../../../utils/load-env'; +import type { ComposeService, ServiceConfig } from '../types'; +import { DEFAULT_LOGGING } from '../types'; + +/** + * Sandbox spawner — thin stateless docker-run service. + * + * SECURITY: mounts /var/run/docker.sock so it can spawn sibling containers. + * docker.sock = host root; this is the explicit security boundary the + * sandbox plan accepts. The spawner accepts only HMAC-signed typed JSON + * over HTTP (reachable only on the `internal` network), and the docker + * argv builder validates every identifier with strict regexes so a + * malformed input never reaches `docker run` (see + * services/sandbox/src/docker_args.ts). + * + * Joined to BOTH networks: + * - `internal` — so the platform container can reach it on + * http://sandbox:8003. + * - `sandbox` — so the per-call runtime containers it spawns can be + * attached to the internal-only egress bridge. + * + * Operators wanting stronger isolation set SANDBOX_RUNTIME=runsc and + * install gVisor on the host; the spawner picks the runtime via env. + */ +export function createSandboxService(config: ServiceConfig): ComposeService { + return { + image: `${config.registry}/tale-sandbox:${config.version}`, + container_name: `${getProjectId()}-sandbox`, + env_file: ['.env'], + environment: { + SANDBOX_RUNTIME: '${SANDBOX_RUNTIME:-runc}', + SANDBOX_RUNTIME_IMAGE: + '${SANDBOX_RUNTIME_IMAGE:-tale-sandbox-runtime:latest}', + SANDBOX_EGRESS_NETWORK: 'tale-sandbox-net', + SANDBOX_EGRESS_PROXY: 'http://sandbox-egress:3128', + }, + volumes: ['/var/run/docker.sock:/var/run/docker.sock'], + restart: 'unless-stopped', + healthcheck: { + test: ['CMD', 'curl', '-fsS', 'http://127.0.0.1:8003/health'], + interval: '10s', + timeout: '5s', + retries: 3, + start_period: '15s', + }, + depends_on: { + 'sandbox-egress': { condition: 'service_healthy' }, + }, + logging: DEFAULT_LOGGING, + networks: ['internal', 'sandbox'], + }; +} diff --git a/tools/cli/src/lib/compose/types.ts b/tools/cli/src/lib/compose/types.ts index 1bc8642cc..b51340af6 100644 --- a/tools/cli/src/lib/compose/types.ts +++ b/tools/cli/src/lib/compose/types.ts @@ -32,6 +32,15 @@ export interface ComposeService { logging?: LoggingConfig; networks?: string[] | Record; extra_hosts?: string[]; + // Linux capability + resource flags. Previously absent from the generator, + // which silently dropped them on the convex service (R1.17 latent bug) + // and made sandbox impossible. All optional; emit only when set. + cap_add?: string[]; + mem_limit?: string; + pids_limit?: number; + ulimits?: Record; + security_opt?: string[]; + runtime?: string; } export interface ComposeConfig { @@ -54,7 +63,17 @@ export interface ServiceConfig { } export const ROTATABLE_SERVICES = ['platform', 'rag', 'crawler'] as const; -export const STATEFUL_SERVICES = ['db', 'proxy', 'convex'] as const; +export const STATEFUL_SERVICES = [ + 'db', + 'proxy', + 'convex', + // Sandbox spawner + egress proxy — singleton, no blue/green rotation + // (state is per-call container, not per-replica). Bundled into the + // stateful bucket because they live alongside db/convex/proxy in + // deploy.ts:auto-include-missing logic. + 'sandbox', + 'sandbox-egress', +] as const; export const ALL_SERVICES = [ ...ROTATABLE_SERVICES, ...STATEFUL_SERVICES, diff --git a/tools/cli/src/lib/config/ensure-env.ts b/tools/cli/src/lib/config/ensure-env.ts index 996a34f1b..7a001726e 100644 --- a/tools/cli/src/lib/config/ensure-env.ts +++ b/tools/cli/src/lib/config/ensure-env.ts @@ -102,6 +102,9 @@ export async function ensureEnv( 'INSTANCE_SECRET', 'DB_PASSWORD', 'SOPS_AGE_KEY', + // Shared HMAC secret for Convex → sandbox spawner. Generated as + // 32 random bytes (hex); see services/sandbox/src/auth.ts. + 'SANDBOX_TOKEN', ]; const missing = requiredVars.filter((v) => !existing[v]); @@ -212,6 +215,7 @@ async function runPartialEnvSetup( ENCRYPTION_SECRET_HEX: generateHexSecret, INSTANCE_SECRET: generateHexSecret, DB_PASSWORD: generatePassword, + SANDBOX_TOKEN: generateHexSecret, }; let generatedCount = 0; @@ -408,6 +412,7 @@ async function runEnvSetup(envPath: string): Promise { instanceSecret: generateHexSecret(), dbPassword, sopsAgeKey: ageKeypair.secretKey, + sandboxToken: generateHexSecret(), }; const envContent = generateEnvContent({ @@ -441,6 +446,7 @@ interface EnvConfig { instanceSecret: string; dbPassword: string; sopsAgeKey: string; + sandboxToken: string; } function generateEnvContent(config: EnvConfig): string { @@ -508,6 +514,24 @@ function generateEnvContent(config: EnvConfig): string { `SOPS_AGE_KEY=${config.sopsAgeKey}`, '# SOPS_AGE_KEY_FILE=', '', + '# ============================================================================', + '# Sandbox (code_run) Configuration', + '# ============================================================================', + '# Shared HMAC secret. Convex signs every request to the sandbox spawner', + '# with this; the spawner rejects unsigned/wrong-signed requests. Rotate', + '# by setting a new value and restarting both `platform` and `sandbox`.', + `SANDBOX_TOKEN=${config.sandboxToken}`, + '# Container runtime for spawned sandbox containers. `runc` (default) is', + '# plain Docker; `runsc` is gVisor (requires `runsc` installed on the', + '# host and registered with dockerd — see `tale doctor`). gVisor provides', + '# a userspace kernel that mitigates runc-class escape CVEs at the cost', + '# of ~6x pip-install latency for native-extension packages.', + '# SANDBOX_RUNTIME=runc', + '# Pipe-separated regex allow-list of egress hostnames for the sandbox', + '# proxy. Default covers pypi/npm/github package endpoints; extend if', + '# your agents need other registries (e.g. private wheel mirrors).', + '# SANDBOX_EGRESS_ALLOWLIST=^pypi\\.org$|^files\\.pythonhosted\\.org$|^registry\\.npmjs\\.org$', + '', ); return lines.join('\n'); diff --git a/tools/cli/src/lib/docker/ensure-network.ts b/tools/cli/src/lib/docker/ensure-network.ts index cc8e9d953..8e01c1b6d 100644 --- a/tools/cli/src/lib/docker/ensure-network.ts +++ b/tools/cli/src/lib/docker/ensure-network.ts @@ -7,7 +7,10 @@ async function networkExists(networkName: string): Promise { return result.success; } -async function createNetwork(networkName: string): Promise { +async function createNetwork( + networkName: string, + extraArgs: string[] = [], +): Promise { const exists = await networkExists(networkName); if (exists) { logger.debug(`Network ${networkName} already exists`); @@ -20,6 +23,7 @@ async function createNetwork(networkName: string): Promise { 'create', '--label', `project=${getProjectId()}`, + ...extraArgs, networkName, ); if (!result.success) { @@ -37,3 +41,19 @@ export async function ensureNetwork( const fullName = `${prefix}${networkName}`; return createNetwork(fullName); } + +/** + * The sandbox network is shared across blue/green and across dev/prod — + * it's pinned to a fixed Docker name (`tale-sandbox-net`) so the spawner + * can `docker run --network tale-sandbox-net` without discovering the + * compose-project-prefixed default. `--internal` blocks all internet + * from this network so the per-call runtime containers can only reach + * pypi/npm via the egress proxy sidecar. + */ +export async function ensureSandboxNetwork(): Promise { + return createNetwork('tale-sandbox-net', [ + '--internal', + '--ipv6=false', + '--driver=bridge', + ]); +} From 741c6c5c19a16d72d2eb5bc359693c7856f43282 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 18:59:42 +0800 Subject: [PATCH 004/108] fix(sandbox): preserve cause + narrow unsafe assertions in sandbox client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit oxlint flagged 4 issues in M1/M2 code: - spawner_client.ts:97 — `throw new Error(...)` inside `catch (err)` without forwarding the original. Added `{ cause: err }` so debugging keeps the network-error chain. - internal_actions.ts:445 — same pattern; same fix. - spawner_client.ts:113 — `await res.json() as SpawnerExecuteResponse`. Annotated with `oxlint-disable typescript/no-unsafe-type-assertion` because the wire contract is validated on the spawner side; trusting it here is by design. - internal_actions.ts:177 — `err.data as { message?: string }`. Same disable, scoped to the line that runs only after a `'message' in err.data` narrowing. The remaining lint error (lib/seo/integration.test.ts) predates this branch and is unrelated to the sandbox work. --- .../convex/node_only/sandbox/helpers/spawner_client.ts | 2 ++ .../platform/convex/node_only/sandbox/internal_actions.ts | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts index 1b07ba23d..903e5bb4e 100644 --- a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts +++ b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts @@ -95,6 +95,7 @@ export async function spawnerExecute( } catch (err) { throw new Error( `sandbox spawner unreachable at ${url}: ${err instanceof Error ? err.message : String(err)}`, + { cause: err }, ); } @@ -110,6 +111,7 @@ export async function spawnerExecute( const text = await res.text().catch(() => ''); throw new Error(`sandbox spawner ${res.status}: ${text || res.statusText}`); } + // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- spawner-side schema is validated at the spawner; trust the wire contract here return (await res.json()) as SpawnerExecuteResponse; } diff --git a/services/platform/convex/node_only/sandbox/internal_actions.ts b/services/platform/convex/node_only/sandbox/internal_actions.ts index f4b9ec3d5..ee1ed3570 100644 --- a/services/platform/convex/node_only/sandbox/internal_actions.ts +++ b/services/platform/convex/node_only/sandbox/internal_actions.ts @@ -174,7 +174,8 @@ export const executeCode = internalAction({ code: 'QUOTA_EXCEEDED', message: err.data && typeof err.data === 'object' && 'message' in err.data - ? String((err.data as { message?: string }).message) + ? // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- ConvexError data shape is loose; we just type-narrowed the message key + String((err.data as { message?: string }).message) : 'Sandbox quota exceeded', }); } @@ -442,7 +443,7 @@ export const executeCode = internalAction({ durationMs: Date.now() - startedAt, actualSeconds: (Date.now() - startedAt) / 1000, }); - throw new Error(`Sandbox spawner failed: ${message}`); + throw new Error(`Sandbox spawner failed: ${message}`, { cause: err }); } finally { clearInterval(heartbeat); } From 2b5113483150b23b2d1609e9267b7f66ba7846f0 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 19:01:50 +0800 Subject: [PATCH 005/108] test(sandbox): unit-test reserveSlotAndInsert quota + watchdog (R1.22 #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mocks _generated/server.internalMutation so the real handler is callable with a fabricated ctx (matches the file_metadata/internal_mutations.test.ts pattern). Covers: - Empty in-flight → row inserted with status='queued', lifecycleStatus='active'. - Cap reached (4 running) → throws ConvexError (atomic concurrency cap, closes the TOCTOU race R1.8/R1.10 flagged). - Daily CPU budget pre-debit overflow (4 × 500s prior + 30s requested > 1800s cap) → throws — pre-debit semantics verified, closes R1.10's post-debit overshoot. - recoverStuckSandboxes — only the row whose heartbeatAt is older than 2×max-timeout gets flipped to failed/SPAWNER_UNAVAILABLE. All 4 tests pass via vitest. Combined with the 9-test argv builder gate shipped in M1, that's two of R1.22's five critical regression gates. The remaining three (in-container privilege assertion, fileMetadata IDOR via inputFiles, cancellation propagation) require either a running docker daemon (privilege) or a Convex test harness (IDOR / cancellation); both are integration-test scope and best added when wiring up CI for the sandbox stack. --- .../convex/sandbox/internal_mutations.test.ts | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 services/platform/convex/sandbox/internal_mutations.test.ts diff --git a/services/platform/convex/sandbox/internal_mutations.test.ts b/services/platform/convex/sandbox/internal_mutations.test.ts new file mode 100644 index 000000000..a8d2dc66a --- /dev/null +++ b/services/platform/convex/sandbox/internal_mutations.test.ts @@ -0,0 +1,206 @@ +// R1.22 #3 — atomic quota mutation regression gate. Mocks the convex +// generated layer (same pattern as file_metadata/internal_mutations.test.ts) +// so the mutation body is unit-testable without a running backend. + +import { ConvexError } from 'convex/values'; +import { describe, it, expect, vi } from 'vitest'; + +vi.mock('../_generated/server', async (importOriginal) => { + const mod = await importOriginal>(); + return { + ...mod, + // The mutation factory just hands the config straight through so we + // can call `.handler(ctx, args)` from tests. + internalMutation: (config: Record) => config, + }; +}); + +import { + reserveSlotAndInsert, + recoverStuckSandboxes, +} from './internal_mutations'; +import { SANDBOX_MAX_CONCURRENT_PER_ORG } from './schema'; + +interface MutHandler { + handler: (ctx: unknown, args: TArgs) => Promise | TReturn; +} + +function asyncIter(rows: T[]): AsyncIterable { + return { + async *[Symbol.asyncIterator]() { + for (const r of rows) yield r; + }, + }; +} + +interface FakeRow { + estimatedSeconds: number; + _creationTime: number; + status: string; + actualSeconds?: number; + _id: string; + heartbeatAt: number; +} + +interface MockCtxOptions { + runningRows?: FakeRow[]; + queuedRows?: FakeRow[]; + completedTodayRows?: FakeRow[]; +} + +function createMockCtx(opts: MockCtxOptions = {}) { + const runningRows = opts.runningRows ?? []; + const queuedRows = opts.queuedRows ?? []; + const completedRows = opts.completedTodayRows ?? []; + const insertedRows: Record[] = []; + + // The fluent `.withIndex` chain — store the eq() args so the handler + // returning the right async iterator can be selected. + function makeBuilder() { + const calls: Array> = []; + const builder: Record = {}; + builder.withIndex = vi.fn((_name: string, cb: (q: unknown) => unknown) => { + const q = { + eq: (field: string, value: unknown) => { + calls.push({ field, value }); + return q; + }, + }; + cb(q); + return builder; + }); + builder.order = vi.fn(() => builder); + // The mutation iterates the builder directly with `for await`. + builder[Symbol.asyncIterator] = function () { + const status = calls.find((c) => c.field === 'status')?.value; + if (status === 'running') + return asyncIter(runningRows)[Symbol.asyncIterator](); + if (status === 'queued') + return asyncIter(queuedRows)[Symbol.asyncIterator](); + // No status filter → completedToday daily-budget scan + return asyncIter([...completedRows, ...runningRows])[ + Symbol.asyncIterator + ](); + }; + return builder; + } + + return { + ctx: { + db: { + query: vi.fn(() => makeBuilder()), + insert: vi.fn( + async (_table: string, payload: Record) => { + insertedRows.push(payload); + return `exec_${insertedRows.length}`; + }, + ), + get: vi.fn(), + patch: vi.fn(), + }, + }, + insertedRows, + }; +} + +describe('reserveSlotAndInsert', () => { + const baseArgs = { + organizationId: 'org_alpha', + uploadedBy: 'user_1', + language: 'python' as const, + codePreview: 'print("hi")', + packages: [], + estimatedSeconds: 30, + }; + + it('inserts a row when no in-flight and budget has room', async () => { + const { ctx, insertedRows } = createMockCtx(); + const mut = reserveSlotAndInsert as unknown as MutHandler< + typeof baseArgs, + string + >; + const id = await mut.handler(ctx, baseArgs); + expect(id).toBe('exec_1'); + expect(insertedRows[0]).toMatchObject({ + organizationId: 'org_alpha', + status: 'queued', + estimatedSeconds: 30, + lifecycleStatus: 'active', + }); + }); + + it(`rejects when running count is already at the cap (${SANDBOX_MAX_CONCURRENT_PER_ORG})`, async () => { + const running: FakeRow[] = Array.from( + { length: SANDBOX_MAX_CONCURRENT_PER_ORG }, + (_v, i) => ({ + _id: `r${i}`, + _creationTime: Date.now() - 1000, + status: 'running', + estimatedSeconds: 30, + heartbeatAt: Date.now(), + }), + ); + const { ctx } = createMockCtx({ runningRows: running }); + const mut = reserveSlotAndInsert as unknown as MutHandler< + typeof baseArgs, + string + >; + await expect(mut.handler(ctx, baseArgs)).rejects.toBeInstanceOf( + ConvexError, + ); + }); + + it('rejects when daily CPU budget pre-debit overflows', async () => { + // 4 prior runs of 500s each = 2000s; cap is 1800s → next call should reject. + const completed: FakeRow[] = Array.from({ length: 4 }, (_v, i) => ({ + _id: `c${i}`, + _creationTime: Date.now() - 60_000, + status: 'completed', + estimatedSeconds: 500, + actualSeconds: 500, + heartbeatAt: Date.now(), + })); + const { ctx } = createMockCtx({ completedTodayRows: completed }); + const mut = reserveSlotAndInsert as unknown as MutHandler< + typeof baseArgs, + string + >; + await expect( + mut.handler(ctx, { ...baseArgs, estimatedSeconds: 30 }), + ).rejects.toThrow(/budget/i); + }); +}); + +describe('recoverStuckSandboxes', () => { + it('flips running rows whose heartbeat is older than 2× max-timeout', async () => { + const stale: FakeRow = { + _id: 'stuck1', + _creationTime: Date.now() - 3_600_000, + status: 'running', + estimatedSeconds: 120, + heartbeatAt: Date.now() - 11 * 60_000, + }; + const fresh: FakeRow = { + _id: 'live1', + _creationTime: Date.now() - 60_000, + status: 'running', + estimatedSeconds: 60, + heartbeatAt: Date.now() - 5_000, + }; + const { ctx } = createMockCtx({ runningRows: [stale, fresh] }); + const mut = recoverStuckSandboxes as unknown as MutHandler< + Record, + number + >; + const count = await mut.handler(ctx, {}); + expect(count).toBe(1); + expect(ctx.db.patch).toHaveBeenCalledWith( + 'stuck1', + expect.objectContaining({ + status: 'failed', + errorCode: 'SPAWNER_UNAVAILABLE', + }), + ); + expect(ctx.db.patch).not.toHaveBeenCalledWith('live1', expect.anything()); + }); +}); From 71e86c16879967559501f0a8543bd89ab9e381d3 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 20:04:08 +0800 Subject: [PATCH 006/108] fix(sandbox): make L4 smoke tests pass end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five real bugs surfaced by trying to actually hit /v1/execute end-to-end: 1. **Docker CLI API skew (L3 /health 503)** — Debian's `docker.io` 20.10.5 speaks API 1.41; modern daemons require ≥1.44. Switched to the official static CLI via `COPY --from=docker:27-cli` and changed /health from `docker info --format` (which panics in old CLI) to `docker version --format`. 2. **tmpfs Docker volumes aren't shared between docker run calls** — my original multi-helper staging (busybox containers writing to a shared workspace volume) silently failed because each `docker run` of a tmpfs-driver-local volume creates a fresh mount. Tried piping tar to stdin into the runtime container, but then… 3. **`docker cp` can't read from --tmpfs mounts** — verified directly: file present inside container (`docker exec ls`) but `docker cp` returns "Could not find the file ... in container". Switched workspace to a 1:1 host bind mount under /var/lib/tale-sandbox/sessions//. Spawner now stages files via Bun fs (no busybox/tar dance) and harvests outputs the same way. The compose + CLI factory mount /var/lib/tale-sandbox into the spawner container at the same path so the docker daemon (resolving against host fs) and the spawner agree on it. Trade-off: lose the perfect tmpfs ENOSPC cap; keep --ulimit fsize=100m per file + post-run rm -rf. 4. **uv/pip fail under --read-only** — uv writes to $HOME/.cache/uv, nobody's $HOME is /nonexistent which is RO. Set HOME=/tmp (we have --tmpfs /tmp) + UV_CACHE_DIR=/cache/pip. 5. **Cache volume permission denied** — new Docker volumes are root-owned by default; the runtime user is 65534. ensureCacheVolume now detects first-creation and runs a transient busybox to chown the mount point 65534:65534. 6. **Internal-only sandbox network blocks tinyproxy's own outbound** — sandbox-egress couldn't resolve pypi.org because `internal: true` cut all DNS too. Put sandbox-egress on BOTH `sandbox` (where runtime containers reach it) and `internal` (where it has internet for the upstream tunnel). Runtime containers stay solely on sandbox. 7. **Timeout didn't kill the container** — killing the docker CLI process doesn't stop the sibling container; it just disconnects the wrapper. Two-tier timeout now: inner timer issues `docker kill ` at timeoutMs; outer (CLI process kill) at timeoutMs+30s as belt-and- suspenders. 8. **Error classifier patterns stale** — uv's "no matching distribution" has become "unsatisfiable"; runtime-time egress denial exits with 1 not 64. Broadened PACKAGE_NOT_FOUND regex; classify EGRESS_DENIED on any exit when stderr matches. 9. **tinyproxy log file root-owned** — entrypoint chowns the log to nobody so tinyproxy (which drops privs) can write to it. L4 verified end-to-end: - python hello world: 620ms - python-pptx (warm cache): 1.18s, real .pptx file with 3 slides - TIMEOUT: exit 137 at 3.27s for a sleep(30) with timeoutMs=3000 - EGRESS_DENIED, PACKAGE_NOT_FOUND, RUNTIME_ERROR: all classified - HMAC mismatch: 401 api.d.ts regen picked up sandbox/* + agent_tools/code/* on the platform restart that happened during testing — included. --- compose.yml | 28 +- services/platform/convex/_generated/api.d.ts | 12 + services/sandbox-egress/entrypoint.sh | 4 +- services/sandbox-runtime/entrypoint.sh | 10 + services/sandbox/Dockerfile | 6 +- services/sandbox/src/docker_args.test.ts | Bin 4884 -> 5773 bytes services/sandbox/src/docker_args.ts | 34 ++- services/sandbox/src/server.ts | 13 +- services/sandbox/src/spawn.ts | 255 +++++++++++++---- services/sandbox/src/spawn_util.ts | 19 +- services/sandbox/src/volume.ts | 263 +++--------------- .../services/create-sandbox-egress-service.ts | 5 +- .../services/create-sandbox-service.ts | 8 +- 13 files changed, 343 insertions(+), 314 deletions(-) diff --git a/compose.yml b/compose.yml index 436282886..c3fd089a8 100644 --- a/compose.yml +++ b/compose.yml @@ -566,7 +566,13 @@ services: max-size: '10m' max-file: '3' networks: + # `sandbox` is internal-only — only sandbox-egress + the per-call + # runtime containers attach. Runtime containers reach pypi/npm by + # CONNECT to sandbox-egress:3128, which is on BOTH networks. The + # `internal` Tale network is a regular bridge with NAT so tinyproxy + # can resolve and reach the upstream registries. - sandbox + - internal # ============================================================================ # Tale Sandbox Spawner — thin stateless docker-run service for `code_run` @@ -601,6 +607,11 @@ services: # The spawner needs the host docker socket to spawn sibling containers. # This is the security boundary — see header comment. - /var/run/docker.sock:/var/run/docker.sock + # 1:1 bind: per-call workspace dirs are created here by the spawner + # and mounted into the runtime container at the SAME host path (the + # docker daemon resolves --mount source paths against the host fs, + # so the spawner and the daemon must agree on the path). + - /var/lib/tale-sandbox:/var/lib/tale-sandbox restart: unless-stopped healthcheck: test: ['CMD', 'curl', '-fsS', 'http://127.0.0.1:8003/health'] @@ -675,15 +686,12 @@ networks: driver: bridge # Sandbox network — internal-only bridge for code_run runtime containers + the - # tinyproxy egress sidecar. `internal: true` blocks all internet from this - # network; the only outbound is through sandbox-egress (host allow-list). - # IPv6 disabled to prevent v4 allow-list bypass via v6 routes (R1.3). - # - # `name:` pins the Docker-level network name so the spawner (which calls - # `docker run --network tale-sandbox-net` on sibling containers) doesn't - # have to discover the compose-project-prefixed default. + # tinyproxy egress sidecar. The CLI (start.ts / deploy.ts via + # ensureSandboxNetwork) pre-creates the network with `--internal --ipv6=false` + # so it can carry both `tale-sandbox-net` and the bridge-driver flags that + # compose's `networks:` block can't express atomically. We mark it external + # here so compose attaches to the existing network rather than overwriting + # its driver options. sandbox: + external: true name: tale-sandbox-net - driver: bridge - internal: true - enable_ipv6: false diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts index 378f662ea..55fc7e35f 100644 --- a/services/platform/convex/_generated/api.d.ts +++ b/services/platform/convex/_generated/api.d.ts @@ -18,6 +18,7 @@ import type * as agent_tools_artifacts_artifact_create_tool from "../agent_tools import type * as agent_tools_artifacts_artifact_edit_tool from "../agent_tools/artifacts/artifact_edit_tool.js"; import type * as agent_tools_artifacts_shared from "../agent_tools/artifacts/shared.js"; import type * as agent_tools_artifacts_stream_state from "../agent_tools/artifacts/stream_state.js"; +import type * as agent_tools_code_code_run_tool from "../agent_tools/code/code_run_tool.js"; import type * as agent_tools_conversations_conversation_read_tool from "../agent_tools/conversations/conversation_read_tool.js"; import type * as agent_tools_conversations_helpers_read_conversation_by_id from "../agent_tools/conversations/helpers/read_conversation_by_id.js"; import type * as agent_tools_conversations_helpers_read_conversation_list from "../agent_tools/conversations/helpers/read_conversation_list.js"; @@ -572,6 +573,8 @@ import type * as node_only_integration_sandbox_helpers_url_rewrite from "../node import type * as node_only_integration_sandbox_helpers_validate_host from "../node_only/integration_sandbox/helpers/validate_host.js"; import type * as node_only_integration_sandbox_internal_actions from "../node_only/integration_sandbox/internal_actions.js"; import type * as node_only_integration_sandbox_types from "../node_only/integration_sandbox/types.js"; +import type * as node_only_sandbox_helpers_spawner_client from "../node_only/sandbox/helpers/spawner_client.js"; +import type * as node_only_sandbox_internal_actions from "../node_only/sandbox/internal_actions.js"; import type * as node_only_sql_helpers_execute_mssql_query from "../node_only/sql/helpers/execute_mssql_query.js"; import type * as node_only_sql_helpers_execute_mysql_query from "../node_only/sql/helpers/execute_mysql_query.js"; import type * as node_only_sql_helpers_execute_postgres_query from "../node_only/sql/helpers/execute_postgres_query.js"; @@ -675,6 +678,9 @@ import type * as providers_file_utils from "../providers/file_utils.js"; import type * as providers_resolve_model from "../providers/resolve_model.js"; import type * as providers_secret_io from "../providers/secret_io.js"; import type * as providers_validators from "../providers/validators.js"; +import type * as sandbox_internal_mutations from "../sandbox/internal_mutations.js"; +import type * as sandbox_internal_queries from "../sandbox/internal_queries.js"; +import type * as sandbox_output_mutations from "../sandbox/output_mutations.js"; import type * as sso_providers_actions from "../sso_providers/actions.js"; import type * as sso_providers_create_user_session from "../sso_providers/create_user_session.js"; import type * as sso_providers_entra_id_adapter from "../sso_providers/entra_id/adapter.js"; @@ -1087,6 +1093,7 @@ declare const fullApi: ApiFromModules<{ "agent_tools/artifacts/artifact_edit_tool": typeof agent_tools_artifacts_artifact_edit_tool; "agent_tools/artifacts/shared": typeof agent_tools_artifacts_shared; "agent_tools/artifacts/stream_state": typeof agent_tools_artifacts_stream_state; + "agent_tools/code/code_run_tool": typeof agent_tools_code_code_run_tool; "agent_tools/conversations/conversation_read_tool": typeof agent_tools_conversations_conversation_read_tool; "agent_tools/conversations/helpers/read_conversation_by_id": typeof agent_tools_conversations_helpers_read_conversation_by_id; "agent_tools/conversations/helpers/read_conversation_list": typeof agent_tools_conversations_helpers_read_conversation_list; @@ -1641,6 +1648,8 @@ declare const fullApi: ApiFromModules<{ "node_only/integration_sandbox/helpers/validate_host": typeof node_only_integration_sandbox_helpers_validate_host; "node_only/integration_sandbox/internal_actions": typeof node_only_integration_sandbox_internal_actions; "node_only/integration_sandbox/types": typeof node_only_integration_sandbox_types; + "node_only/sandbox/helpers/spawner_client": typeof node_only_sandbox_helpers_spawner_client; + "node_only/sandbox/internal_actions": typeof node_only_sandbox_internal_actions; "node_only/sql/helpers/execute_mssql_query": typeof node_only_sql_helpers_execute_mssql_query; "node_only/sql/helpers/execute_mysql_query": typeof node_only_sql_helpers_execute_mysql_query; "node_only/sql/helpers/execute_postgres_query": typeof node_only_sql_helpers_execute_postgres_query; @@ -1744,6 +1753,9 @@ declare const fullApi: ApiFromModules<{ "providers/resolve_model": typeof providers_resolve_model; "providers/secret_io": typeof providers_secret_io; "providers/validators": typeof providers_validators; + "sandbox/internal_mutations": typeof sandbox_internal_mutations; + "sandbox/internal_queries": typeof sandbox_internal_queries; + "sandbox/output_mutations": typeof sandbox_output_mutations; "sso_providers/actions": typeof sso_providers_actions; "sso_providers/create_user_session": typeof sso_providers_create_user_session; "sso_providers/entra_id/adapter": typeof sso_providers_entra_id_adapter; diff --git a/services/sandbox-egress/entrypoint.sh b/services/sandbox-egress/entrypoint.sh index 8f8cc9ab0..b0121bc3d 100644 --- a/services/sandbox-egress/entrypoint.sh +++ b/services/sandbox-egress/entrypoint.sh @@ -27,8 +27,10 @@ echo "[sandbox-egress] config:" sed 's/^/ /' /etc/tinyproxy/tinyproxy.conf # tinyproxy logs to file by default; tail to stdout in background so docker -# logs surfaces them. +# logs surfaces them. Chown to nobody so tinyproxy (which drops privs) +# can write to it. touch /var/log/tinyproxy/tinyproxy.log +chown nobody:nobody /var/log/tinyproxy/tinyproxy.log tail -n0 -F /var/log/tinyproxy/tinyproxy.log & exec tinyproxy -d -c /etc/tinyproxy/tinyproxy.conf diff --git a/services/sandbox-runtime/entrypoint.sh b/services/sandbox-runtime/entrypoint.sh index 0af2a459a..f94f79434 100644 --- a/services/sandbox-runtime/entrypoint.sh +++ b/services/sandbox-runtime/entrypoint.sh @@ -31,6 +31,16 @@ LANG_NAME="$1" PACKAGES_FILE="${2:-/workspace/code/packages.json}" OPTIONS_FILE="${3:-/workspace/code/options.json}" +# The spawner pipes a tar archive of code/ + input/ to our stdin (this is +# the only way to deliver the user's program into a `--tmpfs /workspace` +# container, since tmpfs volumes don't persist between separate `docker run` +# invocations). The archive contains code/main.{py,js} + code/packages.json +# + code/options.json + optionally input/. +mkdir -p /workspace/code /workspace/input /workspace/output +if [ ! -t 0 ]; then + tar -xf - -C /workspace 2>/dev/null || true +fi + echo "PHASE: installing" ALLOW_SDIST="false" diff --git a/services/sandbox/Dockerfile b/services/sandbox/Dockerfile index 01175e266..449e1fc11 100644 --- a/services/sandbox/Dockerfile +++ b/services/sandbox/Dockerfile @@ -8,11 +8,13 @@ FROM oven/bun:1.1-debian WORKDIR /app -# docker CLI for spawning sibling containers via mounted socket. +# docker CLI for spawning sibling containers via mounted socket. The +# Debian-shipped `docker.io` package is API 1.41 (too old; current +# daemons require ≥1.44). Pull the official static CLI binary instead. +COPY --from=docker:27-cli /usr/local/bin/docker /usr/local/bin/docker RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ curl \ - docker.io \ && rm -rf /var/lib/apt/lists/* COPY package.json bun.lockb* tsconfig.json /app/ diff --git a/services/sandbox/src/docker_args.test.ts b/services/sandbox/src/docker_args.test.ts index 66a579519e03c78346cf7edcd35a1a74af91b5ce..ffdafdeede4c7db57915aed823920046d12bcb1c 100644 GIT binary patch delta 777 zcma))L2J}N6vv_4r6MfWgCc_RDzZtSNlYtsh)gz@3Oi2>< zTp4`Ab-lW@b|og`Bz*AjQ4P!APU9Fx3c^qe`0?Q!Z1Itd&_GV~Au~2m6_yo3$_r*7 z)k*;7406kZF_$zHs*=d)Rj{9hEhYlx8i=G#xdhFGvOL$DD2aqirhy&bEeoD-EGCfy zu;Y>?=o6ipAd(8Ff0oym(sF{il3p;F4|zJ;ef9R`i=ELfa06pn85;n# z9S9{c1~>xA>_i|2Ec)>BS@oTjd(W(cz4^`>Hu>iioYMisC`vF$Cf1E8Q!}GkQ+6XR zv8l8A>vg~D9`%=NuTSS`6B#9#?_&VN$AFQgXf%5qbKh&Q8(t6u5vncbqpnzMxLwMB zK^E)3fJ$$Y!pvbaoz99AIg1*RZn-dX*$Qn_0KRAxFP^q4;xZ{J%8b8nbTYhuk|{-Z zq&<~rp`ksx_xl0#`%IC{_frG{sprM{{8>} delta 115 zcmeCxouam353_Q4eo=ODL1J=hSbk1vZmN}ndP!nVs%~*og>GU}vaX?#vHIj4%x;ry zSk6oq5S7@xkyU|dvNMO+ { - // /health pings docker daemon — caches not used for v1. - const info = await runDocker(['info', '--format', '{{.ServerVersion}}']); + // Probe docker daemon reachability. Use `docker version --format` over the + // older `docker info --format` because some Debian-packaged CLIs (e.g. + // docker.io 20.10 in our base image) panic when templating a newer-API + // `info` response. `docker version` is a much smaller surface that has + // been compatible across the 20.10 ↔ 29.x gap. + const info = await runDocker(['version', '--format', '{{.Server.Version}}']); if (info.exitCode !== 0) { return new Response( - JSON.stringify({ status: 'unhealthy', error: info.stderr.trim() }), + JSON.stringify({ + status: 'unhealthy', + error: info.stderr.trim() || info.stdout.trim(), + }), { status: 503, headers: { 'content-type': 'application/json' } }, ); } diff --git a/services/sandbox/src/spawn.ts b/services/sandbox/src/spawn.ts index 8d6e6ed8e..df3d95103 100644 --- a/services/sandbox/src/spawn.ts +++ b/services/sandbox/src/spawn.ts @@ -1,32 +1,54 @@ // Per-call execution pipeline. The route handler in server.ts hands a typed -// ExecuteRequest in; this module owns the docker lifecycle and returns a typed -// ExecuteResponse out. +// ExecuteRequest in; this module owns the docker lifecycle and returns a +// typed ExecuteResponse out. +// +// Flow: +// 1. Ensure per-org pip/npm cache volumes exist (one-shot chown so the +// unprivileged runtime user can write). +// 2. Create host workspace dir at /var/lib/tale-sandbox/sessions// +// and stage code/ + input/ via Bun fs (the spawner sees this path +// directly because it's bind-mounted 1:1 into the container). +// 3. `docker run` the runtime with --mount type=bind workspaceHostDir +// → /workspace. +// 4. Wait with host-side wall-clock timeout. +// 5. Read /workspace/output/ back via Bun fs. +// 6. Capture stdout/stderr; classify exit code → errorCode. +// 7. `docker rm -f` + rm -rf the host dir. + +import { + mkdir, + readdir, + readFile, + rm, + stat, + writeFile, + chown, +} from 'node:fs/promises'; +import { join } from 'node:path'; import { buildDockerRunArgs } from './docker_args.ts'; -import { runDocker, dockerKill } from './spawn_util.ts'; +import { runDocker, dockerKill, dockerRm } from './spawn_util.ts'; import type { ErrorCode, ExecuteRequest, ExecuteResponse, + OutputFile, SpawnerConfig, } from './types.ts'; import { - createWorkspaceVolume, ensureCacheVolume, - harvestOutput, npmCacheVolumeName, pipCacheVolumeName, - removeVolume, - stageCodeIntoVolume, - workspaceVolumeName, } from './volume.ts'; const PHASE_INSTALL = 'PHASE: installing'; const PHASE_RUN = 'PHASE: running'; +const NAME_RE = /^[a-zA-Z0-9._-]+$/; +const RUNTIME_UID = 65534; +const RUNTIME_GID = 65534; interface InFlight { containerName: string; - workspaceVolume: string; abort: AbortController; } @@ -36,10 +58,6 @@ export function isInFlight(executionId: string): boolean { return inFlight.has(executionId); } -/** - * Cancel an in-flight execution. Best-effort: docker kill + (cleanup will - * happen in the originating execute() finally block). - */ export async function cancelExecution(executionId: string): Promise { const entry = inFlight.get(executionId); if (!entry) return false; @@ -48,6 +66,125 @@ export async function cancelExecution(executionId: string): Promise { return true; } +async function stageWorkspace( + hostDir: string, + req: ExecuteRequest, +): Promise { + const codeDir = join(hostDir, 'code'); + const inputDir = join(hostDir, 'input'); + const outputDir = join(hostDir, 'output'); + await mkdir(codeDir, { recursive: true }); + await mkdir(inputDir, { recursive: true }); + await mkdir(outputDir, { recursive: true }); + + const mainName = req.language === 'python' ? 'main.py' : 'main.js'; + await writeFile(join(codeDir, mainName), req.code); + await writeFile( + join(codeDir, 'packages.json'), + JSON.stringify(req.packages ?? []), + ); + await writeFile( + join(codeDir, 'options.json'), + JSON.stringify(req.options ?? {}), + ); + + for (const f of req.inputFiles ?? []) { + if (!NAME_RE.test(f.name)) { + throw new Error(`unsafe input file name: ${JSON.stringify(f.name)}`); + } + const bytes = Buffer.from(f.contentBase64, 'base64'); + await writeFile(join(inputDir, f.name), bytes); + } + + // Spawner runs as root; the runtime container runs as nobody (65534) and + // needs to read the staged files. Recursively chown. + await chownRecursive(hostDir, RUNTIME_UID, RUNTIME_GID); +} + +async function chownRecursive( + path: string, + uid: number, + gid: number, +): Promise { + await chown(path, uid, gid); + const entries = await readdir(path, { withFileTypes: true }); + for (const e of entries) { + const p = join(path, e.name); + if (e.isDirectory()) { + await chownRecursive(p, uid, gid); + } else { + await chown(p, uid, gid); + } + } +} + +async function harvestOutputDir( + hostDir: string, + caps: { perFileMax: number; totalMax: number }, +): Promise<{ files: OutputFile[]; truncatedCount: number }> { + const outputDir = join(hostDir, 'output'); + const files: OutputFile[] = []; + let truncatedCount = 0; + let totalAccepted = 0; + + async function walk(rel: string): Promise { + const abs = join(outputDir, rel); + let entries; + try { + entries = await readdir(abs, { withFileTypes: true }); + } catch { + return; + } + for (const e of entries) { + const childRel = rel ? `${rel}/${e.name}` : e.name; + const childAbs = join(outputDir, childRel); + if (e.isDirectory()) { + await walk(childRel); + continue; + } + if (!e.isFile()) continue; + const st = await stat(childAbs); + if ( + st.size > caps.perFileMax || + totalAccepted + st.size > caps.totalMax + ) { + truncatedCount += 1; + continue; + } + const bytes = await readFile(childAbs); + files.push({ + name: childRel, + contentBase64: bytes.toString('base64'), + size: st.size, + contentType: guessContentType(childRel), + }); + totalAccepted += st.size; + } + } + await walk(''); + return { files, truncatedCount }; +} + +function guessContentType(name: string): string { + const lower = name.toLowerCase(); + if (lower.endsWith('.pptx')) + return 'application/vnd.openxmlformats-officedocument.presentationml.presentation'; + if (lower.endsWith('.pdf')) return 'application/pdf'; + if (lower.endsWith('.xlsx')) + return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; + if (lower.endsWith('.docx')) + return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; + if (lower.endsWith('.png')) return 'image/png'; + if (lower.endsWith('.jpg') || lower.endsWith('.jpeg')) return 'image/jpeg'; + if (lower.endsWith('.svg')) return 'image/svg+xml'; + if (lower.endsWith('.json')) return 'application/json'; + if (lower.endsWith('.csv')) return 'text/csv; charset=utf-8'; + if (lower.endsWith('.txt') || lower.endsWith('.log')) + return 'text/plain; charset=utf-8'; + if (lower.endsWith('.html')) return 'text/html; charset=utf-8'; + return 'application/octet-stream'; +} + export async function executeRequest( cfg: SpawnerConfig, req: ExecuteRequest, @@ -68,54 +205,56 @@ export async function executeRequest( ); const startedAtMs = Date.now(); const containerName = `tale-sbx-${req.executionId}`; - const workspaceVolume = workspaceVolumeName(req.executionId); const pipVolume = pipCacheVolumeName(cfg, req.organizationId); const npmVolume = npmCacheVolumeName(cfg, req.organizationId); + const workspaceHostDir = join(cfg.hostSessionRoot, req.executionId); const abort = new AbortController(); - inFlight.set(req.executionId, { - containerName, - workspaceVolume, - abort, - }); + inFlight.set(req.executionId, { containerName, abort }); try { - await createWorkspaceVolume(req.executionId); await ensureCacheVolume(pipVolume); await ensureCacheVolume(npmVolume); - - await stageCodeIntoVolume({ - volumeName: workspaceVolume, - language: req.language, - code: req.code, - packages: req.packages ?? [], - options: req.options ?? {}, - inputFiles: req.inputFiles ?? [], - }); + await stageWorkspace(workspaceHostDir, req); const argv = buildDockerRunArgs(cfg, { executionId: req.executionId, organizationId: req.organizationId, language: req.language, timeoutMs, - workspaceVolume, pipCacheVolume: pipVolume, npmCacheVolume: npmVolume, + workspaceHostDir, startedAtMs, }); - const result = await runDocker(argv, { - timeoutMs: timeoutMs + 30_000, - signal: abort.signal, - }); + // Two-tier timeout: + // - Inner: at `timeoutMs`, docker kill the container so user code + // cannot exceed the cap. + // - Outer (in runDocker): at `timeoutMs + 30_000`, kill the docker + // CLI process too — covers the case where `docker kill` itself + // hangs (rare; would mean the daemon is in trouble). + const killTimer = setTimeout(() => { + void dockerKill(containerName).catch(() => {}); + }, timeoutMs); + let result: Awaited>; + try { + result = await runDocker(argv, { + timeoutMs: timeoutMs + 30_000, + signal: abort.signal, + killOnTimeoutContainer: containerName, + }); + } finally { + clearTimeout(killTimer); + } const durationMs = Date.now() - startedAtMs; const phases = classifyPhases(result.stdout); const exitCode = result.exitCode; - // Cap stdout/stderr per config. + const stdoutWithoutPhases = stripPhaseMarkers(result.stdout); const { text: stdoutCapped, truncated: stdoutTrunc } = capText( - stripPhaseMarkers(result.stdout), + stdoutWithoutPhases, cfg.stdoutMaxBytes, ); const { text: stderrCapped, truncated: stderrTrunc } = capText( @@ -123,8 +262,6 @@ export async function executeRequest( cfg.stderrMaxBytes, ); - // Cancellation took precedence (we set abort and killed): if signal is - // aborted, surface as 'cancelled' regardless of exit code. if (abort.signal.aborted) { return { status: 'cancelled', @@ -141,16 +278,8 @@ export async function executeRequest( }; } - // Map exit codes (per runtime-image entrypoint convention): - // 0 = success - // 64 = install failed (INSTALL_FAILED or PACKAGE_NOT_FOUND) - // 65 = bad invocation (SPAWNER_UNAVAILABLE) - // 124 = docker wrapper timeout (TIMEOUT) - // 137 = SIGKILL (could be OOM kill OR our explicit timeout kill) - // 139 = segfault - // else = user code RUNTIME_ERROR if (exitCode === 0) { - const harvested = await harvestOutput(workspaceVolume, { + const harvested = await harvestOutputDir(workspaceHostDir, { perFileMax: cfg.outputFileMaxBytes, totalMax: cfg.outputTotalMaxBytes, }); @@ -194,9 +323,10 @@ export async function executeRequest( ); } finally { inFlight.delete(req.executionId); - // Best-effort cleanup; named `--rm` should have removed the container, - // and we tear down the workspace volume. - await removeVolume(workspaceVolume).catch(() => {}); + await dockerRm(containerName).catch(() => {}); + await rm(workspaceHostDir, { recursive: true, force: true }).catch( + () => {}, + ); } } @@ -233,10 +363,7 @@ interface Phases { } function classifyPhases(_stdout: string): Phases { - // Phase timing is approximate — the markers tell us the order, but the - // spawner doesn't have inside-container timestamps. v2 can pipe wall-clock - // hints in the marker; for v1 we return null timings and report only that - // markers were observed. Callers should not depend on install/run split. + // Phase timing is approximate. v2 can pipe wall-clock hints in the marker. return { installMs: null, runMs: null }; } @@ -249,6 +376,11 @@ function capText( return { text: buf.subarray(0, maxBytes).toString('utf8'), truncated: true }; } +const EGRESS_DENIED_RE = + /403 Filtered|Tunnel connection failed|ProxyError|connection refused/i; +const PACKAGE_NOT_FOUND_RE = + /no matching distribution|could not find a version|unsatisfiable|404 Not Found|E404|No matching distribution found/i; + function classifyFailure( exitCode: number, stderr: string, @@ -257,21 +389,19 @@ function classifyFailure( return { code: 'TIMEOUT', message: 'Wall-clock timeout exceeded' }; } if (exitCode === 137) { - // OOM vs explicit kill — Linux doesn't tell us cleanly. If the message - // mentions "Killed" we lean OOM; otherwise it's likely an explicit timeout. if (/killed/i.test(stderr)) { return { code: 'OOM', message: 'Container killed (likely OOM)' }; } return { code: 'TIMEOUT', message: 'Container killed (SIGKILL)' }; } if (exitCode === 64) { - if (/no matching distribution|could not find a version/i.test(stderr)) { + if (PACKAGE_NOT_FOUND_RE.test(stderr)) { return { code: 'PACKAGE_NOT_FOUND', message: 'Requested package could not be resolved', }; } - if (/proxy|forbidden|filter|403|connection refused/i.test(stderr)) { + if (EGRESS_DENIED_RE.test(stderr)) { return { code: 'EGRESS_DENIED', message: 'Egress proxy denied the request', @@ -288,6 +418,15 @@ function classifyFailure( message: 'Sandbox runtime rejected the invocation', }; } + // Non-zero from user code or runtime crash — but if stderr clearly shows the + // egress proxy blocked the call, prefer EGRESS_DENIED over a generic + // RUNTIME_ERROR so the LLM knows it's a network policy, not a code bug. + if (EGRESS_DENIED_RE.test(stderr)) { + return { + code: 'EGRESS_DENIED', + message: 'Egress proxy denied the request', + }; + } return { code: 'RUNTIME_ERROR', message: `User code exited with status ${exitCode}`, diff --git a/services/sandbox/src/spawn_util.ts b/services/sandbox/src/spawn_util.ts index 9d125b2fc..ff1f8b278 100644 --- a/services/sandbox/src/spawn_util.ts +++ b/services/sandbox/src/spawn_util.ts @@ -5,11 +5,16 @@ // handling, stdin piping, and timeouts. export interface RunDockerOptions { - stdin?: string; + stdin?: string | Uint8Array; // Set true when we expect a binary blob (tar stream) on stdout. captureBinaryStdout?: boolean; timeoutMs?: number; signal?: AbortSignal; + // When set, on host-side timeout the CLI process is killed AND + // `docker kill ` is invoked so the actual + // sibling container stops. Without this the container keeps running + // after the CLI disconnects (R5 test). + killOnTimeoutContainer?: string; } export interface RunDockerResult { @@ -54,6 +59,18 @@ export async function runDocker( timer = setTimeout(() => { timedOut = true; proc.kill('SIGKILL'); + // Killing the docker CLI process doesn't stop the sibling + // container it spawned — issue an explicit `docker kill` so + // the runtime container actually terminates instead of + // running to completion in the background. + if (opts.killOnTimeoutContainer) { + const target = opts.killOnTimeoutContainer; + const killer = Bun.spawn( + [DOCKER_BIN, 'kill', '--signal=SIGKILL', target], + { stdout: 'ignore', stderr: 'ignore', stdin: 'ignore' }, + ); + void killer.exited; + } resolve(); }, opts.timeoutMs); }), diff --git a/services/sandbox/src/volume.ts b/services/sandbox/src/volume.ts index 44b02c5d5..979b3c622 100644 --- a/services/sandbox/src/volume.ts +++ b/services/sandbox/src/volume.ts @@ -1,8 +1,9 @@ -// Workspace + per-org cache volume helpers. +// Per-org cache volume helpers + post-run output harvest. // -// Workspace = ephemeral tmpfs Docker volume, 256 MB hard ENOSPC cap (R2.2). -// Per-org pip/npm cache = persistent named volumes scoped to organizationId -// (R2.3 — closes the cross-tenant wheel-cache poison vector). +// Per-org pip/npm caches are persistent named volumes scoped to organizationId +// (R2.3 — closes the cross-tenant wheel-cache poison vector). The runtime +// container itself uses a `--tmpfs /workspace` for the workspace, so there is +// no per-call workspace volume to manage. import { runDocker } from './spawn_util.ts'; import type { SpawnerConfig } from './types.ts'; @@ -18,10 +19,6 @@ function orgSlug(organizationId: string): string { return organizationId; } -export function workspaceVolumeName(executionId: string): string { - return `tale-sbx-${executionId}`; -} - export function pipCacheVolumeName( cfg: SpawnerConfig, organizationId: string, @@ -36,226 +33,62 @@ export function npmCacheVolumeName( return `${cfg.cacheVolumePrefix.npm}-${orgSlug(organizationId)}`; } -/** Create a sized tmpfs Docker volume (RAM-backed, hard ENOSPC at sizeMb). */ -export async function createWorkspaceVolume( - executionId: string, - sizeMb = 256, -): Promise { - const name = workspaceVolumeName(executionId); - const result = await runDocker([ - 'volume', - 'create', - '--driver=local', - '--label', - 'tale.sandbox=1', - `--label`, - `tale.session=${executionId}`, - '--opt', - 'type=tmpfs', - '--opt', - 'device=tmpfs', - '--opt', - `o=size=${sizeMb}m,nosuid,nodev`, - name, - ]); - if (result.exitCode !== 0) { - throw new Error( - `volume: failed to create workspace volume ${name}: ${result.stderr.trim() || result.stdout.trim()}`, - ); - } - return name; -} - /** - * Create per-org cache volume lazily (idempotent: docker volume create - * succeeds on an existing volume). + * Lazy idempotent create. New volumes are root-owned by default and the + * runtime container runs as nobody (65534), so on first creation we also + * spin up a transient busybox to chown the volume's root to 65534:65534. + * Subsequent calls are no-ops (we detect via `docker volume inspect`). */ export async function ensureCacheVolume(name: string): Promise { - const result = await runDocker([ + const inspect = await runDocker(['volume', 'inspect', name]); + if (inspect.exitCode === 0) return; // already exists, already chowned + + const create = await runDocker([ 'volume', 'create', '--label', 'tale.sandbox-cache=1', name, ]); - if (result.exitCode !== 0) { + if (create.exitCode !== 0) { throw new Error( - `volume: failed to ensure cache volume ${name}: ${result.stderr.trim() || result.stdout.trim()}`, + `volume: failed to create cache volume ${name}: ${create.stderr.trim() || create.stdout.trim()}`, ); } -} -export async function removeVolume(name: string): Promise { - // Best-effort; don't throw on missing volume so retries are safe. - await runDocker(['volume', 'rm', '--force', name]); -} - -/** - * Stage a code + packages + options bundle into the workspace volume via a - * transient busybox container. We DO NOT pass the user code through argv; - * we tar-pipe it in. - */ -export async function stageCodeIntoVolume(args: { - volumeName: string; - language: 'python' | 'node'; - code: string; - packages: string[]; - options: { allowSdist?: boolean; allowInstallScripts?: boolean }; - inputFiles: { name: string; contentBase64: string }[]; -}): Promise { - const mainName = args.language === 'python' ? 'main.py' : 'main.js'; - - // Build the tar archive in-memory. Format = a series of files we then - // pipe into `docker cp - container:/`. - // It's simpler to use a one-shot helper container that reads our payload - // from stdin and unpacks it. - - // Compose the script that the helper runs inside the volume. The helper is - // busybox, mounting the volume at /workspace; it reads a JSON manifest from - // stdin and writes the files we list. This keeps everything inside the - // sandbox volume and never touches the host filesystem outside of the - // mounted volume. - const stageScript = `#!/bin/sh -set -e -mkdir -p /workspace/code /workspace/input /workspace/output -cat > /workspace/code/${mainName} -`; - // The helper executes the staging script. We invoke docker run with the - // user code piped to it on stdin (NOT via argv). - const helperArgs = [ + // One-shot chown so the unprivileged runtime user can write to the cache. + const chown = await runDocker([ 'run', '--rm', - '-i', - '--label', - 'tale.sandbox-staging=1', '--user', '0:0', - '--mount', - `type=volume,src=${args.volumeName},dst=/workspace`, - '--entrypoint', - 'sh', - 'busybox:1.36', - '-c', - stageScript, - ]; - - const codeResult = await runDocker(helperArgs, { stdin: args.code }); - if (codeResult.exitCode !== 0) { - throw new Error( - `volume: failed to stage code: ${codeResult.stderr.trim()}`, - ); - } - - // Stage packages.json + options.json - const packagesJson = JSON.stringify(args.packages); - const optionsJson = JSON.stringify(args.options); - const writePackages = await runDocker( - [ - 'run', - '--rm', - '-i', - '--label', - 'tale.sandbox-staging=1', - '--user', - '0:0', - '--mount', - `type=volume,src=${args.volumeName},dst=/workspace`, - '--entrypoint', - 'sh', - 'busybox:1.36', - '-c', - 'cat > /workspace/code/packages.json', - ], - { stdin: packagesJson }, - ); - if (writePackages.exitCode !== 0) { - throw new Error( - `volume: failed to write packages.json: ${writePackages.stderr.trim()}`, - ); - } - - const writeOptions = await runDocker( - [ - 'run', - '--rm', - '-i', - '--label', - 'tale.sandbox-staging=1', - '--user', - '0:0', - '--mount', - `type=volume,src=${args.volumeName},dst=/workspace`, - '--entrypoint', - 'sh', - 'busybox:1.36', - '-c', - 'cat > /workspace/code/options.json', - ], - { stdin: optionsJson }, - ); - if (writeOptions.exitCode !== 0) { - throw new Error( - `volume: failed to write options.json: ${writeOptions.stderr.trim()}`, - ); - } - - // Input files (base64). Each is decoded and dropped under /workspace/input/. - for (const f of args.inputFiles) { - if (!/^[a-zA-Z0-9._-]+$/.test(f.name)) { - throw new Error(`volume: rejected unsafe input file name: ${f.name}`); - } - const writeInput = await runDocker( - [ - 'run', - '--rm', - '-i', - '--label', - 'tale.sandbox-staging=1', - '--user', - '0:0', - '--mount', - `type=volume,src=${args.volumeName},dst=/workspace`, - '--entrypoint', - 'sh', - 'busybox:1.36', - '-c', - `base64 -d > /workspace/input/${f.name}`, - ], - { stdin: f.contentBase64 }, - ); - if (writeInput.exitCode !== 0) { - throw new Error( - `volume: failed to write input file ${f.name}: ${writeInput.stderr.trim()}`, - ); - } - } - - // Ensure ownership so the unprivileged sandbox user can read the staged files. - const chown = await runDocker([ - 'run', - '--rm', '--label', 'tale.sandbox-staging=1', - '--user', - '0:0', '--mount', - `type=volume,src=${args.volumeName},dst=/workspace`, - '--entrypoint', - 'sh', + `type=volume,src=${name},dst=/cache`, 'busybox:1.36', - '-c', - 'chown -R 65534:65534 /workspace', + 'chown', + '65534:65534', + '/cache', ]); if (chown.exitCode !== 0) { throw new Error( - `volume: failed to chown workspace: ${chown.stderr.trim()}`, + `volume: failed to chown cache volume ${name}: ${chown.stderr.trim()}`, ); } } -/** Read the contents of /workspace/output/ as base64-encoded files. */ +export async function removeVolume(name: string): Promise { + await runDocker(['volume', 'rm', '--force', name]); +} + +/** + * Harvest /workspace/output/ from a stopped (not yet removed) container via + * `docker cp` streaming. Container must have been launched WITHOUT `--rm` so + * the filesystem survives until we `docker rm` it explicitly. + */ export async function harvestOutput( - volumeName: string, + containerName: string, caps: { perFileMax: number; totalMax: number }, ): Promise<{ files: { @@ -266,42 +99,16 @@ export async function harvestOutput( }[]; truncatedCount: number; }> { - // Use `docker run -i tar c -C /workspace/output .` to stream a tar; parse it. - // Bun supports child_process; we tee-into a buffer. const tarResult = await runDocker( - [ - 'run', - '--rm', - '--label', - 'tale.sandbox-staging=1', - '--user', - '0:0', - '--mount', - `type=volume,src=${volumeName},dst=/workspace`, - '--entrypoint', - 'sh', - 'busybox:1.36', - '-c', - // -h follows symlinks (matters if user code symlinks). --to-stdout via -O - // for individual files but tar is simpler. - 'cd /workspace/output 2>/dev/null && tar -cf - . 2>/dev/null || true', - ], + ['cp', `${containerName}:/workspace/output/.`, '-'], { captureBinaryStdout: true }, ); - if (tarResult.exitCode !== 0) { return { files: [], truncatedCount: 0 }; } - return parseTarStream(tarResult.stdoutBytes ?? new Uint8Array(0), caps); } -interface TarEntry { - name: string; - size: number; - body: Uint8Array; -} - function parseTarStream( buf: Uint8Array, caps: { perFileMax: number; totalMax: number }, @@ -328,7 +135,6 @@ function parseTarStream( while (i + 512 <= buf.length) { const header = buf.subarray(i, i + 512); - // Check for end-of-archive (two consecutive zero blocks). let allZero = true; for (let j = 0; j < 512; j++) { if (header[j] !== 0) { @@ -349,9 +155,7 @@ function parseTarStream( const bodyEnd = i + size; if (bodyEnd > buf.length) break; - // Regular file: typeflag '0' (0x30) or '\0' if ((typeflag === 0x30 || typeflag === 0) && size > 0) { - // Strip leading ./ const cleanName = name.replace(/^\.\//, ''); if (cleanName && !cleanName.endsWith('/')) { if (size > caps.perFileMax || totalAccepted + size > caps.totalMax) { @@ -368,7 +172,6 @@ function parseTarStream( } } } - // Advance to next 512-aligned boundary. i = bodyEnd + ((512 - (size % 512)) % 512); } return { files, truncatedCount }; diff --git a/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts index 72bd9bff2..ecf84da43 100644 --- a/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts +++ b/tools/cli/src/lib/compose/services/create-sandbox-egress-service.ts @@ -30,6 +30,9 @@ export function createSandboxEgressService( start_period: '5s', }, logging: DEFAULT_LOGGING, - networks: ['sandbox'], + // `sandbox` is internal-only; sandbox-egress also needs `internal` so it + // can resolve and reach pypi/npm (those need DNS + NAT). Runtime + // containers stay solely on `sandbox` and tunnel through this proxy. + networks: ['sandbox', 'internal'], }; } diff --git a/tools/cli/src/lib/compose/services/create-sandbox-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-service.ts index 9bfd90f23..1ace06a97 100644 --- a/tools/cli/src/lib/compose/services/create-sandbox-service.ts +++ b/tools/cli/src/lib/compose/services/create-sandbox-service.ts @@ -34,7 +34,13 @@ export function createSandboxService(config: ServiceConfig): ComposeService { SANDBOX_EGRESS_NETWORK: 'tale-sandbox-net', SANDBOX_EGRESS_PROXY: 'http://sandbox-egress:3128', }, - volumes: ['/var/run/docker.sock:/var/run/docker.sock'], + volumes: [ + '/var/run/docker.sock:/var/run/docker.sock', + // 1:1 bind so per-call workspace dirs created by the spawner are + // visible to the docker daemon at the same host path when it mounts + // them into the runtime container. + '/var/lib/tale-sandbox:/var/lib/tale-sandbox', + ], restart: 'unless-stopped', healthcheck: { test: ['CMD', 'curl', '-fsS', 'http://127.0.0.1:8003/health'], From 64c33c13e0ceef86d43c3edd12dbf540a61ecb51 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 20:18:21 +0800 Subject: [PATCH 007/108] fix(sandbox): accept Convex doc-id alphabet in executionId MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The L4 spawner regex was UUID-only (hex + hyphens), but executeCode passes the Convex audit row's _id (lowercase alphanumeric, base36-ish) as the spawner executionId. Broaden both regexes to [a-zA-Z0-9_-]{1,64} — still safe for Docker container names and argv positions, now accepts both UUIDs and Convex ids. L5 verified end-to-end: executeCode action via convex run produced a real .pptx in 2.6s with audit row + fileMetadata row both populated. --- services/sandbox/src/docker_args.ts | 5 ++++- services/sandbox/src/spawn.ts | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/services/sandbox/src/docker_args.ts b/services/sandbox/src/docker_args.ts index fcf5c40f9..2ec5ae763 100644 --- a/services/sandbox/src/docker_args.ts +++ b/services/sandbox/src/docker_args.ts @@ -22,7 +22,10 @@ export interface DockerRunInput { startedAtMs: number; } -const UUID_RE = /^[a-f0-9-]{1,64}$/i; +// executionId is either a UUID (hex + hyphens) from a direct caller or a +// Convex doc id (lowercase alphanumeric). Both produce safe Docker container +// names — alphanumeric + dash/underscore only. +const UUID_RE = /^[a-zA-Z0-9_-]{1,64}$/; const ORG_RE = /^[a-zA-Z0-9_-]{1,128}$/; const VOL_RE = /^[a-zA-Z0-9_.-]{1,128}$/; const HOST_DIR_RE = /^\/[a-zA-Z0-9_./-]{1,256}$/; diff --git a/services/sandbox/src/spawn.ts b/services/sandbox/src/spawn.ts index df3d95103..e5b42b97b 100644 --- a/services/sandbox/src/spawn.ts +++ b/services/sandbox/src/spawn.ts @@ -189,7 +189,7 @@ export async function executeRequest( cfg: SpawnerConfig, req: ExecuteRequest, ): Promise { - if (!/^[a-f0-9-]{1,64}$/i.test(req.executionId)) { + if (!/^[a-zA-Z0-9_-]{1,64}$/.test(req.executionId)) { return makeError('SPAWNER_UNAVAILABLE', 'invalid executionId', 0); } if (!/^[a-zA-Z0-9_-]{1,128}$/.test(req.organizationId)) { From 24e7f05d79fe2113704df0b754660d2d7bdb934c Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 20:50:29 +0800 Subject: [PATCH 008/108] feat(sandbox): publish 8003 for bun dev + wire code_run into chat-agent demo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - compose.yml + create-sandbox-service: publish 8003:8003 (same shape as rag/crawler dev convention). `bun dev` runs convex-local-backend on the host; without this the executeCode action can't resolve `sandbox` (Docker DNS). The tale-deploy generator can omit the port for hardened prod deployments — same option as rag/crawler. - examples/agents/chat-agent.json: add `code_run` to toolNames; flip Rule 7's "do NOT produce .pptx — there is no PPTX export" line in EN / DE / FR system prompts to "for downloadable .pptx call code_run with python-pptx==1.0.2". HTML in-chat slide guidance via artifact_create is preserved as the in-chat default. Verified end-to-end via host loopback: HMAC-signed POST to 127.0.0.1:8003/v1/execute produces a real .pptx in 2.3s. Recipe for bun dev: 1. .env has SANDBOX_TOKEN= and SANDBOX_URL=http://sandbox:8003 (for the dockerized convex container). 2. services/platform/.env.local has the same SANDBOX_TOKEN and SANDBOX_URL=http://127.0.0.1:8003 (sync-convex-env-from-dotenv applies higher priority for .env.local; bun dev's local convex backend picks up the loopback URL). 3. `bun dev` from services/platform/. --- compose.yml | 7 +++++++ examples/agents/chat-agent.json | 9 +++++---- .../src/lib/compose/services/create-sandbox-service.ts | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/compose.yml b/compose.yml index c3fd089a8..79c58451d 100644 --- a/compose.yml +++ b/compose.yml @@ -596,6 +596,13 @@ services: context: services/sandbox dockerfile: Dockerfile container_name: tale-sandbox + # Port mapping: host:container (for development) + # `bun dev` runs the convex-local-backend on the host, so the executeCode + # Node action needs to reach the spawner via 127.0.0.1:8003. Same shape + # as rag (8001) and crawler (8002). The `tale deploy` CLI generator + # omits this in production. + ports: + - '8003:8003' env_file: - .env environment: diff --git a/examples/agents/chat-agent.json b/examples/agents/chat-agent.json index 58cb7eaaa..0e380c3d1 100644 --- a/examples/agents/chat-agent.json +++ b/examples/agents/chat-agent.json @@ -12,7 +12,8 @@ "pdf", "image", "docx", - "excel" + "excel", + "code_run" ], "supportedModels": [ "openrouter:deepseek/deepseek-v4-flash", @@ -62,7 +63,7 @@ "Eine Follow-up-Email an den Kunden verfassen", "Die neuesten Produktupdates zusammenfassen" ], - "systemInstructions": "Du bist ein hilfreicher KI-Assistent.\n\n**SPRACHE — strikte Prioritätsreihenfolge. Prüfe die Regeln 1→3 und halte beim ersten Treffer an.**\n\n1. **Explizite Anfrage.** Wenn die letzte Nachricht des Nutzers ausdrücklich nach einer Sprache verlangt (z. B. „reply in German\", „auf Deutsch bitte\", „répondez en français\", „translate to French\"), antworte in dieser Sprache.\n2. **Sprache der Nachricht.** Ansonsten erkenne die natürliche Sprache der letzten Nachricht des Nutzers und antworte in dieser Sprache.\n3. **Locale-Fallback.** Nur wenn die letzte Nachricht keine erkennbare natürliche Sprache enthält — z. B. nur Code, eine einzelne URL, reine Zahlen, ein einzelnes Emoji oder ein mehrdeutiges Ein- oder Zwei-Zeichen-Token — antworte in der Browser-Locale des Nutzers: `{{user.language}}`. Wenn `{{user.language}}` ebenfalls leer ist, antworte auf Englisch.\n\nBeispiele:\n- Nutzer: \"how are you today?\" → Englisch (Regel 2).\n- Nutzer: \"Wie geht es dir heute?\" → Deutsch (Regel 2).\n- Nutzer: \"Comment ça va aujourd'hui ?\" → Französisch (Regel 2).\n- Nutzer: \"translate to French: hello\" → Antwort auf Französisch (Regel 1).\n- Nutzer: \"```py\\nprint('hi')\\n```\" mit Browser-Locale `de-DE` → Deutsch (Regel 3).\n- Nutzer: \"👍\" mit Browser-Locale `fr-FR` → Französisch (Regel 3).\n\nVerwende niemals Zeitzone, IP-Adresse oder Geolocation, um die Antwortsprache zu wählen. Nur Regel 3 nutzt die Browser-Locale, und zwar ausschließlich als allerletzten Fallback.\n\n**WISSENSBEREICH**\n- **Wissensdatenbank**: Von der Organisation hochgeladene Dokumente — verwaltet auf der [Dokumente-Seite]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Gecrawlte Websites**: Webseiten von Domains, die von der Organisation hinzugefügt wurden — verwaltet auf der [Websites-Seite]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Wenn Suchen keine Ergebnisse liefern, weise den Nutzer darauf hin, dass er Dokumente hochladen oder Website-Domains hinzufügen kann, um die Wissensdatenbank zu erweitern.\n- Für Daten aus externen Systemen (Shopify, Datenbanken usw.) benötigt der Nutzer den Integration Assistant, konfiguriert unter [Einstellungen > Integrationen]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**REGELN**\n1. **SUCHEN VOR „ICH WEISS ES NICHT\"** — Sage niemals, dass dir Informationen fehlen, ohne zuvor die Wissensdatenbank oder das Web durchsucht zu haben.\n2. **KEINE HALLUZINATIONEN** — Verwende ausschließlich Daten aus Tool-Ergebnissen oder Nutzernachrichten. Erfinde niemals Fakten.\n3. **TOOL-ERGEBNISSE PRÄSENTIEREN** — Wenn ein Tool Ergebnisse zurückgibt, präsentiere zuerst die wichtigsten Informationen. Überspringe niemals Ergebnisse, um direkt zu Rückfragen zu springen.\n4. **MINIMALER TOOL-EINSATZ** — Wenn du aus deinem eigenen Wissen oder dem Gesprächskontext antworten kannst, tu das direkt. Rufe Tools nur auf, wenn die Frage externe Daten erfordert.\n5. **VORANALYSIERTE ANHÄNGE** — Wenn die Nachricht des Nutzers Abschnitte wie „[PRE-ANALYZED CONTENT\" oder „**Document: ...**\" / „**Image: ...**\" / „**Text File: ...**\" enthält, antworte direkt aus diesem Inhalt. NICHT erneut parsen.\n6. **KEINE ROHEN KONTEXT-AUSGABEN** — Gib niemals interne Formate aus („Tool[\", „[Tool Result]\", XML-Tags, rohes JSON). Berichte Ergebnisse in natürlicher Sprache.\n7. **PRÄSENTATIONEN, DEMO-SEITEN, VISUELLE & INTERAKTIVE INHALTE** — Wenn der Nutzer eine Präsentation, Folien, einen Foliensatz, PPT, PPTX, Demo-Seite, Vergleichsseite, interaktive Seite, Visualisierung, ein Dashboard oder eine beliebige *Seite* / *Dokument* zum Lesen direkt im Chat (statt als Datei-Download) anfragt, rufe IMMER das Tool `artifact_create` mit `type: \"html\"` und einem vollständigen, eigenständigen HTML-Dokument als `content` auf. Der Canvas-Bereich rendert das Artefakt live, während du streamst. Um es später zu überarbeiten (einen Bug beheben, eine Farbe ändern, eine Folie ergänzen), rufe `artifact_edit` für dieselbe `artifactId` auf — gib niemals das vollständige HTML erneut über `artifact_create` aus. Gib KEINE rohen ` ```html `-Codeblöcke aus; sie werden nicht als Vorschau gerendert. Rufe das `pdf`-Tool NICHT für diese Anfragen auf. Versuche NICHT, eine .pptx-Datei zu erzeugen — es gibt keinen PPTX-Export. Erzeuge nur dann ein PDF, wenn der Nutzer ausdrücklich eine herunterladbare .pdf-Datei verlangt. (reveal.js per CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, ist ein guter Standard für Folien.)\n\n**ANTWORTSTIL**: Sei direkt und prägnant. Verwende Markdown-Tabellen für mehrere Datensätze.\n\n{{user_profile}}" + "systemInstructions": "Du bist ein hilfreicher KI-Assistent.\n\n**SPRACHE — strikte Prioritätsreihenfolge. Prüfe die Regeln 1→3 und halte beim ersten Treffer an.**\n\n1. **Explizite Anfrage.** Wenn die letzte Nachricht des Nutzers ausdrücklich nach einer Sprache verlangt (z. B. „reply in German\", „auf Deutsch bitte\", „répondez en français\", „translate to French\"), antworte in dieser Sprache.\n2. **Sprache der Nachricht.** Ansonsten erkenne die natürliche Sprache der letzten Nachricht des Nutzers und antworte in dieser Sprache.\n3. **Locale-Fallback.** Nur wenn die letzte Nachricht keine erkennbare natürliche Sprache enthält — z. B. nur Code, eine einzelne URL, reine Zahlen, ein einzelnes Emoji oder ein mehrdeutiges Ein- oder Zwei-Zeichen-Token — antworte in der Browser-Locale des Nutzers: `{{user.language}}`. Wenn `{{user.language}}` ebenfalls leer ist, antworte auf Englisch.\n\nBeispiele:\n- Nutzer: \"how are you today?\" → Englisch (Regel 2).\n- Nutzer: \"Wie geht es dir heute?\" → Deutsch (Regel 2).\n- Nutzer: \"Comment ça va aujourd'hui ?\" → Französisch (Regel 2).\n- Nutzer: \"translate to French: hello\" → Antwort auf Französisch (Regel 1).\n- Nutzer: \"```py\\nprint('hi')\\n```\" mit Browser-Locale `de-DE` → Deutsch (Regel 3).\n- Nutzer: \"👍\" mit Browser-Locale `fr-FR` → Französisch (Regel 3).\n\nVerwende niemals Zeitzone, IP-Adresse oder Geolocation, um die Antwortsprache zu wählen. Nur Regel 3 nutzt die Browser-Locale, und zwar ausschließlich als allerletzten Fallback.\n\n**WISSENSBEREICH**\n- **Wissensdatenbank**: Von der Organisation hochgeladene Dokumente — verwaltet auf der [Dokumente-Seite]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Gecrawlte Websites**: Webseiten von Domains, die von der Organisation hinzugefügt wurden — verwaltet auf der [Websites-Seite]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Wenn Suchen keine Ergebnisse liefern, weise den Nutzer darauf hin, dass er Dokumente hochladen oder Website-Domains hinzufügen kann, um die Wissensdatenbank zu erweitern.\n- Für Daten aus externen Systemen (Shopify, Datenbanken usw.) benötigt der Nutzer den Integration Assistant, konfiguriert unter [Einstellungen > Integrationen]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**REGELN**\n1. **SUCHEN VOR „ICH WEISS ES NICHT\"** — Sage niemals, dass dir Informationen fehlen, ohne zuvor die Wissensdatenbank oder das Web durchsucht zu haben.\n2. **KEINE HALLUZINATIONEN** — Verwende ausschließlich Daten aus Tool-Ergebnissen oder Nutzernachrichten. Erfinde niemals Fakten.\n3. **TOOL-ERGEBNISSE PRÄSENTIEREN** — Wenn ein Tool Ergebnisse zurückgibt, präsentiere zuerst die wichtigsten Informationen. Überspringe niemals Ergebnisse, um direkt zu Rückfragen zu springen.\n4. **MINIMALER TOOL-EINSATZ** — Wenn du aus deinem eigenen Wissen oder dem Gesprächskontext antworten kannst, tu das direkt. Rufe Tools nur auf, wenn die Frage externe Daten erfordert.\n5. **VORANALYSIERTE ANHÄNGE** — Wenn die Nachricht des Nutzers Abschnitte wie „[PRE-ANALYZED CONTENT\" oder „**Document: ...**\" / „**Image: ...**\" / „**Text File: ...**\" enthält, antworte direkt aus diesem Inhalt. NICHT erneut parsen.\n6. **KEINE ROHEN KONTEXT-AUSGABEN** — Gib niemals interne Formate aus („Tool[\", „[Tool Result]\", XML-Tags, rohes JSON). Berichte Ergebnisse in natürlicher Sprache.\n7. **PRÄSENTATIONEN, DEMO-SEITEN, VISUELLE & INTERAKTIVE INHALTE** — Wenn der Nutzer eine Präsentation, Folien, einen Foliensatz, PPT, PPTX, Demo-Seite, Vergleichsseite, interaktive Seite, Visualisierung, ein Dashboard oder eine beliebige *Seite* / *Dokument* zum Lesen direkt im Chat (statt als Datei-Download) anfragt, rufe IMMER das Tool `artifact_create` mit `type: \"html\"` und einem vollständigen, eigenständigen HTML-Dokument als `content` auf. Der Canvas-Bereich rendert das Artefakt live, während du streamst. Um es später zu überarbeiten (einen Bug beheben, eine Farbe ändern, eine Folie ergänzen), rufe `artifact_edit` für dieselbe `artifactId` auf — gib niemals das vollständige HTML erneut über `artifact_create` aus. Gib KEINE rohen ` ```html `-Codeblöcke aus; sie werden nicht als Vorschau gerendert. Rufe das `pdf`-Tool NICHT für diese Anfragen auf. Wenn der Nutzer ausdrücklich eine herunterladbare .pptx-Datei verlangt, rufe `code_run` mit `language: \"python\"`, `packages: [\"python-pptx==1.0.2\"]` und Code auf, der das Deck nach `/workspace/output/.pptx` schreibt; die erzeugte Datei erscheint im Chat als Anhang. Erzeuge nur dann ein PDF, wenn der Nutzer ausdrücklich eine herunterladbare .pdf-Datei verlangt. (reveal.js per CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, ist ein guter Standard für Folien.)\n\n**ANTWORTSTIL**: Sei direkt und prägnant. Verwende Markdown-Tabellen für mehrere Datensätze.\n\n{{user_profile}}" }, "en": { "displayName": "Assistant", @@ -73,7 +74,7 @@ "Write a follow-up email to the client", "Summarize our latest product updates" ], - "systemInstructions": "You are a helpful AI assistant.\n\n**LANGUAGE — strict priority order. Evaluate rules 1→3 and stop at the first match.**\n\n1. **Explicit request.** If the user's latest message explicitly asks for a language (e.g., \"reply in German\", \"auf Deutsch bitte\", \"répondez en français\", \"translate to French\"), use that language for the reply.\n2. **Message language.** Otherwise, detect the natural language of the user's latest message and reply in that language.\n3. **Locale fallback.** Only if the latest message has no detectable natural language — e.g., it is code-only, a bare URL, pure numbers, a single emoji, or a one- or two-character ambiguous token — reply in the user's browser locale: `{{user.language}}`. If `{{user.language}}` is also empty, reply in English.\n\nExamples:\n- User: \"how are you today?\" → English (rule 2).\n- User: \"Wie geht es dir heute?\" → German (rule 2).\n- User: \"Comment ça va aujourd'hui ?\" → French (rule 2).\n- User: \"translate to French: hello\" → French body (rule 1).\n- User: \"```py\\nprint('hi')\\n```\" with browser locale `de-DE` → German (rule 3).\n- User: \"👍\" with browser locale `fr-FR` → French (rule 3).\n\nNever use timezone, IP, or geolocation to choose the response language. Only rule 3 uses the browser locale, and only as a last-resort fallback.\n\n**KNOWLEDGE SCOPE**\n- **Knowledge base**: Documents uploaded by the organization — managed on the [Documents page]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Crawled websites**: Web pages from domains added by the organization — managed on the [Websites page]({{site_url}}/dashboard/{{organization.id}}/websites).\n- If searches return no results, let the user know they can upload documents or add website domains to expand the knowledge base.\n- For external system data (Shopify, databases, etc.), the user needs the Integration Assistant configured in [Settings > Integrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RULES**\n1. **SEARCH BEFORE \"I DON'T KNOW\"** — Never say you don't have information without first searching the knowledge base or the web.\n2. **NO HALLUCINATIONS** — Only use data from tool results or user messages. Never fabricate facts.\n3. **PRESENT TOOL RESULTS** — When a tool returns results, present the key information first. Never skip results to jump to follow-up questions.\n4. **MINIMAL TOOL USE** — If you can answer from your own knowledge or conversation context, do so directly. Only call tools when the question requires external data.\n5. **PRE-ANALYZED ATTACHMENTS** — If the user's message contains \"[PRE-ANALYZED CONTENT\" or \"**Document: ...**\" / \"**Image: ...**\" / \"**Text File: ...**\" sections, answer from that content directly. Do NOT re-parse.\n6. **NO RAW CONTEXT OUTPUT** — Never output internal formats (\"Tool[\", \"[Tool Result]\", XML tags, raw JSON). Report results in natural language.\n7. **PRESENTATIONS, DEMO PAGES, VISUAL & INTERACTIVE CONTENT** — When the user asks for a presentation, slides, slide deck, PPT, PPTX, demo page, comparison page, interactive page, visualization, dashboard, or any *page* / *document* the user will read inside the chat (rather than download as a file), ALWAYS call the `artifact_create` tool with `type: \"html\"` and a complete, self-contained HTML document as `content`. The Canvas pane renders the artifact live as you stream. To revise it later (fix a bug, change a colour, add a slide), call `artifact_edit` against the same `artifactId` — never re-emit the full HTML via another `artifact_create`. Do NOT emit raw ` ```html ` code blocks; they will not render as a preview. Do NOT call the `pdf` tool for these. Do NOT try to produce a .pptx file — there is no PPTX export. Only generate a PDF if the user explicitly insists on a downloadable .pdf file. (reveal.js via CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, is a good default for slides.)\n\n**RESPONSE STYLE**: Be direct and concise. Use Markdown tables for multiple records.\n\n{{user_profile}}" + "systemInstructions": "You are a helpful AI assistant.\n\n**LANGUAGE — strict priority order. Evaluate rules 1→3 and stop at the first match.**\n\n1. **Explicit request.** If the user's latest message explicitly asks for a language (e.g., \"reply in German\", \"auf Deutsch bitte\", \"répondez en français\", \"translate to French\"), use that language for the reply.\n2. **Message language.** Otherwise, detect the natural language of the user's latest message and reply in that language.\n3. **Locale fallback.** Only if the latest message has no detectable natural language — e.g., it is code-only, a bare URL, pure numbers, a single emoji, or a one- or two-character ambiguous token — reply in the user's browser locale: `{{user.language}}`. If `{{user.language}}` is also empty, reply in English.\n\nExamples:\n- User: \"how are you today?\" → English (rule 2).\n- User: \"Wie geht es dir heute?\" → German (rule 2).\n- User: \"Comment ça va aujourd'hui ?\" → French (rule 2).\n- User: \"translate to French: hello\" → French body (rule 1).\n- User: \"```py\\nprint('hi')\\n```\" with browser locale `de-DE` → German (rule 3).\n- User: \"👍\" with browser locale `fr-FR` → French (rule 3).\n\nNever use timezone, IP, or geolocation to choose the response language. Only rule 3 uses the browser locale, and only as a last-resort fallback.\n\n**KNOWLEDGE SCOPE**\n- **Knowledge base**: Documents uploaded by the organization — managed on the [Documents page]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Crawled websites**: Web pages from domains added by the organization — managed on the [Websites page]({{site_url}}/dashboard/{{organization.id}}/websites).\n- If searches return no results, let the user know they can upload documents or add website domains to expand the knowledge base.\n- For external system data (Shopify, databases, etc.), the user needs the Integration Assistant configured in [Settings > Integrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RULES**\n1. **SEARCH BEFORE \"I DON'T KNOW\"** — Never say you don't have information without first searching the knowledge base or the web.\n2. **NO HALLUCINATIONS** — Only use data from tool results or user messages. Never fabricate facts.\n3. **PRESENT TOOL RESULTS** — When a tool returns results, present the key information first. Never skip results to jump to follow-up questions.\n4. **MINIMAL TOOL USE** — If you can answer from your own knowledge or conversation context, do so directly. Only call tools when the question requires external data.\n5. **PRE-ANALYZED ATTACHMENTS** — If the user's message contains \"[PRE-ANALYZED CONTENT\" or \"**Document: ...**\" / \"**Image: ...**\" / \"**Text File: ...**\" sections, answer from that content directly. Do NOT re-parse.\n6. **NO RAW CONTEXT OUTPUT** — Never output internal formats (\"Tool[\", \"[Tool Result]\", XML tags, raw JSON). Report results in natural language.\n7. **PRESENTATIONS, DEMO PAGES, VISUAL & INTERACTIVE CONTENT** — When the user asks for a presentation, slides, slide deck, PPT, PPTX, demo page, comparison page, interactive page, visualization, dashboard, or any *page* / *document* the user will read inside the chat (rather than download as a file), ALWAYS call the `artifact_create` tool with `type: \"html\"` and a complete, self-contained HTML document as `content`. The Canvas pane renders the artifact live as you stream. To revise it later (fix a bug, change a colour, add a slide), call `artifact_edit` against the same `artifactId` — never re-emit the full HTML via another `artifact_create`. Do NOT emit raw ` ```html ` code blocks; they will not render as a preview. Do NOT call the `pdf` tool for these. If the user explicitly asks for a downloadable .pptx file, call `code_run` with `language: \"python\"`, `packages: [\"python-pptx==1.0.2\"]`, and code that writes the deck to `/workspace/output/.pptx`; the resulting file appears in chat as an attachment. Only generate a PDF if the user explicitly insists on a downloadable .pdf file. (reveal.js via CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, is a good default for slides.)\n\n**RESPONSE STYLE**: Be direct and concise. Use Markdown tables for multiple records.\n\n{{user_profile}}" }, "fr": { "displayName": "Assistant", @@ -84,7 +85,7 @@ "Écrire un email de relance au client", "Résumer nos dernières mises à jour produit" ], - "systemInstructions": "Tu es un assistant IA serviable.\n\n**LANGUE — ordre de priorité strict. Évalue les règles 1→3 et arrête-toi à la première correspondance.**\n\n1. **Demande explicite.** Si le dernier message de l'utilisateur demande explicitement une langue (par ex. « reply in German », « auf Deutsch bitte », « répondez en français », « translate to French »), utilise cette langue pour la réponse.\n2. **Langue du message.** Sinon, détecte la langue naturelle du dernier message de l'utilisateur et réponds dans cette langue.\n3. **Locale de repli.** Uniquement si le dernier message ne contient aucune langue naturelle détectable — par ex. il s'agit uniquement de code, d'une simple URL, de chiffres purs, d'un seul emoji, ou d'un jeton ambigu d'un ou deux caractères — réponds dans la locale du navigateur de l'utilisateur : `{{user.language}}`. Si `{{user.language}}` est également vide, réponds en anglais.\n\nExemples :\n- Utilisateur : \"how are you today?\" → anglais (règle 2).\n- Utilisateur : \"Wie geht es dir heute?\" → allemand (règle 2).\n- Utilisateur : \"Comment ça va aujourd'hui ?\" → français (règle 2).\n- Utilisateur : \"translate to French: hello\" → réponse en français (règle 1).\n- Utilisateur : \"```py\\nprint('hi')\\n```\" avec locale du navigateur `de-DE` → allemand (règle 3).\n- Utilisateur : \"👍\" avec locale du navigateur `fr-FR` → français (règle 3).\n\nN'utilise jamais le fuseau horaire, l'IP ou la géolocalisation pour choisir la langue de réponse. Seule la règle 3 utilise la locale du navigateur, et uniquement en dernier recours.\n\n**PÉRIMÈTRE DE CONNAISSANCES**\n- **Base de connaissances** : documents téléversés par l'organisation — gérés sur la [page Documents]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Sites web explorés** : pages web issues des domaines ajoutés par l'organisation — gérés sur la [page Sites web]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Si les recherches ne renvoient aucun résultat, indique à l'utilisateur qu'il peut téléverser des documents ou ajouter des domaines de sites web pour étendre la base de connaissances.\n- Pour les données de systèmes externes (Shopify, bases de données, etc.), l'utilisateur a besoin de l'Integration Assistant configuré dans [Paramètres > Intégrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RÈGLES**\n1. **CHERCHER AVANT DE DIRE « JE NE SAIS PAS »** — Ne dis jamais que tu n'as pas l'information sans avoir d'abord cherché dans la base de connaissances ou sur le web.\n2. **PAS D'HALLUCINATIONS** — N'utilise que les données issues des résultats d'outils ou des messages de l'utilisateur. Ne fabrique jamais de faits.\n3. **PRÉSENTER LES RÉSULTATS DES OUTILS** — Lorsqu'un outil renvoie des résultats, présente d'abord les informations clés. Ne saute jamais les résultats pour passer directement à des questions de suivi.\n4. **USAGE MINIMAL DES OUTILS** — Si tu peux répondre à partir de tes propres connaissances ou du contexte de la conversation, fais-le directement. N'appelle des outils que lorsque la question nécessite des données externes.\n5. **PIÈCES JOINTES PRÉ-ANALYSÉES** — Si le message de l'utilisateur contient des sections « [PRE-ANALYZED CONTENT » ou « **Document: ...** » / « **Image: ...** » / « **Text File: ...** », réponds directement à partir de ce contenu. NE PAS ré-analyser.\n6. **PAS DE SORTIE DE CONTEXTE BRUT** — Ne restitue jamais les formats internes (« Tool[ », « [Tool Result] », balises XML, JSON brut). Rapporte les résultats en langage naturel.\n7. **PRÉSENTATIONS, PAGES DE DÉMO, CONTENU VISUEL & INTERACTIF** — Lorsque l'utilisateur demande une présentation, des diapositives, un slide deck, PPT, PPTX, page de démo, page de comparaison, page interactive, visualisation, tableau de bord, ou toute *page* / *document* à lire directement dans le chat (plutôt qu'à télécharger comme fichier), appelle TOUJOURS l'outil `artifact_create` avec `type: \"html\"` et un document HTML complet et autonome comme `content`. Le panneau Canvas affiche l'artéfact en direct pendant que tu le diffuses. Pour le réviser ensuite (corriger un bug, changer une couleur, ajouter une diapositive), appelle `artifact_edit` sur le même `artifactId` — ne réémets jamais le HTML complet via un autre `artifact_create`. N'émets PAS de blocs de code ` ```html ` bruts ; ils ne s'affichent pas en aperçu. N'appelle PAS l'outil `pdf` pour ces demandes. N'essaie PAS de produire un fichier .pptx — il n'y a pas d'export PPTX. Ne génère un PDF que si l'utilisateur insiste explicitement sur un fichier .pdf téléchargeable. (reveal.js via CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, est un bon défaut pour les diapositives.)\n\n**STYLE DE RÉPONSE** : sois direct et concis. Utilise des tableaux Markdown pour plusieurs enregistrements.\n\n{{user_profile}}" + "systemInstructions": "Tu es un assistant IA serviable.\n\n**LANGUE — ordre de priorité strict. Évalue les règles 1→3 et arrête-toi à la première correspondance.**\n\n1. **Demande explicite.** Si le dernier message de l'utilisateur demande explicitement une langue (par ex. « reply in German », « auf Deutsch bitte », « répondez en français », « translate to French »), utilise cette langue pour la réponse.\n2. **Langue du message.** Sinon, détecte la langue naturelle du dernier message de l'utilisateur et réponds dans cette langue.\n3. **Locale de repli.** Uniquement si le dernier message ne contient aucune langue naturelle détectable — par ex. il s'agit uniquement de code, d'une simple URL, de chiffres purs, d'un seul emoji, ou d'un jeton ambigu d'un ou deux caractères — réponds dans la locale du navigateur de l'utilisateur : `{{user.language}}`. Si `{{user.language}}` est également vide, réponds en anglais.\n\nExemples :\n- Utilisateur : \"how are you today?\" → anglais (règle 2).\n- Utilisateur : \"Wie geht es dir heute?\" → allemand (règle 2).\n- Utilisateur : \"Comment ça va aujourd'hui ?\" → français (règle 2).\n- Utilisateur : \"translate to French: hello\" → réponse en français (règle 1).\n- Utilisateur : \"```py\\nprint('hi')\\n```\" avec locale du navigateur `de-DE` → allemand (règle 3).\n- Utilisateur : \"👍\" avec locale du navigateur `fr-FR` → français (règle 3).\n\nN'utilise jamais le fuseau horaire, l'IP ou la géolocalisation pour choisir la langue de réponse. Seule la règle 3 utilise la locale du navigateur, et uniquement en dernier recours.\n\n**PÉRIMÈTRE DE CONNAISSANCES**\n- **Base de connaissances** : documents téléversés par l'organisation — gérés sur la [page Documents]({{site_url}}/dashboard/{{organization.id}}/documents).\n- **Sites web explorés** : pages web issues des domaines ajoutés par l'organisation — gérés sur la [page Sites web]({{site_url}}/dashboard/{{organization.id}}/websites).\n- Si les recherches ne renvoient aucun résultat, indique à l'utilisateur qu'il peut téléverser des documents ou ajouter des domaines de sites web pour étendre la base de connaissances.\n- Pour les données de systèmes externes (Shopify, bases de données, etc.), l'utilisateur a besoin de l'Integration Assistant configuré dans [Paramètres > Intégrations]({{site_url}}/dashboard/{{organization.id}}/settings/integrations).\n\n**RÈGLES**\n1. **CHERCHER AVANT DE DIRE « JE NE SAIS PAS »** — Ne dis jamais que tu n'as pas l'information sans avoir d'abord cherché dans la base de connaissances ou sur le web.\n2. **PAS D'HALLUCINATIONS** — N'utilise que les données issues des résultats d'outils ou des messages de l'utilisateur. Ne fabrique jamais de faits.\n3. **PRÉSENTER LES RÉSULTATS DES OUTILS** — Lorsqu'un outil renvoie des résultats, présente d'abord les informations clés. Ne saute jamais les résultats pour passer directement à des questions de suivi.\n4. **USAGE MINIMAL DES OUTILS** — Si tu peux répondre à partir de tes propres connaissances ou du contexte de la conversation, fais-le directement. N'appelle des outils que lorsque la question nécessite des données externes.\n5. **PIÈCES JOINTES PRÉ-ANALYSÉES** — Si le message de l'utilisateur contient des sections « [PRE-ANALYZED CONTENT » ou « **Document: ...** » / « **Image: ...** » / « **Text File: ...** », réponds directement à partir de ce contenu. NE PAS ré-analyser.\n6. **PAS DE SORTIE DE CONTEXTE BRUT** — Ne restitue jamais les formats internes (« Tool[ », « [Tool Result] », balises XML, JSON brut). Rapporte les résultats en langage naturel.\n7. **PRÉSENTATIONS, PAGES DE DÉMO, CONTENU VISUEL & INTERACTIF** — Lorsque l'utilisateur demande une présentation, des diapositives, un slide deck, PPT, PPTX, page de démo, page de comparaison, page interactive, visualisation, tableau de bord, ou toute *page* / *document* à lire directement dans le chat (plutôt qu'à télécharger comme fichier), appelle TOUJOURS l'outil `artifact_create` avec `type: \"html\"` et un document HTML complet et autonome comme `content`. Le panneau Canvas affiche l'artéfact en direct pendant que tu le diffuses. Pour le réviser ensuite (corriger un bug, changer une couleur, ajouter une diapositive), appelle `artifact_edit` sur le même `artifactId` — ne réémets jamais le HTML complet via un autre `artifact_create`. N'émets PAS de blocs de code ` ```html ` bruts ; ils ne s'affichent pas en aperçu. N'appelle PAS l'outil `pdf` pour ces demandes. Si l'utilisateur demande explicitement un fichier .pptx téléchargeable, appelle `code_run` avec `language: \"python\"`, `packages: [\"python-pptx==1.0.2\"]` et du code qui écrit la présentation dans `/workspace/output/.pptx` ; le fichier obtenu apparaît dans le chat en pièce jointe. Ne génère un PDF que si l'utilisateur insiste explicitement sur un fichier .pdf téléchargeable. (reveal.js via CDN, https://cdn.jsdelivr.net/npm/reveal.js@5, est un bon défaut pour les diapositives.)\n\n**STYLE DE RÉPONSE** : sois direct et concis. Utilise des tableaux Markdown pour plusieurs enregistrements.\n\n{{user_profile}}" } } } diff --git a/tools/cli/src/lib/compose/services/create-sandbox-service.ts b/tools/cli/src/lib/compose/services/create-sandbox-service.ts index 1ace06a97..66dd60801 100644 --- a/tools/cli/src/lib/compose/services/create-sandbox-service.ts +++ b/tools/cli/src/lib/compose/services/create-sandbox-service.ts @@ -26,6 +26,11 @@ export function createSandboxService(config: ServiceConfig): ComposeService { return { image: `${config.registry}/tale-sandbox:${config.version}`, container_name: `${getProjectId()}-sandbox`, + // Dev convention: publish 8003 to host loopback so `bun dev`'s local + // convex-local-backend (running on the host) can reach the spawner. + // Matches rag (8001) and crawler (8002). The `tale deploy` generator + // can omit this for hardened prod deployments — same as those services. + ports: ['8003:8003'], env_file: ['.env'], environment: { SANDBOX_RUNTIME: '${SANDBOX_RUNTIME:-runc}', From 18693d3b13f684beace81161e224f1d4d565a8cd Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 21:04:34 +0800 Subject: [PATCH 009/108] fix(sandbox): attach generated files as chat-card parts (appendFilePart) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When code_run finished it left the file as a fileMetadata row but never attached it to the assistant message — so the LLM said "your pptx is ready" and nothing appeared in the chat bubble. Mirror what excel/pdf/ docx tools do: - insertOutputFiles + executeCode now thread storageId through to the tool layer (was only returning fileMetadataId). - code_run tool calls appendFilePart(ctx, {fileName, mimeType, downloadUrl}) per output file, with downloadUrl built via buildDownloadUrl(storageId, name) — same helper excel_tool uses. Verified: re-ran executeCode directly via convex run; the response now carries storageId for each file. The chat-bubble attachment fires when the tool is invoked through the agent loop (next chat send should show the .pptx card alongside the LLM's text response). --- .../convex/agent_tools/code/code_run_tool.ts | 15 +++++++++++++++ .../convex/node_only/sandbox/internal_actions.ts | 1 + .../platform/convex/sandbox/output_mutations.ts | 3 +++ 3 files changed, 19 insertions(+) diff --git a/services/platform/convex/agent_tools/code/code_run_tool.ts b/services/platform/convex/agent_tools/code/code_run_tool.ts index a3cd72a3f..26fb4fe1b 100644 --- a/services/platform/convex/agent_tools/code/code_run_tool.ts +++ b/services/platform/convex/agent_tools/code/code_run_tool.ts @@ -14,6 +14,8 @@ import { createTool } from '@convex-dev/agent'; import { z } from 'zod/v4'; import { internal } from '../../_generated/api'; +import { buildDownloadUrl } from '../../lib/helpers/public_storage_url'; +import { appendFilePart } from '../files/helpers/append_file_part'; import type { ToolDefinition } from '../types'; const codeRunArgs = z.object({ @@ -235,6 +237,19 @@ The returned \`files[0].fileMetadataId\` can be passed to \`document_write\` to ); if (result.success) { + // Attach each output file as a downloadable card on the current + // assistant message — matches what excel_tool / pdf_tool / docx_tool + // do today via `appendFilePart`. Without this the file lives in + // `fileMetadata` but never appears as a chat attachment, which is + // what the user just saw (LLM said "file is ready" but no chip). + for (const f of result.files) { + const downloadUrl = buildDownloadUrl(String(f.storageId), f.name); + await appendFilePart(ctx, { + fileName: f.name, + mimeType: f.contentType, + downloadUrl, + }); + } return { success: true, executionId: String(result.executionId), diff --git a/services/platform/convex/node_only/sandbox/internal_actions.ts b/services/platform/convex/node_only/sandbox/internal_actions.ts index ee1ed3570..9abb483a5 100644 --- a/services/platform/convex/node_only/sandbox/internal_actions.ts +++ b/services/platform/convex/node_only/sandbox/internal_actions.ts @@ -96,6 +96,7 @@ export const executeCode = internalAction({ v.object({ name: v.string(), fileMetadataId: v.id('fileMetadata'), + storageId: v.id('_storage'), size: v.number(), contentType: v.string(), }), diff --git a/services/platform/convex/sandbox/output_mutations.ts b/services/platform/convex/sandbox/output_mutations.ts index 0b1910429..bd3615cab 100644 --- a/services/platform/convex/sandbox/output_mutations.ts +++ b/services/platform/convex/sandbox/output_mutations.ts @@ -31,6 +31,7 @@ export const insertOutputFiles = internalMutation({ v.object({ name: v.string(), fileMetadataId: v.id('fileMetadata'), + storageId: v.id('_storage'), size: v.number(), contentType: v.string(), }), @@ -40,6 +41,7 @@ export const insertOutputFiles = internalMutation({ const out: { name: string; fileMetadataId: Id<'fileMetadata'>; + storageId: Id<'_storage'>; size: number; contentType: string; }[] = []; @@ -59,6 +61,7 @@ export const insertOutputFiles = internalMutation({ out.push({ name: f.name, fileMetadataId, + storageId: f.storageId, size: f.size, contentType: f.contentType, }); From 09127fda0accace7a651155498273375aabbf824 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 21:51:43 +0800 Subject: [PATCH 010/108] fix(sandbox): make SANDBOX_TOKEN optional (rag/crawler parity) bun dev surfaced this on first invocation: Uncaught Error: SANDBOX_TOKEN env var is required for sandbox/code_run; set it in .env at handler (.../convex/node_only/sandbox/internal_actions.ts:438:8) bun dev's convex-local-backend runs on the host with whatever env it gets from .env / .env.local. The hard throw in getSpawnerToken() turned "forgot to set the secret" into "tool is dead." rag (8001) and crawler (8002) both sit on the same internal Docker network with no auth and just work; sandbox should match. Auth is now opt-in. Both sides agree: unset on both -> unsigned requests accepted; one-time boot warning on the spawner set on spawner only -> 401 (catches client/server config drift) set on client only -> harmless (client signs, server ignores) set on both -> HMAC required, mismatch = 401 tale init still auto-generates SANDBOX_TOKEN so prod stays HMAC-on by default; this only removes the hard-error path when the secret happens to be missing at runtime. Files (4): - services/sandbox/src/types.ts sandboxToken: string | null - services/sandbox/src/config.ts drop requireEnv; treat "" as unset - services/sandbox/src/server.ts gate verify() on token !== null; warn once on boot in unauth mode - spawner_client.ts drop throw; omit signature header when token is null Verified: - spawner with SANDBOX_TOKEN unset boots, logs the warning, accepts an unsigned POST and runs python in 482ms. - spawner with SANDBOX_TOKEN set still returns 401 on bad/missing sig. - 9 argv unit tests still pass. --- .../sandbox/helpers/spawner_client.ts | 40 ++++++++++--------- services/sandbox/src/config.ts | 13 ++---- services/sandbox/src/server.ts | 17 +++++++- services/sandbox/src/types.ts | 5 ++- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts index 903e5bb4e..e9e472cf8 100644 --- a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts +++ b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts @@ -58,14 +58,13 @@ function getSpawnerUrl(): string { return process.env.SANDBOX_URL ?? 'http://sandbox:8003'; } -function getSpawnerToken(): string { +function getSpawnerToken(): string | null { + // Optional. When unset on both sides, requests go unsigned and the + // spawner accepts them (rag/crawler-parity, internal-trust mode). + // `tale init` generates SANDBOX_TOKEN by default so production + // deployments stay HMAC-on. const token = process.env.SANDBOX_TOKEN; - if (!token) { - throw new Error( - 'SANDBOX_TOKEN env var is required for sandbox/code_run; set it in .env', - ); - } - return token; + return token && token.length > 0 ? token : null; } /** @@ -81,14 +80,18 @@ export async function spawnerExecute( const token = getSpawnerToken(); const bodyJson = JSON.stringify(body); + const headers: Record = { + 'content-type': 'application/json', + }; + if (token !== null) { + headers[SIGNATURE_HEADER] = sign(bodyJson, token); + } + let res: Response; try { res = await fetch(url, { method: 'POST', - headers: { - 'content-type': 'application/json', - [SIGNATURE_HEADER]: sign(bodyJson, token), - }, + headers, body: bodyJson, signal, }); @@ -119,15 +122,14 @@ export async function spawnerCancel(executionId: string): Promise { const url = `${getSpawnerUrl()}/v1/cancel/${encodeURIComponent(executionId)}`; const token = getSpawnerToken(); const body = ''; + const headers: Record = { + 'content-type': 'application/json', + }; + if (token !== null) { + headers[SIGNATURE_HEADER] = sign(body, token); + } try { - await fetch(url, { - method: 'POST', - headers: { - 'content-type': 'application/json', - [SIGNATURE_HEADER]: sign(body, token), - }, - body, - }); + await fetch(url, { method: 'POST', headers, body }); } catch { // Cancellation is best-effort; the watchdog cron will reap stuck rows. } diff --git a/services/sandbox/src/config.ts b/services/sandbox/src/config.ts index e0afa3c14..e4df4cf74 100644 --- a/services/sandbox/src/config.ts +++ b/services/sandbox/src/config.ts @@ -3,14 +3,6 @@ import type { SpawnerConfig } from './types.ts'; -function requireEnv(name: string): string { - const v = process.env[name]; - if (!v || v.length === 0) { - throw new Error(`Missing required env var: ${name}`); - } - return v; -} - function numEnv(name: string, fallback: number): number { const v = process.env[name]; if (v === undefined || v === '') return fallback; @@ -28,9 +20,12 @@ export function loadConfig(): SpawnerConfig { `SANDBOX_RUNTIME must be 'runc' or 'runsc'; got: ${runtime}`, ); } + const rawToken = process.env.SANDBOX_TOKEN; return { port: numEnv('SANDBOX_PORT', 8003), - sandboxToken: requireEnv('SANDBOX_TOKEN'), + // Empty string treated as unset so `SANDBOX_TOKEN=` in .env behaves + // the same as not declaring it at all. + sandboxToken: rawToken && rawToken.length > 0 ? rawToken : null, runtimeImage: process.env.SANDBOX_RUNTIME_IMAGE ?? 'tale-sandbox-runtime:latest', runtime, diff --git a/services/sandbox/src/server.ts b/services/sandbox/src/server.ts index c8d8c8051..653f58e03 100644 --- a/services/sandbox/src/server.ts +++ b/services/sandbox/src/server.ts @@ -51,7 +51,12 @@ async function handleHealth(): Promise { async function handleExecute(req: Request): Promise { const body = await req.text(); - if (!verify(body, req.headers.get(SIGNATURE_HEADER), cfg.sandboxToken)) { + // HMAC is opt-in. When SANDBOX_TOKEN is unset the spawner accepts + // unsigned requests (rag/crawler-parity; see config.ts + plan §1 Auth). + if ( + cfg.sandboxToken !== null && + !verify(body, req.headers.get(SIGNATURE_HEADER), cfg.sandboxToken) + ) { return new Response(JSON.stringify({ error: 'unauthorized' }), { status: 401, headers: { 'content-type': 'application/json' }, @@ -95,7 +100,10 @@ async function handleExecute(req: Request): Promise { async function handleCancel(req: Request, id: string): Promise { const body = await req.text(); - if (!verify(body, req.headers.get(SIGNATURE_HEADER), cfg.sandboxToken)) { + if ( + cfg.sandboxToken !== null && + !verify(body, req.headers.get(SIGNATURE_HEADER), cfg.sandboxToken) + ) { return new Response(JSON.stringify({ error: 'unauthorized' }), { status: 401, headers: { 'content-type': 'application/json' }, @@ -152,6 +160,11 @@ async function main(): Promise { console.log( `[sandbox] spawner listening on :${server.port}; runtime=${cfg.runtime}; image=${cfg.runtimeImage}; maxConcurrent=${cfg.maxConcurrent}`, ); + if (cfg.sandboxToken === null) { + console.warn( + '[sandbox] WARNING: SANDBOX_TOKEN unset — accepting unsigned requests on the internal network (rag/crawler-parity dev mode). Set SANDBOX_TOKEN to enforce HMAC auth.', + ); + } // Keep the periodic sweep handle so it isn't GC'd. void stopPeriodic; diff --git a/services/sandbox/src/types.ts b/services/sandbox/src/types.ts index 803d0f753..da7a84899 100644 --- a/services/sandbox/src/types.ts +++ b/services/sandbox/src/types.ts @@ -67,7 +67,10 @@ export interface CancelResponse { export interface SpawnerConfig { port: number; - sandboxToken: string; + // Optional. When null, spawner accepts unsigned requests (rag/crawler- + // parity, internal-trust mode). `tale init` populates this in prod; + // `bun dev` typically runs without it. + sandboxToken: string | null; runtimeImage: string; runtime: 'runc' | 'runsc'; defaultTimeoutMs: number; From 978d4f9cfb4c459ca41b6970b4d3b7bb40e4fb31 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 22:08:23 +0800 Subject: [PATCH 011/108] fix(sandbox): default SANDBOX_URL to localhost (rag/crawler-parity for bun dev) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User's `bun dev` action threw `fetch failed: sandbox:8003` because the client defaulted to the Docker DNS name. From the host, that doesn't resolve. The convention rag and crawler follow is the inverse: - Code default = http://localhost: (works for bun dev with published ports) - services/platform/env.sh sets the docker DNS as the default for in-container processes, e.g. RAG_URL="${RAG_URL:-http://rag:8001}". The platform docker-entrypoint sources env.sh and then convex env sets the value into the convex backend, so dockerized actions see the docker name. Two changes: - spawner_client.getSpawnerUrl() defaults to http://localhost:8003. - services/platform/env.sh adds SANDBOX_URL="${SANDBOX_URL:-http://sandbox:8003}" next to RAG_URL / CRAWLER_URL / SEARCH_SERVICE_URL. Net effect: bun dev (host node) → code default localhost:8003 → works zero-config via the published port dockerized convex (in compose) → env.sh default sandbox:8003 → works operator override (either) → SANDBOX_URL in .env / .env.local / docker compose environment block takes precedence Sandbox argv unit tests (9) still pass. --- .../convex/node_only/sandbox/helpers/spawner_client.ts | 7 ++++++- services/platform/env.sh | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts index e9e472cf8..db1fbcb04 100644 --- a/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts +++ b/services/platform/convex/node_only/sandbox/helpers/spawner_client.ts @@ -55,7 +55,12 @@ function sign(body: string, token: string): string { } function getSpawnerUrl(): string { - return process.env.SANDBOX_URL ?? 'http://sandbox:8003'; + // Mirrors RAG_URL / CRAWLER_URL convention: default to host loopback + // so `bun dev`'s local convex-local-backend (running on the host) can + // reach the spawner via the published port. Docker compose sets + // SANDBOX_URL=http://sandbox:8003 on the tale-convex container so the + // dockerized convex resolves through Docker DNS instead. + return process.env.SANDBOX_URL ?? 'http://localhost:8003'; } function getSpawnerToken(): string | null { diff --git a/services/platform/env.sh b/services/platform/env.sh index 1e09ec921..107e8af68 100644 --- a/services/platform/env.sh +++ b/services/platform/env.sh @@ -46,6 +46,7 @@ env_normalize_common() { # They can be overridden via environment variables in .env when needed. export RAG_URL="${RAG_URL:-http://rag:8001}" export CRAWLER_URL="${CRAWLER_URL:-http://crawler:8002}" + export SANDBOX_URL="${SANDBOX_URL:-http://sandbox:8003}" export SEARCH_SERVICE_URL="${SEARCH_SERVICE_URL:-http://search:8080}" # Convex instance configuration From e64da4ca10c63f22ab668364e72d555213f42f0a Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 22:28:22 +0800 Subject: [PATCH 012/108] feat(sandbox): sse phase events + artifact run-state schema (M5a) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation for folding code_run into the artifact system (plan Refinement 2). The sandbox execution layer is unchanged behaviorally; the wire protocol grows phase events so the canvas can show live progress, and the artifacts schema grows run-state fields so a runnable artifact's row holds its current execution state. Schema (artifacts/schema.ts): - type union gains `python_runnable` + `node_runnable` - run-state fields (all optional — non-breaking per feedback_deprecate_dont_delete_schema_fields): runPackages, runOptions, runStatus (queued|installing|running| completed|failed|cancelled), runProgress, runStartedAt, runCompletedAt, runExitCode, runErrorCode, runErrorMessage, runStdoutPreview/StorageId, runStderrPreview/StorageId, runOutputFiles, runExecutionId Mutations (artifacts/internal_mutations.ts): - initArtifactRun: set runStatus='queued' + clear prior-run remnants - patchArtifactRunProgress: mid-flight status/progress updates fired on PHASE events from the spawner - finalizeArtifactRun: write final exit code + output files + clear the live progress string Spawner (services/sandbox/src/): - spawn_util.runDocker now accepts onStdoutChunk callback that fires per chunk during the docker run (vs after exit). - spawn.executeRequest accepts ExecuteRequestOptions.onPhase and runs a line-buffered parser over the stdout chunks; PHASE: installing / PHASE: running lines fire the callback with {phase} events. - server /v1/execute switches from buffered JSON response to SSE: emits 'event: phase data: {...}' per phase + final 'event: result data: {...full response...}'. Verification: - 9 argv unit tests still pass. - bun run typecheck clean. - curl smoke against rebuilt container: SSE stream emits two phase events ('installing', 'running') then the final result event with a real .pptx in outputFiles. M5b will switch the convex spawner_client to a streaming consumer and wire artifact_create / artifact_edit to call executeCode for runnable types. --- .../convex/artifacts/internal_mutations.ts | 137 ++++++++++++++++++ services/platform/convex/artifacts/schema.ts | 65 +++++++++ services/sandbox/src/server.ts | 46 ++++-- services/sandbox/src/spawn.ts | 37 +++++ services/sandbox/src/spawn_util.ts | 40 ++++- 5 files changed, 314 insertions(+), 11 deletions(-) diff --git a/services/platform/convex/artifacts/internal_mutations.ts b/services/platform/convex/artifacts/internal_mutations.ts index 46e55ca9a..8b2546a5a 100644 --- a/services/platform/convex/artifacts/internal_mutations.ts +++ b/services/platform/convex/artifacts/internal_mutations.ts @@ -4,6 +4,9 @@ import { internalMutation } from '../_generated/server'; import { applyPatches } from '../agent_tools/artifacts/apply_patches'; import { artifactPatchValidator, + artifactRunErrorCodeValidator, + artifactRunOutputFileValidator, + artifactRunStatusValidator, artifactTypeValidator, liveStreamModeValidator, } from './schema'; @@ -421,3 +424,137 @@ export const cleanupStaleStreams = internalMutation({ return { cleared }; }, }); + +// ============================================================================= +// Runnable-artifact run-state mutations (Refinement 2) +// ============================================================================= +// +// These mutate the `run*` fields on a runnable artifact (`python_runnable` / +// `node_runnable`). The executeCode internal action calls them between +// `setRunning` and `finalize` as PHASE markers stream from the spawner. +// The canvas-runnable-code-renderer subscribes to the artifact row and +// gets reactive updates for the progress chip + output file display. + +export const initArtifactRun = internalMutation({ + args: { + artifactId: v.id('artifacts'), + runPackages: v.array(v.string()), + runOptions: v.optional( + v.object({ + allowSdist: v.optional(v.boolean()), + allowInstallScripts: v.optional(v.boolean()), + }), + ), + }, + returns: v.null(), + handler: async (ctx, args) => { + const row = await ctx.db.get(args.artifactId); + if (!row) return null; + if (row.type !== 'python_runnable' && row.type !== 'node_runnable') { + // Defensive: callers should only invoke this on runnable types. Skip + // silently so an out-of-band call can't corrupt a static artifact. + return null; + } + await ctx.db.patch(args.artifactId, { + runPackages: args.runPackages, + ...(args.runOptions !== undefined && { runOptions: args.runOptions }), + runStatus: 'queued', + runProgress: 'Queued', + runStartedAt: Date.now(), + // Clear any stale fields from a prior run of the same artifact (the + // edit flow re-uses the row for subsequent executions). + runCompletedAt: undefined, + runExitCode: undefined, + runErrorCode: undefined, + runErrorMessage: undefined, + runStdoutPreview: undefined, + runStderrPreview: undefined, + runStdoutStorageId: undefined, + runStderrStorageId: undefined, + runOutputFiles: [], + runExecutionId: undefined, + }); + return null; + }, +}); + +export const patchArtifactRunProgress = internalMutation({ + args: { + artifactId: v.id('artifacts'), + runStatus: v.optional(artifactRunStatusValidator), + runProgress: v.optional(v.string()), + runExecutionId: v.optional(v.id('sandboxExecutions')), + }, + returns: v.null(), + handler: async (ctx, args) => { + const row = await ctx.db.get(args.artifactId); + if (!row) return null; + if (row.type !== 'python_runnable' && row.type !== 'node_runnable') { + return null; + } + const patch: Record = {}; + if (args.runStatus !== undefined) patch.runStatus = args.runStatus; + if (args.runProgress !== undefined) patch.runProgress = args.runProgress; + if (args.runExecutionId !== undefined) { + patch.runExecutionId = args.runExecutionId; + } + if (Object.keys(patch).length === 0) return null; + await ctx.db.patch(args.artifactId, patch); + return null; + }, +}); + +export const finalizeArtifactRun = internalMutation({ + args: { + artifactId: v.id('artifacts'), + runStatus: v.union( + v.literal('completed'), + v.literal('failed'), + v.literal('cancelled'), + ), + runExitCode: v.optional(v.number()), + runErrorCode: v.optional(artifactRunErrorCodeValidator), + runErrorMessage: v.optional(v.string()), + runStdoutPreview: v.optional(v.string()), + runStderrPreview: v.optional(v.string()), + runStdoutStorageId: v.optional(v.id('_storage')), + runStderrStorageId: v.optional(v.id('_storage')), + runOutputFiles: v.array(artifactRunOutputFileValidator), + runExecutionId: v.id('sandboxExecutions'), + }, + returns: v.null(), + handler: async (ctx, args) => { + const row = await ctx.db.get(args.artifactId); + if (!row) return null; + if (row.type !== 'python_runnable' && row.type !== 'node_runnable') { + return null; + } + await ctx.db.patch(args.artifactId, { + runStatus: args.runStatus, + runProgress: undefined, + runCompletedAt: Date.now(), + ...(args.runExitCode !== undefined && { runExitCode: args.runExitCode }), + ...(args.runErrorCode !== undefined && { + runErrorCode: args.runErrorCode, + }), + ...(args.runErrorMessage !== undefined && { + runErrorMessage: args.runErrorMessage, + }), + ...(args.runStdoutPreview !== undefined && { + runStdoutPreview: args.runStdoutPreview, + }), + ...(args.runStderrPreview !== undefined && { + runStderrPreview: args.runStderrPreview, + }), + ...(args.runStdoutStorageId !== undefined && { + runStdoutStorageId: args.runStdoutStorageId, + }), + ...(args.runStderrStorageId !== undefined && { + runStderrStorageId: args.runStderrStorageId, + }), + runOutputFiles: args.runOutputFiles, + runExecutionId: args.runExecutionId, + }); + return null; + }, +}); diff --git a/services/platform/convex/artifacts/schema.ts b/services/platform/convex/artifacts/schema.ts index 53d9306f6..ae76352e6 100644 --- a/services/platform/convex/artifacts/schema.ts +++ b/services/platform/convex/artifacts/schema.ts @@ -7,8 +7,42 @@ export const artifactTypeValidator = v.union( v.literal('markdown'), v.literal('mermaid'), v.literal('code'), + // Runnable types: source code that executes in the server sandbox. The + // artifact's `content` is the script; the `run*` fields below carry the + // execution state (status, stdout/stderr preview, output files, ...). + // Editing a runnable artifact via artifact_edit re-runs the script. + v.literal('python_runnable'), + v.literal('node_runnable'), ); +export const artifactRunStatusValidator = v.union( + v.literal('queued'), + v.literal('installing'), + v.literal('running'), + v.literal('completed'), + v.literal('failed'), + v.literal('cancelled'), +); + +export const artifactRunErrorCodeValidator = v.union( + v.literal('TIMEOUT'), + v.literal('OOM'), + v.literal('EGRESS_DENIED'), + v.literal('INSTALL_FAILED'), + v.literal('PACKAGE_NOT_FOUND'), + v.literal('QUOTA_EXCEEDED'), + v.literal('RUNTIME_ERROR'), + v.literal('SPAWNER_UNAVAILABLE'), + v.literal('CANCELLED'), +); + +export const artifactRunOutputFileValidator = v.object({ + name: v.string(), + fileMetadataId: v.id('fileMetadata'), + size: v.number(), + contentType: v.string(), +}); + export const artifactEditKindValidator = v.union( v.literal('create'), v.literal('patch'), @@ -80,6 +114,37 @@ export const artifactsTable = defineTable({ // preview over the (still settled) source — patch mode never writes // `streamingContent`, so this is the only mid-stream signal users have. streamingPatches: v.optional(v.array(artifactPatchValidator)), + + // --- Runnable-artifact run state (populated only when type is + // `python_runnable` / `node_runnable`). All optional per the + // [feedback_deprecate_dont_delete_schema_fields] rule so existing rows + // pass the read validator unchanged. The canvas-runnable-code-renderer + // subscribes to these fields for live progress + final output display. + runPackages: v.optional(v.array(v.string())), + runOptions: v.optional( + v.object({ + allowSdist: v.optional(v.boolean()), + allowInstallScripts: v.optional(v.boolean()), + }), + ), + runStatus: v.optional(artifactRunStatusValidator), + // Human-readable hint shown in the canvas while running (e.g. + // "Installing python-pptx==1.0.2"). Mirrors videoLinkJobs.progress. + runProgress: v.optional(v.string()), + runStartedAt: v.optional(v.number()), + runCompletedAt: v.optional(v.number()), + runExitCode: v.optional(v.number()), + runErrorCode: v.optional(artifactRunErrorCodeValidator), + runErrorMessage: v.optional(v.string()), + runStdoutPreview: v.optional(v.string()), + runStderrPreview: v.optional(v.string()), + runStdoutStorageId: v.optional(v.id('_storage')), + runStderrStorageId: v.optional(v.id('_storage')), + runOutputFiles: v.optional(v.array(artifactRunOutputFileValidator)), + // Link to the latest per-execution audit row. The sandboxExecutions + // table is the source of truth for execution history; the artifact row + // holds only the *latest* result for fast canvas reads. + runExecutionId: v.optional(v.id('sandboxExecutions')), }) .index('by_organizationId', ['organizationId']) .index('by_organizationId_and_thread', ['organizationId', 'threadId']) diff --git a/services/sandbox/src/server.ts b/services/sandbox/src/server.ts index 653f58e03..ef18245c5 100644 --- a/services/sandbox/src/server.ts +++ b/services/sandbox/src/server.ts @@ -87,15 +87,43 @@ async function handleExecute(req: Request): Promise { ); } inFlightSet.add(parsed.executionId); - try { - const result = await executeRequest(cfg, parsed); - return new Response(JSON.stringify(result), { - status: 200, - headers: { 'content-type': 'application/json' }, - }); - } finally { - inFlightSet.delete(parsed.executionId); - } + + // Stream phase events + final result via Server-Sent Events so the convex + // action can patch the artifact row's runProgress as soon as the runtime + // entrypoint emits a PHASE marker (Refinement 2). Back-compat: a + // non-streaming client can still parse the last `data:` block as JSON + // and get the final result. + const stream = new ReadableStream({ + async start(controller) { + const enc = new TextEncoder(); + const send = (event: string, data: unknown) => { + controller.enqueue( + enc.encode(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`), + ); + }; + try { + const result = await executeRequest(cfg, parsed, { + onPhase: (e) => send('phase', e), + }); + send('result', result); + } catch (err) { + send('error', { + message: err instanceof Error ? err.message : String(err), + }); + } finally { + inFlightSet.delete(parsed.executionId); + controller.close(); + } + }, + }); + return new Response(stream, { + status: 200, + headers: { + 'content-type': 'text/event-stream', + 'cache-control': 'no-cache, no-transform', + 'x-accel-buffering': 'no', + }, + }); } async function handleCancel(req: Request, id: string): Promise { diff --git a/services/sandbox/src/spawn.ts b/services/sandbox/src/spawn.ts index e5b42b97b..0253a0480 100644 --- a/services/sandbox/src/spawn.ts +++ b/services/sandbox/src/spawn.ts @@ -185,9 +185,22 @@ function guessContentType(name: string): string { return 'application/octet-stream'; } +/** + * Phase events emitted while the runtime container is running. The server's + * SSE handler relays these to the convex action; the action then writes the + * artifact row's `runStatus` + `runProgress` so the canvas shows live + * progress instead of a frozen spinner (Refinement 2). + */ +export type PhaseEvent = { phase: 'installing' } | { phase: 'running' }; + +export interface ExecuteRequestOptions { + onPhase?: (event: PhaseEvent) => void; +} + export async function executeRequest( cfg: SpawnerConfig, req: ExecuteRequest, + opts: ExecuteRequestOptions = {}, ): Promise { if (!/^[a-zA-Z0-9_-]{1,64}$/.test(req.executionId)) { return makeError('SPAWNER_UNAVAILABLE', 'invalid executionId', 0); @@ -239,10 +252,34 @@ export async function executeRequest( }, timeoutMs); let result: Awaited>; try { + // Line-buffered phase parser. The runtime image's entrypoint emits + // "PHASE: installing\n" then later "PHASE: running\n" on stdout. We + // accumulate bytes until we see a newline, then scan each line for + // those markers and fire the onPhase callback. Other lines (user's + // own prints) are ignored — the full stdout is still captured in + // result.stdout for the final response. + let lineBuf = ''; + const decoder = new TextDecoder('utf-8', { fatal: false }); + const onChunk = opts.onPhase + ? (chunk: Uint8Array) => { + lineBuf += decoder.decode(chunk, { stream: true }); + let nl: number; + while ((nl = lineBuf.indexOf('\n')) !== -1) { + const line = lineBuf.slice(0, nl); + lineBuf = lineBuf.slice(nl + 1); + if (line === PHASE_INSTALL) { + opts.onPhase?.({ phase: 'installing' }); + } else if (line === PHASE_RUN) { + opts.onPhase?.({ phase: 'running' }); + } + } + } + : undefined; result = await runDocker(argv, { timeoutMs: timeoutMs + 30_000, signal: abort.signal, killOnTimeoutContainer: containerName, + ...(onChunk && { onStdoutChunk: onChunk }), }); } finally { clearTimeout(killTimer); diff --git a/services/sandbox/src/spawn_util.ts b/services/sandbox/src/spawn_util.ts index ff1f8b278..a66e4fec2 100644 --- a/services/sandbox/src/spawn_util.ts +++ b/services/sandbox/src/spawn_util.ts @@ -15,6 +15,12 @@ export interface RunDockerOptions { // sibling container stops. Without this the container keeps running // after the CLI disconnects (R5 test). killOnTimeoutContainer?: string; + // Per-chunk stdout callback fired while the subprocess is alive. Used + // by the phase-marker parser in spawn.ts to emit phase events to the + // SSE stream as soon as the container's entrypoint emits them, rather + // than waiting for the container to exit (Refinement 2). The callback + // is plain bytes; the caller is responsible for line-buffering. + onStdoutChunk?: (chunk: Uint8Array) => void; } export interface RunDockerResult { @@ -42,9 +48,39 @@ export async function runDocker( await proc.stdin.end(); } - // Concurrent reads to avoid pipe-back-pressure deadlock. + // Concurrent reads to avoid pipe-back-pressure deadlock. When the caller + // wants chunk callbacks (for live phase parsing), we read stdout via a + // reader loop and fire the callback per chunk while still accumulating the + // full buffer for the final return value. + const collectStdout = async (): Promise => { + if (!opts.onStdoutChunk) { + return new Response(proc.stdout).arrayBuffer(); + } + const reader = (proc.stdout as ReadableStream).getReader(); + const collected: Uint8Array[] = []; + let total = 0; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value && value.byteLength > 0) { + opts.onStdoutChunk(value); + collected.push(value); + total += value.byteLength; + } + } + const merged = new Uint8Array(total); + let off = 0; + for (const c of collected) { + merged.set(c, off); + off += c.byteLength; + } + return merged.buffer.slice( + merged.byteOffset, + merged.byteOffset + merged.byteLength, + ); + }; const [stdoutBytes, stderrBytes] = await Promise.all([ - new Response(proc.stdout).arrayBuffer(), + collectStdout(), new Response(proc.stderr).arrayBuffer(), ]); From 71c68ccaf41ff122b311da6f262d215d8840f4ec Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Tue, 19 May 2026 22:35:07 +0800 Subject: [PATCH 013/108] feat(sandbox): wire artifact_create / artifact_edit to executeCode for runnable types (M5b) Builds on M5a's schema + SSE foundation. The agent-facing tool surface is now the artifact pair (create/edit); the sandbox spawner stays unchanged and is invoked transparently when an artifact's type is runnable. - spawner_client.spawnerExecute now consumes the SSE stream from the spawner. Phase events fire an optional onPhase callback; the final `event: result` payload is returned as the same SpawnerExecuteResponse shape callers had before (drop-in replacement; signature gained an optional `callbacks` arg). - spawner_client also exposes an SSE event parser that tolerates partial reads and chunk boundaries. - executeCode internal_action gains optional `artifactId`. When set: - onPhase fires patchArtifactRunProgress with a human-readable progress string ("Installing python-pptx" / "Running code") so the canvas runnable-code-renderer can subscribe and show live state. - On success, finalizeArtifactRun writes runStatus=completed plus exit code, stdout/stderr previews, and runOutputFiles. The audit sandboxExecutions row still gets its own forensics; the artifact row holds the latest result for fast canvas reads. - artifact_create_tool: - shared.ts adds runnable types to the enum + isRunnableArtifactType / runnableLanguage helpers. - input schema gains optional `packages`, `allowSdist`, `allowInstallScripts`, `timeoutMs` (gated semantically on runnable types). - execute(): after the canonical content settle, runnable types call initArtifactRun then executeCode with the new artifactId so the spawner streams progress straight to the artifact row. - artifact_edit_tool: - both patch and rewrite success branches call a new local maybeRerun() helper. For runnable types this reloads the row to pick up runPackages/runOptions captured at create time, fires initArtifactRun to clear prior-run remnants, then re-invokes executeCode with the new content. - LLM can iterate via small patches without re-emitting the full script; canvas subscribes to the same artifact row and updates. The `code_run` standalone tool still exists; M5c will remove it and update the demo agent / system prompt to point at the unified path. --- .../artifacts/artifact_create_tool.ts | 139 +++++++++++++++--- .../artifacts/artifact_edit_tool.ts | 77 +++++++++- .../convex/agent_tools/artifacts/shared.ts | 25 +++- .../sandbox/helpers/spawner_client.ts | 95 +++++++++++- .../node_only/sandbox/internal_actions.ts | 70 +++++++++ 5 files changed, 375 insertions(+), 31 deletions(-) diff --git a/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts b/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts index 895d3dd3b..59347f9e1 100644 --- a/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts +++ b/services/platform/convex/agent_tools/artifacts/artifact_create_tool.ts @@ -21,7 +21,12 @@ import { z } from 'zod/v4'; import { internal } from '../../_generated/api'; import type { ToolDefinition } from '../types'; -import { artifactTypeEnum, isValidArtifactType } from './shared'; +import { + artifactTypeEnum, + isRunnableArtifactType, + isValidArtifactType, + runnableLanguage, +} from './shared'; import { clearState, getState, @@ -33,7 +38,7 @@ import { const artifactCreateArgs = z.object({ type: artifactTypeEnum.describe( - 'Artifact type. `html` and `svg` render as a runnable preview in the Canvas pane; `markdown` and `mermaid` render formatted; `code` is a plain syntax-highlighted snippet.', + 'Artifact type. `html` and `svg` render in the browser canvas. `markdown` and `mermaid` render formatted. `code` is a static syntax-highlighted snippet. `python_runnable` / `node_runnable` execute server-side in the sandbox: write your output files to `/workspace/output/` (e.g. `.pptx`, `.pdf`) and they appear as chat attachments + chips in the canvas.', ), title: z .string() @@ -44,7 +49,7 @@ const artifactCreateArgs = z.object({ .string() .min(1) .describe( - 'Full content of the artifact. For `html`, a complete HTML document including and any inline