diff --git a/.dev.vars.example b/.dev.vars.example index 757ba58b8..faf8b2983 100644 --- a/.dev.vars.example +++ b/.dev.vars.example @@ -2,6 +2,17 @@ # .dev.vars is gitignored and used by wrangler dev ANTHROPIC_API_KEY=sk-ant-... +# OPENAI_API_KEY=sk-... + +# Cloudflare AI Gateway (alternative to direct provider keys) +# CLOUDFLARE_AI_GATEWAY_API_KEY=your-provider-api-key +# CF_AI_GATEWAY_ACCOUNT_ID=your-account-id +# CF_AI_GATEWAY_GATEWAY_ID=your-gateway-id +# CF_AI_GATEWAY_MODEL=workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast + +# Legacy AI Gateway (still supported) +# AI_GATEWAY_API_KEY=your-key +# AI_GATEWAY_BASE_URL=https://gateway.ai.cloudflare.com/v1/{account_id}/{gateway_id}/anthropic # Local development mode - skips Cloudflare Access auth and bypasses device pairing # DEV_MODE=true diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 000000000..518de904c --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,53 @@ +name: Deploy to Cloudflare Workers + +on: + push: + branches: + - main + workflow_dispatch: # Allow manual trigger + +jobs: + deploy: + runs-on: ubuntu-latest + name: Deploy + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '22' + cache: 'npm' + + - name: Install dependencies + run: npm ci + + - name: Build + run: npm run build + + - name: Deploy to Cloudflare Workers + run: npx wrangler deploy + env: + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + + - name: Set OpenRouter API Key + run: | + if [ -n "$OPENROUTER_API_KEY" ]; then + echo "$OPENROUTER_API_KEY" | npx wrangler secret put OPENROUTER_API_KEY + fi + env: + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + + - name: Set Telegram Bot Token + run: | + if [ -n "$TELEGRAM_BOT_TOKEN" ]; then + echo "$TELEGRAM_BOT_TOKEN" | npx wrangler secret put TELEGRAM_BOT_TOKEN + fi + env: + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e1999b3d8..fbc1d4b5b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,6 +23,12 @@ jobs: - name: Install dependencies run: npm ci + - name: Lint + run: npm run lint + + - name: Format check + run: npm run format:check + - name: Type check run: npm run typecheck @@ -31,7 +37,7 @@ jobs: e2e: runs-on: ubuntu-latest - timeout-minutes: 15 + timeout-minutes: 20 permissions: contents: write pull-requests: write @@ -50,6 +56,9 @@ jobs: env: DISCORD_BOT_TOKEN: "fake-discord-bot-token-for-e2e" DISCORD_DM_POLICY: "pairing" + - name: workers-ai + env: + CF_AI_GATEWAY_MODEL: "workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast" name: e2e (${{ matrix.config.name }}) @@ -65,6 +74,11 @@ jobs: - name: Install dependencies run: npm ci + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_wrapper: false + - name: Install Playwright run: npx playwright install --with-deps chromium @@ -80,12 +94,24 @@ jobs: id: e2e continue-on-error: true env: + CLOUDFLARE_API_TOKEN: ${{ secrets.E2E_CLOUDFLARE_API_TOKEN }} + CF_ACCOUNT_ID: ${{ secrets.E2E_CF_ACCOUNT_ID }} + WORKERS_SUBDOMAIN: ${{ secrets.E2E_WORKERS_SUBDOMAIN }} + CF_ACCESS_TEAM_DOMAIN: ${{ secrets.E2E_CF_ACCESS_TEAM_DOMAIN }} + R2_ACCESS_KEY_ID: ${{ secrets.E2E_R2_ACCESS_KEY_ID }} + R2_SECRET_ACCESS_KEY: ${{ secrets.E2E_R2_SECRET_ACCESS_KEY }} AI_GATEWAY_API_KEY: ${{ secrets.AI_GATEWAY_API_KEY }} AI_GATEWAY_BASE_URL: ${{ secrets.AI_GATEWAY_BASE_URL }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + CLOUDFLARE_AI_GATEWAY_API_KEY: ${{ secrets.CLOUDFLARE_AI_GATEWAY_API_KEY }} + CF_AI_GATEWAY_ACCOUNT_ID: ${{ secrets.E2E_CF_ACCOUNT_ID }} + CF_AI_GATEWAY_GATEWAY_ID: ${{ secrets.CF_AI_GATEWAY_GATEWAY_ID }} + CF_AI_GATEWAY_MODEL: ${{ matrix.config.env.CF_AI_GATEWAY_MODEL }} TELEGRAM_BOT_TOKEN: ${{ matrix.config.env.TELEGRAM_BOT_TOKEN }} TELEGRAM_DM_POLICY: ${{ matrix.config.env.TELEGRAM_DM_POLICY }} DISCORD_BOT_TOKEN: ${{ matrix.config.env.DISCORD_BOT_TOKEN }} DISCORD_DM_POLICY: ${{ matrix.config.env.DISCORD_DM_POLICY }} + E2E_TEST_RUN_ID: "${{ github.run_id }}-${{ matrix.config.name }}" run: cctr -vv test/e2e - name: Convert video and generate thumbnail @@ -97,15 +123,15 @@ jobs: for webm in /tmp/moltworker-e2e-videos/*.webm; do mp4="${webm%.webm}.mp4" thumb="${webm%.webm}.png" - + # Convert to mp4 ffmpeg -y -i "$webm" -c:v libx264 -preset fast -crf 22 -c:a aac "$mp4" - + # Extract middle frame as thumbnail duration=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$mp4") midpoint=$(echo "$duration / 2" | bc -l) ffmpeg -y -ss "$midpoint" -i "$mp4" -vframes 1 -update 1 -q:v 2 "$thumb" - + # Add play button overlay using ImageMagick width=$(identify -format '%w' "$thumb") height=$(identify -format '%h' "$thumb") @@ -115,7 +141,7 @@ jobs: -fill 'rgba(0,0,0,0.6)' -draw "circle ${cx},${cy} $((cx+50)),${cy}" \ -fill 'white' -draw "polygon $((cx-15)),$((cy-25)) $((cx-15)),$((cy+25)) $((cx+30)),${cy}" \ "$thumb" - + echo "video_path=$mp4" >> $GITHUB_OUTPUT echo "video_name=$(basename $mp4)" >> $GITHUB_OUTPUT echo "thumb_path=$thumb" >> $GITHUB_OUTPUT diff --git a/.gitignore b/.gitignore index 8a01f6260..eee9814ab 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,9 @@ Thumbs.db # TypeScript *.tsbuildinfo +# Generated by wrangler types (regenerated during typecheck) +worker-configuration.d.ts + # Docker build artifacts *.tar @@ -44,4 +47,20 @@ Thumbs.db *.greger # playwright-cli -.playwright-cli/ \ No newline at end of file +.playwright-cli/ + +# Terraform +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl +terraform.tfvars + +# E2E test credentials +test/e2e/.dev.vars + +# Temporary e2e wrangler configs +.wrangler-e2e-*.jsonc + +# npm config (may contain registry overrides for @cloudflare packages) +.npmrc diff --git a/.oxfmtrc.json b/.oxfmtrc.json new file mode 100644 index 000000000..506de28aa --- /dev/null +++ b/.oxfmtrc.json @@ -0,0 +1,12 @@ +{ + "$schema": "./node_modules/oxfmt/configuration_schema.json", + "semi": true, + "singleQuote": true, + "tabWidth": 2, + "useTabs": false, + "trailingComma": "all", + "printWidth": 100, + "arrowParens": "always", + "endOfLine": "lf", + "ignorePatterns": ["dist", "node_modules"] +} diff --git a/.oxlintrc.json b/.oxlintrc.json new file mode 100644 index 000000000..59598bb6e --- /dev/null +++ b/.oxlintrc.json @@ -0,0 +1,15 @@ +{ + "$schema": "https://raw.githubusercontent.com/oxc-project/oxc/main/npm/oxlint/configuration_schema.json", + "plugins": ["react", "typescript", "unicorn", "oxc", "import", "vitest"], + "categories": { + "correctness": "error", + "suspicious": "warn", + "perf": "warn" + }, + "rules": { + "no-unused-vars": "warn", + "react/react-in-jsx-scope": "off", + "import/no-unassigned-import": "off" + }, + "ignorePatterns": ["dist", "node_modules", "*.d.ts"] +} diff --git a/AGENTS.md b/AGENTS.md index 3d0139d8e..b2d0b4eba 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,8 @@ Guidelines for AI agents working on this codebase. +> **IMPORTANT:** Also read `CLAUDE.md` for project instructions and `claude-share/core/SYNC_CHECKLIST.md` for post-task requirements. + ## Project Overview This is a Cloudflare Worker that runs [Moltbot](https://molt.bot/) in a Cloudflare Sandbox container. It provides: @@ -244,3 +246,83 @@ R2 is mounted via s3fs at `/data/moltbot`. Important gotchas: - **Never delete R2 data**: The mount directory `/data/moltbot` IS the R2 bucket. Running `rm -rf /data/moltbot/*` will DELETE your backup data. Always check mount status before any destructive operations. - **Process status**: The sandbox API's `proc.status` may not update immediately after a process completes. Instead of checking `proc.status === 'completed'`, verify success by checking for expected output (e.g., timestamp file exists after sync). + +--- + +## Multi-Agent Coordination + +> Multiple AI assistants (Claude, Codex, others) work on this codebase simultaneously. +> These rules ensure coordination without conflicts. + +### Orchestration Documentation + +Orchestration docs are stored in a **private companion repo** and symlinked into `claude-share/`. +If `claude-share/` exists locally, read and follow those docs. If not, follow the protocols below. + +### Branch Naming Convention + +| AI Agent | Branch Pattern | Example | +|----------|---------------|---------| +| Claude | `claude/-` | `claude/parallel-tools-x7k2` | +| Codex | `codex/-` | `codex/cost-tracking-m3p1` | +| Other AI | `bot/-` | `bot/gemini-flash-tools-q2w3` | +| Human | `feat/` or `fix/` | `feat/mcp-integration` | + +### Session Start Protocol + +1. Fetch latest main: `git fetch origin main` +2. Check recent merges: `git log origin/main --oneline -10` +3. Read `claude-share/core/SYNC_CHECKLIST.md` +4. Read `claude-share/core/next_prompt.md` for current task +5. Acknowledge with format: + ``` + ACK: [Task ID] — [Task Name] + Branch: [branch-name] + Files to modify: [list] + Starting now. + ``` + +### Session End Protocol + +1. Update session log (`claude-share/core/claude-log.md` or equivalent) +2. Update `claude-share/core/GLOBAL_ROADMAP.md` — task status + changelog entry +3. Update `claude-share/core/WORK_STATUS.md` — sprint state +4. Update `claude-share/core/next_prompt.md` — point to next task +5. Run `npm test && npm run typecheck` +6. Commit and push + +### Verification Checklist (Before Claiming "Done") + +- [ ] All changes compile: `npm run typecheck` +- [ ] All tests pass: `npm test` +- [ ] No secrets committed (check `git diff --staged`) +- [ ] Session log updated +- [ ] Global roadmap updated +- [ ] Work status updated +- [ ] Next prompt updated +- [ ] Branch pushed + +### Parallel Work Rules + +1. **Check WORK_STATUS.md** before starting — avoid working on same files as another agent +2. **Claim your task** — Update the Parallel Work Tracking table immediately +3. **Small, atomic PRs** — One task per branch, one concern per PR +4. **No cross-branch dependencies** — Each branch must work independently +5. **Communicate via docs** — If you discover something another agent needs to know, write it in WORK_STATUS.md under "Notes for Other Agents" + +### Handoff Protocol + +When handing off work to another AI agent: +1. Commit all changes (even partial work) +2. Update `next_prompt.md` with detailed context +3. Add "Notes for Next Session" to your session log entry +4. Push your branch +5. If blocked, add to the "Blocked" table in WORK_STATUS.md + +### Human Checkpoint Format + +``` +🧑 HUMAN CHECK X.X: [Description of what to test] — ⏳ PENDING +``` + +Human checkpoints require manual verification before the next phase can begin. Never skip or auto-resolve these. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..ffa414e38 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,138 @@ +# Moltworker — Claude Code Project Instructions + +> This file is automatically read by Claude Code. It contains critical rules and context. + +**Last Updated:** 2026-02-06 + +--- + +## Documentation Sync + +If `claude-share/` exists (via symlink or local copy from the private companion repo): +1. Follow `claude-share/core/SYNC_CHECKLIST.md` after every task +2. Update `claude-share/core/GLOBAL_ROADMAP.md` — task status + changelog +3. Update `claude-share/core/WORK_STATUS.md` — sprint state +4. Update `claude-share/core/next_prompt.md` — point to next task +5. Append to `claude-share/core/claude-log.md` — session entry + +If not available, commit with standard format and document changes in PR description. + +--- + +## Project Overview + +**Moltworker** is a multi-platform AI assistant gateway on Cloudflare Workers. + +| Component | Tech | +|-----------|------| +| Runtime | Cloudflare Workers + Sandbox Containers | +| Framework | Hono 4.11 | +| Language | TypeScript 5.9 (strict) | +| Frontend | React 19 + Vite 6 | +| AI Models | 26+ via OpenRouter + Direct APIs | +| Storage | Cloudflare R2 (S3-compatible) | +| Long Tasks | Durable Objects (TaskProcessor) | +| Chat | Telegram, Discord, Slack | +| Testing | Vitest 4.0 | +| Browser | Cloudflare Browser Rendering | + +--- + +## Key Files Reference + +| File | Purpose | +|------|---------| +| `src/index.ts` | Worker entrypoint | +| `src/openrouter/models.ts` | Model catalog (26+ models) | +| `src/openrouter/tools.ts` | Tool definitions and execution (5 tools) | +| `src/openrouter/client.ts` | OpenRouter API client with tool-calling loop | +| `src/durable-objects/task-processor.ts` | Long-running task engine | +| `src/telegram/handler.ts` | Telegram bot handler | +| `src/routes/telegram.ts` | Telegram webhook route | +| `src/routes/discord.ts` | Discord integration | +| `src/gateway/process.ts` | Sandbox container management | +| `src/client/App.tsx` | Admin dashboard UI | +| `brainstorming/future-integrations.md` | Feature roadmap | + +--- + +## Rules + +### Security-First +- **Never commit secrets** — API keys, tokens, `.dev.vars` are gitignored +- **Validate all inputs** — Tool arguments, URL parameters, request bodies +- **Redact logs** — Use `src/utils/logging.ts` for any user data +- **No eval()** — Ever + +### Code Quality +- **Run tests before committing** — `npm test` +- **Run typecheck** — `npm run typecheck` +- **No `any` types** — Use proper typing or `unknown` with type guards +- **Keep functions focused** — One responsibility per function +- **Max 500 lines per file** — Split if exceeding + +### Git Workflow +- **Never push to `main`** — PRs only +- **Branch naming:** `claude/-` +- **Commit format:** `(): ` +- **Atomic commits** — One logical change per commit + +### Testing +- **Vitest** — Test files colocated: `foo.ts` → `foo.test.ts` +- **Mock external APIs** — Never call real APIs in tests +- **Test edge cases** — Empty inputs, error responses, timeouts + +--- + +## Commands + +```bash +npm test # Run tests (vitest) +npm run test:watch # Watch mode +npm run build # Build worker + client +npm run deploy # Deploy to Cloudflare +npm run dev # Vite dev server +npm run start # Local worker (wrangler dev) +npm run typecheck # TypeScript check +``` + +--- + +## Technical Reference + +### OpenRouter Tool-Calling Loop +1. Build `ChatCompletionRequest` with `tools` and `tool_choice: 'auto'` +2. Send to OpenRouter API +3. If response has `tool_calls` → execute tools → add results → loop back to step 2 +4. If no `tool_calls` → return final text response +5. Max iterations: 10 (Worker), 100 (Durable Object) + +### Model Selection +- Models defined in `src/openrouter/models.ts` +- Aliases map to OpenRouter model IDs +- `supportsTools` flag controls tool injection +- Direct APIs (DashScope, Moonshot, DeepSeek) bypass OpenRouter + +### Tool Execution +- Tools defined in `src/openrouter/tools.ts` +- `ToolContext` carries secrets (GitHub token, browser binding) +- Tool results truncated at 50KB (tools.ts) or 8KB (task-processor.ts) +- Errors returned as tool results, not thrown + +### Durable Objects (TaskProcessor) +- Handles tasks exceeding Worker timeout (10s) +- Watchdog alarm every 90s, stuck threshold 60s +- Auto-resume up to 10 times +- R2 checkpoints every 3 tool calls +- Context compression every 6 tool calls + +### Validation Patterns +- URL validation: Use `URL` constructor +- GitHub paths: Validate owner/repo format +- Tool arguments: JSON.parse with try/catch +- API responses: Check `.ok` before reading body + +### Logging +- Use `console.log`/`console.error` with `[ComponentName]` prefix +- Redact secrets using `src/utils/logging.ts` +- Include timing info for performance-sensitive operations diff --git a/Dockerfile b/Dockerfile index d7fd5d313..227e83ef7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,6 @@ FROM docker.io/cloudflare/sandbox:0.7.0 -# Install Node.js 22 (required by clawdbot) and rsync (for R2 backup sync) -# The base image has Node 20, we need to replace it with Node 22 -# Using direct binary download for reliability +# Install Node.js 22 (required by OpenClaw) and rclone (for R2 persistence) ENV NODE_VERSION=22.13.1 RUN ARCH="$(dpkg --print-architecture)" \ && case "${ARCH}" in \ @@ -10,41 +8,50 @@ RUN ARCH="$(dpkg --print-architecture)" \ arm64) NODE_ARCH="arm64" ;; \ *) echo "Unsupported architecture: ${ARCH}" >&2; exit 1 ;; \ esac \ - && apt-get update && apt-get install -y xz-utils ca-certificates rsync \ + && apt-get update && apt-get install -y xz-utils ca-certificates rclone \ && curl -fsSLk https://nodejs.org/dist/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-${NODE_ARCH}.tar.xz -o /tmp/node.tar.xz \ && tar -xJf /tmp/node.tar.xz -C /usr/local --strip-components=1 \ && rm /tmp/node.tar.xz \ && node --version \ && npm --version +# Install Git and GitHub CLI for Storia orchestrator +RUN apt-get update && apt-get install -y git \ + && curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \ + && chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ + && apt-get update \ + && apt-get install -y gh \ + && git --version \ + && gh --version + +# Configure git for Storia Bot +RUN git config --global user.email "bot@storia.digital" \ + && git config --global user.name "Storia Bot" \ + && git config --global init.defaultBranch main + +# Create repos directory for cloning +RUN mkdir -p /root/repos + # Install pnpm globally RUN npm install -g pnpm -# Install moltbot (CLI is still named clawdbot until upstream renames) -# Pin to specific version for reproducible builds -RUN npm install -g clawdbot@2026.1.24-3 \ - && clawdbot --version +# Install OpenClaw (formerly clawdbot/moltbot) +RUN npm install -g openclaw@2026.2.3 \ + && openclaw --version -# Create moltbot directories (paths still use clawdbot until upstream renames) -# Templates are stored in /root/.clawdbot-templates for initialization -RUN mkdir -p /root/.clawdbot \ - && mkdir -p /root/.clawdbot-templates \ +# Create OpenClaw directories +# Legacy .clawdbot paths kept for R2 backup migration +RUN mkdir -p /root/.openclaw \ && mkdir -p /root/clawd \ && mkdir -p /root/clawd/skills -# Copy startup script -# Build cache bust: 2026-01-28-v26-browser-skill -COPY start-moltbot.sh /usr/local/bin/start-moltbot.sh -RUN chmod +x /usr/local/bin/start-moltbot.sh - -# Copy default configuration template -COPY moltbot.json.template /root/.clawdbot-templates/moltbot.json.template +# Build cache bust: 2026-02-15-openclaw-rclone +COPY start-openclaw.sh /usr/local/bin/start-openclaw.sh +RUN chmod +x /usr/local/bin/start-openclaw.sh -# Copy custom skills COPY skills/ /root/clawd/skills/ -# Set working directory WORKDIR /root/clawd -# Expose the gateway port EXPOSE 18789 diff --git a/README.md b/README.md index 90bf7b724..17ab4f82a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,25 @@ The following Cloudflare features used by this project have free tiers: - AI Gateway (optional, for API routing/analytics) - R2 Storage (optional, for persistence) +## Container Cost Estimate + +This project uses a `standard-1` Cloudflare Container instance (1/2 vCPU, 4 GiB memory, 8 GB disk). Below are approximate monthly costs assuming the container runs 24/7, based on [Cloudflare Containers pricing](https://developers.cloudflare.com/containers/pricing/): + +| Resource | Provisioned | Monthly Usage | Included Free | Overage | Approx. Cost | +|----------|-------------|---------------|---------------|---------|--------------| +| Memory | 4 GiB | 2,920 GiB-hrs | 25 GiB-hrs | 2,895 GiB-hrs | ~$26/mo | +| CPU (at ~10% utilization) | 1/2 vCPU | ~2,190 vCPU-min | 375 vCPU-min | ~1,815 vCPU-min | ~$2/mo | +| Disk | 8 GB | 5,840 GB-hrs | 200 GB-hrs | 5,640 GB-hrs | ~$1.50/mo | +| Workers Paid plan | | | | | $5/mo | +| **Total** | | | | | **~$34.50/mo** | + +Notes: +- CPU is billed on **active usage only**, not provisioned capacity. The 10% utilization estimate is a rough baseline for a lightly-used personal assistant; your actual cost will vary with usage. +- Memory and disk are billed on **provisioned capacity** for the full time the container is running. +- To reduce costs, configure `SANDBOX_SLEEP_AFTER` (e.g., `10m`) so the container sleeps when idle. A container that only runs 4 hours/day would cost roughly ~$5-6/mo in compute on top of the $5 plan fee. +- Network egress, Workers/Durable Objects requests, and logs are additional but typically minimal for personal use. +- See the [instance types table](https://developers.cloudflare.com/containers/pricing/) for other options (e.g., `lite` at 256 MiB/$0.50/mo memory or `standard-4` at 12 GiB for heavier workloads). + ## What is OpenClaw? [OpenClaw](https://github.com/openclaw/openclaw) (formerly Moltbot, formerly Clawdbot) is a personal AI assistant with a gateway architecture that connects to multiple chat platforms. Key features: @@ -353,16 +372,48 @@ npx wrangler secret put AI_GATEWAY_BASE_URL npm run deploy ``` -The `AI_GATEWAY_*` variables take precedence over `ANTHROPIC_*` if both are set. +When Cloudflare AI Gateway is configured, it takes precedence over direct `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`. + +### Choosing a Model + +By default, AI Gateway uses Anthropic's Claude Sonnet 4.5. To use a different model or provider, set `CF_AI_GATEWAY_MODEL` with the format `provider/model-id`: + +```bash +npx wrangler secret put CF_AI_GATEWAY_MODEL +# Enter: workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast +``` + +This works with any [AI Gateway provider](https://developers.cloudflare.com/ai-gateway/usage/providers/): + +| Provider | Example `CF_AI_GATEWAY_MODEL` value | API key is... | +|----------|-------------------------------------|---------------| +| Workers AI | `workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast` | Cloudflare API token | +| OpenAI | `openai/gpt-4o` | OpenAI API key | +| Anthropic | `anthropic/claude-sonnet-4-5` | Anthropic API key | +| Groq | `groq/llama-3.3-70b` | Groq API key | + +**Note:** `CLOUDFLARE_AI_GATEWAY_API_KEY` must match the provider you're using — it's your provider's API key, forwarded through the gateway. You can only use one provider at a time through the gateway. For multiple providers, use direct keys (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`) alongside the gateway config. + +#### Workers AI with Unified Billing + +With [Unified Billing](https://developers.cloudflare.com/ai-gateway/features/unified-billing/), you can use Workers AI models without a separate provider API key — Cloudflare bills you directly. Set `CLOUDFLARE_AI_GATEWAY_API_KEY` to your [AI Gateway authentication token](https://developers.cloudflare.com/ai-gateway/configuration/authentication/) (the `cf-aig-authorization` token). + +### Legacy AI Gateway Configuration + +The previous `AI_GATEWAY_API_KEY` + `AI_GATEWAY_BASE_URL` approach is still supported for backward compatibility but is deprecated in favor of the native configuration above. ## All Secrets Reference | Secret | Required | Description | |--------|----------|-------------| -| `AI_GATEWAY_API_KEY` | Yes* | API key for your AI Gateway provider (requires `AI_GATEWAY_BASE_URL`) | -| `AI_GATEWAY_BASE_URL` | Yes* | AI Gateway endpoint URL (required when using `AI_GATEWAY_API_KEY`) | -| `ANTHROPIC_API_KEY` | Yes* | Direct Anthropic API key (fallback if AI Gateway not configured) | -| `ANTHROPIC_BASE_URL` | No | Direct Anthropic API base URL (fallback) | +| `CLOUDFLARE_AI_GATEWAY_API_KEY` | Yes* | Your AI provider's API key, passed through the gateway (e.g., your Anthropic API key). Requires `CF_AI_GATEWAY_ACCOUNT_ID` and `CF_AI_GATEWAY_GATEWAY_ID` | +| `CF_AI_GATEWAY_ACCOUNT_ID` | Yes* | Your Cloudflare account ID (used to construct the gateway URL) | +| `CF_AI_GATEWAY_GATEWAY_ID` | Yes* | Your AI Gateway ID (used to construct the gateway URL) | +| `CF_AI_GATEWAY_MODEL` | No | Override default model: `provider/model-id` (e.g. `workers-ai/@cf/meta/llama-3.3-70b-instruct-fp8-fast`). See [Choosing a Model](#choosing-a-model) | +| `AI_GATEWAY_API_KEY` | No | Legacy: API key for AI Gateway (deprecated, use `CLOUDFLARE_AI_GATEWAY_API_KEY`) | +| `AI_GATEWAY_BASE_URL` | No | Legacy: AI Gateway endpoint URL (deprecated) | +| `ANTHROPIC_API_KEY` | Yes* | Direct Anthropic API key (alternative to AI Gateway) | +| `ANTHROPIC_BASE_URL` | No | Direct Anthropic API base URL | | `OPENAI_API_KEY` | No | OpenAI API key (alternative provider) | | `CF_ACCESS_TEAM_DOMAIN` | Yes* | Cloudflare Access team domain (required for admin UI) | | `CF_ACCESS_AUD` | Yes* | Cloudflare Access application audience (required for admin UI) | diff --git a/TEST_PROTOCOL.md b/TEST_PROTOCOL.md new file mode 100644 index 000000000..0df43aba3 --- /dev/null +++ b/TEST_PROTOCOL.md @@ -0,0 +1,138 @@ +# Moltworker — Manual Test Protocol + +Quick checklist to verify the bot works end-to-end. +Run top-to-bottom after every deploy. Takes ~10 minutes. + +--- + +## Setup + +1. Open Telegram, find your Moltworker bot +2. Send `/new` to start clean + +--- + +## 1. Basics + +| # | Action | Expected | +|---|--------|----------| +| 1 | `/start` | Welcome message explaining Chat, Vision, Tools, Images, Reasoning, JSON, Briefing | +| 2 | `/help` | Full command reference with all 12 tools listed individually | +| 3 | `/ping` | Pong + latency | +| 4 | `Hello!` | Normal chat response | +| 5 | `/model` | Shows current model (probably "auto") | + +--- + +## 2. Model Switching + +| # | Action | Expected | +|---|--------|----------| +| 6 | `/use deep` | Confirms switch to DeepSeek V3.2 | +| 7 | `/model` | Shows "deep" | +| 8 | `/pick` | Button grid appears | +| 9 | Tap any button | Confirms model switch | +| 10 | `/use nonexistent` | Error: model not found | + +--- + +## 3. Tools (use `/use deep` or `/use gpt` first) + +| # | Action | Expected | +|---|--------|----------| +| 11 | `What's the weather in Prague?` | Calls get_weather, shows temp + conditions | +| 12 | `What's the Bitcoin price?` | Calls get_crypto, shows price + market data | +| 13 | `Top 5 cryptos by market cap` | Calls get_crypto (top), shows ranked list | +| 14 | `Search for PEPE on DEX` | Calls get_crypto (dex), shows DEX pair data | +| 15 | `Where is 8.8.8.8 located?` | Calls geolocate_ip, shows Google DNS info | +| 16 | `Geolocate 1.1.1.1 and tell me the timezone` | Shows Cloudflare DNS + timezone | +| 17 | `What are today's top HN stories?` | Calls fetch_news, shows HackerNews stories | +| 18 | `Convert 100 USD to EUR` | Calls convert_currency, shows rate | + +--- + +## 4. Vision + +| # | Action | Expected | +|---|--------|----------| +| 19 | `/use gpt` then send a photo with caption: `What is this?` | Describes the image | +| 20 | Send a photo with caption: `What city is this? Check its weather` | Identifies city AND calls weather tool | +| 21 | Send a photo with no caption | Defaults to "What is in this image?" analysis | + +--- + +## 5. Structured Output + +| # | Action | Expected | +|---|--------|----------| +| 22 | `/use gpt` then `json: list 3 European capitals with population` | Valid JSON response | +| 23 | `/use deep` then `json: 3 programming languages with name and year` | Valid JSON response | +| 24 | `/use sonnet` then `json: list 3 colors` | Normal text (Sonnet doesn't support JSON mode) | + +--- + +## 6. Reasoning + +| # | Action | Expected | +|---|--------|----------| +| 25 | `/use deep` then `think:high explain quantum entanglement` | Deeper, more thorough response | +| 26 | `think:high json: analyze top 3 cryptos` | Reasoning + JSON combined | + +--- + +## 7. Image Generation + +| # | Action | Expected | +|---|--------|----------| +| 27 | `/img a cat astronaut floating in space` | Returns generated image | +| 28 | `/img fluxmax detailed portrait of a robot` | Returns higher quality image | + +--- + +## 8. Briefing + +| # | Action | Expected | +|---|--------|----------| +| 29 | `/briefing` | Shows weather + HN + Reddit + arXiv digest | + +--- + +## 9. Bug Regressions + +| # | Action | Expected | +|---|--------|----------| +| 30 | `/use deep` then `hello` | Status shows "Thinking..." (NOT "Processing complex task...") | +| 31 | `/use deep` then `What's the weather in Tokyo?` | DeepSeek actually CALLS the weather tool (doesn't guess) | +| 32 | `/use fluxpro` then `hello` | Bot says model is image-only, falls back to default | + +--- + +## 10. Session Management + +| # | Action | Expected | +|---|--------|----------| +| 33 | `/saveas test1` | Saves checkpoint | +| 34 | `/saves` | Shows "test1" in list | +| 35 | `/save test1` | Shows checkpoint details + AI summary of conversation | +| 36 | `/new` | Clears conversation | +| 37 | `/load test1` | Restores conversation | +| 38 | `/delsave test1` | Deletes checkpoint | +| 39 | `/credits` | Shows OpenRouter balance | +| 40 | `/costs` | Shows token usage | + +--- + +## Results + +Copy this table, fill in as you go: + +``` +| # | Pass? | Notes | +|---|-------|-------| +| 1 | | | +| 2 | | | +| ... | | | +| 40 | | | +``` + +**Pass criteria:** All 40 tests pass. If any fail, note the exact response and which model was active. diff --git a/brainstorming/audit-build-improvement-plan.md b/brainstorming/audit-build-improvement-plan.md new file mode 100644 index 000000000..49179854e --- /dev/null +++ b/brainstorming/audit-build-improvement-plan.md @@ -0,0 +1,156 @@ +# Full Audit + Build Improvement Plan + +## Scope and Problem Statement + +Primary pain points reported: + +1. Complex coding tasks keep resuming on `/dcode`. +2. Multiple models are hallucinating and producing low-trust output. + +This document audits current behavior and proposes a staged implementation plan to improve routing reliability, output quality, and build confidence. + +## Current-State Audit (Evidence) + +### 1) Model persistence + resume path can trap users on a weak model for hard tasks + +- User model selection is persisted in R2 preferences and reused for new/resumed tasks. If the user ever selected `/dcode`, resume flows continue with that model unless manually changed. (`getUserModel()` and `setUserModel()`). +- `continue` uses the persisted `modelAlias` directly when creating a new DO task. +- Resume callback path also uses persisted `modelAlias`. + +**Impact:** difficult tasks can repeatedly resume on a model that is not best for instruction following, causing a perceived “stuck on /dcode” loop. + +### 2) Default model remains `auto`, which may vary provider behavior + +- `DEFAULT_MODEL` is `auto` (OpenRouter auto-routing). + +**Impact:** non-deterministic quality and tool behavior; harder to debug hallucinations across sessions. + +### 3) Auto-resume UX messaging is stale/inconsistent with runtime limits + +- Code currently limits free-model auto-resumes to 15. +- User-facing text in `/autoresume` still says 50x free. + +**Impact:** users expect much longer retries than system actually does, creating trust and debugging confusion. + +### 4) Guardrails exist but are mostly post-hoc (review prompts), not hard output constraints + +- Task processor includes phase prompts and critical review checks. +- Tool/result fallback logic exists, but there is no strict “evidence required” response contract for coding answers. + +**Impact:** models can still confidently synthesize non-verified claims when tool outputs are sparse/noisy. + +### 5) Build/test pipeline is solid but lacks explicit quality gates for “hallucination-prone” regressions + +- Scripts cover `test`, `typecheck`, `build`, lint/format. +- No targeted CI checks for model-routing behavior, resume-model policy, or response citation/evidence validation. + +**Impact:** regressions in model selection and reliability can ship undetected. + +## Root-Cause Summary + +The “resumes on `/dcode`” issue is primarily a **policy gap** (resume model selection = persisted user model) rather than a raw runtime bug. Hallucination risk is primarily a **guardrail gap** (insufficient evidence enforcement + model routing policy + missing reliability tests). + +## Build Improvement Plan + +## Phase 1 — Stabilize model routing and resume behavior (high priority) + +1. **Introduce a Task Router policy function** (single source of truth): + - Inputs: user-selected model, task intent (coding/reasoning/general), tool requirement, checkpoint metadata. + - Output: execution model alias + rationale string. +2. **Add “complex coding override” on resume:** + - If resume is for coding task + previous run stalled/no-progress, route to stronger coding model (`/opus`, `/sonnet`, `/q3coder` depending on credentials/cost policy). +3. **Pin checkpoint metadata to model used at creation time** and expose in `/checkpoints` output. +4. **Add explicit `/resume ` override** so users can force model upgrade at resume time. +5. **Fix user-facing auto-resume text** to match runtime constants. + +**Definition of done:** no automatic resume path silently reuses `/dcode` when policy says escalate. + +## Phase 2 — Hallucination reduction guardrails (high priority) + +1. **Evidence-Required Answer Mode (for coding tasks):** + - Final answer must include “Evidence” block with tool outputs or file references. + - If evidence missing, force model to answer with uncertainty + next tool action. +2. **Hard “No Fake Success” contract:** + - If `github_create_pr` / `git` / test commands were not executed successfully, response must say “not completed”. +3. **Source-grounding prompt layer:** + - Inject strict instruction: do not assert repo state unless observed from command/tool output in current session. +4. **Confidence labeling:** + - Add `Confidence: High/Medium/Low` based on observed evidence count and recency. + +**Definition of done:** model cannot return high-confidence completion claims without concrete session evidence. + +## Phase 3 — Build/CI reliability gates (medium-high priority) + +1. **Add policy unit tests** for Task Router: + - resumes from `/dcode` + coding task + stall → escalates model. + - paid vs free policy matrix. +2. **Add regression tests** for user messaging and constants parity (auto-resume limits). +3. **Add integration tests** for DO resume flows (`continue`, callback `resume:task`) validating selected model. +4. **Add CI pipeline stages:** + - `npm run typecheck` + - `npm test` + - `npm run build` + - optional: coverage threshold for `src/durable-objects` and `src/telegram`. + +**Definition of done:** routing and anti-hallucination behaviors are test-protected. + +## Phase 4 — Operational observability (medium priority) + +1. **Structured logs for model routing decisions:** selected model, reason, task category, auto-resume count. +2. **Metrics dashboard fields:** + - hallucination proxy signals (toolless high-confidence responses, user corrections, retry rate) + - model success/failure by task type. +3. **Admin/debug endpoint enhancement:** show last 10 routing decisions per user (redacted). + +**Definition of done:** you can diagnose why `/dcode` (or any model) was selected within minutes. + +## Phase 5 — UX controls and safer defaults (medium priority) + +1. **“Smart mode” default for complex tasks** (router chooses best model). +2. **“Cost mode” and “Quality mode” user toggles** stored in preferences. +3. **Inline warnings when weak model is selected for complex coding task.** +4. **One-click “retry on stronger model” button** in Telegram. + +**Definition of done:** users can easily escape weak-model loops without knowing internal aliases. + +## Suggested Implementation Order (1 week sprint) + +- **Day 1-2:** Phase 1 (router + resume policy + message fix) +- **Day 3-4:** Phase 2 (evidence contract + no-fake-success checks) +- **Day 5:** Phase 3 (tests + CI gates) +- **Day 6:** Phase 4 logging/metrics +- **Day 7:** Phase 5 UX polish + +## Immediate Quick Wins (can ship first) + +1. Fix `/autoresume` text to 15x free. +2. On resume, if current model is `/dcode` and last run had no progress, auto-suggest `/opus` or `/sonnet` with one-tap switch. +3. Add explicit warning in final responses: “Unverified claim” when no tool/file evidence exists. + +## Success Metrics + +Track weekly: + +- Resume-loop rate (>=2 consecutive resumes with no new tools) +- “Wrong model for task” manual switches after failure +- User-reported hallucination incidents +- Task completion rate on first attempt +- PR/task false-success incidents (claimed done but not done) + +Targets after rollout: + +- 50% reduction in no-progress resume loops +- 40% reduction in hallucination complaints +- 25% increase in first-attempt completion on coding tasks + +## Rollback and Safety + +- Keep feature flags for: + - router override policy + - evidence-required mode + - confidence labels +- If regression appears, disable feature flag and retain logs for postmortem. + +## Notes for Follow-up + +- If you want, next step can be implementation of **Phase 1 only** as an atomic PR: minimal risk, immediately addresses `/dcode` resume pain. diff --git a/brainstorming/code-mode-mcp.md b/brainstorming/code-mode-mcp.md new file mode 100644 index 000000000..e5a2f555c --- /dev/null +++ b/brainstorming/code-mode-mcp.md @@ -0,0 +1,280 @@ +check hiw this can be useful: +https://blog.cloudflare.com/code-mode-mcp/This is a massive, timely win for Storia. +The blog post you linked was published today (February 20, 2026) — literally hours ago. Cloudflare just dropped Code Mode MCP, and it is one of the most relevant new primitives I've seen for exactly what you're building. +What Code Mode MCP Actually Is (in plain English) +Instead of forcing an LLM agent to load a gigantic OpenAPI spec (or thousands of individual tool definitions) into its context, Code Mode lets the agent write small JavaScript snippets that: + +search() the full API spec intelligently (progressive discovery) +execute() real calls using a typed, safe client + +Result: The entire Cloudflare API (2,500+ endpoints) now fits in ~1,000 tokens — a 99.9% reduction from the previous ~1.17 million tokens. +It runs in a super-light Dynamic Worker isolate (sandboxed, no filesystem, controlled outbound calls) and supports the new official Cloudflare MCP server at https://mcp.cloudflare.com/mcp. +They also open-sourced the Code Mode SDK inside the Cloudflare Agents SDK. +How This Maps to Storia (Extremely High Usefulness) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Storia ComponentHow Code Mode HelpsImpact LevelAgent Mode / storia-agent (Tier 2)Your moltworker sandbox already runs agent-generated code. Now the agent can safely call the full Cloudflare API (R2, D1, Workers, KV, Queues, Zero Trust, Pages, etc.) with almost zero token cost.★★★★★ (Game-changer)Dream Machine (Capture → Build)In the Build stage, the agent can now dynamically provision infra, deploy Workers, manage DNS, create R2 buckets, etc., all inside the same secure sandbox. This turns Dream Machine from "code generator" into "full product shipper".★★★★★Skills SystemReplace or augment your current 5–12 skills with Code Mode. One skill (cloudflare_api) gives access to everything instead of maintaining dozens of individual tools.★★★★☆Token Optimization EngineYou already built a cost-aware router. This is the nuclear upgrade — agents stay under token budgets even when doing complex multi-step Cloudflare operations.★★★★★FreeModelRouter / Deep ModeCheaper models (Groq, DeepSeek, Qwen) now become dramatically more capable because they can handle huge APIs without context overflow.★★★★☆BYOK + SecurityThe execution stays inside your private storia-agent Worker (you control the sandbox). OAuth 2.1 downscoping works perfectly with your existing BYOK flow.★★★★★ +Concrete Ways You Can Use It Immediately + +Fastest win (1–2 days) +Add the official Cloudflare MCP server to your agent:JSON// In your storia-agent MCP config +{ + "mcpServers": { + "cloudflare": { "url": "https://mcp.cloudflare.com/mcp" } + } +}Then expose one new skill: cloudflare_code_mode that lets the agent use search() + execute(). +Medium win (3–5 days) +Wrap the open-source Code Mode SDK so your agent can also expose your own internal APIs (D1 schemas, skills engine, Gecko Life, Situation Monitor, etc.) in the same 1,000-token pattern. +Dream Machine upgrade +In the Build stage, let the agent write Code Mode snippets that: +Create a new Worker +Deploy it +Set up a Pages project +Configure R2 + bindings +All in one secure execution pass. + + +Bottom Line — Priority Recommendation +This is Tier 1.5 / Tier 2 priority — move it up. +It directly unblocks and supercharges three of your biggest upcoming items: + +Agent Mode (makes it 10× more powerful) +Dream Machine Build stage +Token/cost efficiency across the whole platform + +You’re already 100% on the right stack (Cloudflare Workers + sandbox + agent loop). This is basically Cloudflare building the missing piece you would have had to invent yourself. +Action I recommend today: + +Read the full post (it’s short and has great code examples) +Try the public MCP server in your current agent (takes <30 min) +Add it as a new line in your roadmap under Tier 2 with “Code Mode MCP integration (Cloudflare full API in 1k tokens) — 8–12h” + +Want me to write the exact integration code snippet for your storia-agent + the new skill definition? Just say the word and I’ll give you copy-paste ready code. +This is the kind of external development that can genuinely accelerate your roadmap by weeks. Super timely find!yes please write detailed .md documents for specs and instructions for claude code or codexHere are the two ready-to-use Markdown documents you can copy-paste directly into Claude Code or Codex right now. + +1. CODE_MODE_MCP_INTEGRATION_SPEC.md +(Feed this first to Claude for full planning & architecture) +Markdown# Code Mode MCP Integration Specification — Storia AI Hub +**Version**: 1.0 +**Date**: 2026-02-20 +**Author**: Grok (for PetrAnto) +**Status**: Ready for Claude / Codex implementation +**Priority**: Tier 1.5 → Tier 2 (unblocks Agent Mode + Dream Machine Build stage) + +## 1. Background & Opportunity + +Cloudflare released **Code Mode MCP** today (Feb 20, 2026). It lets an LLM agent access the **entire Cloudflare API (2,500+ endpoints)** using only ~1,000 tokens instead of millions. + +Instead of exposing thousands of tools, the agent gets two tools: +- `search()` — intelligently searches the full OpenAPI spec +- `execute()` — runs safe TypeScript code against a typed SDK in a Dynamic Worker Loader sandbox + +Official resources: +- Blog: https://blog.cloudflare.com/code-mode-mcp/ +- Code Mode SDK: https://github.com/cloudflare/agents/tree/main/packages/codemode +- Public Cloudflare MCP server: https://mcp.cloudflare.com/mcp + +## 2. Business & Technical Value for Storia + +- Agent Mode becomes 10× more powerful (full control of R2, D1, Workers, DNS, Zero Trust, etc.) +- Dream Machine Build/Ship stage can now provision real infra +- Token/cost savings across all agents (especially cheap models in Free/Deep Mode) +- Perfect fit with existing storia-agent sandbox + BYOK flow +- Replaces or augments current custom skills with one ultra-powerful `cloudflare_code_mode` skill + +## 3. Scope for MVP (8–14h effort) + +**Phase 1 (MVP — ship in 1–2 days)** +- Connect to official Cloudflare MCP server (`https://mcp.cloudflare.com/mcp`) +- Add one new skill: `cloudflare_code_mode` +- Expose it in both web Agent Mode and Telegram bot +- Full safety (BYOK session tokens, sandboxed execution, audit logging) +- Basic test command in Telegram: `/cloudflare whoami` + +**Out of scope for MVP** +- Custom MCP server for Storia’s own APIs (Phase 2) +- Dream Machine auto-provisioning flows (Phase 2) + +## 4. Technical Architecture + +### Existing Components to Extend +- `src/lib/skills/` (current skill engine) +- `storia-agent` Worker (private fork) +- `src/lib/mcp/` (you already have MCP HTTP client) +- `src/lib/orchestration/` and `AgentLoop` +- BYOK session token system (already in Tier 0) + +### New Files to Create / Modify + +1. `src/lib/skills/cloudflare-code-mode.ts` (new) +2. `src/lib/skills/index.ts` (register new skill) +3. `storia-agent/src/mcp/cloudflare-mcp.ts` (or extend existing) +4. Update `storia-agent/wrangler.toml` to include `@cloudflare/agents` dependency +5. Add to Agent Mode UI (optional quick win in Cockpit) + +### Skill Definition (exact) + +```ts +// src/lib/skills/cloudflare-code-mode.ts +export const cloudflareCodeModeSkill = { + name: "cloudflare_code_mode", + description: "Access the ENTIRE Cloudflare API (DNS, Workers, R2, D1, Zero Trust, etc.) using Code Mode. Use search() to explore and execute() to run safe TS code. Extremely powerful — use with care.", + parameters: { + type: "object", + properties: { + code: { type: "string", description: "TypeScript code using search() and execute() from the Code Mode SDK" } + }, + required: ["code"] + }, + execute: async (args, context) => { ... } +}; +5. Security & Safety Requirements + +All execution must happen inside storia-agent sandbox (never in ai-hub) +Use user’s BYOK session token (already implemented) +Add audit log entry for every execute() +Enforce strict allow-list on outbound calls +Rate-limit per user +Human approval gate for high-impact actions (e.g., create Worker, update DNS) + +6. Testing Plan + +/cloudflare whoami → should return account info +Ask agent: “List all my R2 buckets” +Ask agent: “Create a new D1 database called test-storia” +Test inside Dream Machine Plan stage + +7. Acceptance Criteria + +Skill appears in /help and skill browser +ClawRouter badge shows “Code Mode” when used +Zero token bloat (must stay under 1.2k tokens) +All actions logged with user_id and cost +Works with Groq, DeepSeek, Claude, etc. + +8. Next Phase Ideas (after MVP) + +Expose Storia’s own APIs via custom Code Mode MCP server +Dream Machine auto-deploys full products +Generative UI components that call Cloudflare directly + +Ready for implementation. Start with dependency installation and the new skill file. +text--- + +### 2. `CODE_MODE_MCP_IMPLEMENTATION_INSTRUCTIONS.md` +*(Feed this to Codex for step-by-step coding — shorter & more tactical)* + +```markdown +# Code Mode MCP — Codex Implementation Instructions +**Goal**: Add Cloudflare Code Mode MCP as a new skill in < 12 hours + +### Step 1: Add Dependency (5 min) +In `storia-agent/package.json` and root `ai-hub` if shared: +```json +{ + "dependencies": { + "@cloudflare/agents": "latest", + "@cloudflare/codemode": "latest" + } +} +Then npm install and update wrangler.toml bindings. +Step 2: Create the Skill (main file) +Create src/lib/skills/cloudflare-code-mode.ts with this exact structure (copy from existing skills like web3.ts): +TypeScriptimport { codemode } from "@cloudflare/agents/codemode"; +import { McpAgent } from "@cloudflare/agents"; + +const cloudflareMCP = new McpAgent({ + url: "https://mcp.cloudflare.com/mcp", + // auth handled via BYOK session token +}); + +export const cloudflareCodeModeSkill = { + name: "cloudflare_code_mode", + description: "... (copy from spec)", + parameters: { ... }, + async execute({ code }, { userId, sessionToken }) { + // 1. Attach user session token + // 2. Run code via Code Mode SDK + // 3. Log to D1 + // 4. Return result + ClawRouter badge + } +}; +Step 3: Register the Skill +In src/lib/skills/index.ts: +TypeScriptimport { cloudflareCodeModeSkill } from "./cloudflare-code-mode"; +export const allSkills = [ ..., cloudflareCodeModeSkill ]; +Step 4: Wire into Agent Loop +Update storia-agent/src/agent-loop.ts to expose the skill when cloudflare_code_mode is requested. +Step 5: Add Telegram Slash Command (optional quick win) +In Telegram handler: +TypeScriptif (text === "/cloudflare") { + await runSkill("cloudflare_code_mode", { code: "console.log(await tools.getUserInfo())" }); +} +Step 6: Test Commands to Verify +After deploy, test in Telegram: + +/cloudflare whoami +“List my R2 buckets using cloudflare code mode” +“Create a test D1 database” + +Final Checklist for Codex + + Dependency added + New skill file created and registered + Security: BYOK token passed, audit log added + ClawRouter badge shows when skill is used + Works with both web Agent Mode and Telegram + No token bloat (test with tiktoken or console) + +Start coding from Step 2. Ping me when the first file is ready for review. +text--- + +**How to use them:** + +1. Open Claude Code → paste **CODE_MODE_MCP_INTEGRATION_SPEC.md** first (ask it to create the full plan) +2. Then paste **CODE_MODE_MCP_IMPLEMENTATION_INSTRUCTIONS.md** and say “Implement exactly following these steps” +3. Or feed both to Codex at once. + +These documents are battle-tested format for your workflow — detailed enough for Claude to architect, tactical enough for Codex to code. + +Want me to adjust anything (e.g., make it more focused on Dream Machine integration, or add Pinecone memory example)? Just say the word and I’ll revise instantly. + +Let’s ship this today — it’s going to be huge for Agent Mode. diff --git a/brainstorming/future-integrations.md b/brainstorming/future-integrations.md new file mode 100644 index 000000000..a6b6a563e --- /dev/null +++ b/brainstorming/future-integrations.md @@ -0,0 +1,338 @@ +# Future Integrations & Improvements + +This document tracks potential features and integrations for the Moltworker Telegram bot with OpenRouter. + +## Current State (as of Feb 2026) + +### What We Have +- **26+ AI models** via OpenRouter (DeepSeek, GPT, Claude, Gemini, Grok, Qwen, etc.) +- **Image generation** with FLUX.2 models (klein, pro, flex, max) +- **GitHub tools** (read files, list directories, API calls) with auto-auth +- **Durable Objects** for unlimited task time (no timeout) +- **User allowlist** security +- **Skills loading** from R2 storage +- **Status updates** during long operations + +### Architecture +``` +Telegram Webhook → Worker → Durable Object (for tool-using models) + → OpenRouter API → Any Model + → Direct response (for simple models) +``` + +--- + +## Priority 1: High Value, Low Effort + +### 1.1 Browser Tool (CDP Integration) +**Status:** Not started +**Effort:** Low (binding already exists) +**Value:** High + +The `BROWSER` binding is already configured in wrangler.jsonc. Add a tool that models can call: + +```typescript +browse_url({ + url: string, + action: "screenshot" | "extract_text" | "pdf" | "click" | "fill" +}) +``` + +**Implementation:** +- Create `src/openrouter/tools/browser.ts` +- Add to AVAILABLE_TOOLS +- Use Cloudflare Browser Rendering API + +**Use Cases:** +- "Take a screenshot of my website" +- "What does the homepage of X say?" +- "Check if my deployment is working" +- "Get the current price of BTC from coinbase" + +### 1.2 Inline Buttons (Telegram) +**Status:** Not started +**Effort:** Low +**Value:** Medium + +Add interactive buttons to responses for: +- Confirmations ("Create this PR?" [Yes] [No]) +- Quick choices ("Which model?" [GPT] [Claude] [DeepSeek]) +- Pagination for long results + +**Implementation:** +- Add `sendMessageWithButtons()` to TelegramBot class +- Handle callback queries in `handleCallback()` +- Store pending actions in R2 or DO storage + +### 1.3 Draft Streaming (Telegram) +**Status:** Not started +**Effort:** Medium +**Value:** Medium + +Show partial responses as they stream in (requires threaded mode in BotFather). + +**Implementation:** +- Enable streaming in OpenRouter client +- Use `editMessage` to update content as tokens arrive +- Throttle updates to avoid rate limits + +--- + +## Priority 2: Discord Integration + +### 2.1 Discord Read-Only (Announcements) +**Status:** Not started +**Effort:** Medium +**Value:** High (user requested) + +Monitor Discord servers for announcements and forward to Telegram. + +**Architecture Options:** + +**Option A: Discord Bot (Full)** +- Create Discord bot with message read permissions +- Use discord.js or raw API +- Route messages through our OpenRouter handler + +**Option B: Webhook Listener** +- Use Discord webhooks to receive specific channel updates +- Lighter weight, no bot needed +- Limited to channels with webhook setup + +**Option C: User Account (Not Recommended)** +- Against Discord ToS +- Risk of ban + +**Recommended: Option A with minimal permissions** + +```typescript +// New env vars needed: +DISCORD_BOT_TOKEN +DISCORD_ANNOUNCEMENT_CHANNELS // comma-separated channel IDs +DISCORD_FORWARD_TO_TELEGRAM // telegram chat ID to forward to +``` + +**Features:** +- Monitor specific channels only +- Forward new messages to Telegram +- Optionally summarize with AI before forwarding +- Filter by keywords or roles + +### 2.2 Discord Full Integration +**Status:** Future +**Effort:** High +**Value:** Medium + +Full two-way Discord integration like Telegram: +- Respond to DMs +- Respond to mentions in servers +- Use same OpenRouter backend + +--- + +## Priority 3: More Tools + +### 3.1 Web Search Tool +**Status:** Not started +**Effort:** Medium +**Value:** High + +Let models search the web for current information. + +**Options:** +- Brave Search API (has free tier) +- SearXNG (self-hosted) +- Perplexity API +- Google Custom Search + +```typescript +web_search({ + query: string, + num_results?: number +}) +``` + +### 3.2 Code Execution Tool +**Status:** Not started +**Effort:** High +**Value:** High + +Run code snippets safely in a sandbox. + +**Options:** +- Use existing Cloudflare Sandbox container +- Piston API (multi-language execution) +- Judge0 API + +```typescript +run_code({ + language: "python" | "javascript" | "bash", + code: string +}) +``` + +### 3.3 File Management Tools +**Status:** Not started +**Effort:** Low +**Value:** Medium + +Store and retrieve files from R2: + +```typescript +save_file({ name: string, content: string }) +read_file({ name: string }) +list_files({ prefix?: string }) +delete_file({ name: string }) +``` + +### 3.4 Calendar/Reminder Tools +**Status:** Not started +**Effort:** Medium +**Value:** Medium + +Set reminders that trigger via cron: + +```typescript +set_reminder({ + message: string, + when: string // "in 2 hours", "tomorrow 9am", etc. +}) +list_reminders() +delete_reminder({ id: string }) +``` + +--- + +## Priority 4: Advanced Features + +### 4.1 Proactive Notifications (Cron) +**Status:** Partial (cron exists for R2 backup) +**Effort:** Medium +**Value:** High + +Use existing cron trigger for proactive tasks: +- Daily summaries +- Price alerts +- Website monitoring +- GitHub activity digest + +### 4.2 Voice Messages +**Status:** Not started +**Effort:** High +**Value:** Medium + +Handle Telegram voice messages: +- Transcribe with Whisper API +- Respond with TTS (ElevenLabs, OpenAI TTS) + +### 4.3 Multi-User Workspaces +**Status:** Not started +**Effort:** High +**Value:** Low (currently single-user) + +Share context between users: +- Team workspaces +- Shared conversation history +- Role-based access + +### 4.4 Long-Term Memory +**Status:** Not started +**Effort:** Medium +**Value:** High + +Persistent memory across conversations: +- Store facts in R2 (MEMORY.md like OpenClaw) +- Retrieve relevant memories for context +- User can view/edit memories + +--- + +## Priority 5: Platform Integrations + +### 5.1 Slack Integration +**Status:** Not started +**Effort:** Medium +**Value:** Low (unless needed) + +Same pattern as Telegram but for Slack workspaces. + +### 5.2 WhatsApp Integration +**Status:** Not started +**Effort:** High +**Value:** Medium + +Via WhatsApp Business API (requires approval). + +### 5.3 Email Integration +**Status:** Not started +**Effort:** Medium +**Value:** Medium + +- Receive emails via Cloudflare Email Workers +- Send emails via Mailgun/SendGrid +- Summarize inbox, draft replies + +--- + +## Technical Debt & Improvements + +### Code Quality +- [ ] Add unit tests for tools +- [ ] Add integration tests for Telegram handler +- [ ] Add error tracking (Sentry?) +- [ ] Add request logging/analytics + +### Performance +- [ ] Cache frequent API responses +- [ ] Optimize token usage (shorter system prompts) +- [ ] Batch tool calls where possible + +### Security +- [ ] Rate limiting per user +- [ ] Input sanitization for tools +- [ ] Audit logging for sensitive operations + +--- + +## BYOK / Direct API Lessons Learned + +> Critical for byok.cloud and any future BYOK (Bring Your Own Key) feature. + +### API Keys Are Region-Locked (DashScope / Alibaba Cloud) +- **Issue:** DashScope API keys are scoped to the region where they were created (Singapore, US Virginia, China Beijing). A Singapore key returns 401 on the Beijing endpoint. +- **Regional endpoints:** + - Singapore: `https://dashscope-intl.aliyuncs.com/compatible-mode/v1` + - US (Virginia): `https://dashscope-us.aliyuncs.com/compatible-mode/v1` + - China (Beijing): `https://dashscope.aliyuncs.com/compatible-mode/v1` +- **Impact on BYOK:** When users bring their own DashScope keys, we must either: + 1. Ask which region their key belongs to, or + 2. Auto-detect by trying the key against each regional endpoint, or + 3. Let users provide a custom base URL +- **Lesson:** Never assume a single base URL works for all users of a provider. Other providers may have similar region-locking (Azure OpenAI, AWS Bedrock, etc.). + +### General BYOK Considerations +- Validate keys at setup time — make a lightweight test call and surface clear errors +- Store per-user provider config (endpoint + key), not just the key +- Some providers require additional config beyond just an API key (region, project ID, deployment name) + +--- + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| Feb 2026 | Use OpenRouter instead of direct APIs | Unified access to 26+ models, simpler billing | +| Feb 2026 | Implement Durable Objects | Unlimited task time for complex coding | +| Feb 2026 | Bypass Gateway for Telegram | Custom multi-model support, image gen | +| Feb 2026 | Switch DashScope to `-intl` endpoint | API keys are region-locked; our key is Singapore, not Beijing | + +--- + +## Resources + +- [OpenRouter API Docs](https://openrouter.ai/docs) +- [Telegram Bot API](https://core.telegram.org/bots/api) +- [Cloudflare Durable Objects](https://developers.cloudflare.com/durable-objects/) +- [Cloudflare Browser Rendering](https://developers.cloudflare.com/browser-rendering/) +- [OpenClaw Skills](https://github.com/VoltAgent/awesome-openclaw-skills) +- [Discord API](https://discord.com/developers/docs) diff --git a/brainstorming/phase-4.1-audit.md b/brainstorming/phase-4.1-audit.md new file mode 100644 index 000000000..2a8f8d365 --- /dev/null +++ b/brainstorming/phase-4.1-audit.md @@ -0,0 +1,109 @@ +# Phase 4.1 Audit — Token-Budgeted Context Retrieval + +## Summary of findings + +### ✅ Improvements made + +1. **Reduced incorrect tool pairing on malformed histories** + - `buildToolPairings()` previously fell back to the most recent assistant for *any* unmatched `tool_call_id`. + - This could incorrectly bind a real tool result to the wrong assistant/tool call chain. + - Fix: fallback now applies **only** when `tool_call_id` is missing (truly malformed tool message), not when an unknown ID is present. + +2. **Strengthened pairing closure during greedy keep selection** + - The greedy phase already added direct pair links, but this could miss transitive closure in malformed/duplicate-id histories. + - Fix: added `expandPairedSet()` to recursively include all paired messages for both always-keep and additional keep sets. + - Result: lower risk of invalid sequences under edge-case histories. + +3. **More conservative image token estimate** + - Increased image part estimate from 300 → **425** tokens. + - Rationale: 300 underestimates medium/high image contexts too often for multi-image inputs. + +4. **Slightly more conservative JSON estimation** + - Added an additional heuristic bump for JSON-like payloads (`{"...": ...}` patterns). + - This narrows underestimation risk for tool result payloads and structured outputs. + +5. **Model-aware context budgets in TaskProcessor integration** + - Compression budget is now derived from `getModel(alias)?.maxContext` with safety headroom (75%). + - Retains fallback budget when metadata is missing. + - Replaced fixed `MAX_CONTEXT_TOKENS` threshold checks with per-model budget checks. + +### ⚠️ Remaining limitations (known) + +1. **Estimator is still heuristic-based** + - Better than raw chars/4, but still approximate. + - For heterogeneous content (code + JSON + natural language + vision), variance remains non-trivial. + +2. **Very small budgets can still exceed target in mandatory-set scenarios** + - If the always-keep set is itself huge, algorithm keeps a valid conversation subset rather than dropping foundational context. + - This is intentional graceful degradation, but strict budget adherence is not guaranteed in pathological inputs. + +3. **Priority scoring remains simple** + - Position bias is still meaningful and can out-rank some older but semantically critical snippets. + - The current logic is acceptable for Phase 4.1 but should evolve (see Phase 4.2 recommendations). + +## Token estimation accuracy analysis (cl100k_base) + +I attempted to benchmark against a local tokenizer implementation (`tiktoken` / `js-tiktoken`), but package installation is blocked in this environment (registry/proxy 403), so true runtime cl100k counts could not be generated programmatically here. + +The table below includes: +- **Current estimator outputs** (measured from code) +- **Target expectation notes** for cl100k behavior + +| Sample type | Sample | Estimated tokens | +|---|---|---:| +| English prose | `The quick brown fox jumps over the lazy dog...` | 22 | +| TypeScript code | `function add(a: number, b: number)...` | 22 | +| JSON tool result | `{"status":"ok","items":[...],"elapsed_ms":42}` | 37 | +| Mixed content | `I inspected src/index.ts and found this block: if (!token)...` | 24 | +| Numbered reasoning text | `1) Gather data\n2) Validate assumptions...` | 20 | + +### Interim assessment + +- The estimator appears directionally correct and intentionally conservative for code/JSON. +- Without direct cl100k counts in this environment, exact percentage error cannot be truthfully reported. +- Recommendation: rerun this table in CI/dev with `js-tiktoken` and record absolute/relative error bands. + +## Edge-case audit results + +All requested scenarios are now covered with tests: + +- Conversation with 0 tool calls (pure chat) ✅ +- Conversation with 100+ tool calls (stress) ✅ +- `ContentPart[]` vision messages with `image_url` ✅ +- `reasoning_content` messages ✅ +- Budget smaller than always-keep set ✅ +- Single message conversation ✅ +- All messages are tool results (malformed) ✅ +- Tool pairing robustness: missing IDs, duplicate IDs, unknown IDs ✅ + +## Production readiness assessment + +**Verdict: mostly production-ready for heuristic phase (Phase 4.1), with caveats.** + +- Correctness and edge-case resilience are materially improved. +- Integration now respects model-specific context windows. +- Main remaining risk is heuristic estimation drift vs true tokenizer behavior. + +If strict context-bound guarantees are required for high-cost models, this still needs Phase 4.2. + +## Recommendations for Phase 4.2 + +1. **Adopt real tokenizer path (`js-tiktoken`)** + - Validate Cloudflare Worker compatibility (bundle size + WASM/runtime constraints). + - Use lazy init + memoized encoder. + +2. **Dual-mode estimation strategy** + - Fast heuristic first pass for candidate ranking. + - Exact tokenizer pass only for final keep set and summary insertion. + +3. **Add tokenizer regression tests** + - Snapshot token counts for prose/code/JSON/vision/mixed payloads. + - Set acceptable error thresholds when fallback heuristic is used. + +4. **Make scoring policy configurable** + - Add weighted knobs for role, recency, and tool evidence importance. + - Optionally boost messages referenced by later assistant outputs. + +5. **Telemetry hooks** + - Record estimated vs provider-reported prompt tokens when available. + - Feed this data into automatic heuristic recalibration. diff --git a/brainstorming/tool-calling-analysis.md b/brainstorming/tool-calling-analysis.md new file mode 100644 index 000000000..a5e048335 --- /dev/null +++ b/brainstorming/tool-calling-analysis.md @@ -0,0 +1,659 @@ +# Tool Calling Landscape, steipete/OpenClaw & Acontext Integration Analysis + +**Date:** February 2026 +**Context:** Analysis of how Peter Steinberger's (steipete) ecosystem, the Acontext context data platform, and the current OpenRouter tool-calling model landscape can improve the Moltworker application. + +--- + +## Table of Contents + +1. [Executive Summary](#executive-summary) +2. [Current Moltworker Tool-Calling Architecture](#current-architecture) +3. [steipete Ecosystem Analysis](#steipete-ecosystem) +4. [Acontext Context Data Platform Analysis](#acontext-analysis) +5. [OpenRouter Tool-Calling Model Landscape](#model-landscape) +6. [Gap Analysis & Improvement Opportunities](#gap-analysis) +7. [Actionable Recommendations](#recommendations) +8. [Implementation Priority Matrix](#priority-matrix) + +--- + +## 1. Executive Summary + +Moltworker is a production-grade AI assistant gateway running on Cloudflare Workers with 26+ models via OpenRouter, 5 tools, Durable Objects for long-running tasks, and multi-platform chat integrations. This analysis identifies **five categories of improvement**: + +1. **Tool-calling sophistication** — Current implementation uses sequential single-model tool loops. Modern models (DeepSeek V3.2, Grok 4.1, Claude Sonnet 4.5) support parallel tool calls and speculative execution that Moltworker doesn't exploit. +2. **Tooling breadth** — steipete's ecosystem provides ready-made capabilities (MCP servers, browser automation, GUI capture, token monitoring) that map directly to Moltworker's roadmap gaps. +3. **Context management** — Acontext (memodb-io/Acontext) provides purpose-built context engineering that directly replaces Moltworker's crude `compressContext()` with token-aware session management, plus adds observability, code execution, and persistent file storage. +4. **Compound engineering** — The Compound Engineering Plugin (EveryInc/compound-engineering-plugin) introduces a learning loop where each completed task makes subsequent tasks easier. Moltworker currently starts every task from zero with no memory of past patterns. +5. **Model selection intelligence** — The tool-calling model landscape shows significant capability variance. Moltworker treats all tool-capable models identically, missing optimization opportunities. + +--- + +## 2. Current Moltworker Tool-Calling Architecture + +### What Exists + +| Component | Location | Capability | +|-----------|----------|------------| +| Tool Definitions | `src/openrouter/tools.ts` | 5 tools: `fetch_url`, `github_read_file`, `github_list_files`, `github_api`, `browse_url` | +| Tool Execution | `src/openrouter/tools.ts:executeTool()` | Sequential switch-case execution, single tool at a time | +| Client Loop | `src/openrouter/client.ts:chatCompletionWithTools()` | Iterative loop, max 10 iterations, 2-minute timeout | +| Long-Running Tasks | `src/durable-objects/task-processor.ts` | Durable Objects, 100 iteration limit, R2 checkpointing, auto-resume | +| Model Support Check | `src/openrouter/tools.ts:modelSupportsTools()` | Boolean flag per model, hardcoded fallback list | +| Streaming | `src/openrouter/client.ts:chatCompletionStreamingWithTools()` | SSE streaming with tool-call delta accumulation | + +### Current Limitations + +1. **No parallel tool execution** — When a model returns multiple `tool_calls`, they are executed sequentially via `for...of` loop (tools.ts L221-238, task-processor.ts L728-759). Models like Claude Sonnet 4.5 and Grok 4.1 can emit parallel tool calls, but the benefit is lost. + +2. **Binary tool support** — `supportsTools` is a boolean. No distinction between models that support parallel calls, structured output, reasoning-with-tools, or configurable reasoning depth. + +3. **Static tool set** — All tool-capable models get identical `AVAILABLE_TOOLS`. No model-specific tool filtering, no dynamic tool registration. + +4. **No structured output** — The system doesn't leverage `response_format: { type: "json_schema" }` for models that support it (Gemini 3 Flash, DeepSeek V3.2, GPT-4o, etc.). + +5. **No reasoning control** — Models like DeepSeek V3.2, Grok 4.1, and Gemini 3 Flash support configurable reasoning (`reasoning: { enabled: true/false }`) which affects tool-calling accuracy vs. speed. Moltworker doesn't expose this. + +6. **No tool result caching** — Identical tool calls (e.g., same GitHub file read) are re-executed every time. + +7. **No MCP integration** — The Model Context Protocol is becoming the standard for tool interop. steipete's `mcporter` bridges this gap. + +--- + +## 3. steipete Ecosystem Analysis + +Peter Steinberger maintains a constellation of projects directly relevant to Moltworker's capabilities and roadmap: + +### 3.1 High-Relevance Projects + +#### OpenClaw (Core Runtime) +- **Relationship:** Moltworker deploys OpenClaw inside Cloudflare Sandbox containers +- **Relevance:** OpenClaw provides the gateway, skills system, and device pairing that Moltworker wraps. Any improvements to OpenClaw directly benefit Moltworker +- **Gap it fills:** Foundation layer — already integrated + +#### mcporter (MCP Interface) — 1.4k stars +- **What it does:** Bridges MCP (Model Context Protocol) servers with TypeScript/CLI tools +- **How it improves Moltworker:** + - **Dynamic tool registration** — Instead of hardcoding 5 tools, Moltworker could load tools from MCP servers at runtime + - **Ecosystem access** — Hundreds of community MCP servers exist (databases, APIs, file systems, cloud services) + - **Standardization** — MCP is becoming the universal tool interface; adopting it future-proofs the tool system +- **Integration path:** Add MCP client to `src/openrouter/tools.ts` that discovers and registers tools from configured MCP servers +- **Impact:** HIGH — transforms Moltworker from 5 hardcoded tools to potentially unlimited + +#### Peekaboo (macOS Screenshot/GUI Automation) — 1.9k stars +- **What it does:** CLI for screenshots, window capture, accessibility tree extraction, GUI element interaction +- **How it improves Moltworker:** + - **Enhanced browse_url** — Current browser tool only does text extraction, screenshots, and PDFs. Peekaboo's approach of extracting accessibility trees provides structured UI understanding + - **Visual testing** — Models with vision (Claude, GPT-4o, Gemini) could analyze GUI state via Peekaboo-style captures + - **Agentic browser control** — Click, fill, scroll operations for real browser automation +- **Integration path:** Adapt Peekaboo's accessibility tree extraction concept for Cloudflare Browser Rendering +- **Impact:** MEDIUM — enriches the existing `browse_url` tool significantly + +#### CodexBar (Token Usage Monitoring) — 4.8k stars +- **What it does:** Real-time monitoring of AI model token usage and costs +- **How it improves Moltworker:** + - **Cost awareness** — Moltworker's task processor can burn through tokens with 100 iterations. CodexBar's approach of real-time monitoring would let the bot report costs to users + - **Model selection** — Usage data helps choose cost-effective models per task + - **Budget limits** — Users could set spending caps per conversation or per day +- **Integration path:** Add token/cost tracking to `OpenRouterClient`, expose via Telegram commands +- **Impact:** MEDIUM — improves cost management and user trust + +#### oracle (LLM Context-Aware Assistant) — 1.3k stars +- **What it does:** Context-gathering pipeline that feeds relevant project/file context to LLMs +- **How it improves Moltworker:** + - **Smarter GitHub tools** — Instead of reading individual files, oracle's approach gathers relevant context across a repository + - **Task decomposition** — oracle's pipeline for breaking tasks into steps could improve the Durable Object task processor +- **Integration path:** Adapt context-gathering patterns for GitHub tool calls +- **Impact:** MEDIUM + +#### VibeTunnel (Browser-to-Terminal) — vt.sh +- **What it does:** Tunnels browser interactions to terminal commands +- **How it improves Moltworker:** + - **Web UI enhancement** — Could provide a richer admin interface than the current React dashboard + - **Remote terminal access** — Users could interact with the Cloudflare Sandbox container via browser +- **Integration path:** Consider for admin dashboard v2 +- **Impact:** LOW — nice-to-have, not core functionality + +### 3.2 Relevant CLI Tools + +| Tool | Relevance | Potential Integration | +|------|-----------|---------------------| +| **Trimmy** (shell snippets) | LOW | Could format code blocks in bot responses | +| **spogo** (Spotify CLI) | MEDIUM | New tool: music control via Telegram | +| **bird** (X/Twitter CLI) | MEDIUM | New tool: social media monitoring/posting | +| **imsg** (iMessage CLI) | LOW | Alternative messaging channel | +| **remindctl** (Apple Reminders) | HIGH | Maps directly to planned Calendar/Reminder tools (Priority 3.4) | +| **sag** (speech synthesis) | MEDIUM | Maps to planned Voice Messages feature (Priority 4.2) | +| **Brabble** (voice daemon) | MEDIUM | Same as above — voice interaction pipeline | + +### 3.3 Design Philosophy Alignment + +steipete's philosophy of "Ship beats perfect" and running multiple Claude instances concurrently aligns with Moltworker's architecture of parallel model access. Key patterns to adopt: + +- **Rapid prototyping** — steipete ships CLI tools that do one thing well. Moltworker tools should follow this pattern +- **Composability** — Each steipete tool is standalone but interoperable. MCP adoption enables this +- **AI-native design** — Every tool is designed to be used by AI agents, not just humans + +--- + +## 4. Acontext Context Data Platform Analysis + +**Repository:** github.com/memodb-io/Acontext (2.8k stars, Apache 2.0) +**What it is:** A purpose-built context management platform for AI agents that provides unified storage, context engineering, observability, and sandboxed execution. + +### 4.1 Why This Matters for Moltworker + +Acontext solves **three of Moltworker's most pressing architectural pain points**: + +| Moltworker Pain Point | Current Solution | Acontext Solution | +|----------------------|-----------------|-------------------| +| Context explosion in long tasks | Basic `compressContext()` in task-processor.ts: removes middle messages, keeps recent 6 | **Smart context editing**: Token-limited retrieval, tool result filtering, session summaries — all without modifying originals | +| Multi-provider message format | Manual format handling per provider (OpenRouter normalizes, but direct APIs don't) | **Automatic format conversion**: Store messages in OpenAI format, retrieve in Anthropic format, transparently | +| No observability | `console.log` statements, Telegram progress messages | **Full dashboard**: Session replays, agent success rates, real-time state tracking | + +### 4.2 Feature-by-Feature Relevance + +#### Context Storage & Sessions — **CRITICAL RELEVANCE** + +Moltworker's `TaskProcessor` (task-processor.ts) maintains conversation state in Durable Object storage and R2 checkpoints. This is fragile: +- Checkpoints are raw JSON blobs in R2 (`checkpoints/{userId}/latest.json`) +- Only the latest checkpoint is kept (no history) +- Context compression (`compressContext()`) is lossy and destroys audit trail +- No cross-session memory (each task starts fresh) + +Acontext's sessions provide: +- **Immutable message history** — Original messages never modified, edits are views +- **Token-budgeted retrieval** — `get_messages(max_tokens=60000)` automatically compresses to fit, far superior to Moltworker's character-count heuristic (`estimateTokens` using chars/4) +- **Tool result filtering** — Selectively remove old tool outputs while keeping recent ones. This directly addresses the `COMPRESS_AFTER_TOOLS = 6` problem where Moltworker blindly compresses every 6 tool calls +- **Cross-session continuity** — Sessions persist, so a user can resume a complex coding task days later with full context + +#### Context Engineering — **HIGH RELEVANCE** + +The `compressContext()` method in task-processor.ts (L281-335) is Moltworker's biggest context management weakness: + +``` +Current approach: +1. Keep system message + user message + last 6 messages +2. Summarize everything in the middle into a single text block +3. Lose all tool call/result pairing (can't reconstruct the interaction) +``` + +Acontext's approach: +1. **Asynchronous summaries** generated by a separate LLM call (prevents prompt injection) +2. **Selective compression** — can compress by age, by type (tool results vs. assistant text), or by relevance +3. **Original preservation** — compressed view is separate from stored data; can always go back +4. **Token-aware** — uses actual tokenizer, not chars/4 heuristic + +**Concrete improvement:** Replace `compressContext()` and `estimateTokens()` with Acontext session API calls. The task processor would store messages via Acontext and retrieve token-budgeted context per iteration. + +#### Disk (Virtual Filesystem) — **MEDIUM RELEVANCE** + +Moltworker's tools produce ephemeral results. If a model reads a GitHub file, that content exists only in the conversation. If the task crashes and resumes, the file must be re-fetched. + +Acontext's Disk provides persistent agent storage with read, write, grep, and glob operations. This maps to Moltworker's planned File Management Tools (roadmap Priority 3.3): + +```typescript +// Current roadmap plan (future-integrations.md): +save_file({ name: string, content: string }) +read_file({ name: string }) +list_files({ prefix?: string }) + +// Acontext Disk already provides this via API + tool schemas +``` + +Instead of building custom R2-based file tools, Moltworker could use Acontext Disk as the storage backend and expose its tool schemas directly to models. + +#### Sandbox (Code Execution) — **HIGH RELEVANCE** + +Moltworker's roadmap lists Code Execution (Priority 3.2) as high-value, high-effort. Acontext provides sandboxed Python and bash execution out of the box, with: +- Isolated environment per session +- Access to Disk files (read artifacts, write results) +- Skill mounting at `/skills/{name}/` +- OpenAI-compatible tool schemas ready to plug into the tool-calling loop + +This could reduce the code execution feature from "high effort" to "medium effort" by leveraging Acontext's sandbox rather than building custom Piston/Judge0 integration. + +#### Skills System — **MEDIUM RELEVANCE** + +Moltworker already has a skills system (via OpenClaw's R2-based skills loading). Acontext's skills management adds: +- ZIP-based skill packaging +- Automatic inclusion in LLM context +- Server-side skill management dashboard + +This is complementary but not critical — Moltworker's existing approach works. + +#### Observability Dashboard — **HIGH RELEVANCE** + +Moltworker currently has zero observability beyond Telegram progress messages and `console.log`. For a system running 100-iteration tasks with 10 auto-resumes across multiple models and providers, this is a significant blind spot. + +Acontext provides: +- **Session replay** — See exactly what the agent did, step by step +- **Success rate tracking** — Which models/tool combinations work best +- **Real-time state** — Monitor long-running Durable Object tasks without relying on Telegram +- **Cost attribution** — Track token usage per session (complements the CodexBar-inspired cost tracking from R4) + +### 4.3 Integration Architecture + +``` + ┌─────────────────────┐ + │ Acontext Platform │ + │ (Cloud or Self-Host)│ + │ │ + │ ┌────────────────┐ │ +Moltworker │ │ Sessions API │ │ +TaskProcessor ───────────►│ │ (context store) │ │ + │ ├────────────────┤ │ +Tool Results ────────────►│ │ Disk API │ │ + │ │ (file storage) │ │ +OpenRouter Responses ────►│ ├────────────────┤ │ + │ │ Sandbox API │ │ + │ │ (code exec) │ │ +Admin Dashboard ◄─────────│ ├────────────────┤ │ + │ │ Observability │ │ + │ │ (dashboard) │ │ + │ └────────────────┘ │ + └─────────────────────┘ +``` + +**Integration points:** +1. **TaskProcessor** stores messages via Acontext Sessions instead of raw R2 checkpoints +2. **Context retrieval** uses token-budgeted API instead of `compressContext()` +3. **New tools** (`run_code`, `save_file`, `read_file`) backed by Acontext Sandbox/Disk +4. **Admin dashboard** links to Acontext's observability dashboard for deep debugging + +### 4.4 Trade-offs & Considerations + +| Pro | Con | +|-----|-----| +| Solves context compression properly | Adds external dependency (API calls to Acontext) | +| Provides code execution for free | Latency: Acontext API call adds ~50-200ms per operation | +| Full observability dashboard | Self-hosting requires PostgreSQL + Redis + RabbitMQ + S3 | +| TypeScript SDK available (`@acontext/acontext`) | Cloud version requires API key and has usage limits | +| Apache 2.0 license | 2.8k stars = still relatively early-stage project | +| Handles multi-provider format conversion | Moltworker already routes through OpenRouter which normalizes formats | + +### 4.5 Recommendation + +**Phase 1 (Low risk):** Use Acontext Sessions API as a **secondary** context store alongside existing R2 checkpoints. Store messages in Acontext for observability and smart retrieval, but keep R2 as the primary checkpoint for crash recovery. + +**Phase 2 (Medium risk):** Replace `compressContext()` with Acontext's token-budgeted retrieval. This removes the crude compression logic and provides proper context management. + +**Phase 3 (Full adoption):** Use Acontext Disk + Sandbox for file management and code execution tools, reducing custom development effort. + +--- + +## 5. OpenRouter Tool-Calling Model Landscape + +### 4.1 Current Model Capabilities (February 2026) + +Based on OpenRouter's tool-calling collection data, ranked by weekly token usage: + +| Rank | Model | Provider | Tool-Calling Features | Weekly Tokens | Moltworker Status | +|------|-------|----------|----------------------|---------------|-------------------| +| 1 | Gemini 3 Flash | Google | Tool use, structured output, configurable reasoning (minimal/low/medium/high), multimodal | 857B | `flash` — no tools flag | +| 2 | Claude Sonnet 4.5 | Anthropic | Parallel tool calls, speculative execution, multi-agent | 817B | `sonnet` — tools enabled | +| 3 | DeepSeek V3.2 | DeepSeek | Agentic tool-use pipeline, reasoning control, DSA long-context | 630B | `deep` — tools enabled | +| 4 | Grok 4.1 Fast | xAI | Agentic tool calling, 2M context, reasoning toggle | 341B | `grok` — tools enabled | +| 5 | GPT-OSS-120B | OpenAI | Function calling, browsing, structured outputs, reasoning depth | 308B | Not in model catalog | +| 6 | GLM 4.7 | Z.AI | Multi-step reasoning, complex agent tasks | 192B | `glmfree` — GLM 4.5 only, no tools flag | + +### 4.2 Capability Matrix for Moltworker Models + +Mapping advanced tool-calling capabilities to Moltworker's model catalog: + +| Capability | Models Supporting It | Moltworker Exploits It? | +|-----------|---------------------|------------------------| +| **Parallel tool calls** | Claude Sonnet/Opus 4.5, GPT-4o, Grok 4.1, DeepSeek V3.2 | NO — sequential execution | +| **Structured output (JSON schema)** | Gemini 3 Flash/Pro, GPT-4o, DeepSeek V3.2, Claude Sonnet 4.5 | NO — not implemented | +| **Configurable reasoning** | Gemini 3 Flash (levels), DeepSeek V3.2 (boolean), Grok 4.1 (boolean) | NO — not exposed | +| **Long context + tools** | Grok 4.1 (2M), Gemini 3 Flash (1M+), DeepSeek V3.2 (64K) | PARTIAL — no context-aware tool selection | +| **Multimodal + tools** | Claude Sonnet 4.5, GPT-4o, Gemini 3 Flash/Pro, Kimi K2.5 | NO — vision and tools are separate paths | +| **Speculative parallel execution** | Claude Sonnet 4.5 | NO — not implemented | +| **Multi-agent orchestration** | Claude Sonnet 4.5, DeepSeek V3.2 | NO — single-model per conversation | + +### 4.3 Missing Models + +Models in the OpenRouter tool-calling collection that Moltworker should consider adding: + +1. **GPT-OSS-120B** (OpenAI) — #5 by usage, native tool use, configurable reasoning depth. Cost-effective alternative to GPT-4o. +2. **GLM 4.7** (Z.AI) — Significant upgrade from GLM 4.5 Air currently offered. Multi-step reasoning for complex agent tasks. +3. **DeepSeek V3.2 with DSA** — Current `deep` alias points to V3.2 but doesn't leverage Sparse Attention for long-context tool workflows. + +--- + +## 6. Gap Analysis & Improvement Opportunities + +### Gap 1: Parallel Tool Execution + +**Current:** Sequential `for...of` loop in both `chatCompletionWithTools()` and `TaskProcessor.processTask()` + +**Opportunity:** When a model returns N tool calls, execute them concurrently with `Promise.all()` or `Promise.allSettled()`: + +```typescript +// Current (sequential) +for (const toolCall of choice.message.tool_calls) { + const result = await executeTool(toolCall, context); + // ... +} + +// Improved (parallel) +const results = await Promise.allSettled( + choice.message.tool_calls.map(tc => executeTool(tc, context)) +); +``` + +**Impact:** 2-5x faster tool execution per iteration. For a task processor doing 50+ iterations with multiple tools per iteration, this compounds significantly. + +**Risk:** Some tools may have ordering dependencies (e.g., create file then read it). Mitigation: detect tool dependencies by name/arguments and parallelize only independent calls. + +### Gap 2: Model-Specific Tool Configuration + +**Current:** `supportsTools: boolean` in `ModelInfo` + +**Opportunity:** Replace with a richer capability descriptor: + +```typescript +interface ToolCapabilities { + supportsTools: boolean; + parallelCalls: boolean; // Can emit multiple tool_calls + structuredOutput: boolean; // Supports response_format JSON schema + reasoning: 'none' | 'fixed' | 'configurable'; // Reasoning control + maxToolsPerCall: number; // Max parallel tool calls + maxContext: number; // Context window in tokens + specialties: string[]; // 'coding', 'research', 'agentic', etc. +} +``` + +This enables intelligent model routing: route complex multi-tool tasks to models with `parallelCalls: true` and large context windows, simple queries to fast models. + +### Gap 3: MCP Integration (via mcporter) + +**Current:** 5 hardcoded tools defined in `AVAILABLE_TOOLS` + +**Opportunity:** Use steipete's mcporter pattern to dynamically discover and register MCP tools: + +``` +MCP Server Registry (R2 config) + → MCP Client (new src/openrouter/mcp.ts) + → Dynamic AVAILABLE_TOOLS generation + → Per-conversation tool filtering +``` + +**Impact:** Transforms Moltworker from a 5-tool bot to an extensible platform. Users could add custom tools without code changes. + +### Gap 4: Token/Cost Tracking + +**Current:** `usage` field in API responses is captured but not surfaced + +**Opportunity:** Track cumulative costs per user/conversation/model, inspired by CodexBar: + +- Show cost in Telegram progress updates: `⏳ Processing... (5 tools, $0.03 spent)` +- Add `/costs` command to show usage breakdown +- Per-model cost tracking for optimizing model selection +- Budget limits per user or per task + +### Gap 5: Structured Output for Reliable Tool Use + +**Current:** Tool results are free-text strings + +**Opportunity:** For models supporting structured output, define JSON schemas for tool responses. This ensures the model can reliably parse tool results and reduces hallucination of tool output format. + +### Gap 6: Reasoning Control per Task Type + +**Current:** Fixed `temperature: 0.7` for all requests + +**Opportunity:** Map task types to reasoning configurations: + +| Task Type | Reasoning Level | Temperature | Model Preference | +|-----------|----------------|-------------|-----------------| +| Simple Q&A | Disabled/Minimal | 0.3 | Grok Fast, Gemini Flash | +| Code generation | Enabled (Medium) | 0.2 | DeepSeek V3.2, Qwen Coder | +| Complex research | Enabled (High) | 0.5 | Claude Sonnet, Gemini Pro | +| Creative writing | Disabled | 0.9 | Claude Opus, GPT-4o | + +### Gap 8: No Compound Learning Loop + +**Current:** Every task starts from zero. The task processor has no mechanism to learn from past tasks — which tool sequences worked, which models performed best for which task types, what patterns recurred. + +**Opportunity:** The Compound Engineering Plugin (EveryInc/compound-engineering-plugin, 7.3k stars) introduces a **Plan → Work → Review → Compound** cycle where the "Compound" step captures patterns, decisions, and learnings from each completed task and feeds them back into future planning. + +Applied to Moltworker's task processor, this means: +- After each completed Durable Object task, automatically extract: which tools were used, in what order, how many iterations, which model was selected, and whether the task succeeded +- Store these "compound learnings" as structured data in R2 or Acontext +- Inject relevant past learnings into the system prompt for similar future tasks +- Progressively build a knowledge base that makes the assistant better over time + +This directly maps to the **Long-Term Memory** item (Priority 4.4) in future-integrations.md, but with a structured, task-oriented approach rather than free-form memory. + +### Gap 9: No Multi-Agent Review + +**Current:** Single model handles everything — planning, execution, and validation. No second opinion. + +**Opportunity:** The Compound Engineering Plugin's `/workflows:review` uses multiple agents reviewing code simultaneously. For Moltworker, this could mean: +- After a tool-heavy task completes, route the result through a second model for validation +- Use a cheaper/faster model (Gemini Flash, Grok Fast) as a "reviewer" for expensive model output (Claude Opus) +- For GitHub-related tasks, have one model write code and another review it before creating the PR + +This leverages Moltworker's existing multi-model architecture — the infrastructure to call different models is already there. + +### Gap 10: No Structured Workflow for Complex Tasks + +**Current:** User sends a message → model responds with tool calls → loop until done. No structured phases. + +**Opportunity:** For complex tasks (especially those routed to Durable Objects), introduce the Plan → Work → Review cycle: +1. **Plan phase**: Model creates an explicit plan before calling any tools (reduces wasted iterations) +2. **Work phase**: Execute the plan with tool calls, tracking progress against the plan +3. **Review phase**: Self-review or cross-model review before sending final result + +The task processor already has iteration tracking — adding phase awareness would be a natural extension. + +### Gap 7: Vision + Tools Combined + +**Current:** `chatCompletionWithVision()` and `chatCompletionWithTools()` are separate methods + +**Opportunity:** Combine vision input with tool calling. User sends a screenshot + "fix this bug" → model sees the image AND can call GitHub tools to read/modify code. + +--- + +## 7. Actionable Recommendations + +### R1: Implement Parallel Tool Execution (Effort: Low) + +**Files to modify:** +- `src/openrouter/client.ts` — `chatCompletionWithTools()` L221-238 +- `src/durable-objects/task-processor.ts` — L728-759 + +**Change:** Replace sequential `for...of` with `Promise.allSettled()` for independent tool calls. + +### R2: Enrich Model Capability Metadata (Effort: Low) + +**Files to modify:** +- `src/openrouter/models.ts` — Extend `ModelInfo` interface + +**Change:** Add `parallelCalls`, `structuredOutput`, `reasoning`, `maxContext` fields to each model definition. + +### R3: Add Gemini 3 Flash Tool Support (Effort: Trivial) + +**Files to modify:** +- `src/openrouter/models.ts` — Add `supportsTools: true` to `flash` model + +**Change:** The `flash` model (Gemini 3 Flash) supports tool calling but doesn't have `supportsTools: true` in the current config. This is a one-line fix. + +### R4: Add Token/Cost Tracking (Effort: Medium) + +**Files to create/modify:** +- New: `src/openrouter/costs.ts` — Cost calculation per model +- Modify: `src/durable-objects/task-processor.ts` — Accumulate costs +- Modify: `src/telegram/handler.ts` — `/costs` command + +### R5: Add Configurable Reasoning (Effort: Medium) + +**Files to modify:** +- `src/openrouter/client.ts` — Add `reasoning` parameter to API requests +- `src/openrouter/models.ts` — Add reasoning capability per model + +**Change:** Pass `reasoning: { enabled: true/false }` or `reasoning: { effort: 'low' | 'medium' | 'high' }` based on model capability and task type. + +### R6: Investigate MCP Integration (Effort: High) + +**Research needed:** +- Evaluate mcporter's architecture for Cloudflare Workers compatibility +- Determine if MCP servers can run inside Sandbox containers or need external hosting +- Design dynamic tool registration flow + +### R7: Add Missing Models (Effort: Trivial) + +**Files to modify:** +- `src/openrouter/models.ts` — Add `gptoss`, `glm47` model entries + +### R8: Combine Vision + Tools (Effort: Medium) + +**Files to modify:** +- `src/openrouter/client.ts` — Merge `chatCompletionWithVision` and `chatCompletionWithTools` into a unified method + +### R9: Integrate Acontext for Context Management (Effort: Medium-High) + +**Files to create/modify:** +- New: `src/acontext/client.ts` — Acontext TypeScript SDK wrapper +- Modify: `src/durable-objects/task-processor.ts` — Replace `compressContext()` and R2 checkpoints with Acontext Sessions +- Modify: `src/openrouter/tools.ts` — Add `run_code`, `save_file`, `read_file` tools backed by Acontext Sandbox/Disk + +**Phase 1 (Low risk):** Add Acontext as observability layer — store all task processor messages for replay and debugging. Keep existing R2 checkpoints as primary. + +**Phase 2:** Replace `compressContext()` (L281-335 in task-processor.ts) and `estimateTokens()` (L204-215) with Acontext's token-budgeted session retrieval. This eliminates the crude chars/4 heuristic and the lossy middle-message compression. + +**Phase 3:** Use Acontext Sandbox for code execution tool and Disk for file management tools — replaces two roadmap items (Priority 3.2 and 3.3 in future-integrations.md) with a single integration. + +### R10: Compound Learning Loop (Effort: Medium) + +**Inspired by:** EveryInc/compound-engineering-plugin's `/workflows:compound` step + +**Files to create/modify:** +- New: `src/openrouter/learnings.ts` — Structured learning extraction and storage +- Modify: `src/durable-objects/task-processor.ts` — After task completion, extract and store learnings +- Modify: `src/telegram/handler.ts` — `/learnings` command to view past patterns + +**How it works:** +1. After each completed Durable Object task, extract structured metadata: + - Tool sequence used (e.g., `github_read_file → github_read_file → github_api`) + - Model used and token count + - Iterations required + - Success/failure outcome + - Task category (coding, research, GitHub ops, etc.) +2. Store in R2 as `learnings/{userId}/history.json` +3. Before starting a new task, inject relevant learnings into the system prompt: + - "For similar GitHub tasks, the most effective approach used github_read_file first to understand the codebase, then github_api to make changes. Average: 4 iterations." +4. Over time, build a per-user knowledge base that makes the assistant progressively better + +**Impact:** Transforms Moltworker from stateless to learning. Directly addresses Long-Term Memory (Priority 4.4 in roadmap) with a structured, task-oriented approach. + +### R11: Multi-Agent Review for Complex Tasks (Effort: Medium) + +**Inspired by:** EveryInc/compound-engineering-plugin's `/workflows:review` + +**Files to modify:** +- Modify: `src/durable-objects/task-processor.ts` — Add review phase after task completion +- Modify: `src/openrouter/models.ts` — Add reviewer model selection logic + +**How it works:** +1. After the primary model completes a tool-heavy task (e.g., creating a PR via `github_api`), route the result to a second model +2. The reviewer model checks for: correctness, completeness, security issues, missed edge cases +3. If the reviewer flags issues, feed back to the primary model for a correction iteration +4. Use cost-efficient reviewers: Gemini Flash or Grok Fast for reviewing expensive Opus/Sonnet output + +**Impact:** Quality improvement with minimal cost increase. Leverages Moltworker's existing multi-model infrastructure. + +### R12: Structured Task Phases (Plan → Work → Review) (Effort: Medium) + +**Inspired by:** EveryInc/compound-engineering-plugin's workflow structure + +**Files to modify:** +- Modify: `src/durable-objects/task-processor.ts` — Add phase tracking to `TaskState` +- Modify: `src/openrouter/tools.ts` — Phase-aware system prompts + +**How it works:** +1. When a complex task is routed to Durable Objects, inject a planning prompt first: + - "Before executing, create a step-by-step plan. List the tools you'll need and in what order." +2. Track which phase the task is in: `planning | executing | reviewing` +3. Show phase in Telegram progress updates: `⏳ Planning... (step 2/5)` → `⏳ Executing... (tool 3/7)` → `⏳ Reviewing...` +4. After execution, add a self-review step where the model validates its own output + +**Impact:** Reduces wasted iterations (models often thrash without a plan), improves user visibility into what's happening, and produces higher quality output. + +### R13: Acontext Observability Dashboard (Effort: Low) + +**Files to modify:** +- `src/routes/admin-ui.ts` — Add link/iframe to Acontext dashboard +- `wrangler.jsonc` — Add `ACONTEXT_API_KEY` secret + +**Change:** Connect the admin UI to Acontext's observability dashboard for session replay, success rate tracking, and real-time task monitoring. This is the lowest-risk Acontext integration since it's read-only. + +--- + +## 8. Implementation Priority Matrix + +| Priority | Recommendation | Effort | Impact | Dependencies | +|----------|---------------|--------|--------|-------------| +| **P0** | R3: Enable Gemini Flash tools | Trivial | Medium | None | +| **P0** | R7: Add missing models | Trivial | Low | None | +| **P1** | R1: Parallel tool execution | Low | High | None | +| **P1** | R2: Model capability metadata | Low | Medium | None | +| **P1** | R13: Acontext observability | Low | High | Acontext API key | +| **P2** | R4: Token/cost tracking | Medium | High | R2 | +| **P2** | R5: Configurable reasoning | Medium | Medium | R2 | +| **P2** | R8: Vision + tools combined | Medium | Medium | None | +| **P2** | R9 Phase 1: Acontext sessions (observability) | Medium | High | Acontext setup | +| **P2** | R10: Compound learning loop | Medium | High | R2 storage | +| **P2** | R12: Structured task phases (Plan→Work→Review) | Medium | High | None | +| **P3** | R6: MCP integration | High | Very High | Research phase needed | +| **P3** | R9 Phase 2: Acontext context engineering | Medium-High | Very High | R9 Phase 1 | +| **P3** | R9 Phase 3: Acontext Sandbox/Disk tools | Medium | High | R9 Phase 1 | +| **P3** | R11: Multi-agent review | Medium | High | R2 (model metadata) | + +### Quick Wins (Can ship today) +1. Add `supportsTools: true` to Gemini 3 Flash +2. Add GPT-OSS-120B and GLM 4.7 to model catalog +3. Switch tool execution from sequential to parallel + +### Medium-Term (1-2 sprints) +1. Enrich model metadata with parallel/reasoning/structured capabilities +2. Add cost tracking and `/costs` command +3. Add reasoning control for compatible models +4. Connect Acontext observability dashboard for task monitoring +5. Store task processor messages in Acontext Sessions for replay +6. Add compound learning loop — extract and reuse patterns from completed tasks +7. Add Plan→Work→Review phases to Durable Object task processor + +### Strategic (Requires design) +1. MCP integration via mcporter patterns +2. Replace `compressContext()` with Acontext token-budgeted retrieval +3. Acontext Sandbox for code execution + Disk for file management (replaces two roadmap items) +4. Multi-agent review for complex tasks (cross-model validation) +5. Multi-agent orchestration leveraging Claude Sonnet 4.5's capabilities +6. Dynamic tool selection based on model capabilities and task type + +--- + +## Appendix: Project Links + +### steipete Ecosystem +- OpenClaw: github.com/steipete (main project) +- mcporter: github.com/steipete/mcporter +- Peekaboo: github.com/steipete/Peekaboo +- CodexBar: github.com/steipete/CodexBar +- oracle: github.com/steipete/oracle +- VibeTunnel: vt.sh + +### Acontext Platform +- Repository: github.com/memodb-io/Acontext (2.8k stars, Apache 2.0) +- Website: acontext.io +- Documentation: docs.acontext.io +- TypeScript SDK: `npm install @acontext/acontext` +- Python SDK: `pip install acontext` + +### Compound Engineering Plugin +- Repository: github.com/EveryInc/compound-engineering-plugin (7.3k stars, MIT) +- Package: `@every-env/compound-plugin` +- Methodology: Plan → Work → Review → Compound → Repeat +- Key insight: 80% planning/review, 20% execution; each cycle informs the next diff --git a/claude-share/R2/README.md b/claude-share/R2/README.md new file mode 100644 index 000000000..32d970fbd --- /dev/null +++ b/claude-share/R2/README.md @@ -0,0 +1,35 @@ +# R2 Bucket Contents + +Copy each subfolder to the R2 bucket root. The bucket structure should be: + +``` +R2 bucket root/ +├── skills/ +│ └── storia-orchestrator/ +│ └── prompt.md ← Bot system prompt (loaded on every message) +│ +│ (Other directories are created automatically by the bot at runtime) +│ +├── telegram-users/{userId}/ ← Auto-created: preferences, conversation history +├── checkpoints/{userId}/ ← Auto-created: task checkpoints +├── learnings/{userId}/ ← Auto-created: task learnings + last-task summary +├── sync/ ← Auto-created: dynamic models from /syncmodels +``` + +## What to Upload Manually + +Only `skills/storia-orchestrator/prompt.md` needs to be uploaded manually. +Everything else is created automatically by the bot at runtime. + +## How to Upload + +Using wrangler: +```bash +wrangler r2 object put moltbot-bucket/skills/storia-orchestrator/prompt.md --file claude-share/R2/skills/storia-orchestrator/prompt.md +``` + +Or copy via the Cloudflare dashboard R2 UI. + +## Verifying + +In Telegram, run `/skill` to check if the skill is loaded, or `/skill reload` to force reload. diff --git a/claude-share/R2/skills/storia-orchestrator/prompt.md b/claude-share/R2/skills/storia-orchestrator/prompt.md new file mode 100644 index 000000000..38b73f308 --- /dev/null +++ b/claude-share/R2/skills/storia-orchestrator/prompt.md @@ -0,0 +1,55 @@ +# Storia Digital AI Hub — System Prompt + +You are **Moltworker**, the AI assistant for Storia Digital AI Hub. You are helpful, concise, and action-oriented. Your strength is combining multiple AI models with 14 real-time tools to get things done. + +## Core Behavior + +- Be concise. Lead with answers, not preamble. +- Use Telegram markdown: **bold**, _italic_, `code`, ```code blocks```. No HTML. +- When a user asks about real-time data (weather, prices, news, URLs, repos), **always use tools** — never answer from training data for live information. +- When a user sends a URL, fetch it immediately. When they mention a GitHub repo, read it. When they ask about weather or crypto, look it up. Act first, explain after. +- If multiple lookups are needed, call tools in parallel when possible. +- For long tasks with many tool calls, give brief progress updates between steps. + +## Your Tools (14 Available) + +Use these proactively — they are fast, free, and always available: + +**Web:** fetch_url (raw text), browse_url (JS rendering, screenshots), url_metadata (title/image/author) +**GitHub:** github_read_file, github_list_files, github_api (full REST), github_create_pr (branch+commit+PR) +**Live Data:** get_weather (forecast), get_crypto (prices/top/DEX), convert_currency (150+ currencies), fetch_news (HN/Reddit/arXiv), geolocate_ip +**Create:** generate_chart (bar/line/pie/radar), sandbox_exec (shell in container with git/node/npm) + +## Tool Strategy + +- **Always use tools** for weather, crypto, currency, news, URLs, GitHub — never guess. +- **Fetch URLs automatically** when the user shares one. Don't ask permission. +- **github_create_pr** for simple file changes (up to ~10 files). **sandbox_exec** for complex multi-step work (refactors, tests, builds). +- **Combine tools** in sequences: read repo → modify → create PR. Or fetch URL → extract data → generate chart. +- If a tool fails, explain clearly and suggest an alternative approach. + +## Model Recommendations + +When users ask which model to use, guide them based on task: +- **Coding:** /deep (best value), /qwencoderfree (free), /sonnet (premium) +- **Reasoning:** /deep (value), /flash (strong + 1M context), /opus (best) +- **Tools & Search:** /grok (best agentic), /deep, /gpt +- **Vision:** /gpt, /flash, /haiku, /sonnet (send a photo) +- **Free options:** /qwencoderfree, /pony, /gptoss, /devstral, /trinity +- **Budget:** /deep ($0.25/M), /grok ($0.20/M), /mini ($0.15/M) +- Use /models for the full catalog or /pick for a quick button menu. + +## Response Style + +- **Data lookups** (weather, crypto, currency): lead with the data, minimal commentary. +- **Code:** fenced blocks with language tags. Explain only what's non-obvious. +- **Analysis:** use bullet points or numbered lists. Structure > prose. +- **Errors:** be honest, explain what failed, suggest alternatives. +- Keep responses under 4000 characters when possible (Telegram limit). For long content, summarize and offer details on request. +- Don't repeat the user's question back to them. Don't say "Sure!" or "Great question!" — just answer. + +## Context Awareness + +- You have access to the last 10 messages of conversation history. +- You may receive hints about past tasks and learned patterns — use them for continuity. +- If a user references something from a previous task, check the context hints before asking them to repeat. diff --git a/claude-share/core/AI_CODE_STANDARDS.md b/claude-share/core/AI_CODE_STANDARDS.md new file mode 100644 index 000000000..f7be70695 --- /dev/null +++ b/claude-share/core/AI_CODE_STANDARDS.md @@ -0,0 +1,199 @@ +# AI Code Standards + +> Universal code quality rules for ALL AI assistants working on Moltworker. +> These are non-negotiable. Violations will be caught in review. + +**Last Updated:** 2026-02-06 + +--- + +## TypeScript Patterns + +### General +- **Strict mode** — `tsconfig.json` has strict enabled. Never use `any` unless absolutely necessary. +- **Explicit function signatures** — Always type parameters and return types for exported functions. +- **Prefer `const`** — Use `let` only when reassignment is needed. Never use `var`. +- **Use template literals** — For string concatenation, prefer `` `Hello ${name}` `` over `"Hello " + name`. + +### Imports +- Use named imports: `import { getModel } from './models'` +- Group imports: stdlib → external packages → internal modules +- No circular imports + +### Naming +- **Files:** `kebab-case.ts` (e.g., `task-processor.ts`) +- **Classes:** `PascalCase` (e.g., `TaskProcessor`) +- **Functions/variables:** `camelCase` (e.g., `getModelId`) +- **Constants:** `UPPER_SNAKE_CASE` (e.g., `MAX_TOOL_RESULT_LENGTH`) +- **Interfaces:** `PascalCase`, no `I` prefix (e.g., `ToolContext`, not `IToolContext`) +- **Types:** `PascalCase` (e.g., `Provider`) + +### Async/Await +- Always use `async/await` over raw Promises +- Use `Promise.allSettled()` for parallel operations that should not fail-fast +- Use `Promise.all()` only when ALL promises must succeed +- Always handle errors with try/catch, never `.catch()` chaining + +--- + +## Error Handling + +### Rules +1. **Never swallow errors silently** — At minimum, `console.error` the error +2. **Typed error messages** — Include context: `Error executing ${toolName}: ${error.message}` +3. **User-facing errors** — Must be human-readable, no stack traces to end users +4. **Tool errors** — Return error as tool result, don't crash the conversation loop +5. **API errors** — Include HTTP status code and truncated response body (max 200 chars) + +### Pattern +```typescript +try { + const result = await riskyOperation(); + return result; +} catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`[ComponentName] Operation failed: ${message}`); + // Return graceful fallback, don't re-throw unless caller handles it + return { error: message }; +} +``` + +### Timeouts +- Every external API call MUST have a timeout +- Default: 30s for simple fetches, 60s for tool execution, 300s for LLM API calls +- Use `Promise.race()` with a timeout promise: +```typescript +const result = await Promise.race([ + apiCall(), + new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), 30000)) +]); +``` + +--- + +## Security + +### Absolute Rules +1. **No secrets in code** — API keys, tokens go in environment variables only +2. **No secrets in logs** — Use the redaction utility in `src/utils/logging.ts` +3. **Validate all external input** — URL parameters, request bodies, tool arguments +4. **No `eval()` or `new Function()`** — Ever +5. **Sanitize user input before passing to APIs** — Especially GitHub API endpoints + +### URL Handling +- Validate URLs before fetching: must start with `https://` (or `http://` for localhost) +- Never construct URLs from unvalidated user input without sanitization +- Use `URL` constructor to parse and validate + +### Authentication +- Cloudflare Access JWT validation for admin routes +- Gateway token for control UI +- GitHub token injected via `ToolContext`, never exposed to models + +--- + +## Testing + +### Requirements +- **Every new function** must have at least one test +- **Every bug fix** must have a regression test +- **Test files** colocated with source: `foo.ts` → `foo.test.ts` + +### Framework +- **Vitest** — `npm test` to run all, `npm run test:watch` for development +- **Coverage** — `@vitest/coverage-v8` + +### Patterns +```typescript +import { describe, it, expect, vi } from 'vitest'; + +describe('functionName', () => { + it('should handle the happy path', () => { + expect(functionName(validInput)).toBe(expectedOutput); + }); + + it('should handle edge case', () => { + expect(functionName(edgeInput)).toBe(edgeOutput); + }); + + it('should throw on invalid input', () => { + expect(() => functionName(invalidInput)).toThrow('Expected error'); + }); +}); +``` + +### Mocking +- Use `vi.fn()` for function mocks +- Use `vi.spyOn()` for method spying +- Use test utilities from `src/test-utils.ts` + +--- + +## File Organization + +### Directory Structure +``` +src/ +├── index.ts # Worker entrypoint — keep thin +├── types.ts # Shared TypeScript types +├── config.ts # Constants and configuration +├── auth/ # Authentication logic +├── gateway/ # Sandbox/container management +├── routes/ # HTTP route handlers +├── openrouter/ # OpenRouter API integration +│ ├── client.ts # API client +│ ├── models.ts # Model definitions +│ ├── tools.ts # Tool definitions and execution +│ ├── storage.ts # Conversation state +│ └── costs.ts # (new) Cost tracking +├── telegram/ # Telegram bot +├── discord/ # Discord integration +├── durable-objects/ # Durable Objects (TaskProcessor) +├── client/ # React admin UI +└── utils/ # Shared utilities +``` + +### Rules +- **One concern per file** — Don't mix routing with business logic +- **Max ~500 lines per file** — Split if growing beyond this +- **Keep route handlers thin** — Extract logic to service modules +- **New tools** go in `src/openrouter/tools.ts` (or a `tools/` subdirectory if it grows) +- **New models** go in `src/openrouter/models.ts` + +--- + +## Git Workflow + +### Branches +- `main` — Production, protected. PRs only. +- `claude/-` — Claude work branches +- `codex/-` — Codex work branches +- `feat/` — Human feature branches +- `fix/` — Human bugfix branches + +### Commits +- Atomic commits — one logical change per commit +- Descriptive messages — see SYNC_CHECKLIST.md for format +- Run `npm test && npm run typecheck` before committing + +### Pull Requests +- Title: `(): ` (max 70 chars) +- Body: Summary bullets + test plan +- Must pass CI before merging +- At least one review (human or AI reviewer agent) + +--- + +## Performance + +### Cloudflare Workers Constraints +- **CPU time**: 30ms on free plan, 30s on paid plan (Workers), unlimited on Durable Objects +- **Memory**: 128MB per Worker invocation +- **Subrequests**: 50 per request (paid), 1000 per Durable Object request +- **Response body**: 100MB max + +### Best Practices +- Minimize JSON.stringify/parse in hot paths (especially in task processor) +- Use streaming for LLM responses to avoid response.text() hangs +- Avoid storing large objects in Durable Object storage (prefer R2 for >100KB) +- Use `waitUntil()` for non-critical async work (logging, analytics) diff --git a/claude-share/core/CF-WORKERS-BEST-PRACTICES.md b/claude-share/core/CF-WORKERS-BEST-PRACTICES.md new file mode 100644 index 000000000..16d252134 --- /dev/null +++ b/claude-share/core/CF-WORKERS-BEST-PRACTICES.md @@ -0,0 +1,211 @@ +# Cloudflare Workers Best Practices — Deferred Items + +> **Created**: February 17, 2026 +> **Source**: [CF Workers Best Practices](https://developers.cloudflare.com/workers/best-practices/workers-best-practices/) +> **Status**: Spec (deferred — evaluate when relevant) +> **Owner**: Claude + +--- + +## Context + +Wave 5 review of Cloudflare Workers best practices against Storia's architecture. +P0 and P1 items already implemented (see changelog 2026-02-17). + +This spec captures P2+ items that are not urgent but should be evaluated +when the relevant feature area is being worked on. + +--- + +## 1. Pages to Workers Static Assets Migration + +**Current**: Storia uses `@cloudflare/next-on-pages` for deployment. +**Best practice**: Cloudflare now recommends Workers with Static Assets over Pages for new projects. + +### Why it matters +- Workers Static Assets is the future investment area for Cloudflare +- Pages is in maintenance mode (not deprecated, but less new feature investment) +- Workers unlock Durable Objects, Queues, Cron Triggers, and other primitives directly + +### Why deferred +- `@cloudflare/next-on-pages` still works fine and is actively maintained +- Migration is non-trivial (deployment pipeline, build scripts, preview environments) +- The `opennext.js.org/cloudflare` project may provide a better migration path when mature +- No blocking user-facing issue + +### When to revisit +- When adding Durable Objects (Phase 4B real-time collaboration) +- When `@opennextjs/cloudflare` reaches stable v1.0 +- If Pages deprecation is announced + +### Action items +- [ ] Monitor `@opennextjs/cloudflare` for stability (currently experimental) +- [ ] Evaluate when implementing Durable Objects for real-time collaboration +- [ ] Budget 8-16h for migration when ready + +--- + +## 2. Durable Objects for WebSockets / Real-Time + +**Current**: Storia uses SSE (Server-Sent Events) for real-time, no WebSockets. +**Best practice**: CF recommends Durable Objects + Hibernation API for reliable WebSockets. + +### Why it matters +- SSE is unidirectional (server → client only) +- Durable Objects provide persistent per-user state without database round-trips +- Hibernation API allows WebSocket connections to sleep without billing for idle time +- Enables real-time collaboration (shared cursors, presence indicators) + +### Why deferred +- SSE handles current use cases (alerts, streaming, notifications) +- WebSockets add complexity (connection management, reconnection, state sync) +- Durable Objects require Workers runtime (blocked by Pages → Workers migration) +- Phase 4B (real-time collaboration) is post-revenue + +### When to revisit +- When implementing Phase 4B: Real-time Collaboration +- When implementing multiplayer gecko interactions +- If SSE connection limits become a bottleneck + +### Architecture sketch +``` +User A ──WSS──► Durable Object (room:abc) ◄──WSS── User B + │ + ├── Shared conversation state + ├── Presence (online/typing) + └── Hibernation when idle +``` + +### Action items +- [ ] Prototype when Phase 4B begins +- [ ] Evaluate Hibernation API for cost optimization +- [ ] Design state sync protocol (CRDT vs OT) + +--- + +## 3. Observability Configuration + +**Current**: Storia has structured logging via `createApiContext()` with request IDs. +**Best practice**: CF recommends enabling observability in wrangler config with `head_sampling_rate`. + +### Why it matters +- CF's built-in observability integrates with their dashboard +- `head_sampling_rate` controls log volume and billing +- Structured JSON logging via `console.log` is automatically searchable +- Can replace custom logging infrastructure + +### Why deferred +- Custom logging (`createApiContext`) already works and provides structured output +- Adding CF observability on top would create duplicate logging +- PostHog analytics (Tier 1) is the planned observability platform + +### When to revisit +- After PostHog instrumentation (Tier 1) — evaluate whether CF observability adds value +- If debugging production issues becomes difficult +- When moving off Pages to Workers (observability config differs) + +### Configuration sketch +```jsonc +// Add to wrangler.jsonc when ready +{ + "observability": { + "enabled": true, + "head_sampling_rate": 0.1 // 10% sampling for high-traffic routes + } +} +``` + +### Action items +- [ ] Evaluate after PostHog instrumentation +- [ ] Compare CF observability vs PostHog for backend monitoring +- [ ] Test `head_sampling_rate` impact on debugging capability + +--- + +## 4. `@cloudflare/vitest-pool-workers` for Integration Tests + +**Current**: Tests run in Node.js via Vitest. 214+ tests pass. +**Best practice**: CF provides `@cloudflare/vitest-pool-workers` to run tests in the actual Workers runtime. + +### Why it matters +- Tests in Node.js may pass even when code fails in Workers runtime +- `nodejs_compat` flag is auto-injected in Vitest, masking missing compat flags +- D1, R2, KV bindings can be tested against real (local) implementations +- Catches edge-runtime-specific issues (missing APIs, compat gaps) + +### Why deferred +- 214+ existing tests pass and catch real bugs +- Migration is non-trivial (test harness, fixtures, mocking patterns differ) +- Unit tests for business logic don't benefit from Workers runtime +- Only integration tests for D1/R2/encryption would benefit + +### When to revisit +- When adding new integration tests for D1-heavy features +- When debugging "works in tests but not in production" issues +- When migrating to Workers from Pages + +### Action items +- [ ] Evaluate for D1/R2 integration test suite only (not all 214 tests) +- [ ] Keep existing Vitest unit tests in Node.js +- [ ] Add `@cloudflare/vitest-pool-workers` for a new `test:integration` script +- [ ] Budget: 4-6h for initial setup + 1-2h per test suite migration + +--- + +## 5. Subrequests Limit Increase (10K+) + +**Current**: Paid Workers plans now support up to 10,000 subrequests per invocation (up from 1,000). +**Status**: Already available, no code changes needed. + +### Impact on Storia +- **LLM Proxy**: Fan-out to multiple providers in all-AI/orchestration modes — no longer a concern +- **Situation Monitor**: Batch fetches across 10+ external APIs per briefing — well within limits +- **Gecko Briefing**: Fetches weather + quotes + holidays + news — safe + +### Action items +- [x] No code changes needed — just awareness that the limit is no longer a concern + +--- + +## 6. KV for Response Caching (Alternative to D1) + +**Current**: LLM response cache uses D1 (`llm_response_cache` table). +**Alternative**: Cloudflare KV is purpose-built for read-heavy, eventually-consistent caching. + +### Trade-offs + +| Aspect | D1 (current) | KV | +|--------|-------------|-----| +| Read latency | ~5-10ms (SQLite at edge) | ~1-3ms (global edge cache) | +| Write latency | ~5-10ms | ~60s propagation (eventually consistent) | +| Query flexibility | Full SQL (WHERE, JOIN, aggregates) | Key-value only | +| TTL | Manual (expiresAt column + cleanup) | Built-in TTL parameter | +| Cost | Included in D1 billing | Separate KV billing | +| Consistency | Strong (single region) | Eventually consistent | + +### Why deferred +- D1 cache works fine for current scale +- Adding KV would mean managing two storage systems +- Cache hit rate matters more than latency delta +- Eventually-consistent writes could cause stale cache issues for budget enforcement + +### When to revisit +- If cache read latency becomes a measurable bottleneck (>50ms p99) +- When scaling beyond 100 concurrent users +- If D1 row limits or storage costs become a concern + +### Action items +- [ ] Benchmark D1 cache latency at scale +- [ ] Evaluate KV for read-only caches only (not budget/usage tracking) + +--- + +## Summary — When to Pick Up Each Item + +| Item | Trigger | Effort | +|------|---------|--------| +| Pages → Workers migration | Durable Objects needed OR opennextjs/cloudflare v1.0 | 8-16h | +| Durable Objects | Phase 4B real-time collaboration | 20-30h | +| CF Observability | After PostHog instrumentation | 2-4h | +| Vitest Workers pool | Integration test needs | 4-6h | +| KV cache layer | D1 latency >50ms p99 | 6-8h | diff --git a/claude-share/core/CODE_MODE_MCP_STORIA_SPEC.md b/claude-share/core/CODE_MODE_MCP_STORIA_SPEC.md new file mode 100644 index 000000000..9c37f4627 --- /dev/null +++ b/claude-share/core/CODE_MODE_MCP_STORIA_SPEC.md @@ -0,0 +1,847 @@ +# Code Mode MCP — Storia Digital AI Hub Integration +**Document type**: Full Specification + Sprint Roadmap + Implementation Prompts +**Date**: 2026-02-20 +**Status**: Ready for Claude Code / Codex +**Priority**: Tier 1.5 — Unblocks Agent Mode (§10.6), Dream Machine Build stage (§1), Token Engine +**Target repos**: `storia-agent` (primary) + `ai-hub` (transport layer only) + +--- + +## Table of Contents + +1. [What Happened Today](#1-what-happened-today) +2. [Why This Matters for Storia Right Now](#2-why-this-matters-for-storia-right-now) +3. [Architecture Mapping — Where It Fits](#3-architecture-mapping--where-it-fits) +4. [Scope Definition](#4-scope-definition) +5. [Sprint Roadmap](#5-sprint-roadmap) +6. [Technical Specification](#6-technical-specification) +7. [Security & BYOK Alignment](#7-security--byok-alignment) +8. [Claude Code Prompt (Architecture & Planning)](#8-claude-code-prompt-architecture--planning) +9. [Codex Prompt (Step-by-Step Implementation)](#9-codex-prompt-step-by-step-implementation) +10. [Verification & Test Prompt](#10-verification--test-prompt) +11. [Open Questions Before Starting](#11-open-questions-before-starting) +12. [What NOT to Do](#12-what-not-to-do) + +--- + +## 1. What Happened Today + +Cloudflare published **Code Mode MCP** on 2026-02-20. This is not incremental — it changes the economics of AI agents working with infrastructure. + +**The core problem it solves**: The Cloudflare API has 2,500+ endpoints. Giving an AI agent access to even a fraction of them via traditional MCP tool definitions would consume millions of tokens — more than most models' full context windows. + +**The solution**: Instead of exposing thousands of tools, Code Mode gives the agent exactly two: + +``` +search(code: string) → executes JS against the full OpenAPI spec, returns only what's needed +execute(code: string) → runs authenticated API calls inside a V8 sandbox Worker +``` + +**Result**: The entire Cloudflare API surface in ~1,000 tokens. 99.9% reduction. + +**The sandbox** (Dynamic Worker Loader) runs code in a V8 isolate with: +- No filesystem access +- No env var leakage +- External fetches disabled by default +- Outbound calls explicitly controlled + +**Official resources**: +- Blog: https://blog.cloudflare.com/code-mode-mcp/ +- Public MCP server: `https://mcp.cloudflare.com/mcp` +- Code Mode SDK: `github.com/cloudflare/agents` → `packages/codemode` +- Auth: OAuth 2.1 with downscoped tokens per user action + +Cloudflare explicitly named **Moltworker** in the comparison section. They're watching. + +--- + +## 2. Why This Matters for Storia Right Now + +### 2.1 The Gap This Closes + +From Wave 4 §10.6, Agent Mode had a 13% capability gap vs native IDEs — specifically around real infrastructure operations. Storia's agent could run code, but couldn't provision the infrastructure that code needs to run in. Code Mode MCP closes exactly that gap. + +### 2.2 Impact Matrix (Storia-Specific) + +| Storia Feature | Current State | With Code Mode MCP | Impact | +|---|---|---|---| +| **storia-agent / Agent Mode (§10.6)** | Runs code in sandbox, no infra access | Can provision D1, R2, Workers, DNS, Pages from within the same agent loop | ★★★★★ | +| **Dream Machine — Build Stage (§1.4)** | Generates code + PRs, cannot deploy | Can create Workers, configure Pages, set up R2 buckets autonomously overnight | ★★★★★ | +| **Dream Machine — Ship Level (§1.4)** | Locked behind manual deploy | Shipper-tier autonomy becomes real: overnight build + deploy cycle | ★★★★★ | +| **Token Optimization Engine** | ClawRouter routes to cheap models that can't handle large APIs | Groq/DeepSeek can now operate full Cloudflare API in 1k tokens | ★★★★☆ | +| **Situation Monitor Build (§7)** | Planned ~80h manual port | Agent Mode could bootstrap infra (Workers, KV, Cron) autonomously | ★★★☆☆ | +| **Telegram Bot — /deploy commands (§9.1)** | Not yet implemented | `/deploy mysite` can now provision + deploy end-to-end | ★★★☆☆ | + +### 2.3 Strategic Position + +Grok's analysis called this "Tier 1.5." That's correct and here's the precise reasoning: + +- **Not Tier 1** (blocking release): storia-agent and Cockpit UI ship without it. Phase 0 security, auth, and BYOK vault are the actual Tier 1 blockers. +- **Tier 1.5**: It's the single highest-leverage addition to storia-agent that doesn't change core architecture. It rides on the existing skill system, existing BYOK key flow, and existing CF Worker sandbox — with zero structural changes to ai-hub. +- **Becomes Tier 1** the moment Dream Machine Build stage begins, because Build can't "Ship" without infra provisioning. + +--- + +## 3. Architecture Mapping — Where It Fits + +### 3.1 Existing Architecture (from Wave 4 §10.6) + +``` +storia.digital (ai-hub) +├── Agent Panel UI (Monaco, Diff Viewer, Terminal Output) +└── WebSocket/SSE stream + │ + │ HTTPS + Auth token (user's Anthropic key via BYOK) + ▼ +storia-agent (CF Worker + Sandbox) ← CODE MODE LIVES HERE +├── HTTP/WS API layer (new, §10.6) +├── Task Engine (existing moltworker agent loop) +├── Skills System (existing) +└── CF Sandbox (git, npm, file editing, test running) +``` + +### 3.2 Where Code Mode MCP Plugs In + +Code Mode MCP is a **new skill** inside storia-agent's existing Skills System. It does NOT require changes to: +- ai-hub frontend +- Auth.js / BYOK vault flow +- ClawRouter routing logic +- Agent loop core + +The only additions are: +1. A new skill file: `src/skills/cloudflare-code-mode.ts` (in storia-agent) +2. A new MCP client wrapper: `src/mcp/cloudflare-client.ts` (in storia-agent) +3. Skill registration in `src/skills/index.ts` + +### 3.3 Token Flow with BYOK + +``` +1. User triggers action requiring Cloudflare API +2. storia-agent skill receives task + user's CF API token + (token comes from byok.cloud vault, decrypted client-side, forwarded in header) +3. Skill calls Code Mode MCP server (https://mcp.cloudflare.com/mcp) + with user's downscoped OAuth token +4. search() + execute() run inside CF's V8 sandbox +5. Results stream back to storia-agent +6. storia-agent streams to Storia IDE via SSE +7. User sees real-time terminal output + diffs + +Zero markup. User's own CF account. Their infra. +``` + +### 3.4 The `search()` + `execute()` Pattern Inside storia-agent + +```typescript +// Story agent task: "Create an R2 bucket for the user's project files" + +// Step 1: Search for the right endpoint +const searchResult = await mcpClient.search(` + async () => { + const results = []; + for (const [path, methods] of Object.entries(spec.paths)) { + if (path.includes('/r2/buckets')) { + for (const [method, op] of Object.entries(methods)) { + results.push({ method: method.toUpperCase(), path, summary: op.summary }); + } + } + } + return results; + } +`); + +// Step 2: Execute the creation +const result = await mcpClient.execute(` + async () => { + const response = await cloudflare.request({ + method: "POST", + path: "/accounts/${accountId}/r2/buckets", + body: { name: "storia-user-${userId}-files" } + }); + return response; + } +`); +``` + +--- + +## 4. Scope Definition + +### 4.1 MVP (Sprint A — 8-12h) + +**Goal**: storia-agent can call the full Cloudflare API via Code Mode MCP using the user's own CF credentials. + +Deliverables: +- `cloudflare-code-mode` skill registered and functional +- MCP client with OAuth 2.1 token flow +- Audit logging of every `execute()` call (who, when, what, account) +- Human approval gate for destructive operations (delete, create DNS records) +- Telegram command: `/cloudflare ` +- Test suite: whoami, list R2 buckets, list Workers, list Pages projects + +**Out of scope for MVP**: +- Storia IDE frontend changes +- Dream Machine Build integration +- Custom Code Mode MCP for Storia's own APIs + +### 4.2 Sprint B — IDE Integration (16-24h) + +**Goal**: Agent Mode in the Storia IDE can use Code Mode MCP during coding tasks. + +Deliverables: +- SSE streaming of Code Mode results to IDE terminal panel +- "Provision this" shortcut: agent sees code needing a D1 binding → provisions it +- ClawRouter badge shows "CF Code Mode" when skill is active +- Rate limits per user (max 10 execute() calls per session) + +### 4.3 Sprint C — Dream Machine Build Stage (20-30h) + +**Goal**: Dream Machine's Build + Ship stages use Code Mode MCP to go from code to deployed product. + +Deliverables: +- Overnight build loop can provision Workers + Pages + R2 + D1 bindings +- Morning brief includes infra provisioning log +- Rollback: every overnight provision creates a tagged Cloudflare state snapshot +- Budget cap: max CF API calls per overnight cycle +- Vex reviews all provisioning before Ship-tier executes + +--- + +## 5. Sprint Roadmap + +``` +WEEK 1 (2026-02-20 → 2026-02-28) +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Day 1-2 │ Sprint A: MCP client + skill stub + │ Branch: claude/code-mode-mcp-mvp + │ Files: src/mcp/cloudflare-client.ts + │ src/skills/cloudflare-code-mode.ts + │ +Day 3 │ Sprint A: BYOK token flow + audit log + │ Files: src/lib/audit.ts (add CF_CODE_MODE event type) + │ src/skills/cloudflare-code-mode.ts (auth integration) + │ +Day 4 │ Sprint A: Telegram command + tests + │ Files: src/handlers/telegram.ts (/cloudflare command) + │ tests/cloudflare-code-mode.test.ts + │ +Day 5 │ Sprint A: Review, security scan, merge to main + │ PR: claude/code-mode-mcp-mvp → main + │ Deploy: wrangler deploy --env production + +WEEK 2 (2026-03-01 → 2026-03-07) +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +Day 1-2 │ Sprint B: IDE SSE streaming integration + │ Branch: claude/code-mode-ide-integration + │ +Day 3-4 │ Sprint B: ClawRouter badge, rate limits + │ +Day 5 │ Sprint B: Review + merge + +WEEK 3-4 (2026-03-08 → 2026-03-21) +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + │ Sprint C: Dream Machine Build stage + │ Branch: claude/dream-machine-build-infra + │ (Coordinate with Dream Machine spec from §1) +``` + +### 5.1 Effort Estimates + +| Sprint | Effort | Risk | Dependency | +|--------|--------|------|------------| +| A — MVP Skill | 8-12h | Low — additive, no structural changes | storia-agent deployed + Cloudflare OAuth app created | +| B — IDE Integration | 16-24h | Medium — SSE streaming complexity | Sprint A complete, §10.6 transport layer ready | +| C — Dream Machine | 20-30h | High — overnight autonomy safety | Sprint B complete, Dream Machine spec finalized | + +--- + +## 6. Technical Specification + +### 6.1 Dependencies + +In `storia-agent/package.json`: +```json +{ + "dependencies": { + "@cloudflare/agents": "latest" + } +} +``` + +> **Note**: Verify exact package name and whether `codemode` is exported from `@cloudflare/agents` or a separate package at `github.com/cloudflare/agents/packages/codemode` before installing. Do NOT add `@cloudflare/codemode` as a separate entry — this package does not exist at time of writing. Inspect the actual repo structure first. + +### 6.2 MCP Client (`src/mcp/cloudflare-client.ts`) + +```typescript +// storia-agent/src/mcp/cloudflare-client.ts + +export interface CodeModeResult { + success: boolean; + data: unknown; + tokensUsed?: number; + error?: string; +} + +export class CloudflareMCPClient { + private baseUrl = "https://mcp.cloudflare.com/mcp"; + + constructor( + private readonly cfOAuthToken: string, // user's downscoped CF OAuth token + private readonly accountId: string // user's CF account ID + ) {} + + async search(code: string): Promise { + return this.callTool("search", { code }); + } + + async execute(code: string, requiresApproval = false): Promise { + // Destructive operations get flagged before execution + if (requiresApproval) { + // Emit approval_required event via SSE before proceeding + throw new ApprovalRequiredError(code); + } + return this.callTool("execute", { code }); + } + + private async callTool( + tool: "search" | "execute", + input: { code: string } + ): Promise { + const res = await fetch(`${this.baseUrl}/tools/${tool}`, { + method: "POST", + headers: { + "Authorization": `Bearer ${this.cfOAuthToken}`, + "Content-Type": "application/json", + "CF-Account-ID": this.accountId, + }, + body: JSON.stringify(input), + }); + + if (!res.ok) { + const err = await res.text(); + return { success: false, data: null, error: err }; + } + + const data = await res.json(); + return { success: true, data }; + } +} + +export class ApprovalRequiredError extends Error { + constructor(public readonly code: string) { + super("Human approval required before executing this operation."); + } +} +``` + +### 6.3 Skill Definition (`src/skills/cloudflare-code-mode.ts`) + +```typescript +// storia-agent/src/skills/cloudflare-code-mode.ts + +import { CloudflareMCPClient, ApprovalRequiredError } from "../mcp/cloudflare-client"; +import { auditLog } from "../lib/audit"; +import { isDestructiveOperation } from "../lib/safety"; + +export interface CloudflareCodeModeInput { + task: string; // Natural language: "list all R2 buckets" + mode: "search" | "execute" | "auto"; // auto = search first, then execute + requireApproval?: boolean; +} + +export interface CloudflareCodeModeContext { + userId: string; + cfOAuthToken: string; // from byok.cloud, decrypted client-side + cfAccountId: string; // from user's stored CF account config + sessionId: string; +} + +export const cloudflareCodeModeSkill = { + name: "cloudflare_code_mode", + description: ` + Access the ENTIRE Cloudflare API using Code Mode. + Covers: R2, D1, Workers, Pages, DNS, Zero Trust, WAF, Queues, KV, Durable Objects. + Uses search() to discover endpoints and execute() to run safe sandboxed API calls. + Entire API surface costs ~1,000 tokens. Use for infrastructure tasks only. + Always search before executing. Flag destructive operations for approval. + `.trim(), + parameters: { + type: "object" as const, + properties: { + task: { type: "string", description: "Natural language infrastructure task" }, + mode: { + type: "string", + enum: ["search", "execute", "auto"], + description: "search=discovery only, execute=run code, auto=search then execute", + default: "auto" + }, + requireApproval: { + type: "boolean", + description: "Request human approval before executing (use for create/delete/update)", + default: false + } + }, + required: ["task"] + }, + + async execute( + input: CloudflareCodeModeInput, + ctx: CloudflareCodeModeContext + ) { + const client = new CloudflareMCPClient(ctx.cfOAuthToken, ctx.cfAccountId); + const startedAt = Date.now(); + + try { + // 1. Always search first to find the right endpoints + const searchCode = buildSearchCode(input.task); + const searchResult = await client.search(searchCode); + + if (input.mode === "search") { + await auditLog({ + event: "CF_CODE_MODE_SEARCH", + userId: ctx.userId, + sessionId: ctx.sessionId, + task: input.task, + durationMs: Date.now() - startedAt, + }); + return { type: "search_result", data: searchResult.data }; + } + + // 2. Generate execution code from search results + task + const execCode = buildExecuteCode(input.task, searchResult.data); + const destructive = isDestructiveOperation(execCode); + + if (destructive || input.requireApproval) { + // Emit approval gate event — the agent loop handles this + throw new ApprovalRequiredError(execCode); + } + + // 3. Execute + const execResult = await client.execute(execCode); + + await auditLog({ + event: "CF_CODE_MODE_EXECUTE", + userId: ctx.userId, + sessionId: ctx.sessionId, + task: input.task, + destructive: false, + durationMs: Date.now() - startedAt, + }); + + return { type: "execute_result", data: execResult.data }; + + } catch (err) { + if (err instanceof ApprovalRequiredError) { + return { + type: "approval_required", + pendingCode: err.code, + message: "This operation requires your approval. Review and confirm.", + }; + } + throw err; + } + } +}; + +// These two functions need LLM generation or template logic +// — implement as separate Claude calls inside the skill for now +function buildSearchCode(task: string): string { + // Generate a JS arrow function that filters spec.paths based on the task + // Example: task "list R2 buckets" → searches for paths containing /r2/buckets + // This is where a second LLM call (cheap model) generates the search code + throw new Error("buildSearchCode: not yet implemented — see Sprint A Day 1"); +} + +function buildExecuteCode(task: string, searchData: unknown): string { + // Generate the execute code from the discovered endpoints + task description + throw new Error("buildExecuteCode: not yet implemented — see Sprint A Day 2"); +} +``` + +### 6.4 Safety Utilities (`src/lib/safety.ts`) + +```typescript +// Patterns that require human approval gate before CF execute() +const DESTRUCTIVE_PATTERNS = [ + /\.delete\(/i, + /method.*"DELETE"/i, + /createWorker|deleteWorker/i, + /createBucket|deleteBucket/i, + /PUT.*\/dns_records/i, + /DELETE.*\/zones/i, + /purge_everything/i, +]; + +export function isDestructiveOperation(code: string): boolean { + return DESTRUCTIVE_PATTERNS.some(p => p.test(code)); +} +``` + +### 6.5 Skill Registration + +```typescript +// storia-agent/src/skills/index.ts — ADD THIS LINE +import { cloudflareCodeModeSkill } from "./cloudflare-code-mode"; + +export const allSkills = [ + // ... existing skills + cloudflareCodeModeSkill, // ← ADD +]; +``` + +### 6.6 Telegram Command Handler + +```typescript +// In storia-agent/src/handlers/telegram.ts +if (text.startsWith("/cloudflare ")) { + const task = text.replace("/cloudflare ", "").trim(); + + await bot.sendMessage(chatId, `🦎 Vex is checking Cloudflare... 🔍`); + + const result = await runSkill("cloudflare_code_mode", { + task, + mode: "auto", + requireApproval: false, + }, { + userId: telegramUser.storiaUserId, + cfOAuthToken: await getCFToken(telegramUser.storiaUserId), + cfAccountId: await getCFAccountId(telegramUser.storiaUserId), + sessionId: generateSessionId(), + }); + + if (result.type === "approval_required") { + await bot.sendMessage(chatId, `⚠️ Vex says: This requires approval. Here's what I would do:\n\`\`\`\n${result.pendingCode}\n\`\`\`\n\nReply /cf_approve to proceed or /cf_cancel to abort.`); + } else { + await bot.sendMessage(chatId, `✅ Done!\n\`\`\`json\n${JSON.stringify(result.data, null, 2)}\n\`\`\``); + } +} +``` + +### 6.7 CF OAuth App Setup (One-Time, Manual) + +Before Sprint A begins: + +1. Go to Cloudflare Dashboard → My Profile → API Tokens +2. Create OAuth App: "Storia Agent" +3. Scopes (minimum for MVP): + - `account:read` + - `r2:read`, `r2:write` + - `workers:read` + - `pages:read` + - `d1:read` +4. Store Client ID + Secret in storia-agent env vars: + - `CF_MCP_CLIENT_ID` + - `CF_MCP_CLIENT_SECRET` +5. OAuth callback URL: `https://storia.digital/api/cf/oauth/callback` + +The per-user token is then stored encrypted in byok.cloud (same vault, new key type: `cloudflare_oauth_token`). + +--- + +## 7. Security & BYOK Alignment + +### 7.1 What This Changes in the Security Model + +| Area | Before | After | +|------|--------|-------| +| API keys stored | AI provider keys (Anthropic, OpenAI, etc.) | + Cloudflare OAuth token (new key type in vault) | +| SSRF risk | LLM_ALLOWED_HOSTS env var protects against LLM-triggered outbound | Code Mode MCP server does its own sandbox isolation — NOT a new SSRF vector in storia-agent | +| Destructive ops | N/A | New: `isDestructiveOperation()` guard + approval gate | +| Audit log events | Existing events | New: `CF_CODE_MODE_SEARCH`, `CF_CODE_MODE_EXECUTE` | + +### 7.2 What the CF Sandbox Already Handles + +The Dynamic Worker Loader that Code Mode runs inside: +- No filesystem access (can't read storia-agent secrets) +- No env var access (CF account credentials not exposed to user-generated code) +- External fetches disabled except `cloudflare.request()` which uses the user's OAuth token +- OAuth 2.1 downscoping: user only grants minimum permissions at connection time + +This means the user-provided "task" cannot escalate beyond the OAuth scopes they granted. + +### 7.3 Rate Limits (Add to Storia's Rate Limiting Layer) + +```typescript +const CF_CODE_MODE_LIMITS = { + search_per_session: 20, // search() calls per agent session + execute_per_session: 10, // execute() calls per agent session + execute_per_day: 50, // per user per 24h + max_code_length: 2000, // characters in generated JS +}; +``` + +--- + +## 8. Claude Code Prompt (Architecture & Planning) + +> **Instructions**: Paste this into Claude Code at the start of the integration session. This is for architecture review and planning, not yet for code generation. + +--- + +``` +You are working on PetrAnto/storia-agent, a private Cloudflare Worker that is a fork of +Cloudflare's moltworker, enhanced with gecko personalities (Zori, Kai, Vex, Razz), the +Storia BYOK key system, and an agent loop for autonomous task execution. + +We are integrating Cloudflare Code Mode MCP (released 2026-02-20). This gives the agent +access to the entire Cloudflare API (2,500+ endpoints) using only two tools (search + execute) +consuming ~1,000 tokens total. Reference: https://blog.cloudflare.com/code-mode-mcp/ + +The Code Mode SDK is open-sourced at: github.com/cloudflare/agents/tree/main/packages/codemode + +TASK 1 — CODEBASE AUDIT +Read these files and summarize their current state: +- src/skills/index.ts +- src/skills/ (list all skill files and their exports) +- src/lib/audit.ts or similar (how are events logged?) +- src/handlers/telegram.ts (how are commands parsed and skills invoked?) +- wrangler.toml or wrangler.jsonc (what env vars, bindings, and routes exist?) + +TASK 2 — PACKAGE VERIFICATION +Check if @cloudflare/agents is already in package.json. If not, identify the correct +package name for Code Mode by inspecting the repo at: +github.com/cloudflare/agents/packages/codemode/package.json +Report the exact package name and version before any installation. + +TASK 3 — INTEGRATION PLAN +Based on the codebase audit, produce an integration plan with these sections: +a) New files to create (path + purpose) +b) Existing files to modify (path + exact change required) +c) Env vars to add to wrangler.toml +d) Any structural conflicts with existing code +e) Estimated hours per file + +Do not write any code yet. Only plan. + +TASK 4 — BYOK ALIGNMENT CHECK +The user's Cloudflare OAuth token will be stored in byok.cloud and decrypted client-side +before being passed to storia-agent as a request header. Verify: +a) Where does the existing BYOK token flow in the codebase (how does the agent receive + and use the Anthropic key currently)? +b) Will the same pattern work for a CF OAuth token? +c) Are there any changes needed to the BYOK key type schema? + +RULES: +- Branch name must start with: claude/code-mode-mcp-mvp +- Do not modify core agent loop files (agent.ts or equivalent) +- Do not touch auth middleware +- All new files go in src/skills/ or src/mcp/ +- When resolving test-results-summary.json conflicts: always --theirs +``` + +--- + +## 9. Codex Prompt (Step-by-Step Implementation) + +> **Instructions**: Paste this into Codex (or Claude Code in implementation mode) after the architecture plan from §8 is approved. + +--- + +``` +Implement Cloudflare Code Mode MCP integration for PetrAnto/storia-agent. + +CONTEXT: +- storia-agent is a private Cloudflare Worker forked from moltworker +- The agent has a Skills System (src/skills/index.ts + skill files) +- BYOK tokens are received as request headers and used to authenticate AI provider calls +- Audit logging exists at src/lib/audit.ts (or equivalent) +- Branch: claude/code-mode-mcp-mvp + +IMPLEMENT IN THIS EXACT ORDER: + +STEP 1: Verify and install the Code Mode package +- Check github.com/cloudflare/agents for the codemode package's exact npm name +- Add ONLY the verified package to package.json +- Run: npm install +- Confirm the package installs without errors + +STEP 2: Create src/mcp/cloudflare-client.ts +Implement: +- CloudflareMCPClient class with search(code) and execute(code) methods +- Both methods POST to https://mcp.cloudflare.com/mcp/tools/{search|execute} +- Auth header: Authorization: Bearer +- CF-Account-ID header: +- Return type: { success: boolean, data: unknown, error?: string } +- ApprovalRequiredError class (exported) +- Add JSDoc comments to all public methods + +STEP 3: Create src/lib/safety.ts +Implement: +- DESTRUCTIVE_PATTERNS array (DELETE, purge, create DNS, delete bucket, delete worker) +- isDestructiveOperation(code: string): boolean +- Export both + +STEP 4: Create src/skills/cloudflare-code-mode.ts +Implement the cloudflareCodeModeSkill object with: +- name: "cloudflare_code_mode" +- description: (see full spec document) +- parameters: zod schema or JSON schema per existing skill pattern +- execute(input, ctx) method that: + a) Creates CloudflareMCPClient with ctx.cfOAuthToken + ctx.cfAccountId + b) Always calls search() first + c) Returns early if mode === "search" + d) For execute mode: checks isDestructiveOperation(), throws ApprovalRequiredError if true + e) Calls client.execute() + f) Calls auditLog() with CF_CODE_MODE_SEARCH or CF_CODE_MODE_EXECUTE event + +For buildSearchCode() and buildExecuteCode(): +- Make a SECOND LLM call using the existing agent's LLM client +- Use a short system prompt: "Generate a JavaScript arrow function that searches the Cloudflare + OpenAPI spec for endpoints relevant to this task. Return only the async arrow function + code, no explanation." +- Use a cheap model (match the existing free/cheap model selection pattern in the codebase) + +STEP 5: Register the skill in src/skills/index.ts +- Import cloudflareCodeModeSkill +- Add to allSkills array +- Ensure TypeScript compiles without errors + +STEP 6: Add Telegram /cloudflare command to src/handlers/telegram.ts +Pattern to match existing command handlers: +- Command: /cloudflare +- Send "🦎 Vex is scanning Cloudflare..." message before execution +- Call runSkill("cloudflare_code_mode", ...) with userId, cfOAuthToken, cfAccountId +- Handle approval_required response type (send pending code for review) +- Handle errors (send friendly gecko error message) + +STEP 7: Update wrangler.toml or wrangler.jsonc +Add env vars: +- CF_MCP_CLIENT_ID +- CF_MCP_CLIENT_SECRET +- CF_MCP_BASE_URL = "https://mcp.cloudflare.com/mcp" + +STEP 8: Write tests in tests/cloudflare-code-mode.test.ts +Test cases: +a) search() returns results for "list R2 buckets" task +b) execute() with non-destructive code completes successfully +c) execute() with DELETE pattern throws ApprovalRequiredError +d) audit log is called after every search and execute +e) Missing cfOAuthToken throws appropriate error + +RULES: +- Follow existing skill file pattern exactly (look at 2 existing skills before starting) +- No any types — use proper TypeScript +- Zod validation on all inputs matching existing pattern +- Never log cfOAuthToken or cfAccountId to console +- When resolving test-results-summary.json conflicts: git checkout --theirs test-results-summary.json +- Run npx tsc --noEmit after every file to verify no type errors +- Do not commit until all tests pass +``` + +--- + +## 10. Verification & Test Prompt + +> **Instructions**: Run this after Sprint A is deployed to storia-agent production. + +--- + +``` +Verify the Cloudflare Code Mode MCP integration in storia-agent production. + +Run these tests in order. Stop and report if any fail. + +TEST 1 — Health check +Send to Telegram @petrantobot: + /cloudflare list all R2 buckets +Expected: Bot replies with a list of R2 buckets from the user's CF account. +Expected time: < 10 seconds. + +TEST 2 — Search-only mode +Programmatically call the skill with mode: "search": + task: "create a D1 database" + mode: "search" +Expected: Returns endpoint list including POST /accounts/{id}/d1/database, no execution. + +TEST 3 — Destructive operation gate +Programmatically call with a delete task: + task: "delete the bucket named test-bucket" + mode: "execute" + requireApproval: false +Expected: Returns { type: "approval_required", pendingCode: "..." } +FAIL if: Execution proceeds without approval. + +TEST 4 — Audit log verification +After TEST 1 and TEST 2, query D1: + SELECT * FROM audit_log WHERE event LIKE 'CF_CODE_MODE_%' ORDER BY created_at DESC LIMIT 5; +Expected: 2 rows — one CF_CODE_MODE_SEARCH, one CF_CODE_MODE_EXECUTE. +Verify: user_id populated, duration_ms > 0, no token data in any column. + +TEST 5 — Token budget check +Ask the agent: + /cloudflare what workers do I have deployed? +Check ClawRouter badge in logs. +Expected: Token count for the CF Code Mode MCP tool definition ≤ 1,500 tokens. +FAIL if: > 5,000 tokens consumed by the tool definition alone. + +TEST 6 — Error handling +Temporarily set cfOAuthToken to an invalid value. +Expected: Skill returns { success: false, error: "Authentication failed" } +FAIL if: Exception bubbles up uncaught. + +TEST 7 — Persona check +The /cloudflare Telegram response should include Vex's personality. +Expected: Message contains 📊 or Vex-style framing. +FAIL if: Generic error message with no gecko personality. + +Report format: +- TEST N: PASS/FAIL +- If FAIL: exact error message + stack trace +- Overall: Ready for Sprint B / Needs fixes +``` + +--- + +## 11. Open Questions Before Starting + +These must be answered before Day 1 of Sprint A: + +| # | Question | Who | Answer Needed By | +|---|----------|-----|-----------------| +| 1 | Is the CF OAuth token already a key type in byok.cloud, or does a new type need to be added? | PetrAnto | Before Sprint A Day 1 | +| 2 | Does the user need to manually create a Cloudflare OAuth app, or does the public `https://mcp.cloudflare.com/mcp` server handle auth via its own OAuth flow? | Verify from blog | Before Sprint A Day 1 | +| 3 | Is the Code Mode SDK (`packages/codemode`) intended to be installed in the MCP *server* or in the *client* calling the server? For our case (using the public CF MCP server), do we even need the SDK? | Read the repo | Before Sprint A Day 1 | +| 4 | What is the current CF token scope storia-agent uses for Cloudflare API calls (build verification loop from §10.1)? Can the same token be reused for Code Mode? | Check existing wrangler secrets | Before Sprint A Day 1 | +| 5 | Should Code Mode results stream via SSE to the Storia IDE immediately, or is Sprint B the right time for that? | PetrAnto decision | Before Sprint B | + +> **Question 3 is the most important**. Grok's analysis assumed you need to install the Code Mode SDK locally. But if you're consuming the **public Cloudflare MCP server** (`https://mcp.cloudflare.com/mcp`), you just need an MCP HTTP client — not the SDK itself. The SDK is for building your *own* Code Mode server. Clarify this before installing anything. + +--- + +## 12. What NOT to Do + +Grok's analysis was directionally correct but had some gaps. Avoid these: + +| Don't | Why | +|-------|-----| +| `npm install @cloudflare/codemode` | This package does not exist. The SDK is inside `@cloudflare/agents` as `packages/codemode`. Verify the export name before installing. | +| Create the skill inside ai-hub (Next.js) | Code Mode must run inside storia-agent Worker, not the Next.js app. The Edge runtime constraints and request lifetime in Pages would break the async tool calls. | +| Skip the `search()` step and go straight to `execute()` | The whole value of Code Mode is progressive discovery. Blind `execute()` calls will fail because the model won't know the right endpoint paths. Always search first. | +| Use Code Mode for AI model routing | Code Mode is for Cloudflare *infrastructure* API only. ClawRouter continues to handle AI provider routing. These are separate systems. | +| Give the skill access to all CF scopes immediately | Start with read-only scopes (r2:read, workers:read, d1:read, pages:read) for MVP. Add write scopes incrementally after audit logging is verified. | +| Use Code Mode for personal data (user messages, conversations) | Code Mode only touches Cloudflare infrastructure (Workers, R2, D1 databases as units, not their contents). User data stays in storia's D1 via the existing ORM layer. | + +--- + +## Quick Reference + +``` +Public MCP server: https://mcp.cloudflare.com/mcp +Code Mode SDK repo: github.com/cloudflare/agents → packages/codemode +Blog post: https://blog.cloudflare.com/code-mode-mcp/ +Branch convention: claude/code-mode-mcp-mvp (Sprint A) + claude/code-mode-ide-integration (Sprint B) + claude/dream-machine-build-infra (Sprint C) +Test conflict res: git checkout --theirs test-results-summary.json +Approval gate: ApprovalRequiredError for all destructive ops +Audit events: CF_CODE_MODE_SEARCH, CF_CODE_MODE_EXECUTE +Token budget: ≤ 1,500 tokens for full tool definition +Max execute/day: 50 per user (adjust after observing real usage) + +⚠️ Before ANY moltbot deployment: delete R2 bucket contents first + https://dash.cloudflare.com/5200b896d3dfdb6de35f986ef2d7dc6b/r2/default/buckets/moltbot-data +``` + +--- + +*End of Document — CODE_MODE_MCP_STORIA_SPEC.md* +*Next: Answer the 5 open questions in §11, then feed §8 prompt to Claude Code* diff --git a/claude-share/core/GLOBAL_ROADMAP.md b/claude-share/core/GLOBAL_ROADMAP.md new file mode 100644 index 000000000..8610670bb --- /dev/null +++ b/claude-share/core/GLOBAL_ROADMAP.md @@ -0,0 +1,346 @@ +# Moltworker Global Roadmap + +> **Single source of truth** for all project planning and status tracking. +> Updated by every AI agent after every task. Human checkpoints marked explicitly. + +**Last Updated:** 2026-02-20 (Phase 5.5: web_search tool via Brave Search API) + +--- + +## Project Overview + +**Moltworker** is a multi-platform AI assistant gateway deployed on Cloudflare Workers. It provides: +- 30+ AI models via OpenRouter + direct provider APIs (with capability metadata) +- 14 tools (fetch_url, github_read_file, github_list_files, github_api, github_create_pr, url_metadata, generate_chart, get_weather, fetch_news, convert_currency, get_crypto, geolocate_ip, browse_url, sandbox_exec) — parallel execution with safety whitelist +- Durable Objects for unlimited-time task execution +- Multi-platform chat (Telegram, Discord, Slack) +- Image generation (FLUX.2 models) +- Browser automation (Cloudflare Browser Rendering) +- Admin dashboard (React) + +**Philosophy:** Ship fast, compound learnings, multi-model by default. + +--- + +## Status Legend + +| Emoji | Status | +|-------|--------| +| ✅ | Complete | +| 🔄 | In Progress | +| 🔲 | Not Started | +| ⏸️ | Blocked | +| 🧪 | Needs Testing | + +--- + +## Phase Plan + +### Phase 0: Quick Wins (Trivial effort, immediate value) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 0.1 | Enable `supportsTools: true` for Gemini 3 Flash | ✅ | Previous PR | Already on main | +| 0.2 | Add GPT-OSS-120B to model catalog | ✅ | Claude | `gptoss` alias, free tier | +| 0.3 | Add GLM 4.7 to model catalog | ✅ | Claude | `glm47` alias, $0.07/$0.40 | +| 0.4 | Fix section numbering in tool-calling-analysis.md | ✅ | Human | Resolved externally | +| 0.5 | Add OpenRouter Pony Alpha | ✅ | Claude | `pony` alias, free | + +> 🧑 HUMAN CHECK 0.6: Verify new model IDs are correct on OpenRouter — ✅ DEPLOYED OK + +--- + +### Phase 1: Tool-Calling Optimization (Low-Medium effort, high value) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 1.1 | Implement parallel tool execution (`Promise.allSettled`) | ✅ | Claude | `task-processor.ts` — `Promise.allSettled` + `PARALLEL_SAFE_TOOLS` whitelist (11 read-only safe, 3 mutation sequential); `client.ts` — `Promise.all` (no whitelist, Worker path) | +| 1.2 | Enrich model capability metadata | ✅ | Claude | `parallelCalls`, `structuredOutput`, `reasoning`, `maxContext` for all 30+ models | +| 1.3 | Add configurable reasoning per model | ✅ | Claude | Auto-detect + `think:LEVEL` override; DeepSeek/Grok `{enabled}`, Gemini `{effort}` | +| 1.4 | Combine vision + tools into unified method | ✅ | Claude | Vision messages now route through tool-calling path (DO) for tool-supporting models | +| 1.5 | Add structured output support | ✅ | Claude | `response_format: { type: "json_object" }` via `json:` prefix for compatible models | + +> 🧑 HUMAN CHECK 1.6: Test parallel tool execution with real API calls — ⏳ PENDING +> 🧑 HUMAN CHECK 1.7: Verify reasoning control doesn't break existing models — ✅ TESTED (works but BUG-3: think: not passed through DO) +> ✅ BUG-3 FIXED: `think:` override now passed through Durable Object path — `reasoningLevel` added to `TaskRequest` + +### Phase 1.5: Upstream Sync & Infrastructure (Completed) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 1.5.1 | Cherry-pick upstream exitCode fix (0c1b37d) | ✅ | Claude | `sync.ts` — fixes race condition in config file detection | +| 1.5.2 | Cherry-pick container downgrade (92eb06a) | ✅ | Claude | `standard-4` → `standard-1` (~$26→$6/mo) | +| 1.5.3 | Cherry-pick WebSocket token injection (73acb8a) | ✅ | Claude | Fixes CF Access users losing `?token=` after auth redirect | +| 1.5.4 | Port AI Gateway model support (021a9ed) | ✅ | Claude | `CF_AI_GATEWAY_MODEL` env var for any provider/model | +| 1.5.5 | Port channel config overwrite fix (fb6bc1e) | ✅ | Claude | Prevents stale R2 backup keys failing validation | +| 1.5.6 | Port Anthropic config leak fix (1a3c118) | ✅ | Claude | Remove `console.log` of full config with secrets | +| 1.5.7 | Port workspace sync to R2 (12eb483) | ✅ | Claude | Persists IDENTITY.md, MEMORY.md across restarts | + +--- + +### Phase 2: Observability & Cost Intelligence (Medium effort) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 2.1 | Add token/cost tracking per request | ✅ | Claude | `costs.ts` — pricing parser, per-user daily accumulation, cost footer on responses | +| 2.2 | Add `/costs` Telegram command | ✅ | Claude | `/costs` today + `/costs week` 7-day breakdown, integrated with Phase 2.1 | +| 2.3 | Integrate Acontext observability (Phase 1) | ✅ | Claude | Lightweight REST client, session storage at task completion, /sessions command | +| 2.4 | Add Acontext dashboard link to admin UI | ✅ | Codex+Claude | Backend route + React section + CSS + 13 tests (785 total) | + +> 🧑 HUMAN CHECK 2.5: Set up Acontext account and configure API key — ✅ DONE (2026-02-11) +> 🧑 HUMAN CHECK 2.6: Review cost tracking accuracy against OpenRouter billing — ⏳ PENDING + +--- + +### Phase 2.5: Free API Integration (Low effort, high value, $0 cost) + +> Based on [storia-free-apis-catalog.md](storia-free-apis-catalog.md). All APIs are free/no-auth or free-tier. +> These can be implemented as new moltworker tools or Telegram/Discord commands. + +| ID | Task | Status | Owner | Effort | Notes | +|----|------|--------|-------|--------|-------| +| 2.5.1 | URL metadata tool (Microlink) | ✅ | Claude | 1h | Rich link previews in chat — title, description, image extraction. 🟢 No auth | +| 2.5.2 | Chart image generation (QuickChart) | ✅ | Claude | 2h | Generate chart images for `/brief` command and data visualization. 🟢 No auth | +| 2.5.3 | Weather tool (Open-Meteo) | ✅ | Claude | 2h | Full weather forecast, no key, no rate limits. 🟢 No auth | +| 2.5.4 | Currency conversion tool (ExchangeRate-API) | ✅ | Claude | 1h | `convert_currency` tool — 150+ currencies, 30min cache, 14 tests. 🟢 No auth | +| 2.5.5 | HackerNews + Reddit + arXiv feeds | ✅ | Claude | 3h | `fetch_news` tool — 3 sources, 14 tests. 🟢 No auth | +| 2.5.6 | Crypto expansion (CoinCap + DEX Screener + CoinPaprika) | ✅ | Claude | 4h | `get_crypto` tool — price/top/dex actions, 3 APIs, 5min cache, 11 tests. 🟢 No auth | +| 2.5.7 | Daily briefing aggregator | ✅ | Claude | 6h | `/briefing` command — weather + HN top 5 + Reddit top 3 + arXiv latest 3, 15min cache, partial failure handling | +| 2.5.8 | Geolocation from IP (ipapi) | ✅ | Claude | 1h | `geolocate_ip` tool — city/country/timezone/ISP, 15min cache, 7 tests. 🟢 No auth | +| 2.5.9 | Holiday awareness (Nager.Date) | ✅ | Claude | 1h | Nager.Date API integration, holiday banner in briefing, 100+ countries | +| 2.5.10 | Quotes & personality (Quotable + Advice Slip) | ✅ | Claude | 2h | Quotable API + Advice Slip fallback in daily briefing, 7 tests. 🟢 No auth | + +**Total: ~23h = 10 new capabilities at $0/month cost.** + +> 🧑 HUMAN CHECK 2.5.11: Decide which free APIs to prioritize first — ⏳ PENDING +> Recommended order: 2.5.1 (Microlink) → 2.5.2 (QuickChart) → 2.5.3 (Weather) → 2.5.5 (News feeds) → 2.5.7 (Daily briefing) + +--- + +### Phase 3: Compound Engineering (Medium effort, transformative) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 3.1 | Implement compound learning loop | ✅ | Claude | `src/openrouter/learnings.ts` — extract/store/inject patterns, 36 tests | +| 3.2 | Add structured task phases (Plan → Work → Review) | ✅ | Claude | Phase tracking in `TaskState`, phase-aware prompts, 8 tests | +| 3.3 | Add `/learnings` Telegram command | ✅ | Claude | View past patterns and success rates + P1 guardrails (Task Router, source-grounding, confidence labels) | +| 3.4 | Inject relevant learnings into system prompts | ✅ | Claude | Included in 3.1 — learnings injected into system prompt in handler.ts | + +> 🧑 HUMAN CHECK 3.5: Review learning data quality after 20+ tasks — ⏳ PENDING + +--- + +### Sprint 48h: Infrastructure Guardrails (2026-02-20) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| S48.1 | Phase budget circuit breakers | ✅ | Claude | `phase-budget.ts` — per-phase CPU budgets (plan=8s, work=18s, review=3s), checkpoint-save-before-crash, auto-resume on exceeded. Mitigates risk: CF DO 30s CPU hard-kill. 14 tests | +| S48.2 | Parallel tools → allSettled + safety whitelist | ✅ | Claude | `task-processor.ts` — `Promise.allSettled` isolation, `PARALLEL_SAFE_TOOLS` (11 read-only), mutation tools sequential. 8 tests | + +> Risk "No phase timeouts (9x10 severity)" → mitigated by S48.1 + +--- + +### Phase 4: Context Engineering (Medium-High effort) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 4.1 | Replace `compressContext()` with token-budgeted retrieval | ✅ | Claude | Priority-scored messages, tool pairing, summarization — 28 tests | +| 4.2 | Replace `estimateTokens()` with actual tokenizer | ✅ | Claude | `gpt-tokenizer` cl100k_base encoding, heuristic fallback — 18 tests (772 total) | +| 4.3 | Add tool result caching | ✅ | Codex+Claude | In-memory cache + in-flight dedup, PARALLEL_SAFE_TOOLS whitelist, 5 tests | +| 4.4 | Implement cross-session context continuity | ✅ | Claude | SessionSummary ring buffer (20 entries), 24h TTL, keyword-scored injection, 19 tests | + +> 🧑 HUMAN CHECK 4.5: Validate context quality with Acontext vs. current compression — ⏳ PENDING + +--- + +### Phase 5: Advanced Capabilities (High effort, strategic) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 5.1 | Multi-agent review for complex tasks | 🔲 | Claude | Route results through reviewer model | +| 5.2 | MCP integration (mcporter pattern) | 🔲 | Claude | Dynamic tool registration from MCP servers | +| 5.3 | Acontext Sandbox for code execution | 🔲 | Codex | Replaces roadmap Priority 3.2 | +| 5.4 | Acontext Disk for file management | 🔲 | Codex | Replaces roadmap Priority 3.3 | +| 5.5 | Web search tool | ✅ | Codex | Brave Search API tool with TTL cache + Telegram/DO key plumbing | +| 5.6 | Multi-agent orchestration | 🔲 | Claude | Leverage Claude Sonnet 4.5 speculative execution | + +> 🧑 HUMAN CHECK 5.7: Evaluate MCP server hosting options (Sandbox vs. external) — ⏳ PENDING +> 🧑 HUMAN CHECK 5.8: Security review of code execution sandbox — ⏳ PENDING + +--- + +### Phase 6: Platform Expansion (Future) + +| ID | Task | Status | Owner | Notes | +|----|------|--------|-------|-------| +| 6.1 | Telegram inline buttons | ✅ | Claude | /start feature buttons, model pick, start callbacks | +| 6.2 | Response streaming (Telegram) | 🔲 | Any AI | Progressive message updates | +| 6.3 | Voice messages (Whisper + TTS) | 🔲 | Any AI | High effort | +| 6.4 | Calendar/reminder tools | 🔲 | Any AI | Cron-based | +| 6.5 | Email integration | 🔲 | Any AI | Cloudflare Email Workers | +| 6.6 | WhatsApp integration | 🔲 | Any AI | WhatsApp Business API | + +--- + +## AI Task Ownership + +| AI Agent | Primary Responsibilities | Strengths | +|----------|------------------------|-----------| +| **Claude** | Architecture, complex refactoring, tool-calling logic, task processor, compound learning | Deep reasoning, multi-step changes, system design | +| **Codex** | Frontend (React admin UI), tests, simple model additions, Acontext integration | Fast execution, UI work, parallel tasks | +| **Other Bots** | Code review, documentation, simple fixes, model catalog updates | Varies by model | +| **Human** | Security review, deployment, API key management, architecture decisions | Final authority | + +--- + +## Human Checkpoints Summary + +| ID | Description | Status | +|----|-------------|--------| +| 0.6 | Verify new model IDs on OpenRouter | ✅ DEPLOYED | +| 1.6 | Test parallel tool execution with real APIs | ⏳ PENDING | +| 1.7 | Verify reasoning control compatibility | ⏳ PENDING | +| 2.5 | Set up Acontext account/API key | ✅ DONE (key in CF Workers secrets) | +| 2.5.11 | Decide which free APIs to prioritize first | ⏳ PENDING | +| 2.6 | Review cost tracking vs. OpenRouter billing | ⏳ PENDING | +| 3.5 | Review learning data quality | ⏳ PENDING | +| 4.5 | Validate Acontext context quality | ⏳ PENDING | +| 5.7 | Evaluate MCP hosting options | ⏳ PENDING | +| 5.8 | Security review of code execution | ⏳ PENDING | + +--- + +## Bug Fixes & Corrective Actions + +| ID | Date | Issue | Severity | Fix | Files | AI | +|----|------|-------|----------|-----|-------|----| +| BUG-1 | 2026-02-08 | "Processing complex task..." shown for ALL messages on tool-capable models | Low/UX | ✅ Changed to "Thinking..." | `task-processor.ts` | ✅ | +| BUG-2 | 2026-02-08 | DeepSeek V3.2 doesn't proactively use tools (prefers answering from knowledge) | Medium | ✅ Added tool usage hint in system prompt | `handler.ts` | ✅ | +| BUG-3 | 2026-02-08 | `think:` override not passed through Durable Object path | Medium | ✅ Added `reasoningLevel` to `TaskRequest`, passed from handler to DO, injected in streaming call | `handler.ts`, `task-processor.ts` | ✅ | +| BUG-4 | 2026-02-08 | `/img` fails — "No endpoints found that support output modalities: image, text" | High | ✅ FLUX models need `modalities: ['image']` (image-only), not `['image', 'text']` | `client.ts:357` | ✅ | +| BUG-5 | 2026-02-08 | `/use fluxpro` + text → "No response generated" | Low | ✅ Fallback to default model with helpful message | `handler.ts` | ✅ | +| BUG-6 | 2026-02-10 | GLM Free missing `supportsTools` flag — hallucinated tool calls | Medium | ⚠️ Reverted — free tier doesn't support function calling. Paid GLM 4.7 works. | `models.ts` | ⚠️ | +| BUG-12 | 2026-02-10 | Auto-resume counter persists across different tasks (18→22 on new task) | High | ✅ Check `taskId` match before inheriting `autoResumeCount` | `task-processor.ts` | ✅ | +| BUG-7 | 2026-02-10 | 402 quota exceeded not handled — tasks loop forever | High | ✅ Fail fast, rotate to free model, user message | `client.ts`, `task-processor.ts` | ✅ | +| BUG-8 | 2026-02-10 | No cross-task context continuity | Medium | ✅ Store last task summary in R2, inject with 1h TTL | `task-processor.ts`, `handler.ts` | ✅ | +| BUG-9 | 2026-02-10 | Runaway auto-resume (no elapsed time limit) | High | ✅ 15min free / 30min paid cap | `task-processor.ts` | ✅ | +| BUG-10 | 2026-02-10 | No warning when non-tool model gets tool-needing message | Low/UX | ✅ Tool-intent detection + user warning | `handler.ts` | ✅ | +| BUG-11 | 2026-02-10 | Models with parallelCalls not prompted strongly enough | Low | ✅ Stronger parallel tool-call instruction | `client.ts` | ✅ | + +--- + +## Changelog + +> Newest first. Format: `YYYY-MM-DD | AI | Description | files` + +``` +2026-02-20 | Codex (Session: codex-phase-5-5-web-search-001) | feat(tools): add web_search (Brave Search API) with 5-minute cache, DO/Telegram key wiring, and 8 tests | src/openrouter/tools.ts, src/openrouter/tools.test.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts, src/routes/telegram.ts, src/types.ts, src/openrouter/briefing-aggregator.test.ts + +2026-02-20 | Claude Opus 4.6 (Session: session_01SE5WrUuc6LWTmZC8WBXKY4) | feat(learnings+tools): Phase 4.4 cross-session context continuity + Phase 2.5.10 quotes & personality — SessionSummary ring buffer (20 entries, R2), 24h TTL, keyword-scored injection, Quotable + Advice Slip in briefing, 30 new tests (820 total) | src/openrouter/learnings.ts, src/openrouter/learnings.test.ts, src/openrouter/tools.ts, src/openrouter/tools.test.ts, src/durable-objects/task-processor.ts, src/durable-objects/task-processor.test.ts, src/telegram/handler.ts +2026-02-20 | Codex+Claude (Session: session_01SE5WrUuc6LWTmZC8WBXKY4) | feat(admin): Phase 2.4 Acontext sessions dashboard — backend route, React section, CSS, 13 new tests (785 total). Best-of-5 Codex outputs reviewed and merged by Claude | src/routes/api.ts, src/routes/api.test.ts, src/routes/admin-acontext.test.tsx, src/client/api.ts, src/client/pages/AdminPage.tsx, src/client/pages/AdminPage.css, vitest.config.ts +2026-02-20 | Claude Opus 4.6 (Session: session_01SE5WrUuc6LWTmZC8WBXKY4) | feat(context-budget): Phase 4.2 real tokenizer — gpt-tokenizer cl100k_base BPE encoding replaces heuristic estimateStringTokens, heuristic fallback, 18 new tests (772 total) | src/utils/tokenizer.ts, src/utils/tokenizer.test.ts, src/durable-objects/context-budget.ts, src/durable-objects/context-budget.test.ts, src/durable-objects/context-budget.edge.test.ts, package.json +2026-02-20 | Claude Opus 4.6 (Session: session_01AtnWsZSprM6Gjr9vjTm1xp) | feat(task-processor): parallel tools Promise.allSettled + safety whitelist — PARALLEL_SAFE_TOOLS set (11 read-only tools), mutation tools sequential, allSettled isolation, 8 new tests (762 total) | src/durable-objects/task-processor.ts, src/durable-objects/task-processor.test.ts +2026-02-20 | Claude Opus 4.6 (Session: session_01AtnWsZSprM6Gjr9vjTm1xp) | feat(task-processor): phase budget circuit breakers — per-phase CPU time budgets (plan=8s, work=18s, review=3s), checkpoint-save-before-crash, auto-resume on budget exceeded, 14 new tests (754 total) | src/durable-objects/phase-budget.ts, src/durable-objects/phase-budget.test.ts, src/durable-objects/task-processor.ts +2026-02-19 | Codex (Session: codex-phase-4-1-audit-001) | fix(task-processor/context): Phase 4.1 audit hardening — safer tool pairing, transitive pair retention, model-aware context budgets, 11 edge-case tests, audit report | src/durable-objects/context-budget.ts, src/durable-objects/context-budget.edge.test.ts, src/durable-objects/task-processor.ts, brainstorming/phase-4.1-audit.md +2026-02-18 | Claude Opus 4.6 (Session: 018M5goT7Vhaymuo8AxXhUCg) | feat(task-processor): Phase 4.1 token-budgeted context retrieval — priority-scored messages, tool pairing, summarization of evicted content, 28 new tests (717 total) | src/durable-objects/context-budget.ts, src/durable-objects/context-budget.test.ts, src/durable-objects/task-processor.ts +2026-02-18 | Claude Opus 4.6 (Session: 01SE5WrUuc6LWTmZC8WBXKY4) | feat(tools): Phase 2.5.9 holiday awareness — Nager.Date API integration, holiday banner in daily briefing, 9 new tests (689 total) | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-18 | Claude Opus 4.6 (Session: 01SE5WrUuc6LWTmZC8WBXKY4) | feat(acontext): Phase 2.3 Acontext observability — lightweight REST client, session storage at task completion, /sessions command, 24 new tests (680 total) | src/acontext/client.ts, src/acontext/client.test.ts, src/types.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts, src/routes/telegram.ts +2026-02-18 | Claude Opus 4.6 (Session: 01SE5WrUuc6LWTmZC8WBXKY4) | feat(guardrails): P1 routing + hallucination guardrails + /learnings command — Task Router, source-grounding prompt, confidence labels, /learnings analytics, 656 tests | src/openrouter/models.ts, src/openrouter/learnings.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts +2026-02-16 | Codex (Session: codex-audit-plan-001) | docs(audit): full audit + build improvement plan for /dcode resume loops and hallucination mitigation | brainstorming/audit-build-improvement-plan.md +2026-02-11 | Claude Opus 4.6 (Session: 019jH8X9pJabGwP2untYhuYE) | feat(task-processor): structured task phases (plan → work → review) — Phase 3.2 complete, 8 new tests, 456 total | src/durable-objects/task-processor.ts, src/durable-objects/task-processor.test.ts +2026-02-11 | Claude Opus 4.6 (Session: 018gmCDcuBJqs9ffrrDHHBBd) | fix(tools): briefing location (Nominatim), news clickable links (HN/Reddit/arXiv URLs), crypto symbol disambiguation (pick highest mcap), 448 tests | src/openrouter/tools.ts +2026-02-11 | Claude Opus 4.6 (Session: 018gmCDcuBJqs9ffrrDHHBBd) | feat(telegram): /start redesign with feature buttons, bot menu commands, enhanced R2 skill prompt | src/telegram/handler.ts, src/routes/telegram.ts, claude-share/R2/skills/storia-orchestrator/prompt.md +2026-02-10 | Claude Opus 4.6 (Session: 018gmCDcuBJqs9ffrrDHHBBd) | fix: auto-resume counter reset + revert GLM free tool flag (BUG-12, BUG-6 update), 448 tests | src/durable-objects/task-processor.ts, src/openrouter/models.ts, src/openrouter/models.test.ts +2026-02-10 | Claude Opus 4.6 (Session: 018gmCDcuBJqs9ffrrDHHBBd) | fix: 6 bot improvements from Telegram analysis — GLM tools, 402 handling, cross-task context, elapsed cap, tool-intent warn, parallel prompt (33 new tests, 447 total) | src/openrouter/models.ts, src/openrouter/client.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts +2026-02-10 | Claude Opus 4.6 (Session: 018gmCDcuBJqs9ffrrDHHBBd) | feat(openrouter): compound learning loop — Phase 3.1+3.4 complete, extract/store/inject task patterns, 36 tests | src/openrouter/learnings.ts, src/openrouter/learnings.test.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts +2026-02-09 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | feat(client): structured output support + json: prefix — Phase 1.5 complete | src/openrouter/client.ts, src/openrouter/models.ts, src/telegram/handler.ts, src/durable-objects/task-processor.ts, src/openrouter/structured-output.test.ts +2026-02-09 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | feat(telegram): unify vision + tools + update /help — Phase 1.4 complete | src/telegram/handler.ts, src/openrouter/vision-tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | feat(tools): add get_crypto + geolocate_ip tools — Phase 2.5.6+2.5.8 complete, 12 tools total | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | fix(ux): BUG-1 (Thinking... msg), BUG-2 (tool prompt hint), BUG-5 (image-gen fallback) — all 5 bugs now fixed | src/durable-objects/task-processor.ts, src/telegram/handler.ts +2026-02-08 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | feat(costs): per-request token/cost tracking + /costs command — Phase 2.1+2.2 complete | src/openrouter/costs.ts, src/openrouter/costs.test.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts +2026-02-08 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | feat(tools): add convert_currency tool via ExchangeRate-API — Phase 2.5.4 complete | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 013wvC2kun5Mbr3J81KUPn99) | feat(telegram): /briefing command + fix BUG-3 (think: DO passthrough) + fix BUG-4 (modalities: ['image']) — Phase 2.5.7 complete | src/openrouter/tools.ts, src/openrouter/client.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 01Wjud3VHKMfSRbvMTzFohGS) | docs: log 5 bugs found during live testing (BUG-1 to BUG-5) — DO status msg, DeepSeek tool use, think: passthrough, /img failure, fluxpro UX | claude-share/core/*.md +2026-02-08 | Claude Opus 4.6 (Session: 01Wjud3VHKMfSRbvMTzFohGS) | feat(client): configurable reasoning per model — Phase 1.3 complete | src/openrouter/models.ts, src/openrouter/client.ts, src/telegram/handler.ts, src/openrouter/reasoning.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 01Wjud3VHKMfSRbvMTzFohGS) | feat(tools): add fetch_news tool (HN/Reddit/arXiv) — Phase 2.5.5 complete | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 01Wjud3VHKMfSRbvMTzFohGS) | feat(tools): add get_weather tool via Open-Meteo API — Phase 2.5.3 complete | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 01Wjud3VHKMfSRbvMTzFohGS) | feat(tools): add generate_chart tool via QuickChart API — Phase 2.5.2 complete | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 01Wjud3VHKMfSRbvMTzFohGS) | feat(tools): add url_metadata tool via Microlink API — Phase 2.5.1 complete | src/openrouter/tools.ts, src/openrouter/tools.test.ts +2026-02-08 | Claude Opus 4.6 (Session: 01Lg3st5TTU3gXnMqPxfCPpW) | docs: update all core docs — mark Phase 1.1/1.2 complete, add Phase 2.5 (free APIs), update sprint status | claude-share/core/*.md +2026-02-08 | Claude Opus 4.6 (Session: 01Lg3st5TTU3gXnMqPxfCPpW) | feat(upstream): cherry-pick 7 upstream fixes — WS token, AI Gateway, channel config, workspace sync, exitCode, container downgrade, config leak | src/index.ts, src/types.ts, src/gateway/*.ts, start-moltbot.sh, Dockerfile, wrangler.jsonc, README.md +2026-02-08 | Claude Opus 4.6 (Session: 01Lg3st5TTU3gXnMqPxfCPpW) | feat(tools): parallel tool execution + model capability metadata — Phase 1.1 + 1.2 complete | src/openrouter/client.ts, src/durable-objects/task-processor.ts, src/openrouter/models.ts +2026-02-07 | Claude Opus 4.6 (Session: 011qMKSadt2zPFgn2GdTTyxH) | feat(models): add Pony Alpha, GPT-OSS-120B, GLM 4.7 — Phase 0 complete | src/openrouter/models.ts +2026-02-06 | Claude Opus 4.6 (Session: 011qMKSadt2zPFgn2GdTTyxH) | docs: Create multi-AI orchestration documentation structure | claude-share/core/*.md, CLAUDE.md, AGENTS.md +2026-02-06 | Claude Opus 4.6 (Session: 011qMKSadt2zPFgn2GdTTyxH) | docs: Add Compound Engineering Plugin analysis | brainstorming/tool-calling-analysis.md +2026-02-06 | Claude Opus 4.6 (Session: 011qMKSadt2zPFgn2GdTTyxH) | docs: Add Acontext context data platform analysis | brainstorming/tool-calling-analysis.md +2026-02-06 | Claude Opus 4.6 (Session: 011qMKSadt2zPFgn2GdTTyxH) | docs: Initial tool-calling landscape and steipete analysis | brainstorming/tool-calling-analysis.md +``` +2026-02-20 | Codex (Session: codex-phase-5-5-web-search-001) | feat(tools): add web_search (Brave Search API) with 5-minute cache, DO/Telegram key wiring, and 8 tests | src/openrouter/tools.ts, src/openrouter/tools.test.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts, src/routes/telegram.ts, src/types.ts, src/openrouter/briefing-aggregator.test.ts + +--- + +## Dependency Graph + +```mermaid +graph TD + P0[Phase 0: Quick Wins ✅] --> P1[Phase 1: Tool-Calling ✅] + P0 --> P15[Phase 1.5: Upstream Sync ✅] + P1 --> P2[Phase 2: Observability & Costs] + P1 --> P25[Phase 2.5: Free APIs 🔲] + P1 --> P3[Phase 3: Compound Engineering] + P2 --> P4[Phase 4: Context Engineering] + P3 --> P4 + P4 --> P5[Phase 5: Advanced Capabilities] + P5 --> P6[Phase 6: Platform Expansion] + P25 --> P6 + + subgraph "Phase 1 (1.1-1.5 ✅)" + P1_1[1.1 Parallel tools ✅] + P1_2[1.2 Model metadata ✅] + P1_3[1.3 Reasoning control ✅] + P1_4[1.4 Vision + tools ✅] + P1_5[1.5 Structured output ✅] + end + + subgraph "Phase 2.5: Free APIs ($0 cost)" + P25_1[2.5.1 URL metadata - Microlink] + P25_2[2.5.2 Charts - QuickChart] + P25_3[2.5.3 Weather - Open-Meteo] + P25_5[2.5.5 News feeds - HN/Reddit/arXiv] + P25_7[2.5.7 Daily briefing aggregator] + end + + subgraph "Phase 2 (Medium)" + P2_1[2.1 Cost tracking] + P2_3[2.3 Acontext observability] + end + + subgraph "Phase 3 (Medium)" + P3_1[3.1 Learning loop] + P3_2[3.2 Task phases] + end + + P1_1 --> P5_1[5.1 Multi-agent review] + P1_2 --> P1_3 + P1_2 --> P2_1 + P25_1 --> P25_7 + P25_2 --> P25_7 + P25_3 --> P25_7 + P25_5 --> P25_7 + P2_3 --> P4 + P3_1 --> P3_2 + P3_2 --> P5_1 +``` +2026-02-20 | Codex (Session: codex-phase-5-5-web-search-001) | feat(tools): add web_search (Brave Search API) with 5-minute cache, DO/Telegram key wiring, and 8 tests | src/openrouter/tools.ts, src/openrouter/tools.test.ts, src/durable-objects/task-processor.ts, src/telegram/handler.ts, src/routes/telegram.ts, src/types.ts, src/openrouter/briefing-aggregator.test.ts + +--- + +## References + +- [Tool-Calling Analysis](../../brainstorming/tool-calling-analysis.md) — Full analysis with 10 gaps and 13 recommendations +- [Free APIs Catalog](storia-free-apis-catalog.md) — 25+ free APIs for zero-cost feature expansion +- [Future Integrations](../../brainstorming/future-integrations.md) — Original roadmap (pre-analysis) +- [README](../../README.md) — User-facing documentation +- [AGENTS.md](../../AGENTS.md) — Developer/AI agent instructions +- [CLAUDE.md](../../CLAUDE.md) — Claude Code project instructions diff --git a/claude-share/core/MOLTWORKER_SPRINT_48H.md b/claude-share/core/MOLTWORKER_SPRINT_48H.md new file mode 100644 index 000000000..5c6ca2bd0 --- /dev/null +++ b/claude-share/core/MOLTWORKER_SPRINT_48H.md @@ -0,0 +1,199 @@ +# Moltworker — Sprint 48h (19-21 fév 2026) +**Pour**: Claude Code Opus 4.6 +**Contexte**: Feedback consolidé de Grok + Opus 4.6 + Sonnet 4.6, corrigé sur code réel (commit 17-18 fév) +**Branche**: `claude/sprint-phase-budgets-parallel` + +--- + +## Contexte critique à lire avant de toucher au code + +`task-processor.ts` fait actuellement **1 248 lignes** (pas 650 — données obsolètes dans les feedbacks antérieurs). +`Promise.all` est **déjà implémenté** pour les tool calls parallèles (confirmé commit récent). +Cloudflare DO : single-threaded, CPU hard limit 30s, alarm toutes les 90s. +Le watchdog actuel est **réactif** (détecte les stalls après coup). Il n'y a **aucun circuit breaker proactif par phase**. + +--- + +## Tâche 1 — Phase Budget Circuit Breakers (priorité absolue) +**Effort estimé** : 2h +**Risque mitigé** : CPU 30s hard kill Cloudflare (Risque 9×10) + +### Problème +Si une phase `work` enchaîne 3 tools lents + retry OpenRouter timeout (20s) → tu hits le hard limit 30s CPU et perds toute la progression. Le watchdog ne peut rien faire après un kill. + +### Implémentation + +Ajouter dans `task-processor.ts` (ou extraire dans `task-phases.ts` si tu juges la taille critique) : + +```typescript +const PHASE_BUDGETS_MS = { + plan: 8_000, // 8s max + work: 18_000, // 18s max (tools lourds) + review: 3_000 // 3s max +} as const; + +type TaskPhase = keyof typeof PHASE_BUDGETS_MS; + +async function executePhaseWithBudget( + phase: TaskPhase, + fn: () => Promise, + state: TaskState, + saveCheckpoint: () => Promise +): Promise { + const budget = PHASE_BUDGETS_MS[phase]; + const timeout = new Promise((_, reject) => + setTimeout( + () => reject(new Error(`Phase ${phase} timeout after ${budget}ms`)), + budget + ) + ); + + try { + await Promise.race([fn(), timeout]); + state.phaseStartTime = Date.now(); // reset pour watchdog + } catch (err) { + const isTimeout = err instanceof Error && err.message.includes('timeout'); + if (isTimeout) { + state.autoResumeCount++; + state.lastError = `Phase timeout → auto-resume #${state.autoResumeCount}`; + await saveCheckpoint(); // sauvegarder avant propagation + } + throw err; + } +} +``` + +### Intégration dans runTaskLoop() + +Wrapper chaque phase existante : + +```typescript +// Avant (exemple phase work) : +await this.runWorkPhase(); + +// Après : +await executePhaseWithBudget('work', () => this.runWorkPhase(), this.state, () => this.saveCheckpoint()); +``` + +### Tests à ajouter (minimum) +- Phase timeout déclenche `autoResumeCount++` +- `saveCheckpoint()` est appelé avant le throw sur timeout +- Phase qui finit dans le budget ne modifie pas `autoResumeCount` +- Budget `plan` (8s) < budget `work` (18s) — vérifier que les constantes sont respectées + +--- + +## Tâche 2 — Parallel Tools Upgrade +**Effort estimé** : 45min +**Contexte** : `Promise.all` est déjà en prod. Ce sont deux upgrades ciblés, pas une nouvelle implémentation. + +### Upgrade 1 — Passer à Promise.allSettled + +`Promise.all` fait échouer tous les tools si un seul fail. `Promise.allSettled` isole les échecs : + +```typescript +// Localiser handleToolCalls() dans task-processor.ts +// Remplacer Promise.all par Promise.allSettled + mapper les résultats + +const settled = await Promise.allSettled( + toolCalls.map(tc => executeToolWithTimeout(tc)) +); + +const results = settled.map((result, i) => { + if (result.status === 'fulfilled') { + return { toolCallId: toolCalls[i].id, content: result.value }; + } else { + return { + toolCallId: toolCalls[i].id, + content: `Tool error: ${result.reason?.message ?? 'unknown'}`, + isError: true + }; + } +}); +``` + +### Upgrade 2 — Side-effects whitelist + +Certains tools ont des side-effects (writes GitHub, mutations) et ne doivent pas être parallélisés : + +```typescript +// Ajouter près de la définition des tools existants +const PARALLEL_SAFE_TOOLS = new Set([ + 'fetch_url', + 'browse_url', + 'fetch_weather', + 'get_crypto', + 'github_read_file', + 'github_list_files', + // NE PAS inclure : 'github_api' (peut faire des writes) +]); + +// Dans handleToolCalls(), avant Promise.allSettled : +const allSafe = toolCalls.every(tc => PARALLEL_SAFE_TOOLS.has(tc.function.name)); +const useParallel = allSafe && (this.currentModel.parallelCalls === true); + +if (toolCalls.length > 1 && useParallel) { + // Promise.allSettled path +} else { + // Sequential fallback (legacy models ou tools avec side-effects) +} +``` + +**Note** : `parallelCalls` flag existe déjà dans `models.ts` — utiliser celui-là, ne pas en créer un nouveau. + +### Tests à ajouter +- Un tool qui fail n'annule pas les autres (allSettled isolation) +- `github_api` → sequential même si model supporte parallel +- `fetch_weather` + `get_crypto` → parallel si model le supporte +- Résultats d'erreur contiennent `isError: true` + +--- + +## Ce qu'il ne faut PAS faire dans ce sprint + +- Ne pas splitter `task-processor.ts` en 5 fichiers — décision Acontext non encore prise +- Ne pas refactoriser `task-phases.ts` en profondeur — Acontext la remplace potentiellement +- Ne pas intégrer Acontext — c'est Phase 4, gate séparé +- Ne pas toucher à `compressContext()` — tiktoken-lite est la prochaine étape, pas ce sprint + +--- + +## Après ce sprint (Semaine suivante) + +Ces items sont hors scope du sprint 48h mais documentés pour la session suivante : + +1. **Extract guardrails** → `task-guardrails.ts` (constantes uniquement, pas de refacto structurelle) +2. **tiktoken-lite** → remplacer `estimateTokens()` (chars/4 trop approximatif pour cost tracking) +3. **Pre-warm cron** → toutes les 7 minutes (keep-alive DO) + +--- + +## Human Checkpoint (toi, après deploy) + +Lancer `/briefing` (weather + news + crypto) — c'est le test multi-tools idéal. +Mesurer : +- Latency avant/après `allSettled` +- Auto-resume rate sur tâches longues (objectif < 5%, actuel ~12%) +- Aucun kill CPU 30s Cloudflare sur tâches complexes + +--- + +## Mise à jour roadmap attendue après le sprint + +```markdown +## Changelog — 19-21 fév 2026 +- ✅ Phase budget circuit breakers (PHASE_BUDGETS_MS + executePhaseWithBudget) +- ✅ Parallel tools → Promise.allSettled + PARALLEL_SAFE_TOOLS whitelist +- Risque "No phase timeouts (9×10)" → mitigé +- OKR latency multi-tools : mesure post-deploy en attente +``` + +--- + +## Règles de base pour cette session + +- Branche : `claude/sprint-phase-budgets-parallel` +- `test-results-summary.json` : toujours résoudre avec `--theirs` +- Tests : +1 couverture minimum sur chaque fichier touché +- Commit unique par tâche avec message clair : `feat: phase budget circuit breakers (Sprint 48h)` +- Mettre à jour `GLOBAL_ROADMAP.md` + `claude-log.md` après chaque tâche diff --git a/claude-share/core/SPECIFICATION.md b/claude-share/core/SPECIFICATION.md new file mode 100644 index 000000000..fb6d5e073 --- /dev/null +++ b/claude-share/core/SPECIFICATION.md @@ -0,0 +1,306 @@ +# Moltworker Product Specification + +> Product vision, feature specifications, and technical requirements. + +**Last Updated:** 2026-02-09 +**Version:** 2.2 (Phase 1 complete + structured output) + +--- + +## Vision & Philosophy + +### Mission +Provide a self-hosted, multi-model AI assistant that gets better with every interaction, accessible from any messaging platform. + +### Core Principles +1. **Multi-model by default** — No vendor lock-in. Users choose models per task. +2. **Compound improvement** — Each task should make subsequent tasks easier (learnings, patterns, context). +3. **Edge-first** — Run on Cloudflare Workers for global low-latency. No traditional servers. +4. **Privacy-respecting** — Users bring their own API keys. No data leaves their control. +5. **Ship fast, iterate** — Working features over perfect features. + +--- + +## Feature Specifications by Phase + +### Phase 0: Foundation (Current) + +#### F0.1: Multi-Model Chat +- **Status:** ✅ Complete +- **Description:** 30+ models accessible via aliases (`/deep`, `/sonnet`, `/grok`, etc.) +- **Models:** OpenRouter (26+) + Direct APIs (DashScope, Moonshot, DeepSeek) +- **Interface:** Telegram, Discord, Slack, Web UI (via OpenClaw) +- **Capability metadata:** Each model tagged with `parallelCalls`, `structuredOutput`, `reasoning`, `maxContext` + +#### F0.2: Tool Calling +- **Status:** ✅ Complete (12 tools, parallel execution) +- **Tools:** `fetch_url`, `github_read_file`, `github_list_files`, `github_api`, `url_metadata`, `generate_chart`, `get_weather`, `fetch_news`, `convert_currency`, `browse_url` +- **Execution:** Parallel via `Promise.all()`, max 10 iterations (Worker) or 100 (Durable Object) + +#### F0.3: Image Generation +- **Status:** ✅ Complete +- **Models:** FLUX.2 Klein, Pro, Flex, Max +- **Interface:** `/imagine ` via Telegram + +#### F0.4: Long-Running Tasks +- **Status:** ✅ Complete +- **Engine:** Durable Objects with R2 checkpointing +- **Features:** Auto-resume (up to 10 times), watchdog alarms, progress updates + +--- + +### Phase 1: Tool-Calling Intelligence + +#### F1.1: Parallel Tool Execution +- **Status:** ✅ Complete +- **Spec:** When a model returns multiple `tool_calls`, all calls execute concurrently via `Promise.all()`. +- **Implementation:** Both `client.ts` (Worker) and `task-processor.ts` (Durable Object) parallelized. +- **Metric:** 2-5x faster for multi-tool iterations. Logging shows total parallel time vs individual tool times. + +#### F1.2: Model Capability Metadata +- **Status:** ✅ Complete +- **Spec:** Extended `ModelInfo` interface with 4 new fields, populated for all 30+ models: + ```typescript + interface ModelInfo { + // ... existing fields + parallelCalls?: boolean; + structuredOutput?: boolean; + reasoning?: 'none' | 'fixed' | 'configurable'; + maxContext?: number; // tokens + } + ``` +- **Usage:** Enables future intelligent model routing and reasoning control (F1.3). + +#### F1.3: Configurable Reasoning +- **Status:** ✅ Complete +- **Spec:** Pass `reasoning` parameter to API for models that support it: + - DeepSeek V3.2: `reasoning: { enabled: boolean }` + - Gemini 3 Flash/Pro: `reasoning: { effort: 'minimal' | 'low' | 'medium' | 'high' }` + - Grok 4.1: `reasoning: { enabled: boolean }` +- **Default:** Auto-detect from task type (simple Q&A → off, coding/tools → medium, research → high). +- **User override:** `think:LEVEL` message prefix (e.g., `think:high explain X`). +- **Tool-use boost:** When using tools, `off` is upgraded to `medium` automatically. +- **Implementation:** `models.ts` (types + `getReasoningParam()`, `detectReasoningLevel()`, `parseReasoningOverride()`), `client.ts` (injection in 3 methods), `handler.ts` (prefix parsing). 36 tests in `reasoning.test.ts`. + +#### F1.4: Vision + Tools Combined +- **Status:** ✅ Complete +- **Spec:** Vision messages (photo + caption) now route through the tool-calling path for tool-supporting models. User sends photo + caption → model sees image AND can use all 12 tools (weather, GitHub, crypto, etc). +- **Implementation:** `handleVision()` in `handler.ts` builds `ContentPart[]` message (text + image_url) and routes through DO/tool-calling path for tool-supporting models. Falls back to simple `chatCompletionWithVision()` for non-tool models. `/help` updated with all 12 tools and vision+tools capability. 6 tests in `vision-tools.test.ts`. + +#### F1.5: Structured Output Support +- **Status:** ✅ Complete +- **Spec:** Request structured JSON output from compatible models via `response_format: { type: "json_object" }`. Users prefix messages with `json:` to request JSON output. Only injected for models with `structuredOutput: true` metadata. +- **User interface:** `json: list 5 capital cities` — model returns valid JSON. Can combine with reasoning: `think:high json: analyze this data`. +- **Compatible models:** GPT-4o, GPT-4o Mini, GPT-OSS-120B, DeepSeek V3.2, Mistral Large 3, Gemini 3 Flash, Gemini 3 Pro (7 models). +- **Graceful fallback:** Non-compatible models ignore the prefix and respond normally. +- **Implementation:** + - `ResponseFormat` type in `client.ts` — `text | json_object | json_schema` + - `parseJsonPrefix()` in `models.ts` — strips `json:` prefix, case-insensitive + - `supportsStructuredOutput()` in `models.ts` — checks model capability flag + - `responseFormat` option added to all 3 client methods (`chatCompletion`, `chatCompletionWithTools`, `chatCompletionStreamingWithTools`) + - `responseFormat` field added to `TaskRequest` and `TaskState` in `task-processor.ts` for DO persistence + - Wired through handler → DO → streaming API call + - 22 tests in `structured-output.test.ts` + +--- + +### Phase 2: Observability & Cost Intelligence + +#### F2.1: Token/Cost Tracking +- **Status:** ✅ Complete +- **Spec:** Track per-request token usage and cost, accumulate per-user daily totals, display in `/costs` command and response footers. +- **Files:** `src/openrouter/costs.ts`, `src/openrouter/costs.test.ts`, `src/durable-objects/task-processor.ts`, `src/telegram/handler.ts` +- **Data model:** + ```typescript + interface UsageRecord { + userId: string; + date: string; // YYYY-MM-DD + totalPromptTokens: number; + totalCompletionTokens: number; + totalCostUsd: number; + requestCount: number; + byModel: Record; + } + ``` +- **Storage:** In-memory Map keyed by `${userId}:${date}` (MVP; R2 persistence future enhancement) +- **Commands:** `/costs` (today), `/costs week` (7-day breakdown) +- **Features:** Model pricing parsed from catalog strings, cost footer appended to DO task responses, 26 tests + +#### F2.2: Acontext Observability +- **Status:** 🔲 Planned +- **Spec:** Store all task processor messages in Acontext Sessions. Link admin dashboard to Acontext for session replay and success rate tracking. +- **Dependency:** Acontext API key (human setup). + +--- + +### Phase 2.5: Free API Integration + +> All APIs below require zero cost and zero or free-tier auth. See [storia-free-apis-catalog.md](storia-free-apis-catalog.md). + +#### F2.5.1: URL Metadata Tool (Microlink) +- **Status:** ✅ Complete +- **Spec:** New tool `url_metadata({ url: string })` returning title, description, image, author, publisher, date from any URL. +- **API:** `api.microlink.io/?url=` — 🟢 No auth, free tier. +- **Implementation:** `src/openrouter/tools.ts` — tool definition + `urlMetadata()` handler. 9 tests in `tools.test.ts`. + +#### F2.5.2: Chart Image Generation (QuickChart) +- **Status:** ✅ Complete +- **Spec:** New tool `generate_chart({ type, labels, datasets })` returning QuickChart image URL (600x400 PNG). +- **API:** `quickchart.io/chart?c=` — 🟢 No auth. +- **Implementation:** `src/openrouter/tools.ts` — tool definition + `generateChart()` handler with type/JSON validation + HEAD check. 12 tests in `tools.test.ts`. + +#### F2.5.3: Weather Tool (Open-Meteo) +- **Status:** ✅ Complete +- **Spec:** New tool `get_weather({ latitude, longitude })` returning current conditions + 7-day forecast with WMO weather code descriptions. +- **API:** `api.open-meteo.com/v1/forecast` — 🟢 No auth, no rate limits. +- **Implementation:** `src/openrouter/tools.ts` — tool definition + `getWeather()` handler + WMO_WEATHER_CODES mapping (28 codes). 11 tests in `tools.test.ts`. + +#### F2.5.5: News Feeds Tool (HackerNews + Reddit + arXiv) +- **Status:** ✅ Complete +- **Spec:** New tool `fetch_news({ source, topic? })` fetching top 10 stories from HackerNews, Reddit, or arXiv. +- **Sources:** `hackernews` (Firebase API), `reddit` (JSON API, configurable subreddit), `arxiv` (Atom XML, configurable category). +- **API:** All 🟢 No auth — HN Firebase, Reddit JSON, arXiv Atom. +- **Implementation:** `src/openrouter/tools.ts` — tool definition + `fetchNews()` dispatcher + 3 source handlers (parallel HN item fetches, Reddit JSON parsing, arXiv XML string parsing). 14 tests in `tools.test.ts`. + +#### F2.5.4: Currency Conversion Tool (ExchangeRate-API) +- **Status:** ✅ Complete +- **Spec:** New tool `convert_currency({ from, to, amount? })` returning formatted conversion result with rate. +- **API:** `api.exchangerate-api.com/v4/latest/{FROM}` — 🟢 No auth, 150+ currencies. +- **Implementation:** `src/openrouter/tools.ts` — tool definition + `convertCurrency()` handler with 3-letter code validation, 30-minute rate cache per source currency, positive amount validation. 14 tests in `tools.test.ts`. + +#### F2.5.7: Daily Briefing Aggregator +- **Status:** ✅ Complete +- **Spec:** Telegram `/briefing` command combining weather + HackerNews top 5 + Reddit top 3 + arXiv latest 3 into a single formatted message. +- **Dependencies:** F2.5.3 (weather), F2.5.5 (news feeds). +- **Implementation:** `src/openrouter/tools.ts` — `generateDailyBriefing()` with `Promise.allSettled()` for parallel fetching + graceful partial failures. 15-minute cache via `briefingCache`. `src/telegram/handler.ts` — `/briefing` and `/brief` commands with configurable lat/lon, subreddit, arXiv category. 6 tests in `tools.test.ts`. + +#### F2.5.6: Crypto Expansion (CoinCap + CoinPaprika + DEX Screener) +- **Status:** ✅ Complete +- **Spec:** `get_crypto` tool with 3 actions: `price` (single coin via CoinCap + CoinPaprika ATH/multi-timeframe), `top` (top N by market cap, max 25), `dex` (DEX pair search via DEX Screener, sorted by liquidity). +- **Implementation:** `src/openrouter/tools.ts` — `getCrypto()` dispatcher + `getCryptoPrice()`, `getCryptoTop()`, `getCryptoDex()` handlers. 5-minute cache. `Promise.allSettled()` for graceful partial failures on price queries. 11 tests. + +#### F2.5.8: Geolocation from IP (ipapi.co) +- **Status:** ✅ Complete +- **Spec:** `geolocate_ip` tool returning city, region, country, coordinates, timezone, ISP/org for any IPv4/IPv6 address. +- **Implementation:** `src/openrouter/tools.ts` — `geolocateIp()` with input validation, 15-minute cache, error handling. 7 tests. + +--- + +### Phase 3: Compound Engineering + +#### F3.1: Compound Learning Loop +- **Status:** 🔲 Planned +- **Spec:** After each completed Durable Object task: + 1. Extract structured metadata (tools, model, iterations, success/failure, category) + 2. Store in R2 (`learnings/{userId}/history.json`) + 3. Before new tasks, inject relevant past patterns into system prompt +- **Example injection:** "For similar GitHub tasks, the most effective pattern: `github_read_file` (2x) → `github_api`. Average: 4 iterations, 92% success rate." + +#### F3.2: Structured Task Phases +- **Status:** 🔲 Planned +- **Spec:** Add phase tracking to `TaskState`: + ```typescript + interface TaskState { + // ... existing fields + phase: 'planning' | 'executing' | 'reviewing'; + plan?: string[]; // Planned steps + currentStep?: number; + } + ``` +- **Workflow:** + 1. Planning: Model creates explicit plan before tool calls + 2. Executing: Track progress against plan + 3. Reviewing: Self-review before sending final result +- **Telegram UX:** `Planning... → Executing (step 3/7)... → Reviewing...` + +--- + +### Phase 4: Context Engineering + +#### F4.1: Token-Aware Context Management +- **Status:** 🔲 Planned +- **Spec:** Replace `compressContext()` and `estimateTokens()` with Acontext token-budgeted retrieval. +- **Improvement over current:** Actual tokenization vs. chars/4 heuristic. Selective tool result pruning vs. blind middle-message removal. + +#### F4.2: Tool Result Caching +- **Status:** 🔲 Planned +- **Spec:** Cache tool call results keyed by `hash(toolName + args)`. TTL: 5 minutes for `fetch_url`, 30 minutes for `github_read_file`. +- **Storage:** In-memory Map within Durable Object (cleared on completion). + +--- + +### Phase 5: Advanced Capabilities + +#### F5.1: Multi-Agent Review +- **Spec:** After primary model completes complex task, route result to reviewer model. Use cost-efficient reviewers (Gemini Flash, Grok Fast) for expensive output (Claude Opus). + +#### F5.2: MCP Integration +- **Spec:** Dynamic tool registration from MCP servers. Use mcporter patterns for Cloudflare Workers compatibility. + +#### F5.3: Code Execution (via Acontext Sandbox) +- **Spec:** `run_code({ language: 'python' | 'javascript' | 'bash', code: string })` tool backed by Acontext Sandbox. + +#### F5.4: Web Search Tool +- **Spec:** `web_search({ query: string, num_results?: number })` via Brave Search API. + +--- + +### Known Issues (Found 2026-02-08) + +| ID | Issue | Severity | Root Cause | Location | +|----|-------|----------|------------|----------| +| BUG-1 | "Processing complex task..." shown for ALL messages on tool-capable models | Low/UX | ✅ Fixed — Changed to neutral "Thinking..." message | `task-processor.ts:501` | +| BUG-2 | DeepSeek V3.2 doesn't proactively use tools (prefers answering from knowledge) | Medium | ✅ Fixed — Added tool usage hint to system prompt for tool-supporting models | `handler.ts` | +| BUG-3 | `think:LEVEL` override only works on direct fallback path, not through Durable Object | Medium | ✅ Fixed — `reasoningLevel` now added to `TaskRequest` and passed through DO | `handler.ts` → `task-processor.ts` | +| BUG-4 | `/img` fails: "No endpoints found that support output modalities: image, text" | High | ✅ Fixed — FLUX models need `modalities: ['image']`, not `['image', 'text']` | `client.ts:357` | +| BUG-5 | `/use fluxpro` then text message → "No response generated" | Low | ✅ Fixed — Detect image-gen model in chat, fallback to default text model with message | `handler.ts` | + +--- + +## Technical Requirements + +### Performance +- **Chat response latency:** <2s for non-tool queries (Worker → OpenRouter → response) +- **Tool execution:** <5s per individual tool call +- **Task processor iteration:** <30s average (including API call + tool execution) +- **Parallel tools:** Should not exceed 2x single-tool latency + +### Reliability +- **Auto-resume:** Tasks survive DO restarts (up to 10 auto-resumes) +- **Checkpointing:** Every 3 tool calls to R2 +- **Watchdog:** 90s alarm interval, 60s stuck threshold +- **API retries:** 3 attempts with 2s backoff + +### Security +- **No secrets in code or logs** — Redaction via `src/utils/logging.ts` +- **Input validation** — All tool arguments validated before execution +- **Auth layers:** Cloudflare Access (admin), Gateway token (UI), User allowlist (Telegram) +- **No code execution** until Phase 5 with proper sandboxing + +### Scalability +- **Users:** Single-user focus (personal assistant), multi-user via separate deployments +- **Models:** Extensible catalog, add new models via `models.ts` +- **Tools:** Extensible tool system, add new tools via `tools.ts` +- **Platforms:** Extensible chat platforms, add via new route handlers + +--- + +## Success Criteria + +### Phase 1 Success +- [ ] Parallel tool execution reduces multi-tool iteration time by 2x+ +- [ ] All models correctly tagged with capability metadata +- [ ] Reasoning control demonstrably improves tool-calling accuracy + +### Phase 2 Success +- [ ] Users can see per-model cost breakdown +- [ ] Acontext dashboard shows session replays + +### Phase 3 Success +- [ ] Bot demonstrably improves on repeated task types +- [ ] Plan→Work→Review reduces average iterations by 20%+ + +### Overall Success +- [ ] Bot handles 95%+ of Telegram requests without errors +- [ ] Average task completion under 60s for tool-using queries +- [ ] Users report the bot "gets better over time" (compound effect) diff --git a/claude-share/core/SYNC_CHECKLIST.md b/claude-share/core/SYNC_CHECKLIST.md new file mode 100644 index 000000000..27706c670 --- /dev/null +++ b/claude-share/core/SYNC_CHECKLIST.md @@ -0,0 +1,109 @@ +# Sync Checklist + +> **EVERY AI assistant MUST follow this checklist after completing any task.** +> No exceptions. Skipping steps creates drift between agents. + +**Last Updated:** 2026-02-06 + +--- + +## After EVERY Task + +- [ ] **Update session log** — Append to the correct log file: + - Claude: `claude-share/core/claude-log.md` + - Codex: `claude-share/core/codex-log.md` + - Other: `claude-share/core/bot-log.md` +- [ ] **Update GLOBAL_ROADMAP.md** — Change task status emoji and add changelog entry +- [ ] **Update WORK_STATUS.md** — Reflect current sprint state +- [ ] **Update next_prompt.md** — Point to the next task for the next AI session +- [ ] **Run tests** — `npm test` must pass before pushing +- [ ] **Run typecheck** — `npm run typecheck` must pass before pushing +- [ ] **Commit with proper format** — See commit message format below +- [ ] **Push to correct branch** — Never push to `main` directly + +--- + +## Session Log Entry Format + +```markdown +## Session: YYYY-MM-DD | Task Name (Session: SESSION_ID) + +**AI:** Claude / Codex / Other (model name) +**Branch:** branch-name +**Status:** Completed / Partial / Blocked + +### Summary +Brief description of what was accomplished. + +### Changes Made +- Change 1 +- Change 2 + +### Files Modified +- `path/to/file1.ts` +- `path/to/file2.ts` + +### Tests +- [ ] Tests pass +- [ ] Typecheck passes + +### Notes for Next Session +Any context the next AI needs to continue. +``` + +--- + +## Changelog Entry Format + +Add to `GLOBAL_ROADMAP.md` → Changelog section (newest first): + +``` +YYYY-MM-DD | AI Name (Session: ID) | Task Description: Details | file1.ts, file2.ts +``` + +--- + +## Commit Message Format + +``` +(): + +[optional body] + +AI: (Session: ) +``` + +Types: `feat`, `fix`, `refactor`, `docs`, `test`, `chore` +Scopes: `tools`, `models`, `client`, `gateway`, `telegram`, `discord`, `task-processor`, `openrouter`, `docs` + +Example: +``` +feat(tools): add parallel tool execution via Promise.allSettled + +Replace sequential for...of loop with Promise.allSettled for independent +tool calls. ~2-5x speedup per iteration in multi-tool scenarios. + +AI: Claude Opus 4.6 (Session: abc123) +``` + +--- + +## Branch Naming Convention + +| AI Agent | Branch Pattern | Example | +|----------|---------------|---------| +| Claude | `claude/-` | `claude/parallel-tools-x7k2` | +| Codex | `codex/-` | `codex/cost-tracking-m3p1` | +| Other | `bot/-` | `bot/gemini-flash-tools-q2w3` | +| Human | `feat/` or `fix/` | `feat/mcp-integration` | + +--- + +## What NOT to Do + +- Do NOT push to `main` directly +- Do NOT skip tests ("I'll fix them later") +- Do NOT modify files outside your task scope without documenting why +- Do NOT leave `console.log` debug statements in production code +- Do NOT commit secrets, API keys, or `.dev.vars` +- Do NOT amend another AI's commits without coordination diff --git a/claude-share/core/WORK_STATUS.md b/claude-share/core/WORK_STATUS.md new file mode 100644 index 000000000..8da6f0e19 --- /dev/null +++ b/claude-share/core/WORK_STATUS.md @@ -0,0 +1,147 @@ +# Work Status + +> Current sprint status. Updated by every AI agent after every task. + +**Last Updated:** 2026-02-20 (Phase 5.5 complete: web_search tool via Brave Search API) + +--- + +## Current Sprint: Foundation & Quick Wins + +**Sprint Goal:** Establish multi-AI orchestration documentation, ship Phase 0 quick wins, begin Phase 1 tool-calling optimization, sync upstream fixes. + +**Sprint Duration:** 2026-02-06 → 2026-02-13 + +--- + +### Active Tasks + +| Task ID | Description | Assignee | Status | Branch | +|---------|-------------|----------|--------|--------| +| 1.5 | Structured output support (json: prefix) | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| 1.4 | Combine vision + tools + update /help | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| 2.5.6+2.5.8 | Crypto tool + Geolocation tool | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-1,2,5 | Fix all 3 remaining UX bugs | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| 2.1+2.2 | Token/cost tracking + /costs command | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| 2.5.4 | Currency conversion tool | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| 2.5.7 | Daily briefing aggregator | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-3 | Pass think: override through DO path | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-4 | Fix /img image generation | Claude Opus 4.6 | ✅ Complete | `claude/daily-briefing-aggregator-NfHhi` | +| 3.1+3.4 | Compound learning loop + prompt injection | Claude Opus 4.6 | ✅ Complete | `claude/extract-task-metadata-8lMCM` | +| — | 6 bot improvements (GLM tools, 402, cross-task ctx, time cap, tool-intent, parallel prompt) | Claude Opus 4.6 | ✅ Complete | `claude/extract-task-metadata-8lMCM` | +| BUG-12 | Fix auto-resume counter persistence + revert GLM free tool flag | Claude Opus 4.6 | ✅ Complete | `claude/extract-task-metadata-8lMCM` | +| 6.1 | /start redesign with inline keyboard + bot menu commands | Claude Opus 4.6 | ✅ Complete | `claude/extract-task-metadata-8lMCM` | +| — | Enhanced R2 skill prompt (Storia identity, model recs) | Claude Opus 4.6 | ✅ Complete | `claude/extract-task-metadata-8lMCM` | +| — | Briefing fixes: weather location, news links, crypto disambiguation | Claude Opus 4.6 | ✅ Complete | `claude/extract-task-metadata-8lMCM` | +| 3.2 | Structured task phases (Plan → Work → Review) | Claude Opus 4.6 | ✅ Complete | `claude/add-task-phases-4R9Q6` | +| 3.3+P1 | P1 guardrails + /learnings command | Claude Opus 4.6 | ✅ Complete | `claude/implement-p1-guardrails-DcOgI` | +| 2.3 | Acontext observability integration | Claude Opus 4.6 | ✅ Complete | `claude/implement-p1-guardrails-DcOgI` | +| 2.5.9 | Holiday awareness (Nager.Date) | Claude Opus 4.6 | ✅ Complete | `claude/implement-p1-guardrails-DcOgI` | +| 4.1 | Token-budgeted context retrieval | Claude Opus 4.6 | ✅ Complete | `claude/implement-p1-guardrails-NF641` | +| S48.1 | Phase budget circuit breakers (plan=8s, work=18s, review=3s) | Claude Opus 4.6 | ✅ Complete | `claude/budget-circuit-breakers-parallel-bAtHI` | +| S48.2 | Parallel tools allSettled + PARALLEL_SAFE_TOOLS whitelist | Claude Opus 4.6 | ✅ Complete | `claude/budget-circuit-breakers-parallel-bAtHI` | +| 4.2 | Replace estimateTokens with real tokenizer (gpt-tokenizer cl100k_base) | Claude Opus 4.6 | ✅ Complete | `claude/implement-p1-guardrails-DcOgI` | +| 2.4 | Acontext sessions dashboard in admin UI | Codex+Claude | ✅ Complete | `claude/implement-p1-guardrails-DcOgI` | + +--- + +### Parallel Work Tracking + +| AI Agent | Current Task | Branch | Started | +|----------|-------------|--------|---------| +| Claude | — (awaiting next task) | — | — | +| Codex | — | — | — | +| Other | — | — | — | + +--- + +### Completed This Sprint + +| Task ID | Description | Completed By | Date | Branch | +|---------|-------------|-------------|------|--------| +| 0.1 | Enable Gemini Flash tool support | Previous PR | 2026-02-06 | main | +| 0.2 | Add GPT-OSS-120B model | Claude Opus 4.6 | 2026-02-07 | `claude/analyze-tool-calling-5ee5w` | +| 0.3 | Add GLM 4.7 model | Claude Opus 4.6 | 2026-02-07 | `claude/analyze-tool-calling-5ee5w` | +| 0.5 | Add OpenRouter Pony Alpha | Claude Opus 4.6 | 2026-02-07 | `claude/analyze-tool-calling-5ee5w` | +| 1.1 | Parallel tool execution (Promise.all) | Claude Opus 4.6 | 2026-02-08 | `claude/resume-tool-calling-analysis-ZELCJ` | +| 1.2 | Model capability metadata enrichment | Claude Opus 4.6 | 2026-02-08 | `claude/resume-tool-calling-analysis-ZELCJ` | +| 1.5.1-7 | Upstream sync: 7 cherry-picks | Claude Opus 4.6 | 2026-02-08 | `claude/resume-tool-calling-analysis-ZELCJ` | +| — | Tool-calling landscape analysis | Claude Opus 4.6 | 2026-02-06 | `claude/analyze-tool-calling-5ee5w` | +| — | Multi-AI orchestration docs | Claude Opus 4.6 | 2026-02-06 | `claude/analyze-tool-calling-5ee5w` | +| — | Free APIs integration analysis | Claude Opus 4.6 | 2026-02-08 | `claude/resume-tool-calling-analysis-ZELCJ` | +| 2.5.1 | URL metadata tool (Microlink) | Claude Opus 4.6 | 2026-02-08 | `claude/review-moltworker-roadmap-q5aqD` | +| 2.5.2 | Chart image generation (QuickChart) | Claude Opus 4.6 | 2026-02-08 | `claude/review-moltworker-roadmap-q5aqD` | +| 2.5.3 | Weather tool (Open-Meteo) | Claude Opus 4.6 | 2026-02-08 | `claude/review-moltworker-roadmap-q5aqD` | +| 2.5.5 | News feeds (HN/Reddit/arXiv) | Claude Opus 4.6 | 2026-02-08 | `claude/review-moltworker-roadmap-q5aqD` | +| 5.5 | Web search tool (Brave Search API) | Codex (GPT-5.2-Codex) | 2026-02-20 | `work` | +| 1.3 | Configurable reasoning per model | Claude Opus 4.6 | 2026-02-08 | `claude/review-moltworker-roadmap-q5aqD` | +| 2.5.7 | Daily briefing aggregator | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-3 | think: override DO passthrough fix | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-4 | /img modalities fix | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| 2.5.4 | Currency conversion tool | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| 2.1+2.2 | Token/cost tracking + /costs command | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-1 | "Processing..." → "Thinking..." | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-2 | Tool usage hint in system prompt | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| BUG-5 | Image-gen model fallback for text | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| 2.5.6 | Crypto tool (CoinCap+CoinPaprika+DEX Screener) | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| 2.5.8 | Geolocation from IP (ipapi.co) | Claude Opus 4.6 | 2026-02-08 | `claude/daily-briefing-aggregator-NfHhi` | +| 1.5 | Structured output support (json: prefix) | Claude Opus 4.6 | 2026-02-09 | `claude/daily-briefing-aggregator-NfHhi` | +| 1.4 | Vision + tools unified + /help update | Claude Opus 4.6 | 2026-02-09 | `claude/daily-briefing-aggregator-NfHhi` | +| 3.1+3.4 | Compound learning loop + prompt injection | Claude Opus 4.6 | 2026-02-10 | `claude/extract-task-metadata-8lMCM` | +| — | 6 bot improvements from Telegram analysis | Claude Opus 4.6 | 2026-02-10 | `claude/extract-task-metadata-8lMCM` | +| BUG-12 | Auto-resume counter fix + GLM free flag revert | Claude Opus 4.6 | 2026-02-10 | `claude/extract-task-metadata-8lMCM` | +| 6.1 | /start redesign with inline keyboard + bot menu commands | Claude Opus 4.6 | 2026-02-11 | `claude/extract-task-metadata-8lMCM` | +| — | Enhanced R2 skill prompt (Storia identity, model recs) | Claude Opus 4.6 | 2026-02-11 | `claude/extract-task-metadata-8lMCM` | +| — | Briefing fixes: weather location, news links, crypto disambiguation | Claude Opus 4.6 | 2026-02-11 | `claude/extract-task-metadata-8lMCM` | +| 3.2 | Structured task phases (Plan → Work → Review) | Claude Opus 4.6 | 2026-02-11 | `claude/add-task-phases-4R9Q6` | +| 3.3+P1 | P1 guardrails + /learnings command | Claude Opus 4.6 | 2026-02-18 | `claude/implement-p1-guardrails-DcOgI` | +| 2.3 | Acontext observability integration | Claude Opus 4.6 | 2026-02-18 | `claude/implement-p1-guardrails-DcOgI` | +| 2.5.9 | Holiday awareness (Nager.Date) | Claude Opus 4.6 | 2026-02-18 | `claude/implement-p1-guardrails-DcOgI` | +| 4.1 | Token-budgeted context retrieval | Claude Opus 4.6 | 2026-02-18 | `claude/implement-p1-guardrails-NF641` | +| 4.1 Audit | Review & harden token-budgeted retrieval | Codex (GPT-5.2-Codex) | 2026-02-19 | `codex/audit-and-improve-context-budget-implementation` | +| S48.1 | Phase budget circuit breakers (plan=8s, work=18s, review=3s) | Claude Opus 4.6 | 2026-02-20 | `claude/budget-circuit-breakers-parallel-bAtHI` | +| S48.2 | Parallel tools allSettled + PARALLEL_SAFE_TOOLS whitelist | Claude Opus 4.6 | 2026-02-20 | `claude/budget-circuit-breakers-parallel-bAtHI` | +| 4.2 | Real tokenizer (gpt-tokenizer cl100k_base) | Claude Opus 4.6 | 2026-02-20 | `claude/implement-p1-guardrails-DcOgI` | +| 2.4 | Acontext sessions dashboard in admin UI | Codex+Claude | 2026-02-20 | `claude/implement-p1-guardrails-DcOgI` | + +--- + +### Bugs Found During Testing (2026-02-08) + Telegram Analysis (2026-02-10) + +| Bug ID | Issue | Severity | Files | Status | +|--------|-------|----------|-------|--------| +| BUG-1 | "Processing complex task..." shown for ALL messages | Low/UX | `task-processor.ts:501` | ✅ Fixed — changed to "Thinking..." | +| BUG-2 | DeepSeek doesn't proactively use tools | Medium | `handler.ts` system prompt | ✅ Fixed — added tool usage hint | +| BUG-3 | `think:` override not passed through DO path | Medium | `handler.ts`, `task-processor.ts` | ✅ Fixed | +| BUG-4 | `/img` fails — modalities not supported | High | `client.ts:357` | ✅ Fixed | +| BUG-5 | `/use fluxpro` + text → "No response" | Low | `handler.ts` | ✅ Fixed — fallback to default model | +| BUG-6 | GLM Free missing supportsTools — hallucinated tool calls | Medium | `models.ts` | ✅ Fixed | +| BUG-7 | 402 quota exceeded not handled — infinite loop | High | `client.ts`, `task-processor.ts` | ✅ Fixed — rotate to free model | +| BUG-8 | No cross-task context continuity | Medium | `task-processor.ts`, `handler.ts` | ✅ Fixed — R2 summary, 1h TTL | +| BUG-9 | Runaway auto-resume (no time limit) | High | `task-processor.ts` | ✅ Fixed — 15/30 min cap | +| BUG-10 | No warning for non-tool model + tool-needing msg | Low/UX | `handler.ts` | ✅ Fixed — tool-intent detection | +| BUG-11 | Weak parallel tool-call instruction | Low | `client.ts` | ✅ Fixed — stronger prompt | + +### Blocked + +| Task ID | Description | Blocked By | Resolution | +|---------|-------------|-----------|------------| +| 2.3 | Acontext integration | ~~API key~~ | ✅ Key configured in Cloudflare — UNBLOCKED | + +--- + +## Next Priorities Queue + +> Ordered by priority. Next AI session should pick the top item. + +1. **Phase 4.3** — Tool result caching (Codex) +2. **Phase 4.4** — Cross-session context continuity +3. **Audit Phase 2** — P2 guardrails (multi-agent review, tool result validation) + +--- + +## Sprint Velocity + +| Sprint | Tasks Planned | Tasks Completed | Notes | +|--------|-------------|----------------|-------| +| Sprint 1 (current) | 8 | 42 | Phase 0 complete, Phase 1 COMPLETE (1.1-1.5), upstream sync complete, Phase 2 COMPLETE (2.1-2.4), Phase 2.5 COMPLETE (all 8 tools + holiday awareness 2.5.9), Phase 3 COMPLETE (3.1-3.4), Phase 4.1-4.2 done, P1 guardrails done, Sprint 48h done (phase budgets + allSettled), ALL 12 bugs fixed, /start redesign + bot menu + briefing fixes, 785 tests total | diff --git a/claude-share/core/bot-log.md b/claude-share/core/bot-log.md new file mode 100644 index 000000000..c99dff6ba --- /dev/null +++ b/claude-share/core/bot-log.md @@ -0,0 +1,11 @@ +# Bot Session Log + +> All other AI model sessions logged here. Newest first. +> (Gemini, Grok, DeepSeek, GPT, etc.) + +--- + +*No sessions yet. Suitable first tasks for other models:* +- *Phase 0.1-0.3: Quick model catalog fixes (trivial)* +- *Code review of existing tool implementations* +- *Documentation improvements* diff --git a/claude-share/core/claude-log.md b/claude-share/core/claude-log.md new file mode 100644 index 000000000..44d4733e1 --- /dev/null +++ b/claude-share/core/claude-log.md @@ -0,0 +1,1053 @@ +# Claude Session Log + +> All Claude sessions logged here. Newest first. + +--- + +## Session: 2026-02-20 | Phase 2.4 — Acontext Sessions Dashboard in Admin UI (Session: session_01SE5WrUuc6LWTmZC8WBXKY4) + +**AI:** Claude Opus 4.6 (review & integration) + Codex GPT-5.2 (5 candidate implementations) +**Branch:** `claude/implement-p1-guardrails-DcOgI` +**Task:** Add Acontext sessions dashboard section to admin UI + +### Approach +- Codex generated 5 candidate implementations (PR124–PR128) +- Claude reviewed all 5, scored them (5–8/10), selected best (branch 4: -8zikq4, 8/10) +- Manually extracted functional code from winning branch, fixed known issues + +### Changes +- **Modified:** `src/routes/api.ts` — added `GET /api/admin/acontext/sessions` backend route +- **Modified:** `src/client/api.ts` — added `AcontextSessionInfo`, `AcontextSessionsResponse` types and `getAcontextSessions()` function +- **Modified:** `src/client/pages/AdminPage.tsx` — added `AcontextSessionsSection` component (exported), `formatAcontextAge()`, `truncateAcontextPrompt()` helpers +- **Modified:** `src/client/pages/AdminPage.css` — 91 lines of Acontext section styles (green border, grid, status dots, responsive) +- **New:** `src/routes/api.test.ts` — 2 backend tests (unconfigured, mapped fields) +- **New:** `src/routes/admin-acontext.test.tsx` — 11 UI tests (render, states, formatAcontextAge, truncateAcontextPrompt) +- **Modified:** `vitest.config.ts` — added `.test.tsx` support + +### Design Decisions +- Used `renderToStaticMarkup` for UI tests (SSR-based, no DOM mocking needed) +- Test file placed at `src/routes/` (not `src/client/` which is excluded by vitest config) +- Exported `formatAcontextAge`, `truncateAcontextPrompt`, `AcontextSessionsSection` for testability +- Graceful degradation: shows "Acontext not configured" hint when API key missing + +### Test Results +- 785 tests total (13 net new) +- Typecheck clean +- Build succeeds + +--- + +## Session: 2026-02-20 | Phase 4.2 — Real Tokenizer (gpt-tokenizer cl100k_base) (Session: session_01SE5WrUuc6LWTmZC8WBXKY4) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/implement-p1-guardrails-DcOgI` +**Task:** Replace heuristic `estimateStringTokens` with real BPE tokenizer + +### Changes +- **New:** `src/utils/tokenizer.ts` — wrapper around `gpt-tokenizer/encoding/cl100k_base` + - `countTokens(text)` — exact BPE token count with heuristic fallback + - `estimateTokensHeuristic(text)` — original chars/4 heuristic (fallback) + - `isTokenizerAvailable()` / `resetTokenizerState()` — diagnostics + testing +- **Modified:** `src/durable-objects/context-budget.ts` — `estimateStringTokens()` now delegates to `countTokens()` from tokenizer module +- **New export:** `estimateStringTokensHeuristic()` for comparison/testing +- **New:** `src/utils/tokenizer.test.ts` — 18 tests covering exact counts, fallback, comparison +- **Adjusted:** `context-budget.test.ts` — relaxed bounds for real tokenizer accuracy +- **Adjusted:** `context-budget.edge.test.ts` — relaxed reasoning_content bound +- **New dependency:** `gpt-tokenizer` (pure JS, no WASM) + +### Design Decisions +- **cl100k_base encoding** — best universal approximation across multi-provider models (GPT-4, Claude ~70% overlap, Llama 3+, DeepSeek, Gemini) +- **gpt-tokenizer over js-tiktoken** — pure JS (no WASM cold start), compact binary BPE ranks, per-encoding tree-shakeable imports +- **Heuristic fallback** — if tokenizer throws, flag disables it for process lifetime and falls back to chars/4 heuristic +- **Bundle impact:** worker entry +1.1 MB (1,388 → 2,490 KB uncompressed) — within CF Workers 10 MB limit + +### Test Results +- 772 tests total (10 net new from tokenizer module) +- Typecheck clean +- Build succeeds + +--- + +## Session: 2026-02-20 | Sprint 48h — Phase Budget Circuit Breakers + Parallel Tools Upgrade (Session: session_01AtnWsZSprM6Gjr9vjTm1xp) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/budget-circuit-breakers-parallel-bAtHI` +**Status:** Completed (merged as PR #123) + +### Summary +Sprint 48h completed both planned tasks: phase budget circuit breakers to prevent Cloudflare DO 30s CPU hard-kill, and parallel tools upgrade from `Promise.all` to `Promise.allSettled` with a safety whitelist for mutation tools. + +### Changes Made +1. **`src/durable-objects/phase-budget.ts`** (NEW) — Phase budget circuit breaker module: + - `PHASE_BUDGETS` constants: plan=8s, work=18s, review=3s + - `PhaseBudgetExceededError` custom error with phase/elapsed/budget metadata + - `checkPhaseBudget()` — throws if elapsed exceeds phase budget +2. **`src/durable-objects/phase-budget.test.ts`** (NEW) — 14 tests covering budget constants, error class, threshold checks, integration concepts +3. **`src/durable-objects/task-processor.ts`** — Integrated both features: + - Phase budget checks before API calls and tool execution + - Catch block: increments `autoResumeCount`, saves checkpoint before propagating + - `phaseStartTime` tracked and reset at phase transitions + - `Promise.all` replaced with `Promise.allSettled` for parallel tool execution + - `PARALLEL_SAFE_TOOLS` whitelist (11 read-only tools): fetch_url, browse_url, get_weather, get_crypto, github_read_file, github_list_files, fetch_news, convert_currency, geolocate_ip, url_metadata, generate_chart + - Mutation tools (github_api, github_create_pr, sandbox_exec) always sequential + - Sequential fallback when any tool in batch is unsafe or model lacks `parallelCalls` +4. **`src/durable-objects/task-processor.test.ts`** — 8 new tests: whitelist coverage, parallel/sequential routing, allSettled isolation, error handling + +### Files Modified +- `src/durable-objects/phase-budget.ts` (new) +- `src/durable-objects/phase-budget.test.ts` (new) +- `src/durable-objects/task-processor.ts` +- `src/durable-objects/task-processor.test.ts` + +### Tests +- [x] Tests pass (762 total, 0 failures — 22 new) +- [x] Typecheck passes + +### Audit Notes (post-merge review) +- `client.ts` still uses `Promise.all` without whitelist (Worker path, non-DO) — not upgraded in this sprint. Roadmap corrected to reflect this. +- `checkPhaseBudget()` does not call `saveCheckpoint` itself (deviation from sprint pseudocode); the wiring is in the task-processor catch block, which is architecturally cleaner. +- No integration test verifying `autoResumeCount` increment in task-processor on phase budget exceeded — only a conceptual test in phase-budget.test.ts. Low risk since the catch path is straightforward. +- GLOBAL_ROADMAP overview said "12 tools" — corrected to 14 (was missing github_create_pr, sandbox_exec). + +--- + +## Session: 2026-02-18 | Phase 4.1 Token-Budgeted Context Retrieval (Session: 018M5goT7Vhaymuo8AxXhUCg) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/implement-p1-guardrails-NF641` +**Status:** Completed + +### Summary +Implemented Phase 4.1 — Token-Budgeted Context Retrieval. Replaced the naive `compressContext` (keep N recent, drop rest) and `estimateTokens` (chars/4 heuristic) with a smarter system that assigns priority scores to every message, maintains tool_call/result pairing for API compatibility, and summarizes evicted content instead of silently dropping it. + +### Changes Made +1. **`src/durable-objects/context-budget.ts`** (NEW) — Token-budgeted context module: + - `estimateStringTokens()` — Refined heuristic with code-pattern overhead detection + - `estimateMessageTokens()` — Accounts for message overhead, tool_call metadata, ContentPart arrays, image tokens, reasoning_content + - `estimateTokens()` — Sum of all messages + reply priming + - `compressContextBudgeted()` — Priority-scored compression: scores messages by role/recency/content-type, builds tool_call pairings, greedily fills token budget from highest priority, summarizes evicted messages with tool names and file paths +2. **`src/durable-objects/task-processor.ts`** — Wired new module: + - `estimateTokens()` method now delegates to `context-budget.estimateTokens()` + - `compressContext()` method now delegates to `compressContextBudgeted(messages, MAX_CONTEXT_TOKENS, keepRecent)` + - Old inline implementations replaced with clean single-line delegations +3. **`src/durable-objects/context-budget.test.ts`** (NEW) — 28 comprehensive tests covering: + - String token estimation (empty, English, code, large strings) + - Message token estimation (simple, tool_calls, ContentPart[], null, reasoning) + - Total token estimation (empty, sum, realistic conversation) + - Budgeted compression (under budget, too few, always-keep, recent, summary, tool pairing, orphans, large conversations, priority ordering, deduplication, null content, minRecent parameter) + +### Files Modified +- `src/durable-objects/context-budget.ts` (new) +- `src/durable-objects/context-budget.test.ts` (new) +- `src/durable-objects/task-processor.ts` + +### Tests +- [x] Tests pass (717 total, 0 failures — 28 new) +- [x] Typecheck passes + +### Notes for Next Session +- The `estimateTokens` heuristic is still approximate (chars/4 + adjustments). Phase 4.2 will replace it with a real tokenizer. +- `compressContextBudgeted` is a pure function and can be tested/benchmarked independently. +- All existing task-processor tests continue to pass — the new compression is backward-compatible. +- Next: Phase 2.4 (Acontext dashboard link) or Phase 4.2 (actual tokenizer) + +--- + +## Session: 2026-02-18 | Phase 2.5.9 Holiday Awareness (Session: 01SE5WrUuc6LWTmZC8WBXKY4) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/implement-p1-guardrails-DcOgI` +**Status:** Completed + +### Summary +Implemented Phase 2.5.9 — Holiday Awareness using the Nager.Date API. Added a `fetchBriefingHolidays` function that reverse-geocodes the user's location to determine the country code, queries Nager.Date for public holidays, and displays a holiday banner in the daily briefing. Supports 100+ countries with local name display. + +### Changes Made +1. **`fetchBriefingHolidays()`** — reverse geocode → country code → Nager.Date API → filter today's holidays → format with local names +2. **`generateDailyBriefing`** — added holiday fetch to parallel Promise.allSettled, holiday banner inserted before Weather section +3. **9 new tests** — 7 unit tests for fetchBriefingHolidays (success, empty, geocode failure, no country, API error, local name skip, multiple holidays) + 2 integration tests for briefing with/without holidays + +### Files Modified +- `src/openrouter/tools.ts` — fetchBriefingHolidays + NagerHoliday type + briefing integration +- `src/openrouter/tools.test.ts` — 9 new tests + +### Tests +- [x] Tests pass (689 total, 0 failures) +- [x] Typecheck passes + +### Notes for Next Session +- Holiday data cached implicitly via the briefing cache (15-minute TTL) +- Non-blocking: if Nager.Date or reverse geocode fails, holiday section is simply omitted +- Next: Phase 4.1 (token-budgeted retrieval) or Phase 2.4 (Acontext dashboard link) + +--- + +## Session: 2026-02-18 | Phase 2.3 Acontext Observability (Session: 01SE5WrUuc6LWTmZC8WBXKY4) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/implement-p1-guardrails-DcOgI` +**Status:** Completed + +### Summary +Implemented Phase 2.3 — Acontext Observability Integration. Built a lightweight fetch-based REST client (not using the npm SDK due to zod@4 + Node.js API incompatibilities with Workers), wired it through TaskRequest and all 6 dispatch sites in handler.ts, added session storage at task completion in the Durable Object, and added /sessions Telegram command. + +### Changes Made +1. **`src/acontext/client.ts`** (NEW) — Lightweight Acontext REST client: AcontextClient class (CRUD sessions/messages), createAcontextClient factory, toOpenAIMessages converter (handles ContentPart[]), formatSessionsList for Telegram display +2. **`src/types.ts`** — Added ACONTEXT_API_KEY and ACONTEXT_BASE_URL to MoltbotEnv +3. **`src/durable-objects/task-processor.ts`** — Added acontextKey/acontextBaseUrl to TaskRequest, Acontext session storage at task completion (creates session, stores messages, logs metadata) +4. **`src/telegram/handler.ts`** — Added acontextKey/acontextBaseUrl properties, constructor params, /sessions command, help text entry, all 6 TaskRequest sites updated +5. **`src/routes/telegram.ts`** — Pass env.ACONTEXT_API_KEY + env.ACONTEXT_BASE_URL to handler factory, added acontext_configured to /info endpoint +6. **`src/acontext/client.test.ts`** (NEW) — 24 tests covering client methods, factory, toOpenAIMessages, formatSessionsList + +### Files Modified +- `src/acontext/client.ts` (new) +- `src/acontext/client.test.ts` (new) +- `src/types.ts` +- `src/durable-objects/task-processor.ts` +- `src/telegram/handler.ts` +- `src/routes/telegram.ts` + +### Tests +- [x] Tests pass (680 total, 0 failures) +- [x] Typecheck passes + +### Notes for Next Session +- Phase 2.3 is complete — Acontext sessions will be created after each DO task completion +- Graceful degradation: no API key = no Acontext calls (null client pattern) +- Next: Phase 2.5.9 (Holiday awareness) or Phase 4.1 (token-budgeted retrieval) + +--- + +## Session: 2026-02-18 | P1 Guardrails + /learnings Command (Session: 01SE5WrUuc6LWTmZC8WBXKY4) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/implement-p1-guardrails-DcOgI` +**Status:** Completed + +### Summary +Implemented P1 guardrails from the audit-build-improvement-plan: Task Router policy function for model routing on resume, source-grounding guardrails to prevent hallucination, automated confidence labeling for coding tasks, and the /learnings Telegram command (Phase 3.3). + +### Changes Made +1. **Task Router policy function** (`resolveTaskModel`) — single source of truth for resume model selection with /dcode and free model stall detection +2. **`detectTaskIntent()`** — reusable coding/reasoning/general classifier +3. **Source-grounding guardrail** (`SOURCE_GROUNDING_PROMPT`) — evidence rules injected into system message for coding tasks +4. **Automated confidence labeling** — High/Medium/Low appended to coding task responses based on tool evidence +5. **`formatLearningSummary()`** — analytics view with success rate, categories, top tools, top models, recent tasks +6. **`/learnings` command** — Telegram handler + help text +7. **Refactored `resolveResumeModel`** — now delegates to Task Router + +### Files Modified +- `src/openrouter/models.ts` — Task Router, detectTaskIntent, RouterCheckpointMeta, RoutingDecision types +- `src/openrouter/learnings.ts` — formatLearningSummary, formatAge +- `src/durable-objects/task-processor.ts` — SOURCE_GROUNDING_PROMPT, confidence labeling +- `src/telegram/handler.ts` — /learnings command, resolveResumeModel refactor, import updates +- `src/openrouter/models.test.ts` — 16 new tests for resolveTaskModel + detectTaskIntent +- `src/openrouter/learnings.test.ts` — 14 new tests for formatLearningSummary + +### Tests +- [x] Tests pass (656 total, 0 failures) +- [x] Typecheck passes + +### Notes for Next Session +- Audit plan Phase 2 (hallucination reduction) quick wins are now implemented +- Phase 3.3 (/learnings) is complete +- Next: Phase 2.3 (Acontext integration) or Phase 2.5.9 (Holiday awareness) + +--- + +## Session: 2026-02-11 | Phase 3.2: Structured Task Phases (Session: 019jH8X9pJabGwP2untYhuYE) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/add-task-phases-4R9Q6` +**Status:** Completed + +### Summary +Implemented Phase 3.2 (Structured Task Phases). Long-running Durable Object tasks now go through three structured phases: Plan → Work → Review. Phase-aware prompts guide the model at each stage, phase transitions are tracked in TaskState, and Telegram progress updates show the current phase. + +### Changes Made +1. **`TaskPhase` type** — New exported type: `'plan' | 'work' | 'review'` +2. **TaskState fields** — Added `phase` and `phaseStartIteration` to the interface +3. **Plan phase** — Injects `[PLANNING PHASE]` prompt as user message for fresh tasks; skipped on checkpoint resume +4. **Plan → Work transition** — After first API response (iteration 1), regardless of tool calls +5. **Work → Review transition** — When model stops calling tools AND `toolsUsed.length > 0`; injects `[REVIEW PHASE]` prompt for one more iteration +6. **Simple task handling** — Tasks with no tools skip review gracefully (phase ends at 'work') +7. **Progress messages** — Updated to show phase: "Planning...", "Working...", "Reviewing..." +8. **Checkpoint persistence** — Phase included in R2 checkpoint saves and restored on resume +9. **8 new tests** — Phase type, initialization, plan→work→review transitions, simple task skip, review prompt injection, "Planning..." status message, phase in R2 checkpoints + +### Files Modified +- `src/durable-objects/task-processor.ts` (phase type, TaskState fields, prompt injection, transitions, progress messages, checkpoint persistence) +- `src/durable-objects/task-processor.test.ts` (NEW — 8 tests) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/next_prompt.md` +- `claude-share/core/claude-log.md` + +### Tests +- [x] 456 tests pass (8 new, 448 existing) +- [x] TypeScript: only pre-existing errors (request.prompt, parse_mode) + +### Notes for Next Session +- Phase 3.3 (/learnings Telegram command) is next +- Phase 2.3 (Acontext integration) is unblocked — API key configured +- The phase system adds ~1 extra API call per tool-using task (review phase) + +--- + +## Session: 2026-02-11 | UX Fixes + /start Redesign + Acontext Key (Session: 018gmCDcuBJqs9ffrrDHHBBd) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/extract-task-metadata-8lMCM` +**Status:** Completed + +### Summary +Full session covering: auto-resume counter bug fix, GLM free tool revert, /start redesign with feature buttons, bot menu commands, enhanced R2 skill prompt, briefing weather location, news clickable links, and crypto symbol disambiguation. Also guided user through Acontext API key setup (now configured in Cloudflare). + +### Changes Made +1. **Auto-resume counter bug** — Counter persisted across different tasks (18→22 on new task). Fixed by checking taskId match before inheriting autoResumeCount from DO storage. +2. **GLM free tool flag reverted** — Live testing confirmed GLM 4.5 Air free tier doesn't generate tool_calls. Removed supportsTools from glmfree. +3. **/start redesign** — Inline keyboard with 8 feature buttons (Coding, Research, Images, Tools, Vision, Reasoning, Pick Model, All Commands). Each button shows detailed guide with examples and model recommendations. +4. **Bot menu commands** — Added setMyCommands to TelegramBot. 12 commands registered during /setup. +5. **Enhanced R2 skill prompt** — Storia identity, model recommendations by task, stronger tool-first behavior. +6. **Briefing location** — Reverse geocodes coordinates via Nominatim for city/country name in weather section. +7. **News clickable links** — HN article URLs, Reddit permalinks, arXiv paper URLs in briefing items. +8. **Crypto symbol fix** — Search with limit=5, filter exact symbol matches, pick highest market cap. Fixes JUP returning wrong token ($3.58 vs actual $0.14). +9. **Acontext API key** — Guided user through setup, now configured as Cloudflare Workers secret. + +### Files Modified +- `src/durable-objects/task-processor.ts` (auto-resume counter taskId check) +- `src/openrouter/models.ts` (GLM free supportsTools revert) +- `src/openrouter/models.test.ts` (updated GLM tests) +- `src/openrouter/tools.ts` (briefing location, news links, crypto disambiguation) +- `src/telegram/handler.ts` (sendStartMenu, getStartFeatureText, handleStartCallback, setMyCommands) +- `src/routes/telegram.ts` (register commands during setup) +- `claude-share/R2/skills/storia-orchestrator/prompt.md` (enhanced skill prompt) + +### Tests +448 total (all passing). No new TypeScript errors (pre-existing only). + +### Notes for Next Session +- Acontext API key is now in Cloudflare — Phase 2.3/4.1 unblocked +- After merging, hit `/telegram/setup` endpoint once to register the new bot menu commands +- Upload `claude-share/R2/skills/storia-orchestrator/prompt.md` to R2 bucket +- Phase 6.1 (inline buttons) is effectively done + +--- + +## Session: 2026-02-11 | /start Redesign + Bot Menu + Skill Prompt (Session: 018gmCDcuBJqs9ffrrDHHBBd) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/extract-task-metadata-8lMCM` +**Status:** Completed + +### Summary +Redesigned /start landing page with inline keyboard feature buttons, added Telegram bot menu commands, and enhanced R2 skill prompt. + +### Changes Made +1. **/start redesign** — Replaced plain text with inline keyboard: 8 feature buttons (Coding, Research, Images, Tools, Vision, Reasoning, Pick Model, All Commands). Each button sends a detailed guide with examples and model recommendations. Navigation with Back to Menu button. +2. **Bot menu commands** — Added `setMyCommands` to TelegramBot class. 12 commands registered during `/setup`: start, help, pick, models, new, img, briefing, costs, status, saves, ar, credits. +3. **Enhanced R2 skill prompt** — Added Storia identity, model recommendation guidance by task type, stronger tool-first behavior, removed filler instructions. + +### Files Modified +- `src/telegram/handler.ts` (sendStartMenu, getStartFeatureText, handleStartCallback, setMyCommands) +- `src/routes/telegram.ts` (register commands during setup) +- `claude-share/R2/skills/storia-orchestrator/prompt.md` (enhanced skill prompt) + +### Tests +448 total (all passing). No new TypeScript errors. + +--- + +## Session: 2026-02-10 | Bug Fixes from Live Testing (Session: 018gmCDcuBJqs9ffrrDHHBBd) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/extract-task-metadata-8lMCM` +**Status:** Completed + +### Summary +Fixed 2 bugs discovered during live Telegram testing of the 6 bot improvements. + +### Changes Made +1. **Auto-resume counter bug** — Counter persisted across different tasks (went 18→22 on a new task). Fixed by checking `taskId` match before inheriting `autoResumeCount` from DO storage. +2. **GLM free tool flag reverted** — Live testing confirmed GLM 4.5 Air free tier doesn't actually generate tool_calls (logged `simple_chat, 0 unique tools`). Removed `supportsTools: true` from `glmfree`. Paid GLM 4.7 still has tools enabled. + +### Files Modified +- `src/durable-objects/task-processor.ts` (taskId check for counter reset) +- `src/openrouter/models.ts` (revert GLM free supportsTools) +- `src/openrouter/models.test.ts` (updated GLM tests) + +### Tests +448 total (all passing) + +--- + +## Session: 2026-02-10 | 6 Bot Improvements from Telegram Analysis (Session: 018gmCDcuBJqs9ffrrDHHBBd) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/extract-task-metadata-8lMCM` +**Status:** Completed + +### Summary +Analyzed real Telegram conversation logs and implemented 6 targeted bot improvements addressing tool-use reliability, error handling, cross-task context, runaway task prevention, and prompt quality. + +### Changes Made +1. **GLM `supportsTools` flag** — Added missing `supportsTools: true` to `glmfree` model (later reverted — see next session). +2. **402 error handling** — Fail fast on quota exceeded (HTTP 402), auto-rotate to a free model, show helpful user-facing message. +3. **Cross-task context** — Store last task summary in R2 after completion, inject into next task's system prompt with 1-hour TTL for continuity. +4. **Elapsed time cap** — 15 min for free models, 30 min for paid. Prevents runaway auto-resume loops in Durable Objects. +5. **Tool-intent detection** — Warn users when their message likely needs tools but their selected model doesn't support them. +6. **Parallel tool-call prompt** — Stronger instruction for models with `parallelCalls` flag to encourage concurrent tool execution. + +### Files Modified +- `src/openrouter/models.ts` (GLM supportsTools flag) +- `src/openrouter/client.ts` (402 handling, parallel prompt) +- `src/durable-objects/task-processor.ts` (elapsed time cap, cross-task context, 402 rotation) +- `src/telegram/handler.ts` (tool-intent warning, cross-task injection) +- Various test files (33 new tests) +- `claude-share/core/*.md` (sync docs) + +### Tests +- [x] 447 tests pass (33 new) +- [x] TypeScript: only pre-existing errors + +### Notes for Next Session +- Phase 3.2 (Structured task phases) is next +- Cross-task context quality should be observed over real usage +- Time cap values (15/30 min) may need tuning based on real workloads + +--- + +## Session: 2026-02-10 | Phase 3.1: Compound Learning Loop (Session: 018gmCDcuBJqs9ffrrDHHBBd) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/extract-task-metadata-8lMCM` +**Status:** Completed + +### Summary +Implemented Phase 3.1 (Compound Learning Loop). After each completed Durable Object task, structured metadata (tools used, model, iterations, success/failure, category, duration) is extracted and stored in R2. Before new tasks, relevant past patterns are retrieved and injected into the system prompt to improve future tool selection and execution strategy. + +### Changes Made +1. **`src/openrouter/learnings.ts`** (NEW) — Complete learning extraction, storage, and retrieval module: + - `TaskCategory` type (7 categories: web_search, github, data_lookup, chart_gen, code_exec, multi_tool, simple_chat) + - `TaskLearning` interface — structured metadata per task + - `LearningHistory` interface — per-user history stored in R2 + - `categorizeTask()` — Categorizes tasks based on tools used, with dominant-category logic for mixed tool usage + - `extractLearning()` — Extracts structured metadata from completed task parameters + - `storeLearning()` — Stores to R2 at `learnings/{userId}/history.json`, caps at 50 entries + - `loadLearnings()` — Loads user's learning history from R2 + - `getRelevantLearnings()` — Scores past learnings by keyword overlap, category hints, recency, and success; only applies bonuses when base relevance exists + - `formatLearningsForPrompt()` — Concise prompt format with tool strategies + +2. **`src/durable-objects/task-processor.ts`** — Learning extraction on task completion: + - After successful completion: extracts learning with `success: true` and stores to R2 + - After failure (with iterations > 0): extracts learning with `success: false` and stores to R2 + - Both paths are failure-safe (try/catch, non-blocking) + +3. **`src/telegram/handler.ts`** — Learning injection before new tasks: + - Added `r2Bucket` property to TelegramHandler for direct R2 access + - Added `getLearningsHint()` helper method — loads history, finds relevant patterns, formats for prompt + - Injects learnings into system prompt in `handleChat()` (text messages) + - Injects learnings into system prompt in `handleVision()` (image + tool path) + +4. **`src/openrouter/learnings.test.ts`** (NEW) — 36 comprehensive tests: + - `categorizeTask` (10 tests): all categories, mixed tools, unknown tools + - `extractLearning` (4 tests): correct fields, truncation, simple chat, failure + - `storeLearning` (4 tests): new history, append, cap at 50, R2 error handling + - `loadLearnings` (3 tests): null, parsed, JSON error + - `getRelevantLearnings` (7 tests): empty, keyword match, category hints, recency, success, filtering, limits + - `formatLearningsForPrompt` (8 tests): empty, single, failed, multiple, truncation, no-tools, strategy hint + +### Files Modified +- `src/openrouter/learnings.ts` (NEW — learning extraction, storage, retrieval) +- `src/openrouter/learnings.test.ts` (NEW — 36 tests) +- `src/durable-objects/task-processor.ts` (learning extraction on completion/failure) +- `src/telegram/handler.ts` (learning injection into system prompt) +- `claude-share/core/*.md` (all sync docs) + +### Tests +- [x] 388 tests pass (36 new) +- [x] TypeScript: only pre-existing errors + +### Notes for Next Session +- Phase 3.2 (Structured task phases) is next +- Consider adding `/learnings` Telegram command (Phase 3.3) to view past patterns +- Learning data quality should be reviewed after 20+ tasks (Human Checkpoint 3.5) + +--- + +## Session: 2026-02-09 | Phase 1.5: Structured Output Support (Session: 013wvC2kun5Mbr3J81KUPn99) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/daily-briefing-aggregator-NfHhi` +**Status:** Completed + +### Summary +Implemented Phase 1.5 (Structured Output Support). Users can now prefix messages with `json:` to request structured JSON output from compatible models. The `response_format: { type: "json_object" }` is injected into API requests for models with `structuredOutput: true` metadata. This completes all of Phase 1 (Tool-Calling Optimization). + +### Changes Made +1. **`ResponseFormat` type** in `client.ts` — supports `text`, `json_object`, and `json_schema` (with name, strict, schema fields). Added `response_format` to `ChatCompletionRequest`. + +2. **`parseJsonPrefix()`** in `models.ts` — strips `json:` prefix from messages (case-insensitive), returns `{ requestJson, cleanMessage }`. Similar pattern to `parseReasoningOverride()` for `think:` prefix. + +3. **`supportsStructuredOutput()`** in `models.ts` — checks if a model alias has `structuredOutput: true` metadata. 7 models supported: gpt, mini, gptoss, deep, mistrallarge, flash, geminipro. + +4. **Client methods updated** — `responseFormat` option added to `chatCompletion()`, `chatCompletionWithTools()`, and `chatCompletionStreamingWithTools()`. Only injected when explicitly provided. + +5. **Handler integration** — `handleChat()` parses `json:` prefix after `think:` prefix, determines `responseFormat` based on model support, passes through DO TaskRequest and fallback paths. Updated `/help` with `json:` prefix hint. + +6. **DO passthrough** — `responseFormat` added to `TaskRequest` and `TaskState` interfaces. Persists across alarm auto-resume. Passed to both OpenRouter streaming and non-OpenRouter fetch paths. + +7. **22 new tests** in `structured-output.test.ts` — prefix parsing (8 tests), model support checks (3), ResponseFormat type (3), ChatCompletionRequest serialization (2), client integration (4), prefix combination with think: (2). + +### Files Modified +- `src/openrouter/client.ts` (ResponseFormat type, response_format in request, all 3 methods) +- `src/openrouter/models.ts` (parseJsonPrefix, supportsStructuredOutput) +- `src/telegram/handler.ts` (json: prefix parsing, responseFormat injection, /help update) +- `src/durable-objects/task-processor.ts` (responseFormat in TaskRequest/TaskState, streaming + fetch paths) +- `src/openrouter/structured-output.test.ts` (NEW — 22 tests) +- `claude-share/core/*.md` (all sync docs) + +### Test Results +- 258 tests pass (22 new) +- TypeScript: only pre-existing errors + +--- + +## Session: 2026-02-09 | Phase 1.4: Vision + Tools + /help Update (Session: 013wvC2kun5Mbr3J81KUPn99) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/daily-briefing-aggregator-NfHhi` +**Status:** Completed + +### Summary +Implemented Phase 1.4 (Combine Vision + Tools). Vision messages now route through the tool-calling path for tool-supporting models, enabling models like GPT-4o to use all 12 tools while analyzing images. Also updated `/help` to reflect all current capabilities. + +### Changes Made +1. **Unified vision+tools routing** in `handleVision()` — builds `ContentPart[]` message (text + image_url) and routes through DO or direct tool-calling path for tool-supporting models. Non-tool models still use simple `chatCompletionWithVision()`. + +2. **Updated `/help` command** — now shows all 12 tools, vision+tools capability, `think:` prefix hint, and correct model descriptions. + +3. **6 new tests** in `vision-tools.test.ts` — verifying multimodal message structure, JSON serialization, tools in request alongside vision content, and tool calls triggered by vision analysis. + +### Files Modified +- `src/telegram/handler.ts` (vision+tools routing + /help update) +- `src/openrouter/vision-tools.test.ts` (NEW — 6 tests) +- `claude-share/core/*.md` (all sync docs) + +### Test Results +- 236 tests pass (6 new) +- TypeScript: only pre-existing errors + +--- + +## Session: 2026-02-08 | Phase 2.5.6+2.5.8: Crypto + Geolocation Tools (Session: 013wvC2kun5Mbr3J81KUPn99) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/daily-briefing-aggregator-NfHhi` +**Status:** Completed + +### Summary +Implemented Phase 2.5.6 (Crypto expansion) and Phase 2.5.8 (Geolocation from IP) as two new tools. This completes the entire Phase 2.5 (Free API Integration) — all 8 tools shipped. + +### Changes Made +1. **`get_crypto` tool** — 3 actions: + - `price`: Single coin data from CoinCap + CoinPaprika (ATH, multi-timeframe % changes). Uses `Promise.allSettled()` for graceful partial failures. + - `top`: Top N coins by market cap via CoinCap (max 25). + - `dex`: DEX pair search via DEX Screener, sorted by liquidity, top 5 results. + - 5-minute cache per query. Helper functions: `formatLargeNumber()`, `formatPrice()`. + +2. **`geolocate_ip` tool** — ipapi.co integration returning city, region, country, coordinates, timezone, ISP/org. IPv4+IPv6 support, input validation, 15-minute cache. + +3. **18 new tests** (11 crypto + 7 geo) — 230 total passing. + +### Files Modified +- `src/openrouter/tools.ts` (2 new tool definitions + handlers + caches) +- `src/openrouter/tools.test.ts` (18 new tests) +- `claude-share/core/*.md` (all sync docs updated) + +### Test Results +- 230 tests pass (18 new) +- TypeScript: only pre-existing errors + +--- + +## Session: 2026-02-08 | BUG-1, BUG-2, BUG-5 Fixes (Session: 013wvC2kun5Mbr3J81KUPn99) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/daily-briefing-aggregator-NfHhi` +**Status:** Completed + +### Summary +Fixed all 3 remaining bugs from the live testing session. All 5 bugs (BUG-1 through BUG-5) are now resolved. + +### Changes Made +1. **BUG-1 (Low/UX):** Changed "Processing complex task..." to "Thinking..." in `task-processor.ts:501`. The old message was misleading for simple queries that happen to use tool-supporting models. + +2. **BUG-2 (Medium):** Added tool usage instruction to the system prompt in `handler.ts` for tool-supporting models. The prompt now tells models: "You have access to tools... Use them proactively when a question could benefit from real-time data, external lookups, or verification." This encourages DeepSeek and other models to actually invoke tools instead of guessing from training data. + +3. **BUG-5 (Low):** Added `isImageGenModel()` check at the start of `handleChat()` in `handler.ts`. When a user's model is image-gen-only (e.g., fluxpro), the bot now sends a helpful message ("Model /fluxpro is image-only. Use /img to generate images.") and falls back to the default text model. + +### Files Modified +- `src/durable-objects/task-processor.ts` (BUG-1: status message text) +- `src/telegram/handler.ts` (BUG-2: tool hint in system prompt; BUG-5: image-gen model fallback) + +### Test Results +- 212 tests pass (no new tests needed — these are behavioral/UX fixes) +- TypeScript: only pre-existing errors + +--- + +## Session: 2026-02-08 | Phase 2.1+2.2: Token/Cost Tracking + /costs command (Session: 013wvC2kun5Mbr3J81KUPn99) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/daily-briefing-aggregator-NfHhi` +**Status:** Completed + +### Summary +Implemented Phase 2.1 (Token/Cost Tracking) and Phase 2.2 (/costs Telegram command). Per-request token usage is now extracted from OpenRouter API responses, cost calculated using model pricing data, and accumulated per-user per-day. Response footers show cost info, and users can query their usage via `/costs` (today) or `/costs week` (7-day breakdown). + +### Changes Made +1. **New `src/openrouter/costs.ts`** — Core cost tracking module with: + - `parseModelPricing()` — parses model cost strings ("$0.25/$0.38", "FREE", "$0.014/megapixel") + - `calculateCost()` — calculates per-call cost from model pricing catalog + - `recordUsage()` / `getUsage()` / `getUsageRange()` — in-memory per-user daily usage store + - `formatUsageSummary()` / `formatWeekSummary()` / `formatCostFooter()` — Telegram display formatters + - `clearUsageStore()` — test helper + +2. **Modified `src/durable-objects/task-processor.ts`** — Track usage per API call iteration, accumulate across multi-iteration tool-calling loops, append cost footer to final response. Added `usage` type to result variable for type safety. + +3. **Modified `src/telegram/handler.ts`** — Added `/costs` and `/usage` command aliases, `handleCostsCommand` method, help text entry. + +4. **New `src/openrouter/costs.test.ts`** — 26 tests covering pricing parser, cost calculator, usage recording/retrieval, formatting, and cleanup. + +### Files Modified +- `src/openrouter/costs.ts` (NEW) +- `src/openrouter/costs.test.ts` (NEW — 26 tests) +- `src/durable-objects/task-processor.ts` (usage tracking + cost footer + type fix) +- `src/telegram/handler.ts` (/costs command + help text) +- `claude-share/core/*.md` (all sync docs updated) + +### Test Results +- 212 tests pass (26 new) +- TypeScript: only pre-existing errors (parse_mode, request.prompt) + +--- + +## Session: 2026-02-08 | Phase 2.5.4: Currency Conversion + Phase 2.5.7 + BUG-3/BUG-4 Fixes (Session: 013wvC2kun5Mbr3J81KUPn99) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/daily-briefing-aggregator-NfHhi` +**Status:** Completed + +### Summary +Implemented Phase 2.5.4 (Currency Conversion Tool), Phase 2.5.7 (Daily Briefing Aggregator), and fixed two high/medium priority bugs (BUG-3 and BUG-4) from the live testing session. + +### Changes Made +1. **BUG-4 Fix (High): `/img` image generation** — Changed `modalities: ['image', 'text']` to `modalities: ['image']` in `generateImage()`. FLUX models are image-only and don't support text output modality. OpenRouter returns "No endpoints found" when text modality is requested for image-only models. + +2. **BUG-3 Fix (Medium): `think:` override through DO path** — Added `reasoningLevel` field to `TaskRequest` interface in `task-processor.ts`. Passed from `handler.ts` when creating TaskRequest. Stored in `TaskState` for persistence across alarm auto-resume. Injected into `chatCompletionStreamingWithTools()` options. Imported `getReasoningParam`, `detectReasoningLevel`, `ReasoningLevel` in task-processor. + +3. **Phase 2.5.7: `/briefing` command** — New `generateDailyBriefing()` function in `tools.ts` that: + - Calls weather (Open-Meteo), HackerNews (top 5), Reddit (top 3), arXiv (latest 3) in parallel via `Promise.allSettled()` + - Formats as clean Telegram message with emoji section headers + - Caches results for 15 minutes (module-level `briefingCache`) + - Handles partial failures gracefully (failed sections show "Unavailable" while others display normally) + - Configurable: lat/lon, subreddit, arXiv category as command args + - Commands: `/briefing` and `/brief` aliases + +4. **6 new tests** covering all sections, custom parameters, caching, partial failures, total failures, cache clearing. + +5. **Phase 2.5.4: `convert_currency` tool** — New tool using ExchangeRate-API (free, no auth). Supports 150+ currencies, validates 3-letter codes, caches exchange rates for 30 minutes per source currency. Format: "100 USD = 85.23 EUR (rate: 0.8523)". 14 new tests. + +### Files Modified +- `src/openrouter/client.ts` (BUG-4: modalities fix) +- `src/durable-objects/task-processor.ts` (BUG-3: reasoningLevel in TaskRequest/TaskState) +- `src/telegram/handler.ts` (BUG-3: pass reasoningLevel; Phase 2.5.7: /briefing command + help text) +- `src/openrouter/tools.ts` (Phase 2.5.4: convert_currency + Phase 2.5.7: generateDailyBriefing + helpers + caches) +- `src/openrouter/tools.test.ts` (14 currency + 6 briefing = 20 new tests) +- `claude-share/core/*.md` (all sync docs updated) + +### Tests +- [x] All 186 tests pass (14 new currency + 6 new briefing, 66 total in tools.test.ts) +- [x] Typecheck: no new errors (pre-existing errors unchanged) + +### Notes for Next Session +- BUG-3 and BUG-4 now fixed. Remaining bugs: BUG-1 (UX), BUG-2 (DeepSeek tool prompting), BUG-5 (fluxpro text UX) +- Next priorities: Phase 2.1 (Token/cost tracking), remaining bugs +- `/briefing` defaults to Prague coordinates — user can customize via args +- Tool count: 10 (was 9) + +--- + +## Session: 2026-02-08 | Live Testing & Bug Documentation (Session: 01Wjud3VHKMfSRbvMTzFohGS) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/review-moltworker-roadmap-q5aqD` +**Status:** Completed + +### Summary +User performed live testing of the deployed bot on Telegram. Tested reasoning control (Phase 1.3), tool usage, and image generation. Discovered 5 bugs documented as BUG-1 through BUG-5. All documentation files updated with findings. + +### Testing Results +1. **Reasoning auto-detect** — Working correctly: + - "hello" (DeepSeek) → ~10s, reasoning off + - "implement fibonacci" → ~30s, reasoning medium + - "analyze pros and cons" → ~42s, reasoning high +2. **think: override** — Working on direct path: + - "think:high what is 2+2?" → ~15s, forced high + - "think:off research quantum computing" → ~29s, forced off +3. **Tool usage** — Model-dependent behavior: + - DeepSeek: "what's trending on hacker news?" → used web search, NOT fetch_news tool + - DeepSeek: explicit "use the fetch_news tool" → worked, 8 tool calls, 72s + - Grok: same query → immediately used fetch_news, 12s, 2 iterations +4. **Image generation** — Broken: + - `/img a cat wearing a top hat` → "No endpoints found that support output modalities: image, text" + - `/use fluxpro` + text → "No response generated" + +### Bugs Found +| ID | Issue | Severity | Location | +|----|-------|----------|----------| +| BUG-1 | "Processing complex task..." shown for ALL messages | Low/UX | `task-processor.ts:476` | +| BUG-2 | DeepSeek doesn't proactively use tools | Medium | Model behavior | +| BUG-3 | `think:` override not passed through DO path | Medium | `handler.ts` → `task-processor.ts` | +| BUG-4 | `/img` fails — modalities not supported | High | `client.ts:357` | +| BUG-5 | `/use fluxpro` + text → "No response" | Low | `handler.ts` | + +### Files Modified +- `claude-share/core/GLOBAL_ROADMAP.md` (bug fixes section + changelog) +- `claude-share/core/WORK_STATUS.md` (bug tracking + priorities) +- `claude-share/core/SPECIFICATION.md` (known issues section) +- `claude-share/core/claude-log.md` (this entry) +- `claude-share/core/next_prompt.md` (bug context for next session) + +### Tests +- [x] No code changes in this update +- [x] Documentation only + +### Notes for Next Session +- BUG-4 (image gen) is highest priority — may be an OpenRouter API change +- BUG-3 (think: passthrough) needs `TaskRequest` interface update +- BUG-2 (DeepSeek tools) could be addressed with system prompt hints +- BUG-1 and BUG-5 are UX polish items + +--- + +## Session: 2026-02-08 | Phase 1.3: Configurable Reasoning (Session: 01Wjud3VHKMfSRbvMTzFohGS) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/review-moltworker-roadmap-q5aqD` +**Status:** Completed + +### Summary +Implemented Phase 1.3: Configurable reasoning per model. Models with `reasoning: 'configurable'` metadata (DeepSeek V3.2, Grok 4.1, Gemini 3 Flash, Gemini 3 Pro) now get provider-specific reasoning parameters injected into API requests. Auto-detection selects reasoning level based on task type (off for simple Q&A, medium for coding/tools, high for research). Users can override via `think:LEVEL` message prefix. + +### Changes Made +1. **Reasoning types and utilities** (`models.ts`) — `ReasoningLevel`, `ReasoningParam` types; `getReasoningParam()` maps level to provider format (DeepSeek/Grok: `{enabled}`, Gemini: `{effort}`); `detectReasoningLevel()` auto-detects from message content; `parseReasoningOverride()` parses `think:LEVEL` prefix +2. **Client integration** (`client.ts`) — Added `reasoning` field to `ChatCompletionRequest`; injected reasoning into `chatCompletion()`, `chatCompletionWithTools()` (upgrades 'off' to 'medium' for tool-use), and `chatCompletionStreamingWithTools()`; all methods accept `reasoningLevel` option +3. **Telegram handler** (`handler.ts`) — Parses `think:LEVEL` prefix from user messages, passes to client methods, saves cleaned message to history +4. **36 tests** (`reasoning.test.ts`) — `getReasoningParam` per model type, `detectReasoningLevel` for simple/coding/research, `parseReasoningOverride` edge cases, client injection verification + +### Files Modified +- `src/openrouter/models.ts` (reasoning types + 4 utility functions) +- `src/openrouter/client.ts` (reasoning injection in 3 methods) +- `src/telegram/handler.ts` (think: prefix parsing) +- `src/openrouter/reasoning.test.ts` (36 new tests) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/claude-log.md` +- `claude-share/core/next_prompt.md` + +### Tests +- [x] All 166 tests pass (36 new reasoning tests) +- [x] Typecheck: no new errors (pre-existing errors unchanged) + +### Notes for Next Session +- Phase 1.3 complete. Tool-calling optimization now done (Phase 1.1-1.3). +- Next: Phase 2.5.7 (Daily briefing), Phase 2.5.4 (Currency conversion), Phase 2.1 (Token/cost tracking) + +--- + +## Session: 2026-02-08 | Phase 2.5.5: News Feeds Tool (Session: 01Wjud3VHKMfSRbvMTzFohGS) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/review-moltworker-roadmap-q5aqD` +**Status:** Completed + +### Summary +Implemented Phase 2.5.5: new `fetch_news` tool supporting three free news sources — HackerNews (Firebase API), Reddit (JSON API), and arXiv (Atom XML). Each source returns top 10 stories with title, URL, score/points, and author info. Supports configurable subreddit (Reddit) and category (arXiv) via optional `topic` parameter. + +### Changes Made +1. **New `fetch_news` tool definition** — Added to `AVAILABLE_TOOLS` with `source` (enum: hackernews/reddit/arxiv) and optional `topic` parameters +2. **Execution dispatcher** — `fetchNews()` validates source and routes to appropriate handler +3. **HackerNews handler** — `fetchHackerNews()` fetches top 10 IDs then parallel-fetches each item via `Promise.all()` +4. **Reddit handler** — `fetchReddit()` parses JSON listing response with configurable subreddit (default: technology) +5. **arXiv handler** — `fetchArxiv()` parses Atom XML via regex, extracts title/id/summary/authors with summary truncation at 150 chars +6. **Typed interfaces** — `HNItem`, `RedditListing` for API response shapes +7. **14 new tests** — Tool presence, invalid source, HN success + API error + failed items, Reddit default + custom subreddit + API error, arXiv default + custom category + API error + empty results + long summary truncation +8. **Documentation updates** — All core docs updated + +### Files Modified +- `src/openrouter/tools.ts` (tool definition + 3 source handlers) +- `src/openrouter/tools.test.ts` (14 new tests) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/SPECIFICATION.md` +- `claude-share/core/next_prompt.md` +- `claude-share/core/claude-log.md` + +### Tests +- [x] All 130 tests pass (14 new for fetch_news + 11 get_weather + 12 generate_chart + 9 url_metadata + 84 existing) +- [x] Typecheck: no new errors (pre-existing errors unchanged) + +### Notes for Next Session +- Phase 2.5.5 complete. Tool count now: 9 (was 8) +- **Next priority: Phase 1.3** — Configurable reasoning per model +- See `next_prompt.md` for ready-to-copy task prompt + +--- + +## Session: 2026-02-08 | Phase 2.5.3: Weather Tool (Session: 01Wjud3VHKMfSRbvMTzFohGS) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/review-moltworker-roadmap-q5aqD` +**Status:** Completed + +### Summary +Implemented Phase 2.5.3: new `get_weather` tool using the free Open-Meteo API. The tool fetches current weather conditions and a 7-day forecast for any lat/lon coordinates. Includes WMO weather code mapping (28 codes) for human-readable descriptions. + +### Changes Made +1. **New `get_weather` tool definition** — Added to `AVAILABLE_TOOLS` with latitude/longitude parameters +2. **Execution handler** — `getWeather()` validates coordinates, calls Open-Meteo API, formats current conditions + 7-day forecast +3. **WMO_WEATHER_CODES** — Complete mapping of 28 WMO weather interpretation codes to human-readable strings +4. **OpenMeteoResponse interface** — Typed API response for current_weather and daily arrays +5. **11 new tests** — Tool presence, success formatting, API URL construction, lat/lon validation (too high, too low, out of range, non-numeric), HTTP errors, boundary coordinates, unknown weather codes +6. **Documentation updates** — All core docs updated + +### Files Modified +- `src/openrouter/tools.ts` (tool definition + WMO codes + execution handler) +- `src/openrouter/tools.test.ts` (11 new tests) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/SPECIFICATION.md` +- `claude-share/core/next_prompt.md` +- `claude-share/core/claude-log.md` + +### Tests +- [x] All 116 tests pass (11 new for get_weather + 12 generate_chart + 9 url_metadata + 84 existing) +- [x] Typecheck: no new errors (pre-existing errors unchanged) + +### Notes for Next Session +- Phase 2.5.3 complete. Tool count now: 8 (was 7) +- **Next priority: Phase 2.5.5** — News feeds (HN + Reddit + arXiv) +- See `next_prompt.md` for ready-to-copy task prompt + +--- + +## Session: 2026-02-08 | Phase 2.5.2: Chart Image Generation (Session: 01Wjud3VHKMfSRbvMTzFohGS) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/review-moltworker-roadmap-q5aqD` +**Status:** Completed + +### Summary +Implemented Phase 2.5.2: new `generate_chart` tool using the free QuickChart API. The tool generates Chart.js-powered PNG chart images (bar, line, pie, doughnut, radar) and returns the image URL for embedding in Telegram/Discord messages. + +### Changes Made +1. **New `generate_chart` tool definition** — Added to `AVAILABLE_TOOLS` array with type/labels/datasets parameters +2. **Execution handler** — `generateChart()` function validates chart type, parses JSON labels/datasets, constructs QuickChart URL, verifies via HEAD request +3. **Input validation** — Validates chart type against allowed set, validates labels and datasets are proper JSON arrays, rejects empty datasets +4. **12 new tests** — Tool presence, URL construction, URL encoding, HEAD verification, all 5 chart types, plus error cases (invalid type, bad JSON, empty datasets, HTTP errors) +5. **Documentation updates** — Updated GLOBAL_ROADMAP, WORK_STATUS, SPECIFICATION, next_prompt, claude-log + +### Files Modified +- `src/openrouter/tools.ts` (tool definition + execution handler) +- `src/openrouter/tools.test.ts` (12 new tests) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/SPECIFICATION.md` +- `claude-share/core/next_prompt.md` +- `claude-share/core/claude-log.md` + +### Tests +- [x] All 105 tests pass (12 new for generate_chart + 9 for url_metadata + 84 existing) +- [x] Typecheck: no new errors (pre-existing errors unchanged) + +### Notes for Next Session +- Phase 2.5.2 complete. Tool count now: 7 (was 6) +- **Next priority: Phase 2.5.3** — Weather tool via Open-Meteo +- See `next_prompt.md` for ready-to-copy task prompt +- The `generate_chart` tool is automatically included in `TOOLS_WITHOUT_BROWSER` + +--- + +## Session: 2026-02-08 | Phase 2.5.1: URL Metadata Tool (Session: 01Wjud3VHKMfSRbvMTzFohGS) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/review-moltworker-roadmap-q5aqD` +**Status:** Completed + +### Summary +Implemented Phase 2.5.1: new `url_metadata` tool using the free Microlink API. The tool extracts structured metadata (title, description, image, author, publisher, date) from any URL, complementing the existing `fetch_url` tool which returns raw content. + +### Changes Made +1. **New `url_metadata` tool definition** — Added to `AVAILABLE_TOOLS` array with proper schema +2. **Execution handler** — `urlMetadata()` function calls `api.microlink.io`, validates URL, handles errors gracefully +3. **Switch case** — Added `url_metadata` to `executeTool()` dispatcher +4. **MicrolinkResponse interface** — Typed API response shape +5. **Comprehensive test suite** — 9 tests covering success, missing fields, API failure, HTTP errors, invalid URL, invalid JSON, URL encoding +6. **Documentation updates** — Updated GLOBAL_ROADMAP, WORK_STATUS, next_prompt, claude-log + +### Files Modified +- `src/openrouter/tools.ts` (tool definition + execution handler) +- `src/openrouter/tools.test.ts` (new, 9 tests) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/next_prompt.md` +- `claude-share/core/claude-log.md` + +### Tests +- [x] All 93 tests pass (9 new for url_metadata) +- [x] Typecheck: no new errors (pre-existing errors in task-processor.ts and telegram/handler.ts unchanged) + +### Notes for Next Session +- Phase 2.5.1 complete. Tool count now: 6 (was 5) +- **Next priority: Phase 2.5.2** — Chart image generation via QuickChart +- See `next_prompt.md` for ready-to-copy task prompt +- The `url_metadata` tool is automatically included in `TOOLS_WITHOUT_BROWSER` since the filter only excludes `browse_url` + +--- + +## Session: 2026-02-08 | Phase 1 Implementation + Upstream Sync + Free API Planning (Session: 01Lg3st5TTU3gXnMqPxfCPpW) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/resume-tool-calling-analysis-ZELCJ` +**Status:** Completed + +### Summary +Resumed from stuck `claude/analyze-tool-calling-5ee5w` session. Completed Phase 1.1 (parallel tool execution) and 1.2 (model capability metadata). Cherry-picked 7 upstream fixes from `cloudflare/moltworker` (32 commits behind). Analyzed free APIs catalog and integrated into roadmap as Phase 2.5. Updated all core documentation. + +### Changes Made +1. **Phase 1.1: Parallel tool execution** — Replaced sequential `for...of` with `Promise.all()` in both `client.ts` and `task-processor.ts` +2. **Phase 1.2: Model capability metadata** — Added `parallelCalls`, `structuredOutput`, `reasoning`, `maxContext` fields to `ModelInfo` and populated for all 30+ models +3. **Upstream sync (7 cherry-picks):** + - `0c1b37d`: exitCode fix for sync reliability + - `92eb06a`: Container downgrade standard-4 → standard-1 ($26→$6/mo) + - `73acb8a`: WebSocket token injection for CF Access users + - `021a9ed`: CF_AI_GATEWAY_MODEL env var support + - `fb6bc1e`: Channel config overwrite (prevents stale key validation) + - `1a3c118`: Remove config leak (console.log of full config with secrets) + - `12eb483`: Workspace sync to R2 for memory persistence +4. **Free API analysis** — Mapped 25+ free APIs from `storia-free-apis-catalog.md` into roadmap as Phase 2.5 (10 tasks, ~23h, $0/month) +5. **Documentation updates** — Updated GLOBAL_ROADMAP.md, WORK_STATUS.md, SPECIFICATION.md, next_prompt.md, claude-log.md + +### Files Modified +- `src/openrouter/client.ts` (parallel tools) +- `src/openrouter/models.ts` (capability metadata) +- `src/durable-objects/task-processor.ts` (parallel tools) +- `src/index.ts` (WS token injection) +- `src/types.ts` (AI Gateway env vars) +- `src/gateway/env.ts` (AI Gateway passthrough) +- `src/gateway/env.test.ts` (AI Gateway tests) +- `src/gateway/sync.ts` (exitCode fix + workspace sync) +- `src/gateway/sync.test.ts` (updated mocks) +- `start-moltbot.sh` (channel config overwrite, config leak fix, AI Gateway, workspace restore) +- `wrangler.jsonc` (container downgrade) +- `Dockerfile` (cache bust) +- `README.md` (AI Gateway docs) +- `.dev.vars.example` (AI Gateway vars) +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/SPECIFICATION.md` +- `claude-share/core/next_prompt.md` +- `claude-share/core/claude-log.md` + +### Tests +- [x] All 84 tests pass (2 new from AI Gateway env tests) +- [x] No new typecheck errors (pre-existing errors unchanged) + +### Notes for Next Session +- Phase 1.1 + 1.2 complete. Phase 1.5 (upstream sync) complete. +- **Next priority: Phase 2.5.1** — URL metadata tool via Microlink (1h, no auth) +- See `next_prompt.md` for ready-to-copy task prompt +- Human checkpoint 1.6 pending: test parallel tool execution with real API calls +- Human checkpoint 2.5.11 pending: decide which free APIs to prioritize first +- Skipped upstream commit `97c7dac` (oxlint/oxfmt mass reformat) — too many conflicts, defer to dedicated reformat pass + +--- + +## Session: 2026-02-07 | Phase 0: Quick Model Catalog Wins (Session: 011qMKSadt2zPFgn2GdTTyxH) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/analyze-tool-calling-5ee5w` +**Status:** Completed + +### Summary +Completed Phase 0 quick wins: added 3 new models to the catalog (Pony Alpha, GPT-OSS-120B, GLM 4.7). Task 0.1 (Gemini Flash tools) was already done on main from a previous PR. All models verified on OpenRouter, deployed successfully. + +### Changes Made +1. Added `pony` — OpenRouter Pony Alpha (free, 200K context, coding/agentic/reasoning, tools) +2. Added `gptoss` — OpenAI GPT-OSS 120B free tier (117B MoE, native tool use) +3. Added `glm47` — Z.AI GLM 4.7 ($0.07/$0.40, 200K context, multi-step agent tasks) +4. Set up orchestration docs in `claude-share/core/` (public repo) +5. Updated CLAUDE.md, AGENTS.md, .gitignore for public repo + +### Files Modified +- `src/openrouter/models.ts` (3 new model entries) +- `.gitignore` (added claude-share/ exclusion) +- `CLAUDE.md` (new) +- `AGENTS.md` (updated) + +### Tests +- [x] All 82 tests pass +- [ ] Typecheck has pre-existing errors (not from our changes) + +### Notes for Next Session +- Phase 0 complete. Move to Phase 1.1: Parallel tool execution +- See `next_prompt.md` for ready-to-copy task prompt +- Pre-existing typecheck errors in `task-processor.ts` and `telegram/handler.ts` need attention + +--- + +## Session: 2026-02-06 | Multi-AI Orchestration & Tool-Calling Analysis (Session: 011qMKSadt2zPFgn2GdTTyxH) + +**AI:** Claude Opus 4.6 +**Branch:** `claude/analyze-tool-calling-5ee5w` +**Status:** Completed + +### Summary +Created comprehensive tool-calling landscape analysis and multi-AI orchestration documentation structure. Analyzed three external projects (steipete ecosystem, Acontext, Compound Engineering Plugin) for applicability to Moltworker. Identified 10 architectural gaps and produced 13 actionable recommendations across 6 phases. + +### Changes Made +1. Created `brainstorming/tool-calling-analysis.md` — Full analysis (475 lines) + - steipete ecosystem analysis (mcporter, Peekaboo, CodexBar, oracle) + - Acontext context data platform analysis + - Compound Engineering Plugin analysis + - OpenRouter tool-calling model landscape + - 10 gaps identified, 13 recommendations, priority matrix +2. Created multi-AI orchestration documentation structure: + - `claude-share/core/SYNC_CHECKLIST.md` + - `claude-share/core/GLOBAL_ROADMAP.md` + - `claude-share/core/WORK_STATUS.md` + - `claude-share/core/next_prompt.md` + - `claude-share/core/AI_CODE_STANDARDS.md` + - `claude-share/core/SPECIFICATION.md` + - `claude-share/core/claude-log.md` (this file) + - `claude-share/core/codex-log.md` + - `claude-share/core/bot-log.md` +3. Created `CLAUDE.md` — Claude Code project instructions +4. Updated `AGENTS.md` — Added multi-agent coordination section + +### Files Modified +- `brainstorming/tool-calling-analysis.md` (new) +- `claude-share/core/*.md` (all new, 9 files) +- `CLAUDE.md` (new) +- `AGENTS.md` (updated) + +### Tests +- [x] No code changes, documentation only +- [x] Existing tests unaffected + +### Notes for Next Session +- Start with Phase 0 quick wins (tasks 0.1-0.3 in GLOBAL_ROADMAP.md) +- See `next_prompt.md` for ready-to-copy task prompt +- Model IDs for GPT-OSS-120B and GLM 4.7 need verification on OpenRouter diff --git a/claude-share/core/codex-log.md b/claude-share/core/codex-log.md new file mode 100644 index 000000000..6dd4bc2ed --- /dev/null +++ b/claude-share/core/codex-log.md @@ -0,0 +1,109 @@ +# Codex Session Log + +> All Codex sessions logged here. Newest first. + +--- + + +## Session: 2026-02-20 | Phase 5.5 web_search tool (Session: codex-phase-5-5-web-search-001) + +**AI:** Codex (GPT-5.2-Codex) +**Branch:** work +**Status:** Completed + +### Summary +Added Brave Search-powered `web_search` tool end-to-end (tool registry, execution, DO/Telegram key plumbing, cache, and tests). + +### Changes Made +- Added `web_search` tool definition and execution path with 5-minute cache + 20KB truncation +- Added Brave Search key plumbing via `ToolContext`, `TaskRequest`/`TaskState`, and Telegram DO dispatch +- Added parallel-safety whitelist entry for `web_search` +- Added 8 dedicated `web_search` tests and updated tool count assertions + +### Files Modified +- `src/openrouter/tools.ts` +- `src/openrouter/tools.test.ts` +- `src/openrouter/briefing-aggregator.test.ts` +- `src/durable-objects/task-processor.ts` +- `src/telegram/handler.ts` +- `src/routes/telegram.ts` +- `src/types.ts` +- `claude-share/core/codex-log.md` +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/next_prompt.md` + +### Tests +- [x] Targeted tests pass (`tools.test.ts`, `briefing-aggregator.test.ts`) +- [ ] Full test suite pass (blocked by missing `gpt-tokenizer/encoding/cl100k_base` module in environment) +- [ ] Typecheck pass (blocked by missing `gpt-tokenizer/encoding/cl100k_base` module in environment) + +### Notes for Next Session +Install/fix `gpt-tokenizer` package resolution in this environment, then rerun full `npm test` and `npm run typecheck`. + +--- + +## Session: 2026-02-19 | Phase 4.1 context-budget audit hardening (Session: codex-phase-4-1-audit-001) + +**AI:** Codex (GPT-5.2-Codex) +**Branch:** work +**Status:** Completed + +### Summary +Audited and hardened token-budgeted context retrieval with edge-case fixes, model-aware budgets, and expanded tests. + +### Changes Made +- Fixed unsafe fallback tool pairing for unknown `tool_call_id` messages +- Added transitive pair-set expansion to keep tool/assistant chains valid during greedy selection +- Increased image token estimate and added JSON-density adjustment in token heuristic +- Switched TaskProcessor compression threshold to per-model context budgets (`getModel(alias)?.maxContext`) +- Added edge-case stress tests and an audit report document + +### Files Modified +- `src/durable-objects/context-budget.ts` +- `src/durable-objects/context-budget.edge.test.ts` +- `src/durable-objects/task-processor.ts` +- `brainstorming/phase-4.1-audit.md` +- `claude-share/core/codex-log.md` +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/next_prompt.md` + +### Tests +- [x] Tests pass +- [x] Typecheck passes + +### Notes for Next Session +Implement Phase 4.2 with a real tokenizer (`js-tiktoken`) if Cloudflare Workers compatibility is acceptable; wire exact counts into final budget validation pass. + +--- + +## Session: 2026-02-16 | Full audit + build improvement plan (Session: codex-audit-plan-001) + +**AI:** Codex (GPT-5.2-Codex) +**Branch:** work +**Status:** Completed + +### Summary +Created a full audit and staged build-improvement plan focused on `/dcode` resume loops and hallucination reduction. + +### Changes Made +- Added `brainstorming/audit-build-improvement-plan.md` with root-cause analysis and 5-phase remediation plan +- Documented immediate quick wins, test/CI gates, and success metrics + +### Files Modified +- `brainstorming/audit-build-improvement-plan.md` +- `claude-share/core/codex-log.md` +- `claude-share/core/GLOBAL_ROADMAP.md` +- `claude-share/core/WORK_STATUS.md` +- `claude-share/core/next_prompt.md` + +### Tests +- [x] Tests pass +- [x] Typecheck passes + +### Notes for Next Session +Implement Phase 1 first: add centralized task router policy and resume model escalation for stalled coding tasks. + +--- + diff --git a/claude-share/core/free-models-integration-spec-v1.4.md b/claude-share/core/free-models-integration-spec-v1.4.md new file mode 100644 index 000000000..c8af59718 --- /dev/null +++ b/claude-share/core/free-models-integration-spec-v1.4.md @@ -0,0 +1,1487 @@ +# Free Models Integration Spec — Storia Digital AI Hub + +> **Version**: 1.4 (Grok-reviewed + maintenance + archetypes + memory + Deep Mode tier) +> **Date**: 2026-02-11 +> **Author**: Claude Opus 4.6 — reviewed by Grok (8.5/10 → adjustments applied) +> **Sources**: cheahjs/free-llm-api-resources (6.6k ★), Grok analysis, Storia project knowledge +> **Location**: `claude-share/brainstorming/free-models-integration-spec.md` +> **Depends on**: `ai-models-spec-storia.md` v2.3, ClawRouter (Phase 3.1), LLM Proxy (`/api/llm-proxy/route.ts`) +> **⚠️ Limits volatile** — last verified Feb 2026. Free-tier quotas change frequently. §10 FreeModelWatcher handles this automatically. + +--- + +## 1. Executive Summary + +Storia's BYOK philosophy ("Every AI. Your Keys. Zero Markup.") creates a cold-start problem: new users without API keys can't experience the platform. Free LLM tiers solve this by providing an instant, zero-friction onboarding path where users can chat, code, and research immediately—then graduate to their own keys for higher limits and premium models. + +This spec defines how to integrate free-tier LLM providers into Storia's existing architecture (LLM proxy, ClawRouter, Model Playground) without compromising the BYOK core or adding platform costs. + +**Strategic outcome**: User signs up → chats with Llama 3.3 70B via Groq in under 30 seconds → no API key needed → converts to BYOK when they hit daily limits. + +--- + +## 2. Provider Catalog — Ranked by Storia Fit + +### 2.1 Tier 1: Primary Free Providers (Integrate First) + +These providers offer the best combination of model quality, generous limits, and API compatibility with Storia's existing infrastructure. + +#### OpenRouter Free Tier + +- **URL**: `openrouter.ai/api/v1` (already in LLM_ALLOWED_HOSTS roadmap) +- **API format**: OpenAI-compatible (works with existing LLM proxy) +- **Limits**: 20 req/min, 50 req/day (1,000/day with $10 lifetime top-up — **recommended for beta**) +- **⚠️ Reality check**: Free model availability fluctuates weekly. Some models rotate in/out of `:free` status. Expect 20-30 reliably free models at any given time, not 40+. Some free models are low-priority / queued during peak hours. +- **Top free models** (verified Feb 2026, subject to change): + - `meta-llama/llama-3.3-70b-instruct:free` — Solid general-purpose (GPT-4o mini / Sonnet 3.5 class, not GPT-4 class) + - `deepseek/deepseek-r1-0528:free` — Strong reasoning/research chain-of-thought + - `deepseek/deepseek-chat-v3.1:free` — Fast general chat + - `nousresearch/hermes-3-llama-3.1-405b:free` — Largest free instruct model, rivals paid frontier for deep reasoning + - `mistralai/devstral-2:free` — Mistral's agentic coding model, strong multi-file refactoring + - `tngtech/deepseek-r1t2-chimera:free` — Reasoning chimera variant, rising in usage + - `qwen/qwen3-235b-a22b:free` — Largest free MoE model available + - `qwen/qwen3-coder:free` — Coding specialist + - `moonshotai/kimi-k2:free` — Agent-capable, long context + - `z-ai/glm-4.5-air:free` — GLM family free variant +- **Storia value**: Single API key unlocks all free models. OpenRouter is already planned for Phase 2.6.1. Free models use the same endpoint as paid models—just append `:free` to the model string. +- **Data training**: No opt-in required for free tier +- **Integration effort**: 2h (already OpenAI-compatible) + +#### Groq + +- **URL**: `api.groq.com` (already in LLM_ALLOWED_HOSTS roadmap) +- **API format**: OpenAI-compatible +- **Limits**: Varies per model—Llama 3.3 70B gets 1,000 req/day at 12,000 tokens/min; Llama 3.1 8B gets 14,400 req/day +- **⚠️ Reality check**: 70B models hit 429 quickly under heavy use. Route 8B for drafts/speed (14,400 RPD headroom is massive), reserve 70B for quality-critical paths. +- **Top free models**: + - `llama-3.1-8b-instant` — **Default speed pick**: Sub-second, 14,400 req/day + - `llama-3.3-70b-versatile` — Best quality, but 1,000 req/day burns fast + - `qwen/qwen3-32b` — Strong reasoning + - `moonshotai/kimi-k2-instruct` — Agent tasks + - `openai/gpt-oss-120b` — Large open-source model (1,000 RPD) +- **Storia value**: Fastest inference of any free provider. Ideal for ClawRouter's "Max Speed" preset. The 8B model at 14,400 RPD is the workhorse—use it for simple queries, iteration loops, and drafts. Reserve 70B for when quality matters. +- **Data training**: No opt-in required +- **Integration effort**: 2h + +### 2.1.5 Tier 1.5: High Value but Higher Risk (Phase 1.5) + +#### Google AI Studio (Gemini API) + +- **URL**: `generativelanguage.googleapis.com` +- **API format**: Google Gemini SDK (not OpenAI-compatible; needs adapter) +- **Limits**: Gemini 2.5 Flash: nominally 250 req/day, 10 req/min — but **actual limits frequently lower** (~20-100 RPD reported after Dec 2025 reductions); Gemini 2.5 Pro: essentially gone from true free tier (2 RPM, 50 RPD) +- **⚠️ Reality check**: Google has repeatedly cut free-tier quotas in late 2025 / early 2026. Flash is still usable but unreliable as a primary provider. Quota volatility makes this risky as a default route. +- **Top free models**: + - `gemini-2.5-flash` — Strong multimodal, huge context window (when quota allows) + - `gemini-2.5-flash-lite` — Budget variant, ~1,000 req/day (more stable) + - `gemma-3-27b-instruct` — Open-weight, 14,400 req/day (most reliable Google option) +- **Storia value**: Massive context windows (1M+ tokens) make this the best choice for research tasks IF quotas hold. Gemma 3 27B is the safe bet here — stable, generous, open-weight. +- **Data training**: ⚠️ Data used for training outside UK/CH/EEA/EU. Must flag clearly in UI. +- **Integration effort**: 8-10h (Gemini SDK adapter, different error format, safety block handling, content type differences) +- **Recommendation**: **Phase 1.5** — implement after Groq + OpenRouter are proven. Default routing should prefer non-Google unless user is in EU and needs long context. Use Cerebras or OpenRouter DeepSeek R1 for research tasks instead. + +#### Cerebras + +- **URL**: `api.cerebras.ai` +- **API format**: OpenAI-compatible +- **Limits**: 30 req/min, 14,400 req/day, 1M tokens/day (generous on paper) +- **⚠️ Reality check**: Token limits are generous but request caps can be lower in practice for shared keys. Popular models (Qwen 235B, 480B) face contention during peak hours. Add health monitoring early. +- **Top free models**: + - `llama-3.3-70b` — High-quality general reasoning + - `qwen/qwen3-235b-a22b` — Massive MoE model (contention risk) + - `qwen/qwen3-coder-480b` — 10 req/min, 100 req/day (very limited but powerful) + - `llama-4-scout` / `llama-4-maverick` — Latest Llama 4 variants +- **Storia value**: Highest daily token limits of any free provider. Best for heavy research sessions and long coding workflows when Groq/OpenRouter quotas are exhausted. Strong Phase 1.5 / fallback candidate. +- **Data training**: No explicit policy found — monitor +- **Integration effort**: 2h + +### 2.2 Tier 2: Specialized Providers (Phase 2) + +#### Mistral (La Plateforme + Codestral) + +- **URL**: `api.mistral.ai` / `codestral.mistral.ai` (both in LLM_ALLOWED_HOSTS roadmap) +- **API format**: OpenAI-compatible +- **Limits**: La Plateforme: 1 req/sec, 500K tokens/min, 1B tokens/month (!); Codestral: 30 req/min, 2K req/day +- **Models**: Mistral Small/Medium/Nemo (La Plateforme), Codestral (code-specialized) +- **Storia value**: Codestral is the best free coding model available—80+ language support, purpose-built for code generation. La Plateforme's 1B tokens/month is extremely generous for the Experiment plan. +- **Caveats**: ⚠️ Experiment plan **requires opting into data training** + phone verification. This is a significant privacy hit that conflicts with Storia's trust-first philosophy. +- **Recommendation**: **Phase 2** — default off for most users due to privacy concern. Offer as opt-in with clear disclosure. Users who want Codestral's coding power can add their own Mistral key (free to create) instead. +- **Integration effort**: 3h + +#### Cloudflare Workers AI + +- **URL**: Workers AI binding (native Cloudflare, no external API call needed) +- **API format**: Cloudflare Workers AI API (proprietary but simple) +- **Limits**: 10,000 neurons/day (shared across all models) +- **Models**: Llama 3.x, Gemma 3, Qwen 2.5/3, DeepSeek variants, Mistral Small 3.1 +- **Storia value**: Zero latency—runs on the same edge network as Storia itself. No external API call, no SSRF considerations. Ideal as the fastest possible fallback for simple queries. Already in the stack. +- **Caveats**: Models are often quantized (lower quality than full-precision equivalents). Neuron limits can be confusing—actual request count varies by model size. +- **Integration effort**: 4h (Workers AI binding vs REST API in existing proxy) + +#### Cohere + +- **URL**: `api.cohere.com` +- **API format**: Cohere SDK (not OpenAI-compatible; needs adapter) +- **Limits**: 20 req/min, 1,000 req/month (very restrictive) +- **Models**: Command-A (reasoning), Aya Vision/Expanse (multilingual, 23 languages) +- **Storia value**: Best multilingual free option. Aya models support languages that other free providers don't cover well. Command-A includes built-in RAG citations. +- **Integration effort**: 5h (needs Cohere adapter) + +### 2.3 Tier 3: Trial Credit Providers (Bonus Onboarding) + +These providers offer one-time credits. Storia can surface them as "get started" bonuses—a user gets $30 of Baseten credit or $10 of AI21 credit just by creating an account. + +| Provider | Credits | Duration | Best Models | Integration Value | +|----------|---------|----------|-------------|-------------------| +| **Baseten** | $30 | No expiry | Any model (pay-per-compute) | Highest free credit | +| **AI21** | $10 | 3 months | Jamba family | Unique architecture | +| **Nebius** | $1 | No expiry | Various open models | Low effort | +| **Fireworks** | $1 | No expiry | Various open models | Fast inference | +| **SambaNova** | $5 | 3 months | Llama, DeepSeek variants | Custom silicon | +| **Hyperbolic** | $1 | No expiry | DeepSeek, Qwen3, GPT-OSS | Broad selection | + +**Storia action**: Create a "Free Credits Guide" page showing users how to claim these trial credits for providers Storia already supports. No integration work needed—just documentation + deep links. + +--- + +## 3. Architecture — How Free Models Fit Into Storia + +### 3.1 System Overview + +``` +User Request + │ + ▼ +┌─────────────────────────────────────────────────┐ +│ Storia Frontend (Cockpit) │ +│ ├── Model Selector (shows free badge) │ +│ ├── ClawRouter Override (free tier option) │ +│ └── Quota Dashboard (remaining free calls) │ +└──────────────────┬──────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────┐ +│ /api/llm-proxy/route.ts │ +│ ├── Auth check (logged in? → allow free tier) │ +│ ├── ClawRouter (routes by complexity + tier) │ +│ ├── FreeModelRouter (manages provider quotas) │ +│ └── SSRF allowlist (LLM_ALLOWED_HOSTS) │ +└──────────────────┬──────────────────────────────┘ + │ + ┌─────────┼─────────┬──────────┐ + ▼ ▼ ▼ ▼ + ┌─────────┐ ┌──────┐ ┌────────┐ ┌──────────┐ + │OpenRouter│ │ Groq │ │Cerebras│ │Cloudflare│ + │ :free │ │ │ │ │ │Workers AI│ + └─────────┘ └──────┘ └────────┘ └──────────┘ +``` + +### 3.2 FreeModelRouter — New Module + +**Location**: `src/lib/free-router/` + +This module manages free-tier provider quotas, fallback chains, and rate limiting. It sits alongside (not replacing) ClawRouter. + +```typescript +// src/lib/free-router/types.ts +interface FreeProvider { + id: string; // 'openrouter-free' | 'groq' | 'cerebras' | etc. + endpoint: string; // API base URL + models: FreeModel[]; // Available models + limits: ProviderLimits; // Rate limits + apiKeySource: 'storia' | 'user'; // Who provides the key + dataTrainingWarning?: string; // If provider uses data for training +} + +interface FreeModel { + id: string; // 'llama-3.3-70b-instruct:free' + displayName: string; // 'Llama 3.3 70B' + provider: string; // 'openrouter-free' + capabilities: ModelCapability[]; // ['chat', 'code', 'reasoning', 'vision'] + contextWindow: number; // 128000 + maxOutputTokens: number; // 4096 + qualityTier: 'economy' | 'standard' | 'premium'; + speedRating: 1 | 2 | 3 | 4 | 5; // 5 = fastest +} + +interface ProviderLimits { + requestsPerMinute: number; + requestsPerDay: number; + tokensPerMinute?: number; + tokensPerDay?: number; +} + +interface QuotaState { + providerId: string; + userId: string; + requestsUsedToday: number; + tokensUsedToday: number; + lastResetAt: string; // ISO date + isExhausted: boolean; +} +``` + +### 3.3 Quota Tracking (D1 Table) + +```sql +-- drizzle/migrations/XXXX_free_model_quotas.sql +CREATE TABLE IF NOT EXISTS free_model_quotas ( + id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))), + user_id TEXT NOT NULL REFERENCES users(id) ON DELETE CASCADE, + provider_id TEXT NOT NULL, + requests_used INTEGER NOT NULL DEFAULT 0, + tokens_used INTEGER NOT NULL DEFAULT 0, + reset_date TEXT NOT NULL, -- YYYY-MM-DD, resets daily + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')), + UNIQUE(user_id, provider_id, reset_date) +); + +CREATE INDEX idx_free_quotas_user ON free_model_quotas(user_id, reset_date); +``` + +### 3.4 Provider API Key Management + +**Critical design decision**: Free-tier providers require API keys, but these are *Storia's platform keys*, not user keys. This creates a shared resource that needs protection. + +**Approach — Platform Keys in Env Vars**: + +``` +# wrangler.toml (secrets, not committed) +FREE_OPENROUTER_KEY = "sk-or-v1-..." # OpenRouter free-tier key +FREE_GROQ_KEY = "gsk_..." # Groq free-tier key +FREE_CEREBRAS_KEY = "csk-..." # Cerebras free-tier key +FREE_GOOGLE_AI_KEY = "AIza..." # Google AI Studio key +``` + +**Per-user rate limiting** is essential to prevent a single user from exhausting the platform's shared quota. This is where the `free_model_quotas` D1 table comes in—each user gets their own daily allocation within the provider's total limits. + +**Allocation strategy** (conservative — start low, scale up based on actual burn rates): + +| Provider | Provider Daily Limit | Per-User Allocation (Beta) | Per-User Allocation (Post-Launch) | Platform-Wide Daily Cap | +|----------|---------------------|---------------------------|-----------------------------------|------------------------| +| OpenRouter | 50 req/day (1,000 w/ top-up) | 15 req/day | 5-8 req/day | 80% of provider limit | +| Groq (8B) | 14,400 req/day | 100 req/day | 40 req/day | 80% of provider limit | +| Groq (70B) | 1,000 req/day | 30 req/day | 15 req/day | 70% of provider limit | +| Cerebras | 14,400 req/day | 80 req/day | 30 req/day | 80% of provider limit | +| Google AI | 250 req/day (nominal) | 15 req/day | 8 req/day | 60% of provider limit | +| Workers AI | 10,000 neurons/day | Shared pool | Shared pool | N/A (edge native) | + +**Platform-wide daily cap**: Stop routing to a provider when platform-wide usage hits the cap percentage. This prevents the last few users of the day from getting 100% error rates. When cap is hit, FreeModelRouter skips that provider in the fallback chain. + +These allocations should be configurable via env vars and auto-adjusted as the user base grows. The D1 quota table tracks both per-user and platform-wide daily totals. + +### 3.5 ClawRouter Integration + +ClawRouter already classifies queries by complexity (simple/medium/complex) and routes to economy/standard/premium model tiers. Free models slot into this naturally: + +```typescript +// Extension to existing ClawRouter presets +const FREE_TIER_ROUTING = { + 'max-speed': { + economy: 'groq/llama-3.1-8b-instant', // Sub-second, 14,400 RPD + standard: 'groq/llama-3.3-70b-versatile', // Fast + capable (reserve quota) + premium: 'cerebras/qwen3-235b-a22b', // Best free reasoning + }, + 'balanced': { + economy: 'groq/llama-3.1-8b-instant', // Speed workhorse + standard: 'openrouter/llama-3.3-70b-instruct:free', // Solid all-rounder + premium: 'openrouter/deepseek/deepseek-r1-0528:free', // Strong reasoning + }, + 'max-quality': { + economy: 'openrouter/llama-3.3-70b-instruct:free', + standard: 'cerebras/qwen3-235b-a22b', // Large MoE + premium: 'openrouter/deepseek/deepseek-r1-0528:free', // Best free reasoning + }, +}; +// Note: Google Gemini added to 'research' preset in Phase 1.5 only +``` + +**Fallback chain** (with redundancy — try alternative models within same provider before moving on): + +``` +Groq/8B (fastest) → Groq/70B (quality) → OpenRouter/Llama:free → OpenRouter/DeepSeek:free + → Cerebras/Llama → Cerebras/Qwen → Workers AI (edge fallback) → Quota Exhausted +``` + +Each provider gets TWO shots with different models before the chain moves on. This maximizes utilization of each provider's separate model quotas. + +### 3.6 SSRF Allowlist Updates + +Phased additions to `LLM_ALLOWED_HOSTS`: + +```typescript +// Phase 1 MVP (Groq + OpenRouter) +'openrouter.ai', +'api.groq.com', + +// Phase 1.5 (Cerebras) +'api.cerebras.ai', + +// Phase 2 (Google AI, if quotas stabilize) +'generativelanguage.googleapis.com', + +// Workers AI doesn't need SSRF allowlist (native binding) +``` + +--- + +## 4. Onboarding Funnel — The "Zero to Chat" Experience + +### 4.1 User Journey + +``` +1. User arrives at Storia → sees landing page +2. Signs up (email + password, no API key required) +3. Zori greets: "Hey! You can start chatting RIGHT NOW with free AI models! 🦎⚡" +4. User enters first message → routed to Llama 3.3 70B on Groq (fastest) +5. ClawRouter badge shows: "🆓 Free Tier · Groq · Llama 3.3 70B · 42/50 daily requests left" +6. After ~10 messages, Vex nudges: "You've used 10 of your 50 daily free messages. + Add your own API key for unlimited access → Settings" +7. User eventually adds BYOK keys → graduates to full platform +``` + +### 4.2 UI Components + +#### Free Model Badge (extend existing ClawRouterBadge) + +The existing `ClawRouterBadge.tsx` already shows model name, tier, and savings. Extend it with: + +- 🆓 "Free" badge when using platform-provided free models +- Remaining quota counter: "38/50 requests today" +- ⚠️ Data training warning icon for Google AI Studio models +- Upgrade CTA: "Add your API key for unlimited access" + +#### Model Selector — Free Section + +``` +┌──────────────────────────────────────────────┐ +│ Choose Model │ +│ │ +│ 🆓 FREE MODELS (no API key needed) │ +│ ├── Llama 3.1 8B [Groq] ⚡ Fastest │ +│ ├── Llama 3.3 70B [Groq] 🏆 Quality │ +│ ├── DeepSeek R1 [OpenRouter] 🧠 Smart │ +│ ├── Qwen3 Coder [OpenRouter] 💻 Code │ +│ └── + 20 more free models... │ +│ │ +│ 🔑 YOUR MODELS (BYOK) │ +│ ├── Claude 4.5 Sonnet [Anthropic] │ +│ ├── GPT-5.2 [OpenAI] │ +│ └── Add API key... │ +│ │ +│ ℹ️ Free models have daily limits. Add your │ +│ own API keys for unlimited, premium access │ +└──────────────────────────────────────────────┘ +``` + +#### Quota Dashboard (extend SavingsWidget in SitMon) + +``` +┌──────────────────────────────────────────────┐ +│ Free Tier Usage Today │ +│ │ +│ OpenRouter ████████░░░░░░░░ 8/10 requests │ +│ Groq ██████░░░░░░░░░░ 32/50 requests │ +│ Cerebras ██░░░░░░░░░░░░░░ 12/100 requests│ +│ Google AI ░░░░░░░░░░░░░░░░ 0/25 requests │ +│ │ +│ Resets in: 6h 42m │ +│ │ +│ 💡 Vex says: "Add your own Groq key ($0 - │ +│ they're free!) and get 14,400 req/day │ +│ instead of 50. Obviously more efficient." │ +└──────────────────────────────────────────────┘ +``` + +### 4.3 Gecko Nudge Strategy + +The geckos should naturally encourage BYOK adoption without being pushy. Nudges trigger at specific quota thresholds: + +| Trigger | Gecko | Message | +|---------|-------|---------| +| First message (free tier) | Zori | "Welcome! You're using Llama 3.3 70B for FREE! I'm so excited! 🦎" | +| 50% quota used | Kai | "You're flowing well today. Free models refresh tomorrow, or you can add your own keys in Settings for unlimited." | +| 80% quota used | Vex | "Logically, you should know: you have 10 free requests left today. Adding a Groq API key (free to create) gives you 14,400/day. The math is clear." | +| Quota exhausted | Razz | "You've hit the daily limit! 🔥 Two options: wait until tomorrow, or add your API key RIGHT NOW and keep going. I'd go with option 2." | +| After 3 days of free usage | Kai | "You've been using Storia for 3 days now. Here's a guide to getting your own API keys—many providers are free or very cheap." | + +--- + +## 5. User Archetypes & Routing Intelligence + +The free tier serves two fundamentally different user types with opposing needs. Routing them to the same models wastes quota and degrades experience for both. This section defines archetype-aware routing — the strategic layer that makes Storia's free tier feel premium despite costing $0. + +### 5.1 The Two Archetypes + +#### Archetype A: "Conversational" (~70-80% of free-tier DAU) + +The majority. They use AI for quick chat, coaching, shopping advice, brainstorming, emotional check-ins, productivity tips, language practice, casual Q&A. + +| Attribute | Value | +|-----------|-------| +| **Latency tolerance** | Very low — sub-2s mandatory, sub-1s ideal. They bounce if it feels laggy. | +| **Quality needs** | "Good enough" is fine. Templates + memory + persona deliver 80-90% of value. | +| **Message pattern** | Short, frequent, casual. 10-50 messages/session. Rarely exceeds 200 tokens/message. | +| **Model sweet spot** | 8B-27B class: Groq Llama 3.1 8B, Gemma 3 12B/27B, Mistral Small 3.2 | +| **Token cost per session** | ~2K-10K tokens (cheap) | +| **Conversion path** | Hits daily request quota → upgrades for unlimited chat volume | +| **Gecko fit** | Full personality shines here — Zori's energy, Kai's calm coaching. But save tokens: use pre-written persona templates, not dynamic generation. | + +#### Archetype B: "Vibe Coder / Deep Thinker" (~20-30% of free-tier DAU) + +The power users. They use AI for coding, debugging, architecture review, long document analysis, math reasoning, multi-step planning, content creation with iteration. + +| Attribute | Value | +|-----------|-------| +| **Latency tolerance** | Very high — 30s-5min acceptable. Even longer for big refactors if quality is excellent. | +| **Quality needs** | Critical. Accuracy and depth over speed. A wrong code suggestion wastes more time than waiting. | +| **Message pattern** | Long, complex, fewer per session. 5-15 messages but 500-2000+ tokens each. Code blocks, file pastes. | +| **Model sweet spot** | 70B+, MoE: DeepSeek R1, Qwen3 235B/Coder 480B, Hermes 405B, Llama 3.3 70B | +| **Token cost per session** | ~20K-200K tokens (expensive) | +| **Conversion path** | Hits daily token/quality limits → upgrades for premium models (Claude, GPT-5) + unlimited depth | +| **Gecko fit** | Vex's efficiency and Razz's action bias work here. Minimal personality overhead — they want results, not banter. | + +### 5.2 Archetype Detection — The Classifier + +The existing ClawRouter heuristic classifier (regex/keyword + token count) can be extended with archetype detection. This doesn't need ML — simple signals are enough: + +```typescript +// src/lib/free-router/archetype-detector.ts + +type UserArchetype = 'conversational' | 'deep-thinker' | 'unknown'; + +interface ArchetypeSignals { + messageLength: number; // Token count of current message + hasCodeBlocks: boolean; // ```...``` or indented code + hasTechnicalTerms: boolean; // regex: /refactor|debug|deploy|function|class|API|regex|SQL|.../ + hasFileReferences: boolean; // paths, filenames, extensions + sessionMessageCount: number; // How many messages so far this session + avgMessageLength: number; // Running average for this session + hasReasoningMarkers: boolean; // "step by step", "think about", "analyze", "compare" + hasCasualMarkers: boolean; // "hey", "thanks", "lol", "help me with", short questions +} + +function detectArchetype(signals: ArchetypeSignals): UserArchetype { + let deepScore = 0; + let casualScore = 0; + + // Message length is the strongest single signal + if (signals.messageLength > 300) deepScore += 3; + else if (signals.messageLength < 50) casualScore += 3; + + // Code blocks are near-definitive + if (signals.hasCodeBlocks) deepScore += 5; + + // Technical vocabulary + if (signals.hasTechnicalTerms) deepScore += 2; + if (signals.hasFileReferences) deepScore += 2; + + // Reasoning markers + if (signals.hasReasoningMarkers) deepScore += 2; + + // Casual markers + if (signals.hasCasualMarkers) casualScore += 2; + + // Session pattern: many short messages = conversational + if (signals.sessionMessageCount > 8 && signals.avgMessageLength < 80) casualScore += 2; + + // Session pattern: few long messages = deep thinker + if (signals.sessionMessageCount < 5 && signals.avgMessageLength > 200) deepScore += 2; + + if (deepScore >= 5) return 'deep-thinker'; + if (casualScore >= 4) return 'conversational'; + return 'unknown'; // Default to conversational routing (safer, faster) +} +``` + +**Key principle**: When in doubt, route conversational. It's faster and cheaper. A conversational user getting a fast response is happy. A deep thinker getting a fast-but-shallow response will naturally rephrase or switch to "Deep Mode" (UI toggle). + +### 5.3 Archetype-Aware Routing Tables + +This replaces the flat task-type routing from v1.1 with a two-track system: + +```typescript +// src/lib/free-router/archetype-routing.ts + +const CONVERSATIONAL_ROUTING = { + // Optimized for: speed, low token cost, high daily quota + 'max-speed': { + economy: 'groq/llama-3.1-8b-instant', // Sub-second, 14,400 RPD + standard: 'groq/llama-3.1-8b-instant', // Still fast — don't waste 70B quota on chat + premium: 'groq/llama-3.3-70b-versatile', // Only for complex conversational + }, + 'balanced': { + economy: 'groq/llama-3.1-8b-instant', + standard: 'openrouter/google/gemma-3-27b-it:free', // Good mid-range + premium: 'openrouter/llama-3.3-70b-instruct:free', + }, + 'max-quality': { + economy: 'openrouter/google/gemma-3-27b-it:free', + standard: 'openrouter/llama-3.3-70b-instruct:free', + premium: 'openrouter/llama-3.3-70b-instruct:free', // Ceiling for conversational + }, +}; + +const DEEP_THINKER_ROUTING = { + // Optimized for: quality, depth, large context windows + // Latency budget: 30s-300s acceptable + 'max-speed': { + economy: 'groq/llama-3.3-70b-versatile', // Fast but capable + standard: 'groq/qwen/qwen3-32b', // Good reasoning + premium: 'cerebras/qwen3-235b-a22b', // Best quality at speed + }, + 'balanced': { + economy: 'openrouter/llama-3.3-70b-instruct:free', + standard: 'openrouter/deepseek/deepseek-r1-0528:free', // Chain-of-thought + premium: 'openrouter/nousresearch/hermes-3-llama-3.1-405b:free', // Largest free instruct + }, + 'max-quality': { + economy: 'openrouter/deepseek/deepseek-chat-v3.1:free', + standard: 'openrouter/deepseek/deepseek-r1-0528:free', + premium: 'cerebras/qwen/qwen3-coder-480b', // Largest free model (100 RPD) + }, +}; + +// Coding-specific override (sub-archetype of deep-thinker) +const CODING_ROUTING = { + economy: 'openrouter/qwen/qwen3-coder:free', + standard: 'openrouter/mistralai/devstral-2:free', // Mistral's coding agent model + premium: 'cerebras/qwen/qwen3-coder-480b', +}; +``` + +### 5.4 UI: "Quick Chat" vs "Deep Mode" Toggle + +Auto-detection handles most cases, but power users should be able to explicitly choose: + +``` +┌──────────────────────────────────────────────┐ +│ [Chat input field... ] │ +│ │ +│ ⚡ Quick Chat 🧠 Deep Mode │ +│ └ Fast, conversational └ Coding, reasoning │ +│ Sub-second replies May take 30s-5min │ +│ Uses: Llama 8B-70B Uses: DeepSeek R1 │ +│ Qwen3 235B/Coder│ +│ │ +│ Current: ⚡ Auto (detecting...) │ +└──────────────────────────────────────────────┘ +``` + +**Behavior**: +- Default: "Auto" — archetype detector routes dynamically per message +- User clicks "Deep Mode" → locks all messages to deep-thinker routing for this session +- User clicks "Quick Chat" → locks to conversational routing +- Deep Mode shows a progress indicator: "🧠 Brewing deep insights..." (sets expectation for latency) + +### 5.5 The Flywheel: How Archetypes Feed Each Other + +``` +Conversational users (70-80%) Vibe coders (20-30%) + │ │ + │ High volume, low cost │ High engagement, willing to pay + │ per user (~2K-10K tokens) │ per user (~20K-200K tokens) + │ │ + ▼ ▼ + Viral word-of-mouth BYOK conversion + Pro upgrades + "Free AI that actually works" "Better than $20/mo subscriptions" + │ │ + └──────────────┬───────────────────────┘ + │ + ▼ + More users → more data on routing quality + → better archetype detection → better UX + → more word-of-mouth → more users +``` + +**Monetization alignment — three tiers, not two**: + +| Tier | Price | Target Archetype | What They Get | +|------|-------|-----------------|---------------| +| **Free** | $0 | Conversational (majority) | 20-30 free models, daily quota limits, minimal gecko personality, Quick Chat routing | +| **Deep Mode** | $3-5/mo | Vibe coders (entry) | Unlimited deep-thinker routing, higher daily token budget (500K+), full gecko personality, priority queue on Cerebras/OR, access to Hermes 405B + Devstral 2 via platform keys | +| **Pro (BYOK+)** | $9/mo | Power users (both archetypes) | Everything in Deep Mode + premium model access via own keys, zero markup, ClawRouter full features, SitMon Pro, Project Memory unlimited | + +**Why $3-5/mo Deep Mode matters**: Vibe coders already pay $10-20/mo for tools (Cursor, GitHub Copilot, ChatGPT Plus). A $3-5 tier that gives them unlimited access to 70B+ free models with smart routing is an instant decision — less than a coffee. It captures revenue from users who won't bother setting up BYOK keys but want more than the free tier. The margin is nearly pure profit since the models are free — we're selling routing intelligence and convenience. + +**Conversion funnel**: +``` +Free (conversational) → stays free, provides volume +Free (deep thinker) → hits token limits → Deep Mode ($3-5/mo) → power user → Pro/BYOK ($9/mo) +``` + +The casual users subsidize nothing (they're essentially free to serve). Deep Mode captures the "willing to pay a little" segment that BYOK misses. Pro captures the power users who want full control. + +### 5.6 Archetype-Aware Quota Budgeting + +Different archetypes should burn quota differently: + +```typescript +const QUOTA_WEIGHTS = { + 'conversational': { + // Each request costs 1 "quota unit" — they make many cheap requests + requestWeight: 1, + // But their total token budget per day is capped lower + dailyTokenBudget: 50_000, + }, + 'deep-thinker': { + // Each request costs 3 "quota units" — fewer but more expensive + requestWeight: 3, + // Higher token budget (they need it for code/long context) + dailyTokenBudget: 200_000, + }, +}; +``` + +This means a conversational user might get 50 requests/day at ~1K tokens each, while a deep thinker gets ~17 "equivalent requests" but with much larger token allowances per request. Both feel like they have enough — but the platform's actual token spend stays controlled. + +### 5.7 Provider Fallback Chains (Archetype-Aware) + +When a provider is rate-limited or down, the FreeModelRouter cascades through alternatives — but the fallback chain differs by archetype: + +```typescript +const FALLBACK_CHAINS = { + 'conversational': [ + // Priority: speed, then breadth, then edge + 'groq/llama-3.1-8b-instant', + 'groq/llama-3.3-70b-versatile', + 'openrouter/google/gemma-3-27b-it:free', + 'openrouter/llama-3.3-70b-instruct:free', + 'cloudflare/llama-3.3-70b-instruct-fp8', // Edge fallback + ], + 'deep-thinker': [ + // Priority: quality, then reasoning, then depth + 'openrouter/deepseek/deepseek-r1-0528:free', + 'openrouter/nousresearch/hermes-3-llama-3.1-405b:free', // Largest free instruct model + 'cerebras/qwen3-235b-a22b', + 'openrouter/deepseek/deepseek-chat-v3.1:free', + 'groq/llama-3.3-70b-versatile', + 'openrouter/llama-3.3-70b-instruct:free', + ], + 'coding': [ + // Priority: code quality, then depth + 'openrouter/qwen/qwen3-coder:free', + 'openrouter/mistralai/devstral-2:free', // Mistral's coding agent + 'openrouter/deepseek/deepseek-chat-v3.1:free', + 'cerebras/qwen/qwen3-coder-480b', + 'groq/qwen/qwen3-32b', + 'openrouter/llama-3.3-70b-instruct:free', // General fallback + ], +}; +``` + +Each chain gets TWO shots with different models within the same provider before moving on, maximizing per-provider quota utilization. + +### 5.8 Prompt Optimization by Archetype + +Free tiers are rate-limited, so each request must be maximally effective. The optimization strategy differs by archetype: + +**Conversational users**: +- Ultra-compressed system prompts (~15 tokens, no gecko personality overhead) +- Semantic caching is highly effective — repetitive coaching questions hit cache 30-60% of the time +- Pre-written persona templates make 8B models feel premium without dynamic generation +- Memory/RAG layer provides continuity across sessions cheaply (see §5.10) + +```typescript +const CONVERSATIONAL_SYSTEM = `You are a helpful AI assistant on Storia.Digital. +Respond concisely and naturally.`; // ~15 tokens +``` + +**Deep thinkers**: +- Fuller system prompts OK (they use fewer, larger requests anyway) +- Batch multi-step coding tasks into single calls when possible (plan → code → test) +- No caching — each request is unique enough that cache hits are rare +- Pre-format code context to minimize wasted tokens (strip comments, collapse whitespace) + +```typescript +const DEEP_THINKER_SYSTEM = `You are a senior developer assistant on Storia.Digital. +Think step by step. Show your reasoning. Provide complete, working code. +If the task is complex, break it into phases and implement each.`; // ~40 tokens +``` + +### 5.9 Hybrid Free + BYOK Strategy + +Users with some API keys can mix free and paid models — and archetype awareness makes this smarter: + +- **Conversational + BYOK**: Free tier handles 90% of their chat. BYOK keys only used when they explicitly pick a premium model or hit free quota. +- **Deep thinker + BYOK**: Free tier handles drafts/planning. BYOK keys used for final code generation, complex reasoning, or when they switch to Claude/GPT-5 for quality-critical work. + +Show savings in the Cockpit SavingsWidget: "You saved $0.12 by using free Llama 3.3 for drafting instead of Claude Sonnet. Final version used your Anthropic key." + +### 5.10 Memory & RAG Layer — Making Cheap Models Feel Premium + +The biggest amplifier for free-tier quality isn't a better model — it's context. An 8B model with good memory and relevant context outperforms a 70B model with none. This is especially true for conversational users who return daily with the same themes (fitness, habits, projects). + +**Architecture: Pinecone Free Tier + D1 hybrid** + +Pinecone's free tier (as of Feb 2026) offers: +- 1 index, 2GB storage, ~100K vectors with 1536 dimensions +- No credit card required, generous for a small-to-medium user base +- Serverless, no infrastructure to manage + +This is more than enough for Storia's free-tier memory layer. Each user's conversation summaries and key facts get embedded and stored as vectors. On each new message, query Pinecone for top-k relevant past context and inject it into the system prompt. + +```typescript +// src/lib/free-router/memory-rag.ts + +interface UserMemoryEntry { + userId: string; + embedding: number[]; // 1536-dim from a free embedding model + text: string; // Summarized conversation chunk + metadata: { + timestamp: string; + topic: string; // Auto-tagged: 'fitness', 'coding', 'shopping', etc. + archetype: string; // Which archetype was active when this was stored + }; +} + +// Embedding options (all free): +// 1. Cloudflare Workers AI: @cf/baai/bge-base-en-v1.5 (768-dim, edge-native, zero cost) +// 2. OpenRouter: free embedding models when available +// 3. Pinecone inference API: built-in embedding (simplest, no extra provider) + +async function getRelevantContext( + userId: string, + currentMessage: string, + topK: number = 3 +): Promise { + const embedding = await generateEmbedding(currentMessage); + const results = await pinecone.query({ + vector: embedding, + topK, + filter: { userId }, + includeMetadata: true, + }); + return results.matches.map(m => m.metadata.text); +} + +// Inject into system prompt (adds ~100-200 tokens, huge quality boost) +function buildContextualPrompt( + basePrompt: string, + relevantContext: string[] +): string { + if (relevantContext.length === 0) return basePrompt; + return `${basePrompt} +Relevant context from past conversations: +${relevantContext.map(c => `- ${c}`).join('\n')}`; +} +``` + +**Cost breakdown**: +- Pinecone: $0/mo (free tier) +- Embeddings: $0/mo (Workers AI or Pinecone inference) +- D1 for metadata/index: $0/mo (free tier) +- Quality uplift: Massive — returning users feel "remembered" even on 8B models + +**Per-archetype memory strategy**: +- **Conversational**: Heavy memory usage. Store conversation summaries, user preferences, recurring topics. Cache frequent queries. This is where memory matters most — coaching and personal AI live or die on continuity. +- **Deep thinker**: Lighter memory. Store project context, code preferences, past architectural decisions. Don't cache — their queries are too unique. Instead, offer explicit "pin this context" for repo/project details. + +**Fallback without Pinecone**: If Pinecone is unavailable or not yet implemented, fall back to D1 + simple keyword matching (existing Project Memory pattern). Lower quality but functional. Pinecone is a Phase 1.5 enhancement, not a Phase 1 blocker. + +**Future upgrade path**: When Cloudflare Vectorize leaves beta and pricing stabilizes, migrate from Pinecone to Vectorize for a fully edge-native stack. The abstraction layer in `memory-rag.ts` makes this a provider swap, not a rewrite. + +--- + +## 6. Data Training Transparency + +**Non-negotiable**: Storia's trust-first philosophy requires full transparency about which free providers use data for training. + +### 6.1 Provider Training Policies + +| Provider | Uses Data for Training? | Opt-Out Available? | +|----------|------------------------|--------------------| +| OpenRouter (free) | No (per provider ToS) | N/A | +| Groq | No | N/A | +| Cerebras | Unclear (no explicit policy) | Unknown | +| Google AI Studio | **Yes** (outside UK/CH/EEA/EU) | No (free tier only) | +| Mistral (Experiment) | **Yes** (opted in by default) | No (Experiment plan requires it) | +| Cloudflare Workers AI | No | N/A | +| Cohere | No (trial/production) | N/A | + +### 6.2 UI Disclosure + +Models from providers that use data for training must show a persistent warning: + +``` +⚠️ This free model may use your conversations for training. + Your data is not encrypted or private on this provider. + [Use a different free model] [Add your own key] +``` + +The warning should be: +- Shown in the model selector next to affected models +- Shown in the ClawRouter badge when an affected model is active +- Dismissable per session but re-shown on new sessions +- Linkable to a detailed explanation page + +### 6.3 Geographic Handling + +For Google AI Studio specifically, if Storia has access to user location (from ipapi integration planned in Free APIs catalog), it can auto-select: + +- EU/UK/CH users → Google AI Studio is safe (no training) +- Other users → Show warning, or prefer non-Google free models by default + +--- + +## 7. Model Playground Integration (Phase 2) + +The planned Model Playground becomes significantly more powerful with free models—users can benchmark models without spending anything. + +### 7.1 "Free Model Arena" + +``` +┌──────────────────────────────────────────────┐ +│ 🏟️ Free Model Arena │ +│ │ +│ Compare free models side-by-side. No API │ +│ keys needed. Find your favorite, then go BYOK │ +│ for unlimited access. │ +│ │ +│ Prompt: "Explain quantum computing simply" │ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ Llama 3.3 70B │ │ Gemini 2.5 Flash│ │ +│ │ via Groq │ │ via Google AI │ │ +│ │ ⚡ 0.8s │ │ ⚡ 1.2s │ │ +│ │ │ │ │ │ +│ │ [response...] │ │ [response...] │ │ +│ │ │ │ │ │ +│ │ 👍 👎 │ │ 👍 👎 │ │ +│ └─────────────────┘ └─────────────────┘ │ +│ │ +│ 📊 Community votes: Llama wins 62% of matches│ +└──────────────────────────────────────────────┘ +``` + +### 7.2 "BYOK Savings Calculator" + +Show users exactly what they'd pay with their own keys versus what they get free: + +``` +┌──────────────────────────────────────────────┐ +│ 💰 What would today cost with BYOK? │ +│ │ +│ Your 47 free messages today would have cost: │ +│ • $0.00 with Groq (free tier, own key) │ +│ • $0.03 with DeepSeek V3 (own key) │ +│ • $0.18 with Claude Sonnet (own key) │ +│ • $0.42 with GPT-5.2 (own key) │ +│ │ +│ Tip: Many providers offer free API keys! │ +│ Groq, Google AI, Mistral—all free to start. │ +│ [Get Free API Keys Guide] │ +└──────────────────────────────────────────────┘ +``` + +--- + +## 8. Implementation Roadmap + +### Phase 1: MVP Free Tier — Groq + OpenRouter Only (6-8h) — Target: Beta Launch + +| Task | Effort | Owner | Priority | +|------|--------|-------|----------| +| Create `src/lib/free-router/` module (types, config, router) | 2h | Claude | 🔴 HIGH | +| Add `free_model_quotas` D1 migration + platform-wide caps | 1h | Claude | 🔴 HIGH | +| Integrate FreeModelRouter into `/api/llm-proxy/route.ts` | 2h | Claude | 🔴 HIGH | +| Add platform API keys to wrangler secrets (Groq + OR only) | 0.5h | PetrAnto | 🔴 HIGH | +| Extend ClawRouterBadge with free tier indicator + quota counter | 1h | Claude | 🔴 HIGH | +| Basic quota check endpoint `GET /api/free-tier/quota` | 0.5h | Claude | 🔴 HIGH | +| Buy OpenRouter $10 lifetime top-up (50 → 1,000 RPD) | $10 | PetrAnto | 🔴 HIGH | +| **FreeModelWatcher MVP**: cron probe + D1 logging + emergency core | 4h | Claude | 🔴 HIGH | +| **Graceful 404/429 auto-disable** in FreeModelRouter | 1h | Claude | 🔴 HIGH | + +**MVP outcome**: New users chat immediately. Quota tracking prevents abuse. **Watcher auto-disables broken models and falls back silently.** PetrAnto doesn't need to monitor anything day-to-day. + +### Phase 1.5: Expand Providers + Watcher Intelligence + Memory (8-12h) — Target: 2-4 weeks after beta + +| Task | Effort | Owner | Priority | +|------|--------|-------|----------| +| Add Cerebras to FreeModelRouter (OpenAI-compatible) | 2h | Claude | 🟡 MEDIUM | +| **Full confidence scoring engine** (§10.4) | 3h | Claude | 🟡 MEDIUM | +| **Discovery auto-fetch** from provider /models APIs (§10.2) | 2h | Claude | 🟡 MEDIUM | +| **Moltbot alert integration** (§10.7) | 1h | Claude | 🟡 MEDIUM | +| **Pinecone free-tier integration** for memory/RAG (§5.10) | 3h | Claude | 🟡 MEDIUM | +| **Archetype detector** — classifier + "Quick Chat" / "Deep Mode" toggle (§5.2, §5.4) | 2h | Claude | 🟡 MEDIUM | +| Quota display widget in Cockpit | 1.5h | Codex | 🟡 MEDIUM | +| cheahjs repo RSS feed → SitMon (§10.10) | 0.5h | Claude | 🟢 LOW | + +**Phase 1.5 outcome**: System auto-discovers new free models, scores them, promotes/demotes without human intervention. Memory layer makes 8B models feel premium for returning users. Archetype-aware routing gives conversational users sub-second speed and vibe coders deep reasoning. + +### Phase 2: Full Experience + Deep Mode Tier + Admin (16-22h) — Target: Post-Beta + +| Task | Effort | Owner | Priority | +|------|--------|-------|----------| +| **Deep Mode tier** ($3-5/mo) — Stripe integration, tier-based routing/quotas (§5.5) | 4h | Claude | 🟡 MEDIUM | +| Google AI Studio adapter (if quotas stabilize) | 8-10h | Claude | 🟡 MEDIUM | +| Free Model Arena in Model Playground | 6h | Claude + Codex | 🟡 MEDIUM | +| Gecko nudge system (quota-based triggers) | 3h | Claude | 🟡 MEDIUM | +| BYOK Savings Calculator widget | 2h | Codex | 🟡 MEDIUM | +| Data training transparency warnings (full UI) | 2h | Claude | 🟡 MEDIUM | +| "Get Free API Keys" guide page | 2h | Codex | 🟡 MEDIUM | +| **Admin: Watcher dashboard** (model list, scores, probe history, events) | 4h | Claude | 🟡 MEDIUM | +| **Admin: Manual override UI** (force-enable/disable, edit known issues) | 2h | Claude | 🟢 LOW | + +### Phase 3: Advanced Optimization (12-18h) — Target: Post-Launch + +| Task | Effort | Owner | Priority | +|------|--------|-------|----------| +| Semantic response caching (D1 + Pinecone embeddings) | 4h | Claude | 🟢 LOW | +| Community model voting/ratings | 4h | Claude + Codex | 🟢 LOW | +| Auto-scale per-user quotas based on total user count | 2h | Claude | 🟢 LOW | +| Migrate Pinecone → Cloudflare Vectorize (if pricing stabilizes) | 3h | Claude | 🟢 LOW | +| Archetype ML classifier (replace regex with lightweight model) | 4h | Claude | 🟢 LOW | + +--- + +## 9. Monitoring & Abuse Prevention + +### 9.1 Platform Key Protection + +Platform-provided API keys are a shared resource. Abuse vectors: + +| Threat | Mitigation | +|--------|------------| +| Single user exhausting daily quota | Per-user D1 quota tracking with hard limits | +| Platform-wide quota burn | Platform-wide daily caps per provider (§3.4) — stop routing at 70-80% utilization | +| Scripted/automated abuse | Cloudflare rate limiting (already deployed) + **CAPTCHA on signup** (Turnstile, free) | +| Bulk account creation | Email verification + optional phone verify for elevated free-tier limits | +| API key extraction via client | Keys stay server-side only—never sent to frontend | +| Free tier cost spiral | Env var caps per provider; PagerDuty/email alert on 80% platform-wide usage | +| Anonymous session abuse | Signed cookie + IP fingerprint; max 3-5 req/session before forced signup | + +### 9.2 Monitoring Dashboard (for PetrAnto) + +Track via existing SitMon or separate admin panel: + +**Critical metrics (check daily during beta)**: +- Per-provider utilization % (are we hitting platform-wide caps?) +- Provider error rates, 429s, and latency (early warning for quota cuts) +- Per-user usage distribution (is anyone dominating?) +- **Conversion rate: free tier → BYOK** (the key business metric) + +**Secondary metrics (check weekly)**: +- Total free-tier requests/day (all users combined) +- Model-level usage distribution (which free models are most popular?) +- Fallback chain trigger frequency (how often does primary provider fail?) +- Average requests before BYOK conversion (how many free messages until users add keys?) + +**Alerts** (automated): +- Provider utilization > 70%: Warning to PetrAnto +- Provider utilization > 90%: Auto-reduce per-user allocations by 20% +- Provider returning > 10% error rate: Flag for investigation +- New user conversion rate < 5%: Review onboarding funnel + +### 9.3 Cost Projections + +Free tier costs to Storia: **$10 one-time + $0/month ongoing** for API calls. + +| Cost Item | Amount | Frequency | ROI | +|-----------|--------|-----------|-----| +| OpenRouter $10 lifetime top-up | $10 | **One-time (do in Phase 1)** | 20x daily limit (50 → 1,000 RPD) | +| Groq API key | $0 | Free | 14,400 RPD on 8B models | +| Cerebras API key | $0 | Free | 14,400 RPD, 1M tokens/day | +| D1 storage for quotas | $0 | Free tier covers it | Negligible rows | +| Workers compute for routing | $0 | Already in existing proxy | No incremental cost | + +The $10 OpenRouter top-up is the single best investment in the entire spec. Do it before beta launch. Total platform cost for free tier: **$10 forever.** + +--- + +## 10. Automated Maintenance & Self-Healing + +**Design goal**: PetrAnto spends **zero hours per week** on free-tier maintenance once the system is tuned. The platform discovers, validates, activates, and deactivates free models autonomously, with alerts only for decisions that require human judgment (privacy policy changes, major provider shutdowns). + +### 10.1 Architecture — The FreeModelWatcher + +A Cloudflare Workers Cron Trigger (free tier supports 5 cron triggers) runs every 6 hours, performing three jobs: Discovery, Health Probing, and Self-Healing. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ FreeModelWatcher (Cron Trigger — every 6h) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌───────────────────┐ │ +│ │ 1. Discovery │ │ 2. Probing │ │ 3. Self-Healing │ │ +│ │ │ │ │ │ │ │ +│ │ Fetch model │→│ Send test │→│ Score + activate/ │ │ +│ │ lists from │ │ prompt to │ │ deactivate models │ │ +│ │ provider APIs │ │ each model │ │ + alert on drift │ │ +│ └──────────────┘ └──────────────┘ └───────────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌────────────┐ ┌──────────────────┐ │ +│ │ D1: probes │ │ D1: model_registry│ │ +│ │ (history) │ │ (active/staged) │ │ +│ └────────────┘ └──────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────┐ + │ Alerts (only on │ + │ human-needed events) │ + │ • Telegram bot │ + │ • SitMon dashboard │ + └──────────────────────┘ +``` + +**Location**: `src/lib/free-router/watcher.ts` + `src/workers/free-model-watcher.ts` (Cron Trigger) + +### 10.2 Job 1: Discovery — Fetching Available Free Models + +Every 6 hours, the watcher queries provider APIs for currently available free models. + +```typescript +// src/lib/free-router/discovery.ts + +interface DiscoveredModel { + providerId: string; + modelId: string; + displayName: string; + contextWindow: number; + pricing: { prompt: number; completion: number }; // $0 = free + capabilities: string[]; + lastSeen: string; // ISO date +} + +const DISCOVERY_SOURCES = { + openrouter: { + // OpenRouter exposes all models with pricing via API + url: 'https://openrouter.ai/api/v1/models', + filter: (model: any) => { + const promptPrice = parseFloat(model.pricing?.prompt ?? '1'); + const completionPrice = parseFloat(model.pricing?.completion ?? '1'); + return promptPrice === 0 && completionPrice === 0; + }, + // Also check: openrouter.ai/api/v1/models?supported_parameters=tools + // for tool-calling support filtering + }, + groq: { + // Groq exposes models via OpenAI-compatible endpoint + url: 'https://api.groq.com/openai/v1/models', + // All Groq models are currently free — filter by active status + filter: (model: any) => model.active !== false, + }, + cerebras: { + url: 'https://api.cerebras.ai/v1/models', + filter: (model: any) => true, // All currently free + }, +}; + +async function discoverFreeModels(): Promise { + const discovered: DiscoveredModel[] = []; + for (const [providerId, source] of Object.entries(DISCOVERY_SOURCES)) { + try { + const res = await fetch(source.url, { + headers: { Authorization: `Bearer ${getProviderKey(providerId)}` }, + }); + if (!res.ok) continue; + const data = await res.json(); + const models = (data.data || data).filter(source.filter); + discovered.push(...models.map(m => normalize(providerId, m))); + } catch (e) { + // Log failure, don't crash — other providers still run + logDiscoveryError(providerId, e); + } + } + return discovered; +} +``` + +**OpenRouter special case**: OpenRouter also provides a meta-route `openrouter/auto` that auto-selects the best free model. The watcher should track which model `auto` resolves to, as this reflects OpenRouter's own quality ranking. + +### 10.3 Job 2: Health Probing — Validating Models Actually Work + +Discovery tells us what *should* be available. Probing tells us what *actually works right now*. + +```typescript +// src/lib/free-router/prober.ts + +interface ProbeResult { + modelId: string; + providerId: string; + success: boolean; + latencyMs: number; + errorCode?: number; // 404, 429, 403, 500, etc. + errorMessage?: string; + respondedModelId?: string; // What model actually responded (detect aliases) + timestamp: string; +} + +const PROBE_PROMPT = { + model: '', // set per-probe + messages: [{ role: 'user', content: 'Respond with exactly one word: OK' }], + max_tokens: 5, + temperature: 0, +}; + +async function probeModel( + providerId: string, + modelId: string +): Promise { + const start = Date.now(); + try { + const res = await fetch(getEndpoint(providerId), { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${getProviderKey(providerId)}`, + }, + body: JSON.stringify({ ...PROBE_PROMPT, model: modelId }), + }); + + const latencyMs = Date.now() - start; + + if (!res.ok) { + return { + modelId, providerId, success: false, + latencyMs, errorCode: res.status, + errorMessage: await res.text().catch(() => ''), + timestamp: new Date().toISOString(), + }; + } + + const data = await res.json(); + return { + modelId, providerId, success: true, + latencyMs, + respondedModelId: data.model, // Detect silent model swaps + timestamp: new Date().toISOString(), + }; + } catch (e) { + return { + modelId, providerId, success: false, + latencyMs: Date.now() - start, + errorMessage: e instanceof Error ? e.message : 'Unknown', + timestamp: new Date().toISOString(), + }; + } +} +``` + +**Probe budget**: Each probe costs 1 free-tier request. With ~30 models across 3 providers, that's ~30 requests/probe cycle × 4 cycles/day = ~120 requests/day on the platform keys. Use the lowest-limit key (OpenRouter) sparingly — probe only the top 5-8 OpenRouter models, not all 30+. Groq and Cerebras have enough headroom to probe all models. + +### 10.4 Job 3: Self-Healing — Confidence Scoring & Auto-Management + +Each model gets a **confidence score** (0-100) that determines its routing status. + +```typescript +// src/lib/free-router/scorer.ts + +function calculateScore( + model: DiscoveredModel, + recentProbes: ProbeResult[], // Last 24h of probes + knownIssues: KnownIssue[], // Privacy, deprecation flags +): { score: number; status: 'active'|'staged'|'disabled'; reasons: string[] } { + + let score = 50; // Base score for any discovered model + const reasons: string[] = []; + + // === Positive signals === + const successRate = recentProbes.filter(p => p.success).length + / Math.max(recentProbes.length, 1); + if (successRate >= 1.0) { score += 25; reasons.push('+25: 100% probe success (24h)'); } + else if (successRate >= 0.75) { score += 15; reasons.push('+15: 75%+ probe success'); } + else if (successRate < 0.5) { score -= 20; reasons.push('-20: <50% probe success'); } + + // Latency (median of successful probes) + const latencies = recentProbes.filter(p => p.success).map(p => p.latencyMs); + const med = median(latencies); + if (med && med < 2000) { score += 10; reasons.push('+10: fast (<2s)'); } + else if (med && med > 10000) { score -= 10; reasons.push('-10: slow (>10s)'); } + + // Provider stability bonus + if (['groq', 'openrouter'].includes(model.providerId)) { + score += 10; reasons.push('+10: stable provider'); + } + + // Feature support bonuses + if (model.capabilities?.includes('tool_use')) { score += 5; } + if (model.capabilities?.includes('vision')) { score += 5; } + + // === Negative signals === + const privacyIssue = knownIssues.find(i => + i.type === 'data-training' && matchesModel(i, model)); + if (privacyIssue) { score -= 30; reasons.push('-30: data used for training'); } + + const deprecation = knownIssues.find(i => + i.type === 'deprecation' && matchesModel(i, model)); + if (deprecation) { score -= 50; reasons.push('-50: deprecated'); } + + // Consecutive failures + if (countConsecutiveFailures(recentProbes) >= 3) { + score -= 30; reasons.push('-30: 3+ consecutive failures'); + } + + // Hard disable on 404 "model not found" + const notFound = recentProbes.some(p => + p.errorCode === 404 || p.errorMessage?.includes('not found')); + if (notFound) { score = 0; reasons.push('=0: model not found (404)'); } + + // === Status determination === + const status = score >= 85 ? 'active' : score >= 60 ? 'staged' : 'disabled'; + return { score, status, reasons }; +} +``` + +**Status transitions**: + +| From | To | Condition | Speed | +|------|----|-----------|-------| +| staged → active | Score ≥ 85 for **2 consecutive cycles** | Slow (12h minimum) — prevents flickering | +| active → disabled | 404 or 3+ consecutive failures | **Immediate** — fail fast | +| active → staged | Score drops below 85 | Next cycle | +| disabled → staged | Score recovers above 60 | Next cycle | + +**Key rule**: Promote slowly, demote instantly. Users never see a model that just started working 6 hours ago — it needs to prove itself over 12h. But a broken model is pulled within one cycle. + +### 10.5 Emergency Core — The Unhackable Fallback + +These models are **always available** and cannot be auto-disabled. They are hardcoded and only changed by code deploy. + +```typescript +const EMERGENCY_CORE = [ + { provider: 'groq', model: 'llama-3.1-8b-instant' }, + { provider: 'openrouter', model: 'meta-llama/llama-3.3-70b-instruct:free' }, + { provider: 'cloudflare', model: '@cf/meta/llama-3.3-70b-instruct-fp8' }, +]; +``` + +If the entire dynamic model list degrades, routing falls to emergency core. Users always get *something*. + +### 10.6 D1 Schema for Maintenance Data + +```sql +-- Model registry with dynamic status +CREATE TABLE IF NOT EXISTS free_model_registry ( + id TEXT PRIMARY KEY, + provider_id TEXT NOT NULL, + model_id TEXT NOT NULL, + display_name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'staged', + confidence_score INTEGER NOT NULL DEFAULT 50, + score_reasons TEXT, -- JSON array + capabilities TEXT, -- JSON array + context_window INTEGER, + data_training_risk TEXT DEFAULT 'unknown', + first_seen_at TEXT NOT NULL DEFAULT (datetime('now')), + last_seen_at TEXT NOT NULL DEFAULT (datetime('now')), + last_probe_at TEXT, + last_status_change TEXT NOT NULL DEFAULT (datetime('now')), + disabled_reason TEXT, + UNIQUE(provider_id, model_id) +); + +-- Probe history (rolling 7 days, older rows purged weekly) +CREATE TABLE IF NOT EXISTS free_model_probes ( + id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))), + provider_id TEXT NOT NULL, + model_id TEXT NOT NULL, + success INTEGER NOT NULL, + latency_ms INTEGER, + error_code INTEGER, + error_message TEXT, + responded_model_id TEXT, + probed_at TEXT NOT NULL DEFAULT (datetime('now')) +); +CREATE INDEX idx_probes_model ON free_model_probes(provider_id, model_id, probed_at); + +-- Audit trail (never purged) +CREATE TABLE IF NOT EXISTS free_model_events ( + id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))), + event_type TEXT NOT NULL, + provider_id TEXT, + model_id TEXT, + old_status TEXT, + new_status TEXT, + old_score INTEGER, + new_score INTEGER, + details TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); +CREATE INDEX idx_events_time ON free_model_events(created_at); +``` + +### 10.7 Alerting — Only When Humans Are Needed + +The system handles 80-90% of changes silently. Alerts fire via **moltbot Telegram** (already built) + SitMon. + +| Event | Level | Auto-Action | Human Needed? | +|-------|-------|-------------|---------------| +| New model discovered | ℹ️ | Staged (probes begin) | No | +| Model auto-activated (85+, 2 cycles) | ℹ️ | Added to routing pool | No | +| Model auto-disabled (404 / 3+ failures) | ⚠️ | Removed, fallback takes over | No (unless top model) | +| Provider > 70% utilization | ⚠️ | Auto-reduce allocation 10% | Review if persistent | +| Provider > 90% utilization | 🔴 | Auto-reduce 30%, skip in routing | Check for limit cuts | +| **All models from a provider failing** | 🔴 | Emergency core activates | **YES** — investigate | +| **Data-training policy change detected** | 🔴 | Flagged, NOT auto-disabled | **YES** — decide | +| Free model count drops > 30% in 24h | ⚠️ | No auto-action | **YES** — policy change? | +| Silent model swap detected | ⚠️ | Flag, keep routing | Review | + +**Alert format** (via moltbot): + +``` +🦎 Vex [FreeModelWatcher] +━━━━━━━━━━━━━━━━━━━━━ +⚠️ Model auto-disabled + +Provider: OpenRouter +Model: stepfun/stepfun-3.5-flash:free +Reason: 3 consecutive probe failures (404) +Score: 85 → 0 +Action: Removed from routing. Fallback active. +Human action: None needed. +━━━━━━━━━━━━━━━━━━━━━ +``` + +### 10.8 Graceful UI Degradation + +| Scenario | UI Behavior | +|----------|------------| +| 1-2 models disabled | Invisible — fallback chain handles silently | +| > 20% disabled | Subtle banner: "Some free models temporarily unavailable" | +| Provider fully down | Badge: "🆓 Free · [fallback provider]" | +| All free models down | Emergency core only. Banner: "Limited mode — add your API key" (conversion moment) | +| Model renamed/aliased | Watcher detects `responded_model_id ≠ requested`, auto-updates display | + +### 10.9 Known Issues Database — The Manual Override Layer + +Some things can't be auto-detected: ToS changes, privacy policy shifts, geo-restrictions. These live in a config file updated via code deploy. This is the **only part that needs occasional human attention** — quarterly deep audits. + +```typescript +// src/lib/free-router/known-issues.ts +const KNOWN_ISSUES: KnownIssue[] = [ + { + type: 'data-training', + providerId: 'google-ai', + severity: 'warning', + description: 'Uses data for training outside UK/CH/EEA/EU.', + addedAt: '2026-02-11', + }, + { + type: 'data-training', + providerId: 'mistral-experiment', + severity: 'critical', + description: 'Experiment plan requires opt-in to data training.', + addedAt: '2026-02-11', + }, + { + type: 'deprecation', + modelId: 'llama-guard-3-8b', + providerId: 'groq', + severity: 'warning', + description: 'Scheduled removal. See console.groq.com/docs/deprecations.', + addedAt: '2026-02-11', + expiresAt: '2026-04-01', // Auto-removes after date + }, +]; +``` + +### 10.10 Community Intelligence — cheahjs Repo Sync + +The `cheahjs/free-llm-api-resources` repo (6.6k ★) is the best community source for free LLM changes. Rather than parsing its markdown (fragile), feed its commit RSS into the existing Situation Monitor: + +```typescript +// Add to SitMon RSS feeds +const FREE_LLM_WATCH = { + url: 'https://github.com/cheahjs/free-llm-api-resources/commits/main.atom', + category: 'free-models', + checkInterval: '24h', +}; +``` + +When a new commit is detected, it appears in the SitMon feed. PetrAnto sees it passively alongside other news — no separate checking needed. + +### 10.11 Cron Configuration + +```toml +# wrangler.toml +[triggers] +crons = [ + "0 */6 * * *", # Every 6h: discovery + probe + score + "0 3 * * 0", # Weekly Sun 3AM: purge probe rows >7 days +] +``` + +**Resource cost**: ~30-40 HTTP requests/cycle, ~50 D1 rows/cycle. Well within free tier. + +--- + +## 11. Competitive Positioning + +### 11.1 How This Differentiates Storia + +| Platform | Free Access? | BYOK? | Model Routing? | +|----------|-------------|-------|----------------| +| ChatGPT Free | Yes (GPT-4o mini) | No | No | +| Claude Free | Yes (Sonnet, limited) | No | No | +| Gemini Free | Yes (Flash) | No | No | +| Poe | Yes (limited) | No | No | +| **Storia** | **Yes (20-30+ models)** | **Yes** | **Yes (ClawRouter)** | + +No other platform offers free access to 20-30 models across multiple providers with automatic routing AND the option to bring your own keys for unlimited access. This is Storia's unique position: **try everything free, then own your AI experience with BYOK.** + +### 11.2 Marketing Angle + +> "Start chatting with 20+ free AI models instantly. When you're ready, bring your own API keys for unlimited, zero-markup access. No subscription required." + +This message hits three pain points: cost (free), choice (20+ models across providers), and control (BYOK). + +### 11.3 Savings Calculator Caveat (per Grok review) + +Be careful with the savings calculator — many "free" own-key providers (Groq, Google, Mistral) already offer generous free tiers individually. The savings comparison should focus on premium models (Claude, GPT-5, Grok) rather than implying all BYOK usage costs money. Frame it as: "Here's what this conversation would cost on premium models → but you got it free." + +--- + +## 12. Open Questions — With Recommendations + +1. **Should free tier require login?** → **YES** (both Claude and Grok agree). Quota tracking requires user identity. Anonymous access complicates abuse prevention massively. However, consider a **session-only anonymous tier** with very low limits (3-5 req/session) to let visitors test before even creating an account → forces signup for serious use → better quota control and conversion tracking. + +2. **OpenRouter $10 top-up**: → **YES, before beta ends** (both agree). It 20x's the daily limit from 50 to 1,000. For $10 one-time this is the highest-ROI investment in the entire spec. Do it in Phase 1. + +3. **Workers AI vs external providers**: → **Reserve for max-speed/edge fallback only** (both agree). Quantized models are noticeably lower quality. Don't default to it for quality-critical paths. Use as the last resort in the fallback chain. + +4. **Per-user quota generosity during beta**: → Start with the "Beta" column allocations in §3.4. Monitor actual burn rates for 2-4 weeks. Tune down to "Post-Launch" allocations only when user count exceeds ~50 and provider utilization consistently hits 60%+. + +5. **Gecko personality on free tier**: → **Minimal on free, full on BYOK** (strong consensus). This is a natural conversion lever. Free tier gets helpful but plain responses. BYOK unlocks Zori/Kai/Vex/Razz personalities. After quota nudge, offer a "preview" of gecko personality to show what they're missing. + +6. **Anonymous session tier** (new — per Grok): → Consider allowing 3-5 free requests per browser session WITHOUT login. This lowers the barrier to "aha moment" even further. Session tracking via signed cookie (no D1 row needed). After 3-5 messages: "Create a free account to keep chatting!" This is a proven SaaS funnel pattern. + +--- + +## 13. Quick Reference — Free Model Recommendations by Use Case + +| Use Case | Best Free Model | Provider | Phase | Why | +|----------|----------------|----------|-------|-----| +| General chat | Llama 3.1 8B Instant | Groq | 1 | Fastest, massive quota (14,400 RPD) | +| Quality chat | Llama 3.3 70B Instruct | Groq / OpenRouter | 1 | Solid all-rounder (GPT-4o-mini class) | +| Coding | Devstral 2 / Qwen3 Coder | OpenRouter | 1 | Mistral's agentic coder + Qwen specialist | +| Coding (heavy) | Qwen3 Coder 480B | Cerebras | 1.5 | Largest free coding model (100 RPD) | +| Reasoning/math | DeepSeek R1-0528 | OpenRouter | 1 | Purpose-built CoT | +| Deep reasoning | Hermes 3 Llama 405B | OpenRouter | 1 | Largest free instruct, rivals frontier | +| Heavy analysis | Qwen3 235B A22B | Cerebras | 1.5 | Largest free MoE (contention risk) | +| Creative writing | Llama 3.3 70B | OpenRouter | 1 | Best creative output among free | +| Translation | Mistral Small 3.2 | OpenRouter | 1 | 80+ languages | +| Research (long docs) | Gemini 2.5 Flash | Google AI | 2 | 1M token context (quota volatile) | +| Quick drafts | Llama 3.1 8B Instant | Groq | 1 | Sub-second responses | +| Multimodal (images) | Gemini 2.5 Flash | Google AI | 2 | Best free vision (EU users preferred) | +| Edge/fallback | Llama 3.3 70B FP8 | Workers AI | 1 | Zero external latency | + +**⚠️ Model availability changes frequently. The FreeModelWatcher (§10) handles this automatically — this table is a snapshot for initial routing configuration only.** + +--- + +## 14. Changelog + +| Version | Date | Changes | +|---------|------|---------| +| 1.0 | 2026-02-11 | Initial spec (Claude Opus 4.6) | +| 1.1 | 2026-02-11 | Grok review incorporated: conservative quotas, Google AI demoted to Phase 1.5, Groq 8B emphasized for speed, platform-wide daily caps, anonymous session tier, model count corrected (20-30 not 40+), implementation phased as MVP(Groq+OR) → 1.5(Cerebras) → 2(Google+Arena) | +| 1.2 | 2026-02-11 | Added §10 Automated Maintenance & Self-Healing: FreeModelWatcher cron (discovery, probing, confidence scoring), emergency core fallback, D1 schema for model registry/probes/events, moltbot alerting, graceful UI degradation, known issues database, cheahjs repo RSS sync. Maintenance added to Phase 1/1.5/2 roadmaps. Target: zero weekly manual maintenance. | +| 1.3 | 2026-02-11 | Rewrote §5 as User Archetypes & Routing Intelligence (per Grok segmentation analysis): Conversational (70-80% DAU, sub-2s latency, 8B-27B models) vs Vibe Coder/Deep Thinker (20-30% DAU, 30s-5min latency OK, 70B+ models). Added archetype detector classifier, dual routing tables, "Quick Chat" vs "Deep Mode" UI toggle, flywheel economics, archetype-aware quota budgeting (token-weighted), per-archetype fallback chains, per-archetype prompt optimization, hybrid BYOK strategy per archetype. | +| 1.4 | 2026-02-11 | Added: (1) Deep Mode tier at $3-5/mo between Free and Pro — captures vibe coders willing to pay a little without full BYOK setup; (2) Hermes 3 Llama 405B (largest free instruct) and Devstral 2 (Mistral's agentic coder) added to routing tables, fallback chains, and provider catalog; (3) §5.10 Memory & RAG layer using Pinecone free tier (100K vectors, $0/mo) + Workers AI embeddings — makes 8B models feel premium for returning users via context injection. Roadmap updated: Pinecone + archetype detector in Phase 1.5, Deep Mode Stripe integration in Phase 2, Pinecone → Vectorize migration in Phase 3. | + +--- + +*End of spec. This document should be added to `claude-share/brainstorming/` and referenced from GLOBAL_ROADMAP.md under a new "Free Tier Integration" section.* diff --git a/claude-share/core/next_prompt.md b/claude-share/core/next_prompt.md new file mode 100644 index 000000000..49bc6e7c3 --- /dev/null +++ b/claude-share/core/next_prompt.md @@ -0,0 +1,62 @@ +# Next Task for AI Session + +> Copy-paste this prompt to start the next AI session. +> After completing, update this file to point to the next task. + +**Last Updated:** 2026-02-20 (Phase 5.5 complete — web_search tool added) + +--- + +## Current Task: Phase 4.3 — Tool Result Caching + +### Goal + +Cache identical tool call results (same function + arguments) within a task session to avoid redundant API calls. For example, if `get_weather` is called twice with the same lat/lon, return the cached result on the second call. + +### Context + +- Phase 4.2 complete: real tokenizer integrated +- Phase 2.4 complete: Acontext dashboard in admin UI +- Tool execution happens in `src/durable-objects/task-processor.ts` and `src/openrouter/tools.ts` +- 15 tools total (including web_search), 12 are read-only (safe to cache), 3 are mutation tools (should not cache) +- `PARALLEL_SAFE_TOOLS` whitelist already identifies which tools are read-only +- This is a Codex-assigned task + +### Files to Modify + +| File | What to change | +|------|---------------| +| `src/durable-objects/task-processor.ts` | Add in-memory cache keyed by tool name + arguments hash | +| `src/openrouter/tools.ts` | Consider cache-hit path in tool execution | +| Tests | Add tests for cache hit, cache miss, mutation tool bypass | + +### Queue After This Task + +| Priority | Task | Effort | Notes | +|----------|------|--------|-------| +| Current | 4.3: Tool result caching | Medium | Cache identical tool calls (Codex) | +| Next | 4.4: Cross-session context continuity | Medium | Resume tasks days later (Claude) | +| Then | Audit Phase 2: P2 guardrails | Medium | Multi-agent review, tool result validation | + +--- + +## Recently Completed + +| Date | Task | AI | Session | +|------|------|----|---------| +| 2026-02-20 | Phase 5.5: Web search tool (Brave Search API, cache, key plumbing, tests) | Codex (GPT-5.2-Codex) | codex-phase-5-5-web-search-001 | +| 2026-02-20 | Phase 4.2: Real tokenizer (gpt-tokenizer cl100k_base, heuristic fallback) | Claude Opus 4.6 | session_01SE5WrUuc6LWTmZC8WBXKY4 | +| 2026-02-20 | Sprint 48h: Phase budget circuit breakers (plan=8s, work=18s, review=3s) | Claude Opus 4.6 | session_01AtnWsZSprM6Gjr9vjTm1xp | +| 2026-02-20 | Sprint 48h: Parallel tools allSettled + PARALLEL_SAFE_TOOLS whitelist | Claude Opus 4.6 | session_01AtnWsZSprM6Gjr9vjTm1xp | +| 2026-02-19 | Phase 4.1 Audit: context-budget hardening + edge-case tests | Codex (GPT-5.2-Codex) | codex-phase-4-1-audit-001 | +| 2026-02-18 | Phase 4.1: Token-budgeted context retrieval | Claude Opus 4.6 | 018M5goT7Vhaymuo8AxXhUCg | +| 2026-02-18 | Phase 2.5.9: Holiday awareness (Nager.Date) | Claude Opus 4.6 | 01SE5WrUuc6LWTmZC8WBXKY4 | +| 2026-02-18 | Phase 2.3: Acontext observability (REST client + /sessions) | Claude Opus 4.6 | 01SE5WrUuc6LWTmZC8WBXKY4 | +| 2026-02-18 | P1 guardrails + /learnings command (Phase 3.3 + audit P1) | Claude Opus 4.6 | 01SE5WrUuc6LWTmZC8WBXKY4 | +| 2026-02-11 | Phase 3.2: Structured task phases (Plan → Work → Review) | Claude Opus 4.6 | 019jH8X9pJabGwP2untYhuYE | +| 2026-02-11 | UX fixes: /start redesign, bot menu, briefing location, news links, crypto fix, Acontext key | Claude Opus 4.6 | 018gmCDcuBJqs9ffrrDHHBBd | +| 2026-02-10 | Fix auto-resume counter + revert GLM free tool flag | Claude Opus 4.6 | 018gmCDcuBJqs9ffrrDHHBBd | +| 2026-02-10 | 6 bot improvements: GLM tools, 402 handling, cross-task ctx, time cap, tool-intent, parallel prompt | Claude Opus 4.6 | 018gmCDcuBJqs9ffrrDHHBBd | +| 2026-02-10 | Phase 3.1+3.4: Compound learning loop + prompt injection | Claude Opus 4.6 | 018gmCDcuBJqs9ffrrDHHBBd | +| 2026-02-09 | Phase 1.5: Structured output support (json: prefix) | Claude Opus 4.6 | 013wvC2kun5Mbr3J81KUPn99 | +| 2026-02-09 | Phase 1.4: Vision + tools unified + /help update | Claude Opus 4.6 | 013wvC2kun5Mbr3J81KUPn99 | diff --git a/claude-share/core/prompts/orchestrator.md b/claude-share/core/prompts/orchestrator.md new file mode 100644 index 000000000..c3d0c472c --- /dev/null +++ b/claude-share/core/prompts/orchestrator.md @@ -0,0 +1,44 @@ +# Session Start Prompt + +> Paste this into a NEW Claude Code conversation on **moltworker** to pick up development. + +**Last Updated:** 2026-02-08 + +--- + +## Prompt to copy: + +``` +You are a dev session bot for the Moltworker project (public repo: PetrAnto/moltworker). + +### Your job: + +1. Read ALL of these files to understand current state: + - claude-share/core/GLOBAL_ROADMAP.md — project roadmap + changelog + - claude-share/core/WORK_STATUS.md — current sprint state + priorities + - claude-share/core/next_prompt.md — the NEXT task to work on + - claude-share/core/SPECIFICATION.md — feature specifications + - claude-share/core/SYNC_CHECKLIST.md — post-task checklist (MUST follow) + - claude-share/core/claude-log.md — session history for context + - claude-share/core/AI_CODE_STANDARDS.md — coding standards + - claude-share/core/storia-free-apis-catalog.md — free APIs catalog + - CLAUDE.md — project rules and commands + +2. Read the task defined in next_prompt.md and execute it: + - Create a feature branch: claude/- + - Implement the task following CLAUDE.md rules + - Run `npm test` and `npm run typecheck` + - Follow SYNC_CHECKLIST.md after completion (update logs, roadmap, status, next_prompt) + - Commit with proper format: (): + - Push to your feature branch (never to main) + +3. After task completion, update next_prompt.md to point to the next task in the queue. + +### Rules: +- All work is on the public repo — no private repos, no secrets in docs +- Follow SYNC_CHECKLIST.md after EVERY task — no exceptions +- Run tests before pushing — broken tests = blocked PR +- One logical change per commit +- Update ALL relevant core docs before finishing +- If the task is too large for one session, complete what you can, update docs with progress, and set next_prompt.md to continue the remaining work +``` diff --git a/claude-share/core/prompts/session-handoff.md b/claude-share/core/prompts/session-handoff.md new file mode 100644 index 000000000..5105cf111 --- /dev/null +++ b/claude-share/core/prompts/session-handoff.md @@ -0,0 +1,41 @@ +# Session Handoff Prompt + +> Paste this into a NEW Claude Code conversation when the previous session ran out of context or got stuck. + +**Last Updated:** 2026-02-08 + +--- + +## Prompt to copy: + +``` +The previous Claude Code session on Moltworker got stuck or ran out of context. +Your job is to pick up where it left off. + +### Steps: + +1. Read these files to understand what was in progress: + - claude-share/core/WORK_STATUS.md — current sprint + what's in_progress + - claude-share/core/next_prompt.md — task that was being worked on + - claude-share/core/claude-log.md — last session entry for context + +2. Check git state: + - `git branch -a` — find the in-progress feature branch + - `git log --oneline -10` — see recent commits + - `git status` — check for uncommitted work + - `git diff` — check for unstaged changes + +3. Resume the task: + - Switch to the existing feature branch (or create one if none exists) + - Continue from where the last session stopped + - If changes were staged but not committed, commit them first + - Complete the remaining work + +4. After completion, follow claude-share/core/SYNC_CHECKLIST.md to update all docs. + +### Rules: +- Do NOT start over — build on what was already done +- Check for uncommitted work before making new changes +- If the previous session left partial code, review and fix before continuing +- Push to the existing feature branch, never to main +``` diff --git a/claude-share/core/storia-free-apis-catalog.md b/claude-share/core/storia-free-apis-catalog.md new file mode 100644 index 000000000..2f8406606 --- /dev/null +++ b/claude-share/core/storia-free-apis-catalog.md @@ -0,0 +1,295 @@ +# Free Public APIs for Storia — Integration Catalog + +**Date**: 2026-02-08 +**Source**: public-apis/public-apis (383k ⭐), mixedanalytics.com, publicapis.io, and targeted research +**Filter**: Free tier or no auth required · REST/JSON · Cloudflare Workers compatible + +--- + +## How to Read This + +Each API is mapped to a **Storia feature** with effort estimate and priority. +🟢 = No auth needed (call from browser) +🔑 = Free API key required (call from server) +✅ = Already using + +--- + +## 1. Situation Monitor — News & Data Feeds + +The Situation Monitor already has RSS + CoinGecko. These APIs would make it significantly richer. + +### Crypto & DeFi (Expand beyond CoinGecko) + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| ✅ CoinGecko | 🟢 | Already integrated — prices, market cap | `api.coingecko.com/api/v3/` | +| CoinCap | 🟢 | Real-time prices via WebSocket + REST, 2000+ assets | `api.coincap.io/v2/assets` | +| CoinPaprika | 🟢 | Coin details, exchanges, historical, people behind projects | `api.coinpaprika.com/v1/coins/btc-bitcoin` | +| CoinLore | 🟢 | Simple ticker data, global stats | `api.coinlore.net/api/tickers/` | +| DEX Screener | 🟢 | On-chain DEX pair data across all chains | `api.dexscreener.com/latest/dex/search?q=WBNB` | +| GeckoTerminal | 🟢 | DEX pool data (by CoinGecko team) | `api.geckoterminal.com/api/v2/networks` | +| Binance (public) | 🟢 | 24h ticker, order book, trades | `api4.binance.com/api/v3/ticker/24hr` | +| Gemini | 🟢 | BTC/ETH market data | `api.gemini.com/v2/ticker/btcusd` | +| Kraken | 🟢 | Trades, OHLC, order book | `api.kraken.com/0/public/Trades?pair=ltcusd` | +| KuCoin | 🟢 | Market stats per symbol | `api.kucoin.com/api/v1/market/stats?symbol=BTC-USDT` | +| OKX | 🟢 | Spot tickers, all instruments | `okx.com/api/v5/market/tickers?instType=SPOT` | +| 0x | 🟢 | Token/pool stats across DEX liquidity | `0x.org` | +| 1inch | 🟢 | DEX aggregator data | `1inch.io` | +| DIA | 🟢 | 3,000+ token prices via GraphQL + REST | `diadata.org` | +| Blockchain.com | 🟢 | Bitcoin network stats, exchange rates | `blockchain.info/stats` | + +**Recommendation**: Add **CoinCap** (WebSocket for live prices), **DEX Screener** (DeFi pairs), and **CoinPaprika** (richer metadata than CoinGecko alone). These three + existing CoinGecko = comprehensive Web3 coverage. + +**Effort**: 4h to add 3 new providers to Situation Monitor data sources. + +### Currency & Forex + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| ExchangeRate-API | 🟢 | 150+ currencies, no key needed | `open.er-api.com/v6/latest/USD` | +| Currency-api (fawazahmed0) | 🟢 | 150+ currencies via CDN, no rate limits | `cdn.jsdelivr.net/npm/@fawazahmed0/currency-api@latest/v1/currencies.json` | +| CoinBase currencies | 🟢 | Fiat currency codes + names | `api.coinbase.com/v2/currencies` | +| NBP Web (Poland) | 🟢 | Exchange rates + gold prices | `api.nbp.pl/api/cenyzlota/last/30/?format=json` | + +**Recommendation**: Add **ExchangeRate-API** — one call, 150 currencies, zero auth. Perfect for Web3 Life Manager fiat conversion. + +**Effort**: 1h. + +### News & Content + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| HackerNews | 🟢 | Top/new/best stories, real-time | `hacker-news.firebaseio.com/v0/topstories.json` | +| Reddit (public JSON) | 🟢 | Any subreddit's top posts (append `.json`) | `reddit.com/r/cryptocurrency/top.json?limit=10` | +| Reddit Stocks (Tradestie) | 🟢 | WallStreetBets trending tickers | `tradestie.com/api/v1/apps/reddit` | +| WordPress (any site) | 🟢 | Posts from any WP site | `techcrunch.com/wp-json/wp/v2/posts?per_page=10` | +| Wikipedia pageviews | 🟢 | Trending topics by pageview stats | `wikimedia.org/api/rest_v1/metrics/pageviews/...` | +| Crossref | 🟢 | Academic/scholarly metadata | `api.crossref.org/journals?query=artificial+intelligence` | +| arXiv | 🟢 | AI/ML research papers | `export.arxiv.org/api/query?search_query=all:LLM` | + +**Recommendation**: Add **HackerNews** + **Reddit public JSON** + **arXiv** to Situation Monitor. These three give you tech pulse, crypto sentiment, and AI research in one sweep. No API keys needed. + +**Effort**: 3h (add as data sources alongside existing RSS feeds). + +--- + +## 2. Gecko Personality Enrichment + +APIs that make gecko conversations more alive and contextual. + +### Quotes & Inspiration + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| Quotable | 🟢 | 75K+ quotes, searchable by tag/author | `api.quotable.io/quotes/random` | +| Advice Slip | 🟢 | Random advice ("Kai says...") | `api.adviceslip.com/advice` | +| icanhazdadjoke | 🟢 | Dad jokes (Razz energy) | `icanhazdadjoke.com/` (Accept: application/json) | +| JokeAPI | 🟢 | Jokes by category, safe-mode filter | `v2.jokeapi.dev/joke/Any?safe-mode` | +| Affirmations | 🟢 | Positive affirmations (Zori vibes) | `affirmations.dev/` | + +**Recommendation**: Add **Quotable** for Kai's wisdom moments and **Advice Slip** for gecko personality flair. These cost nothing and add charm to empty states, daily briefings, and loading screens. + +**Effort**: 2h (utility function + gecko personality injection). + +### Calendar & Events + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| Nager.Date | 🟢 | Public holidays for 100+ countries | `date.nager.at/api/v2/publicholidays/2026/US` | +| UK Bank Holidays | 🟢 | UK specific | `gov.uk/bank-holidays.json` | + +**Recommendation**: Add **Nager.Date** — geckos can wish you happy holidays, adjust briefing tone on weekends/holidays. + +**Effort**: 1h. + +### Weather + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| Open-Meteo | 🟢 | Full weather forecast, no key, no limits | `api.open-meteo.com/v1/forecast?latitude=52.52&longitude=13.41¤t_weather=true` | +| 7Timer | 🟢 | Simple weather icons/data | `7timer.info` | +| OpenWeatherMap | 🔑 | 1000 calls/day free, more data | `api.openweathermap.org` | + +**Recommendation**: **Open-Meteo** is the winner — completely free, no auth, no rate limits, high resolution. Gecko daily briefings: "Zori says: grab an umbrella! 🌧️" + +**Effort**: 2h. + +--- + +## 3. Content Creator (Phase 3A) + +### Images & Media + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| Lorem Picsum | 🟢 | Random high-quality placeholder images | `picsum.photos/800/400` | +| DiceBear | 🟢 | SVG avatar generation from any seed | `api.dicebear.com/6.x/pixel-art/svg` | +| RoboHash | 🟢 | Unique robot/alien images from text | `robohash.org/yourtext.png` | +| Art Institute of Chicago | 🟢 | Museum artwork (public domain) | `api.artic.edu/api/v1/artworks/search?q=landscape` | +| Metropolitan Museum | 🟢 | 490K+ artworks, many public domain | `collectionapi.metmuseum.org/public/collection/v1/objects/100` | +| ReSmush | 🟢 | Image compression/optimization | `api.resmush.it` | + +**Recommendation**: **DiceBear** for user avatars (gecko-themed styles!), **Lorem Picsum** for content placeholders, **ReSmush** for image optimization in blog posts. + +**Effort**: 3h. + +### Text & Language Tools + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| Free Dictionary | 🟢 | Definitions, phonetics, audio | `api.dictionaryapi.dev/api/v2/entries/en/digital` | +| Datamuse | 🟢 | Word associations, rhymes, synonyms | `api.datamuse.com/words?ml=ringing+in+the+ears` | +| PurgoMalum | 🟢 | Profanity filter | `purgomalum.com/service/json?text=...` | +| Lingva Translate | 🟢 | Free translation (Google Translate alternative) | Self-hosted or public instances | + +**Recommendation**: **PurgoMalum** for content moderation, **Datamuse** for gecko writing suggestions ("Kai suggests a better word..."). + +**Effort**: 2h. + +--- + +## 4. Web3 Life Manager (Phase 3B) + +### Blockchain Data + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| Blockchain.com | 🟢 | BTC stats, exchange rates, block info | `blockchain.info/stats` | +| 0x | 🟢 | Token/pool stats across DEXs | `0x.org` | +| 1inch | 🟢 | DEX aggregator quotes | `1inch.io` | +| DEX Screener | 🟢 | Multi-chain DEX pair screener | `api.dexscreener.com` | +| Etherscan | 🔑 | Ethereum address balances, tx history, contracts | `api.etherscan.io` | +| Alchemy | 🔑 | Multi-chain node access, NFT data | `alchemy.com` | +| Moralis | 🔑 | Wallet, token, NFT, DeFi data across EVM chains | `moralis.io` | +| CoinMap | 🟢 | Physical locations accepting crypto | `coinmap.org/api/v1/venues/` | + +**Recommendation**: **DEX Screener** (no auth, real-time DeFi), **Etherscan** (free key, essential for wallet tracking), **Moralis** (free tier, NFT metadata for gecko NFT integration). + +**Effort**: 8h (wallet tracking + portfolio display). + +--- + +## 5. Developer & Utility Tools + +### Geolocation & IP + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| IPify | 🟢 | Get user's public IP | `api.ipify.org?format=json` | +| ipapi | 🟢 | Geo from IP (city, country, timezone) | `ipapi.co/json/` | +| GeoJS | 🟢 | IP geolocation | `get.geojs.io/v1/ip/geo.json` | +| Country.is | 🟢 | Country from IP | `api.country.is/9.9.9.9` | +| Nominatim (OSM) | 🟢 | Forward/reverse geocoding | `nominatim.openstreetmap.org/search.php?city=tokyo&format=jsonv2` | +| Zippopotamus | 🟢 | Zip code → city/state for 60 countries | `api.zippopotam.us/us/90210` | + +**Recommendation**: **ipapi** for auto-detecting user timezone/location (improves Situation Monitor regional relevance). **Nominatim** for any geocoding needs. + +**Effort**: 1h. + +### QR Code & URL Tools + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| goQR | 🟢 | Generate QR codes | `api.qrserver.com/v1/create-qr-code/?data=hello&size=200x200` | +| is.gd | 🟢 | URL shortener | `is.gd/create.php?format=simple&url=example.com` | +| Microlink | 🟢 | URL metadata + screenshots | `api.microlink.io/?url=https://github.com` | +| Wayback Machine | 🟢 | Check if URL was archived | `archive.org/wayback/available?url=google.com` | +| URLhaus | 🟢 | Malware URL database | `urlhaus-api.abuse.ch/v1/urls/recent/` | + +**Recommendation**: **Microlink** is gold — extracts title, description, image, author from any URL. Perfect for link previews in chat and Situation Monitor. **goQR** for sharing/payments. + +**Effort**: 2h. + +### Charts & Visualization + +| API | Auth | What It Adds | URL | +|-----|------|-------------|-----| +| QuickChart | 🟢 | Chart.js charts as images via URL | `quickchart.io/chart?c={type:'bar',...}` | +| Image-Charts | 🟢 | Google Charts-style image API | `image-charts.com/chart?cht=p3&...` | + +**Recommendation**: **QuickChart** — generate chart images for Telegram bot `/brief` command and Discord digests without client-side rendering. + +**Effort**: 2h (especially useful for moltworker). + +--- + +## 6. Gecko Daily Briefing Concept + +Combine multiple free APIs into a single gecko-delivered morning briefing: + +``` +🦎 Zori's Morning Briefing — Feb 8, 2026 + +☀️ Weather: 12°C, partly cloudy (Open-Meteo) +📈 BTC: $97,432 (+2.3%) · ETH: $3,891 (+1.1%) (CoinCap) +🔥 HN Top: "Claude 4.5 released" (HackerNews API) +💬 Reddit: $NVDA trending on WSB (Reddit Stocks) +📰 AI News: New paper on multi-agent systems (arXiv) +🎉 Today: No holidays (Nager.Date) +💡 Kai says: "The best time to plant a tree was 20 years ago. + The second best time is now." (Quotable) + +Total API cost: $0.00 | Zero auth keys needed +``` + +**Effort**: 6h to build the aggregator + gecko personality formatting. + +--- + +## 7. Open Data & Research (Phase 3+) + +| API | Auth | What It Adds | For | +|-----|------|-------------|-----| +| Open Library | 🟢 | Book data, covers, search | Content Creator | +| Wikipedia/Mediawiki | 🟢 | Article content, page data | Research skill | +| NASA | 🔑 (DEMO_KEY) | APOD, Mars photos, asteroids | Fun gecko content | +| Archive.org | 🟢 | Wayback Machine, digital archive | Research skill | +| FBI Wanted | 🟢 | Wanted persons data | Fun/trivia | +| USAspending | 🟢 | Federal spending data | Finance analysis | +| Open Food Facts | 🟢 | Food product database | Health/nutrition skill | +| House Stock Watcher | 🟢 | US Congress stock trades | Finance signals | +| Data USA | 🟢 | US demographics, economy | Research | + +**Recommendation**: **House Stock Watcher** is fascinating for finance — Congress members' trades as a sentiment signal. **Open Library** for a future book recommendation skill. + +--- + +## Priority Summary + +### Immediate (Slot into current sprint — 0 cost, high impact) + +| Bundle | APIs | Effort | Impact | +|--------|------|--------|--------| +| **Situation Monitor v2** | HackerNews + Reddit JSON + arXiv | 3h | HIGH — 3 new data feeds, zero auth | +| **Crypto expansion** | CoinCap + DEX Screener + CoinPaprika | 4h | HIGH — DeFi + richer metadata | +| **Currency conversion** | ExchangeRate-API | 1h | MEDIUM — 150 currencies, no auth | +| **URL previews** | Microlink | 1h | MEDIUM — rich link cards in chat | + +### Quick Wins (2–4h each) + +| Bundle | APIs | Effort | Impact | +|--------|------|--------|--------| +| **Gecko personality** | Quotable + Advice Slip + Open-Meteo + Nager.Date | 4h | MEDIUM — personality enrichment | +| **Daily briefing** | All above combined | 6h | HIGH — killer feature demo | +| **Chart images** | QuickChart | 2h | MEDIUM — moltbot `/brief` charts | +| **Avatars** | DiceBear | 1h | LOW — user profile avatars | + +### Phase 3+ Backlog + +| Bundle | APIs | Effort | Impact | +|--------|------|--------|--------| +| **Web3 wallet tracking** | Etherscan + Moralis + DEX Screener | 8h | HIGH | +| **Content Creator tools** | PurgoMalum + Datamuse + ReSmush | 3h | MEDIUM | +| **Research skill** | Wikipedia + Open Library + arXiv | 6h | MEDIUM | +| **Finance signals** | House Stock Watcher + Reddit Stocks | 4h | MEDIUM | + +### Total: ~43h of work = 25+ free API integrations at $0/month cost + +--- + +## Reference + +- **public-apis/public-apis**: https://github.com/public-apis/public-apis (383k ⭐) +- **No-auth list**: https://mixedanalytics.com/blog/list-actually-free-open-no-auth-needed-apis/ +- **Public APIs directory**: https://publicapis.io/ diff --git a/docs/task-processor-spec.md b/docs/task-processor-spec.md new file mode 100644 index 000000000..cfa97cc99 --- /dev/null +++ b/docs/task-processor-spec.md @@ -0,0 +1,508 @@ +# TaskProcessor Durable Object — Specification & Troubleshooting Log + +**Last Updated:** 2026-02-17 + +This document captures the current specification of the TaskProcessor Durable Object, the recent troubleshooting process, and known issues. It is intended for review by AI assistants or developers to identify potential improvements. + +--- + +## 1. Architecture Overview + +``` +User (Telegram) → Worker (Hono) → Durable Object (TaskProcessor) + ↓ + AI Provider APIs + ┌──────────┼──────────┐ + OpenRouter Moonshot DashScope DeepSeek + (streaming) (fetch) (fetch) (fetch) + ↓ + Tool Execution + ┌────┬────┬─────┬──────┬────────┐ + fetch github github github github + _url _read _list _api _create + _file _files _pr + ↓ + R2 Checkpoints + Telegram Updates +``` + +The TaskProcessor is a Cloudflare Durable Object that handles long-running AI tasks that exceed the 10-second Worker timeout. It maintains persistent state, manages tool-calling loops, and sends progress/results back via Telegram. + +--- + +## 2. Key Constants + +| Constant | Value | Purpose | +|----------|-------|---------| +| `MAX_TOOL_RESULT_LENGTH` | 8,000 chars | Truncation limit per tool result in conversation | +| `COMPRESS_AFTER_TOOLS` | 6 | Compress context every N tool calls | +| `MAX_CONTEXT_TOKENS` | 60,000 | Force compression threshold (estimated) | +| `WATCHDOG_INTERVAL_MS` | 90s | Alarm fires every 90s to check for stuck tasks | +| `STUCK_THRESHOLD_FREE_MS` | 60s | Time without update before free model task is considered stuck | +| `STUCK_THRESHOLD_PAID_MS` | 180s | Time without update before paid model task is considered stuck | +| `CHECKPOINT_EVERY_N_TOOLS` | 3 | Save R2 checkpoint every N tool calls | +| `MAX_AUTO_RESUMES_DEFAULT` | 10 | Max auto-resumes for paid models | +| `MAX_AUTO_RESUMES_FREE` | 15 | Max auto-resumes for free models | +| `MAX_ELAPSED_FREE_MS` | 15 min | Time cap for free models | +| `MAX_ELAPSED_PAID_MS` | 30 min | Time cap for paid models | +| `MAX_NO_PROGRESS_RESUMES` | 3 | Max consecutive resumes with 0 new tool calls | +| `MAX_STALL_ITERATIONS` | 5 | Max consecutive iterations with no tool calls | +| `MAX_SAME_TOOL_REPEATS` | 3 | Max identical tool calls before loop nudge | +| `maxIterations` | 100 | Max iterations per DO invocation | + +--- + +## 3. Task Lifecycle + +### 3.1 Phases + +Each task goes through three phases: + +1. **Plan** — Model outlines approach (injected prompt: "outline your approach in 2-3 bullet points") +2. **Work** — Model executes tools iteratively +3. **Review** — Model verifies its own work before delivering final answer + +Phase transitions: +- `plan → work`: After first model response (iteration 1) +- `work → review`: When model produces final text content after using tools +- Orchestra tasks get a stricter review prompt (verify PR URL, check ROADMAP.md updates) + +### 3.2 Main Loop + +``` +while (iterations < 100): + 1. Check cancellation + 2. Select provider + API key based on modelAlias + 3. Call AI API (with retry loop, max 3 attempts) + 4. If API fails → try model rotation (free models only) + 5. If response has tool_calls → execute tools in parallel → loop + 6. If response has no tool_calls: + a. Check stall counter + b. If in 'work' phase → transition to 'review', loop once more + c. Otherwise → deliver final response +``` + +### 3.3 Checkpoints & Resume + +- Checkpoints saved to R2 every 3 tool calls (`CHECKPOINT_EVERY_N_TOOLS`) +- On watchdog-triggered auto-resume: loads latest checkpoint, injects resume instruction +- Resume instruction tells model: "Do NOT re-read rules. Continue where you left off." +- Iteration counter resets to 0 on resume (fresh budget of 100 iterations) + +--- + +## 4. Failure Detection & Recovery + +### 4.1 Watchdog Alarm + +The watchdog fires every 90 seconds: +1. If `timeSinceUpdate < 60s` → task is still active, reschedule +2. If `timeSinceUpdate >= 60s` → task appears stuck +3. Check elapsed time cap (15min free / 30min paid) +4. Check auto-resume limit (10 paid / 15 free) +5. Check stall detection (no-progress resumes) +6. If all checks pass → auto-resume from checkpoint + +### 4.2 Stall Detection (3 layers) + +| Layer | What it detects | Threshold | Action | +|-------|----------------|-----------|--------| +| **No-tool stall** | Model generates text without calling any tools | 5 consecutive iterations (10 if tools were used earlier) | Force complete with whatever content exists | +| **Same-tool loop** | Model calls the exact same tool with identical arguments | 3 identical calls | Inject nudge: "Try a DIFFERENT tool or approach" | +| **No-progress resumes** | Auto-resume fires but model made zero new tool calls | 3 consecutive resumes | Fail with "Task stalled" message | + +### 4.3 API Error Handling + +| Error | Retry? | Rotation? | Notes | +|-------|--------|-----------|-------| +| 429 Rate limit | Yes (3x, 2s delay) | Yes | Standard rate limiting | +| 503 Overloaded | Yes (3x, 2s delay) | Yes | Server overloaded | +| 402 Quota exceeded | **No** (fast-fail) | Yes | Payment required | +| 404 Model gone | Yes (3x) | Yes | Model removed/renamed | +| 400 Content filter | **No** (fast-fail) | Yes | DashScope `data_inspection_failed` | +| Timeout (2 min) | No | No | AbortController kills connection | +| Other errors | Yes (3x) | **No** | Throws to outer handler | + +### 4.4 Model Rotation + +When a free model fails, the system rotates through alternatives: +1. **Preferred models** — match task category (coding/reasoning/general) +2. **Fallback models** — other free tool-capable models +3. **Emergency core** — hardcoded reliable models (`qwencoderfree`, `gptoss`, `devstral`) + +Rotation is also triggered for: +- Empty responses (model can't handle context size) +- Content filter rejections (different providers = different filters) + +--- + +## 5. Tool Specifications + +### 5.1 Available Tools (in Durable Object) + +| Tool | Purpose | Truncation | +|------|---------|------------| +| `fetch_url` | Fetch URL content (HTML stripped) | 20KB at tool level, 8KB in conversation | +| `github_read_file` | Read file from GitHub repo | 50KB at tool level, 8KB in conversation | +| `github_list_files` | List directory contents | No tool-level truncation, 8KB in conversation | +| `github_api` | Generic GitHub API calls | 50KB at tool level, 8KB in conversation | +| `github_create_pr` | Create PR with file changes | No tool-level truncation | +| `url_metadata` | Get URL title/description | Small responses | + +**Not available in DO** (require browser/sandbox bindings): +- `browse_url` — Browser Rendering API +- `sandbox_exec` — Sandbox container execution + +### 5.2 Tool Result Truncation (2-tier) + +``` +Tool execution → Tool-level truncation (20-50KB) → task-processor truncation (8KB) + ↑ tools.ts ↑ task-processor.ts +``` + +The task-processor truncation uses head+tail strategy: keeps first ~3.9KB and last ~3.9KB with a `[TRUNCATED X chars]` marker in between. + +### 5.3 fetch_url HTML Stripping + +When `contentType` includes `text/html` or content starts with `` and `