askscio · ken-cavanagh-glean · May 17, 2026 · May 16, 2026 · May 17, 2026
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -0,0 +1,25 @@
+name: check
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  checks:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        task: [typecheck, lint, test]
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - run: bun install
+
+      - name: Run ${{ matrix.task }}
+        run: bun run ${{ matrix.task }}
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,73 @@
+# Seer — Agent Evaluation Framework
+
+Evaluates Glean agents using LLM-as-judge with seven-call architecture, multi-judge ensemble, and categorical scoring.
+
+## Commands
+
+```bash
+bun run check          # typecheck + lint + test (run before every PR)
+bun run typecheck      # tsc --noEmit
+bun run lint           # biome check src/
+bun run lint:fix       # biome auto-fix
+bun run test           # bun test (67 tests, <100ms)
+bun run dev            # CLI: bun run src/cli.ts
+cd web && bun run dev  # Web UI: Next.js on port 3000
+```
+
+## Repository Map
+
+```
+src/
+  cli.ts                   CLI commands (Commander.js) — composition root
+  types.ts                 Core domain types (AgentResult, JudgeScore, EvalSetMode)
+  criteria/defaults.ts     10 default eval dimensions with rubrics + scales
+  db/schema.ts             Drizzle SQLite schema (7 tables)
+  db/index.ts              DB init + idempotent migrations
+  data/glean.ts            Agent runner (workflow + autonomous + multi-turn)
+  lib/judge.ts             Seven-call judge pipeline + ensemble aggregation
+  lib/judge-prompts.ts     Extracted prompt builders (pure functions, snapshot-tested)
+  lib/score.ts             Weighted average score calculation
+  lib/retry.ts             fetchWithRetry — exponential backoff + jitter
+  lib/token-ledger.ts      SQLite-backed token usage tracking
+  lib/csv.ts               CSV parsing utility
+  lib/config.ts            Settings loader (settings.json → .env → error)
+  lib/simulator.ts         Multi-turn simulated user (COMPLETE/CONTINUE)
+  lib/fetch-agent.ts       Agent info + capabilities
+  lib/fetch-docs.ts        Source doc fetch for faithfulness judge
+  lib/generate-agent.ts    Smart eval set generation
+web/                       Next.js web UI (shared SQLite with CLI)
+```
+
+## Architecture Layers
+
+Enforced by `src/__tests__/architecture.test.ts` — wrong-layer imports fail tests.
+
+```
+0: Types     (types.ts)           → imports nothing from src/
+1: Config    (lib/config.ts, criteria/*)  → only Types
+2: DB        (db/*)               → Types + Config
+3: Data      (data/*, lib/fetch-*, lib/retry.ts, lib/simulator.ts)
+4: Engine    (lib/judge.ts, lib/score.ts, lib/generate-agent.ts)
+5: CLI       (cli.ts)             → anything (composition root)
+```
+
+## Quality Gates
+
+- **biome.json** — linting + formatting rules
+- **Prompt snapshots** — `src/lib/__tests__/judge-prompts.test.ts` locks all judge prompt text
+- **Architecture test** — import boundaries enforced mechanically
+- **CI** — `.github/workflows/check.yml` runs all 3 gates on every PR (`fail-fast: false`)
+
+## Updating Snapshots
+
+When you intentionally change a judge prompt or criteria definition:
+```bash
+bun test --update-snapshots
+```
+
+Review the diff to confirm only expected changes.
+
+## Deep Context
+
+- [CLAUDE.md](CLAUDE.md) — full architecture, design decisions, research foundation
+- [docs/](docs/) — evaluation framework spec, judge best practices, API docs
diff --git a/biome.json b/biome.json
@@ -0,0 +1,55 @@
+{
+  "$schema": "https://biomejs.dev/schemas/2.4.15/schema.json",
+  "vcs": {
+    "enabled": true,
+    "clientKind": "git",
+    "useIgnoreFile": true
+  },
+  "files": {
+    "ignoreUnknown": true,
+    "includes": ["src/**"]
+  },
+  "formatter": {
+    "enabled": true,
+    "indentStyle": "space",
+    "indentWidth": 2,
+    "lineWidth": 120
+  },
+  "javascript": {
+    "formatter": {
+      "quoteStyle": "single",
+      "semicolons": "asNeeded",
+      "trailingCommas": "all",
+      "arrowParentheses": "always"
+    }
+  },
+  "linter": {
+    "enabled": true,
+    "rules": {
+      "recommended": true,
+      "suspicious": {
+        "noConsole": "warn",
+        "noExplicitAny": "warn"
+      },
+      "complexity": {
+        "noForEach": "off"
+      },
+      "style": {
+        "noNonNullAssertion": "off",
+        "useNodejsImportProtocol": "off"
+      }
+    }
+  },
+  "overrides": [
+    {
+      "includes": ["src/cli.ts", "src/db/**", "src/data/**", "src/lib/retry.ts", "src/lib/fetch-docs.ts", "src/lib/fetch-agent.ts", "src/lib/generate-agent.ts", "src/**/__tests__/**"],
+      "linter": {
+        "rules": {
+          "suspicious": {
+            "noConsole": "off"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/bun.lock b/bun.lock