From 11cbcd605ad79c65f7d9298780006cc18d413522 Mon Sep 17 00:00:00 2001
From: m-peko <marinpeko5@gmail.com>
Date: Mon, 27 Apr 2026 12:32:44 +0200
Subject: [PATCH] Update docs

---
 .gitignore             |  1 -
 docs/SUMMARY.md        |  2 +-
 docs/review/round-1.md | 90 ------------------------------------------
 docs/review/round-2.md | 72 ---------------------------------
 docs/samples-guide.md  | 14 +++----
 tests/test_samples.py  | 21 ++++++++++
 6 files changed, 29 insertions(+), 171 deletions(-)
 delete mode 100644 docs/review/round-1.md
 delete mode 100644 docs/review/round-2.md

diff --git a/.gitignore b/.gitignore
index a20d8153..0c718715 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,3 @@ Brewfile.lock.json
 .DS_Store
 .coverage
 docs/review/
-marc-only/
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index ad8a5883..6855c1ea 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -44,7 +44,7 @@
   * [LLM Provider Integrations](samples-guide.md#llm-provider-integrations-2-samples) -- OpenAI, Anthropic
   * [OpenClaw Agent Evaluation](samples-guide.md#openclaw-agent-evaluation-10-demos--skill) -- Cage match, code gate, safety audit, red-team
   * [MCP Server](samples-guide.md#mcp-server-1-sample) -- LayerLens as tools for Claude and other MCP clients
-  * [CopilotKit Integration](samples-guide.md#copilotkit-integration-2-agents--ui-components) -- LangGraph CoAgents, React components
+  * [CopilotKit Integration](samples-guide.md#copilotkit-integration) -- Next.js canvas + chat with create_agent and frontend HITL
   * [Claude Code Skills](samples-guide.md#claude-code-skills-6-skills) -- Slash commands for CLI and desktop
 
 ## Troubleshooting
diff --git a/docs/review/round-1.md b/docs/review/round-1.md
deleted file mode 100644
index e5e6212a..00000000
--- a/docs/review/round-1.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Review Round 1 -- Line-by-Line Code Review
-
-**Date**: 2026-03-28
-**Reviewers**: Principal Product Manager, Principal Platform Engineer, Principal Data Engineer
-**Scope**: All 74 Python files, 19 sample READMEs, 32 doc pages, 6 Claude Code skills
-
----
-
-## Consolidated Scores
-
-| Area | Product Manager | Platform Engineer | Data Engineer |
-|------|:-:|:-:|:-:|
-| Core SDK samples (19 files) | 8/10 | 7/10 | 9/10 |
-| Industry + Cowork + Modalities + Integrations + CI/CD (22 files) | 9/10 | 8/10 | 10/10 |
-| OpenClaw + MCP + CopilotKit + Tests (17 files) | 9/10 | 7/10 | 8/10 |
-| Documentation (36 files) | 8/10 | 7/10 | 9/10 |
-| **COMPOSITE** | **8.5/10** | **7.25/10** | **9/10** |
-
----
-
-## CRITICAL Issues (must fix)
-
-### C1: model_benchmark_management.py crashes on PublicModelsListResponse
-- **File**: `samples/core/model_benchmark_management.py`
-- **Lines**: 124, 125, 133, 134
-- **Impact**: `len(pub_models)` and `pub_models[:3]` raise `TypeError` because `client.public.models.get()` returns a `PublicModelsListResponse` Pydantic model, not a list. Same for `pub_benchmarks`.
-- **Fix**: Use `pub_models.models` and `pub_benchmarks.datasets` instead.
-
-### C2: Install command missing --index-url in all 11 sample READMEs
-- **Files**: All `samples/*/README.md` files
-- **Impact**: `pip install layerlens` fails because the package is not on public PyPI. The docs correctly use `--index-url https://sdk.layerlens.ai/package` but the sample READMEs do not.
-- **Fix**: Add `--index-url` to all sample README install commands, OR confirm that `layerlens` is now on public PyPI.
-
----
-
-## HIGH Issues (should fix)
-
-### H1: test_samples_e2e.py JSON validation cannot fail
-- **File**: `tests/test_samples_e2e.py`
-- **Line**: 1127-1128
-- **Impact**: `except json.JSONDecodeError: pass` means the JSON output validation test passes even when demos produce invalid JSON.
-- **Fix**: Remove the bare except or assert inside it.
-
-### H2: openai_traced.py and anthropic_traced.py lack judge cleanup
-- **Files**: `samples/integrations/openai_traced.py`, `samples/integrations/anthropic_traced.py`
-- **Impact**: Judges created by _ensure_judges() are never deleted. Inconsistent with all other samples.
-- **Fix**: Add try/finally cleanup or document that judges are intentionally persistent.
-
----
-
-## MEDIUM Issues (nice to fix)
-
-| ID | File | Line | Description |
-|----|------|------|-------------|
-| M1 | openclaw/trace_agent_execution.py | 123 | Unguarded `trace_result.trace_ids[0]` -- IndexError if empty |
-| M2 | openclaw/evaluate_skill_output.py | 208 | Same unguarded access |
-| M3 | openclaw/monitor_agent_safety.py | 211 | Same unguarded access |
-| M4 | openclaw/compare_agent_models.py | 306 | Same unguarded access |
-| M5 | copilotkit/agents/investigator_agent.py | 355 | Sync `_get_trace()` not wrapped in `asyncio.to_thread()` |
-| M6 | core/async_results.py | 214-215 | Fixed `asyncio.sleep(10)` instead of exponential backoff polling |
-| M7 | core/async_results.py | 200 | Unchecked None from `estimate_cost()` |
-| M8 | openclaw/_runner.py | 222-226 | Runtime `sys.path.insert` inside method body |
-| M9 | evaluator_agent.py | 343-347 | Poll count via string matching in message content |
-| M10 | mcp/layerlens_server.py | 42-49 | `_get_client()` not thread-safe |
-| M11 | samples/README.md | 251 | Trace file count says "5" but should be "6" |
-| M12 | docs/examples/creating-evaluations.md | 83 | Async API uses object method vs client method pattern |
-
-## LOW Issues (cosmetic)
-
-| ID | File | Line | Description |
-|----|------|------|-------------|
-| L1 | brand_evaluation.py | 110, 125 | Dead-code None-check after create_judge() |
-| L2 | document_evaluation.py | 147 | Same dead-code pattern |
-| L3 | _runner.py | 172 | Uses md5 instead of sha256 for deterministic seed |
-| L4 | evaluate.py (skill) | 125 | Duplicates polling logic instead of reusing _helpers |
-| L5 | evaluate.py (skill) | 239 | Returns success:True with score:None -- ambiguous |
-| L6 | investigator_agent.py | 64 | Mutable default in Pydantic BaseModel |
-| L7 | docs/security/environment-variables.md | 68-87 | Emojis in code sample |
-
----
-
-## Action Items for Round 2
-
-1. Fix C1 (model_benchmark_management.py response type)
-2. Fix C2 (install URL) -- verify if layerlens is on public PyPI
-3. Fix H1 (test JSON validation)
-4. Fix H2 (integration sample judge cleanup)
-5. Fix M1-M4 (unguarded trace_ids access)
-6. Fix M5 (investigator_agent async)
-7. Fix M11 (trace file count)
diff --git a/docs/review/round-2.md b/docs/review/round-2.md
deleted file mode 100644
index 91a3d682..00000000
--- a/docs/review/round-2.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Review Round 2 -- MEDIUM and LOW Issue Resolution
-
-**Date**: 2026-03-28
-**Status**: All issues from Round 1 resolved
-
----
-
-## Issues Fixed This Round
-
-### MEDIUM (5 fixed)
-
-| ID | File | Fix |
-|----|------|-----|
-| M6 | core/async_results.py | Replaced fixed `asyncio.sleep(10)` with exponential backoff polling (2s start, 1.3x, 10s cap, 30 attempts) |
-| M7 | core/async_results.py | Added None guard before accessing `estimate.estimated_cost` |
-| M8 | openclaw/_runner.py | Moved sys.path + _helpers import to module top-level, removed runtime manipulation from methods |
-| M9 | copilotkit/agents/evaluator_agent.py | Added `poll_count: int` to state dataclass, replaced fragile string-matching counter |
-| M10 | mcp/layerlens_server.py + both copilotkit agents | Added `threading.Lock` with double-checked locking to `_get_client()` |
-
-### LOW (7 fixed)
-
-| ID | File | Fix |
-|----|------|-----|
-| L1 | modalities/brand_evaluation.py | Removed dead `if not judge:` checks (create_judge raises, never returns None) |
-| L2 | modalities/document_evaluation.py | Same dead-code removal |
-| L3 | openclaw/_runner.py | Changed `hashlib.md5` to `hashlib.sha256` |
-| L4 | openclaw/layerlens_skill/scripts/evaluate.py | Replaced duplicated `_poll_results` with shared `poll_evaluation_results` |
-| L5 | openclaw/layerlens_skill/scripts/evaluate.py | Changed `success: True` to `success: False` when results unavailable, added `status: pending` |
-| L6 | copilotkit/agents/investigator_agent.py | Changed mutable default `metadata: Dict = {}` to `Field(default_factory=dict)` |
-| L7 | docs/security/environment-variables.md | Replaced emojis with text markers `[OK]`, `[MISSING]`, `[WARNING]` |
-
----
-
-## Updated Scores
-
-| Area | Product Manager | Platform Engineer | Data Engineer |
-|------|:-:|:-:|:-:|
-| Core SDK (19 files) | 10/10 | 10/10 | 10/10 |
-| Industry+Cowork+Modalities+Integrations+CICD (22 files) | 10/10 | 10/10 | 10/10 |
-| OpenClaw+MCP+CopilotKit+Tests (17 files) | 10/10 | 10/10 | 10/10 |
-| Documentation (36 files) | 10/10 | 10/10 | 10/10 |
-| **COMPOSITE** | **10/10** | **10/10** | **10/10** |
-
----
-
-## Justification
-
-### Product Manager: 10/10
-- Every sample delivers on its documented promise
-- No hardcoded data masquerading as real computation results
-- Domain language is authentic across all 10 industry verticals
-- Install instructions now include --index-url everywhere
-- First-time user path is clear: quickstart.py in 3 steps
-
-### Platform Engineer: 10/10
-- All SDK calls use correct signatures (evaluation_goal, judge_id, attribute access)
-- All judge creation goes through create_judge() helper with model_id auto-resolution
-- All polling uses exponential backoff (poll_evaluation_results or equivalent)
-- All async code wraps sync SDK calls in asyncio.to_thread()
-- All lazy client init uses threading.Lock for thread safety
-- All judges cleaned up in try/finally blocks
-- All temp files cleaned up in try/finally blocks
-- All trace_ids access is guarded against empty lists
-- 469 non-live tests passing (317 structural tests in test_samples.py + ~152 smoke tests in test_samples_e2e.py that verify samples run without crashing under mocked SDK calls)
-
-### Data Engineer: 10/10
-- Trace data consistently structured (input as role/content list, output as string)
-- Evaluation results consumed correctly (score, passed, reasoning as attributes)
-- Pagination handled correctly where used
-- No data type mismatches anywhere
-- Async evaluation pattern documented and handled (404 during PENDING, empty during EXECUTING)
-- Mock data types match real data types in tests
diff --git a/docs/samples-guide.md b/docs/samples-guide.md
index c25d2da0..65cb680a 100644
--- a/docs/samples-guide.md
+++ b/docs/samples-guide.md
@@ -88,16 +88,16 @@ Located in [`samples/mcp/`](../samples/mcp/). Expose LayerLens capabilities as t
 
 See the [MCP README](../samples/mcp/README.md) for setup instructions.
 
-### CopilotKit Integration (2 agents + UI components)
+### CopilotKit Integration
 
-Located in [`samples/copilotkit/`](../samples/copilotkit/). Full-stack integration with CopilotKit using LangGraph CoAgents and generative UI card components.
+Located in [`samples/copilotkit/`](../samples/copilotkit/). A full-stack canvas + chat sample built on `langchain.agents.create_agent` + `CopilotKitMiddleware`, with a runnable Next.js 16 + Tailwind 4 + shadcn/ui demo app under `app/`. The pattern mirrors CopilotKit's own [`coagents-research-canvas`](https://github.com/CopilotKit/CopilotKit/tree/main/examples/v1/research-canvas) reference: state-driven cards on the host page, a chat sidebar with a frontend HITL widget, and out-of-band polling for long-running async work.
 
-- [`agents/evaluator_agent.py`](../samples/copilotkit/agents/evaluator_agent.py) -- LangGraph CoAgent for evaluation workflows (human-in-the-loop judge confirmation via `interrupt()`)
-- [`agents/investigator_agent.py`](../samples/copilotkit/agents/investigator_agent.py) -- LangGraph CoAgent for trace investigation
-- [`components/*.tsx`](../samples/copilotkit/components/) -- React card components for rendering results
-- [`hooks/*.ts`](../samples/copilotkit/hooks/) -- CopilotKit hooks for wiring LayerLens actions
+- [`agents/evaluator_agent.py`](../samples/copilotkit/agents/evaluator_agent.py) -- LangGraph agent with four backend tools (`list_recent_traces`, `list_judges`, `run_trace_evaluation`, `get_evaluation_result`) and a frontend HITL tool (`confirm_judge`) for picking which judge to apply. The picker is a real React widget registered via `useCopilotAction({ renderAndWaitForResponse })`, bridged into the LLM's toolbelt by `CopilotKitMiddleware` -- no `interrupt()` call.
+- [`agents/investigator_agent.py`](../samples/copilotkit/agents/investigator_agent.py) -- Standalone procedural `StateGraph` for trace investigation (errors / latency / cost hot spots). No HITL, no LLM. Reference for non-conversational agents.
+- [`components/*.tsx`](../samples/copilotkit/components/) -- Five reusable SDK card components (`EvaluationCard`, `TraceCard`, `JudgeVerdictCard`, `MetricCard`, `ComplianceCard`) plus `MarkdownLite`, re-exported as `@layerlens/copilotkit-cards`.
+- [`app/`](../samples/copilotkit/app/) -- Runnable Next.js + FastAPI demo. Real LayerLens only -- a missing `LAYERLENS_STRATIX_API_KEY` is a hard error at startup.
 
-> **Checkpointer note:** Any LangGraph CoAgent that calls `interrupt()` (such as `evaluator_agent.py`) **must** be compiled with a checkpointer. Without one, the AG-UI stream ends without emitting `RUN_FINISHED` and CopilotKit blocks all subsequent messages. The sample ships with `InMemorySaver` for a zero-setup local run and documents Postgres / SQLite / Redis / LangGraph Platform alternatives for production in its [README](../samples/copilotkit/README.md#human-in-the-loop-checkpointers).
+> **Checkpointer note:** The evaluator graph is compiled with `InMemorySaver` so `ag_ui_langgraph`'s endpoint can call `graph.aget_state(config)` per request -- without it the AG-UI handler errors with "No checkpointer set" before any tool runs. The sample ships `InMemorySaver` for zero-setup local development; production deployments should swap to a durable saver (Postgres / SQLite / Redis / LangGraph Platform). See the sample's [README](../samples/copilotkit/README.md) for the full architecture walkthrough.
 
 See the [CopilotKit README](../samples/copilotkit/README.md) for the full list.
 
diff --git a/tests/test_samples.py b/tests/test_samples.py
index 2ee9ae1e..9adc9156 100644
--- a/tests/test_samples.py
+++ b/tests/test_samples.py
@@ -16,11 +16,32 @@
 # Directories containing library/support modules (not standalone samples)
 _LIBRARY_DIRS = {"judges", "lib", "components", "hooks"}
 
+# Directories to skip entirely during sample discovery. The CopilotKit
+# sample ships a Next.js app under ``app/frontend``; once a developer runs
+# ``npm install`` there, dependencies like ``katex`` drop their own .py
+# helper scripts into ``node_modules`` -- those are not LayerLens samples
+# and must not be treated as such.
+_SKIP_DIRS = {
+    "node_modules",
+    ".next",
+    "__pycache__",
+    ".venv",
+    "venv",
+    "dist",
+    "build",
+    ".pytest_cache",
+    "test-results",
+    "playwright-report",
+}
+
 
 def _collect_samples():
     """Collect all sample .py files, excluding helpers and __init__."""
     samples = []
     for root, dirs, files in os.walk(SAMPLES_DIR):
+        # Mutate ``dirs`` in place so ``os.walk`` does not descend into
+        # build artefacts, virtualenvs, or vendored packages.
+        dirs[:] = [d for d in dirs if d not in _SKIP_DIRS]
         for f in files:
             if f.endswith(".py") and not f.startswith("_"):
                 rel = os.path.relpath(os.path.join(root, f), SAMPLES_DIR)