From 11cbcd605ad79c65f7d9298780006cc18d413522 Mon Sep 17 00:00:00 2001 From: m-peko Date: Mon, 27 Apr 2026 12:32:44 +0200 Subject: [PATCH] Update docs --- .gitignore | 1 - docs/SUMMARY.md | 2 +- docs/review/round-1.md | 90 ------------------------------------------ docs/review/round-2.md | 72 --------------------------------- docs/samples-guide.md | 14 +++---- tests/test_samples.py | 21 ++++++++++ 6 files changed, 29 insertions(+), 171 deletions(-) delete mode 100644 docs/review/round-1.md delete mode 100644 docs/review/round-2.md diff --git a/.gitignore b/.gitignore index a20d8153..0c718715 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,3 @@ Brewfile.lock.json .DS_Store .coverage docs/review/ -marc-only/ diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index ad8a5883..6855c1ea 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -44,7 +44,7 @@ * [LLM Provider Integrations](samples-guide.md#llm-provider-integrations-2-samples) -- OpenAI, Anthropic * [OpenClaw Agent Evaluation](samples-guide.md#openclaw-agent-evaluation-10-demos--skill) -- Cage match, code gate, safety audit, red-team * [MCP Server](samples-guide.md#mcp-server-1-sample) -- LayerLens as tools for Claude and other MCP clients - * [CopilotKit Integration](samples-guide.md#copilotkit-integration-2-agents--ui-components) -- LangGraph CoAgents, React components + * [CopilotKit Integration](samples-guide.md#copilotkit-integration) -- Next.js canvas + chat with create_agent and frontend HITL * [Claude Code Skills](samples-guide.md#claude-code-skills-6-skills) -- Slash commands for CLI and desktop ## Troubleshooting diff --git a/docs/review/round-1.md b/docs/review/round-1.md deleted file mode 100644 index e5e6212a..00000000 --- a/docs/review/round-1.md +++ /dev/null @@ -1,90 +0,0 @@ -# Review Round 1 -- Line-by-Line Code Review - -**Date**: 2026-03-28 -**Reviewers**: Principal Product Manager, Principal Platform Engineer, Principal Data Engineer -**Scope**: All 74 Python files, 19 sample READMEs, 32 doc pages, 6 Claude Code skills - ---- - -## Consolidated Scores - -| Area | Product Manager | Platform Engineer | Data Engineer | -|------|:-:|:-:|:-:| -| Core SDK samples (19 files) | 8/10 | 7/10 | 9/10 | -| Industry + Cowork + Modalities + Integrations + CI/CD (22 files) | 9/10 | 8/10 | 10/10 | -| OpenClaw + MCP + CopilotKit + Tests (17 files) | 9/10 | 7/10 | 8/10 | -| Documentation (36 files) | 8/10 | 7/10 | 9/10 | -| **COMPOSITE** | **8.5/10** | **7.25/10** | **9/10** | - ---- - -## CRITICAL Issues (must fix) - -### C1: model_benchmark_management.py crashes on PublicModelsListResponse -- **File**: `samples/core/model_benchmark_management.py` -- **Lines**: 124, 125, 133, 134 -- **Impact**: `len(pub_models)` and `pub_models[:3]` raise `TypeError` because `client.public.models.get()` returns a `PublicModelsListResponse` Pydantic model, not a list. Same for `pub_benchmarks`. -- **Fix**: Use `pub_models.models` and `pub_benchmarks.datasets` instead. - -### C2: Install command missing --index-url in all 11 sample READMEs -- **Files**: All `samples/*/README.md` files -- **Impact**: `pip install layerlens` fails because the package is not on public PyPI. The docs correctly use `--index-url https://sdk.layerlens.ai/package` but the sample READMEs do not. -- **Fix**: Add `--index-url` to all sample README install commands, OR confirm that `layerlens` is now on public PyPI. - ---- - -## HIGH Issues (should fix) - -### H1: test_samples_e2e.py JSON validation cannot fail -- **File**: `tests/test_samples_e2e.py` -- **Line**: 1127-1128 -- **Impact**: `except json.JSONDecodeError: pass` means the JSON output validation test passes even when demos produce invalid JSON. -- **Fix**: Remove the bare except or assert inside it. - -### H2: openai_traced.py and anthropic_traced.py lack judge cleanup -- **Files**: `samples/integrations/openai_traced.py`, `samples/integrations/anthropic_traced.py` -- **Impact**: Judges created by _ensure_judges() are never deleted. Inconsistent with all other samples. -- **Fix**: Add try/finally cleanup or document that judges are intentionally persistent. - ---- - -## MEDIUM Issues (nice to fix) - -| ID | File | Line | Description | -|----|------|------|-------------| -| M1 | openclaw/trace_agent_execution.py | 123 | Unguarded `trace_result.trace_ids[0]` -- IndexError if empty | -| M2 | openclaw/evaluate_skill_output.py | 208 | Same unguarded access | -| M3 | openclaw/monitor_agent_safety.py | 211 | Same unguarded access | -| M4 | openclaw/compare_agent_models.py | 306 | Same unguarded access | -| M5 | copilotkit/agents/investigator_agent.py | 355 | Sync `_get_trace()` not wrapped in `asyncio.to_thread()` | -| M6 | core/async_results.py | 214-215 | Fixed `asyncio.sleep(10)` instead of exponential backoff polling | -| M7 | core/async_results.py | 200 | Unchecked None from `estimate_cost()` | -| M8 | openclaw/_runner.py | 222-226 | Runtime `sys.path.insert` inside method body | -| M9 | evaluator_agent.py | 343-347 | Poll count via string matching in message content | -| M10 | mcp/layerlens_server.py | 42-49 | `_get_client()` not thread-safe | -| M11 | samples/README.md | 251 | Trace file count says "5" but should be "6" | -| M12 | docs/examples/creating-evaluations.md | 83 | Async API uses object method vs client method pattern | - -## LOW Issues (cosmetic) - -| ID | File | Line | Description | -|----|------|------|-------------| -| L1 | brand_evaluation.py | 110, 125 | Dead-code None-check after create_judge() | -| L2 | document_evaluation.py | 147 | Same dead-code pattern | -| L3 | _runner.py | 172 | Uses md5 instead of sha256 for deterministic seed | -| L4 | evaluate.py (skill) | 125 | Duplicates polling logic instead of reusing _helpers | -| L5 | evaluate.py (skill) | 239 | Returns success:True with score:None -- ambiguous | -| L6 | investigator_agent.py | 64 | Mutable default in Pydantic BaseModel | -| L7 | docs/security/environment-variables.md | 68-87 | Emojis in code sample | - ---- - -## Action Items for Round 2 - -1. Fix C1 (model_benchmark_management.py response type) -2. Fix C2 (install URL) -- verify if layerlens is on public PyPI -3. Fix H1 (test JSON validation) -4. Fix H2 (integration sample judge cleanup) -5. Fix M1-M4 (unguarded trace_ids access) -6. Fix M5 (investigator_agent async) -7. Fix M11 (trace file count) diff --git a/docs/review/round-2.md b/docs/review/round-2.md deleted file mode 100644 index 91a3d682..00000000 --- a/docs/review/round-2.md +++ /dev/null @@ -1,72 +0,0 @@ -# Review Round 2 -- MEDIUM and LOW Issue Resolution - -**Date**: 2026-03-28 -**Status**: All issues from Round 1 resolved - ---- - -## Issues Fixed This Round - -### MEDIUM (5 fixed) - -| ID | File | Fix | -|----|------|-----| -| M6 | core/async_results.py | Replaced fixed `asyncio.sleep(10)` with exponential backoff polling (2s start, 1.3x, 10s cap, 30 attempts) | -| M7 | core/async_results.py | Added None guard before accessing `estimate.estimated_cost` | -| M8 | openclaw/_runner.py | Moved sys.path + _helpers import to module top-level, removed runtime manipulation from methods | -| M9 | copilotkit/agents/evaluator_agent.py | Added `poll_count: int` to state dataclass, replaced fragile string-matching counter | -| M10 | mcp/layerlens_server.py + both copilotkit agents | Added `threading.Lock` with double-checked locking to `_get_client()` | - -### LOW (7 fixed) - -| ID | File | Fix | -|----|------|-----| -| L1 | modalities/brand_evaluation.py | Removed dead `if not judge:` checks (create_judge raises, never returns None) | -| L2 | modalities/document_evaluation.py | Same dead-code removal | -| L3 | openclaw/_runner.py | Changed `hashlib.md5` to `hashlib.sha256` | -| L4 | openclaw/layerlens_skill/scripts/evaluate.py | Replaced duplicated `_poll_results` with shared `poll_evaluation_results` | -| L5 | openclaw/layerlens_skill/scripts/evaluate.py | Changed `success: True` to `success: False` when results unavailable, added `status: pending` | -| L6 | copilotkit/agents/investigator_agent.py | Changed mutable default `metadata: Dict = {}` to `Field(default_factory=dict)` | -| L7 | docs/security/environment-variables.md | Replaced emojis with text markers `[OK]`, `[MISSING]`, `[WARNING]` | - ---- - -## Updated Scores - -| Area | Product Manager | Platform Engineer | Data Engineer | -|------|:-:|:-:|:-:| -| Core SDK (19 files) | 10/10 | 10/10 | 10/10 | -| Industry+Cowork+Modalities+Integrations+CICD (22 files) | 10/10 | 10/10 | 10/10 | -| OpenClaw+MCP+CopilotKit+Tests (17 files) | 10/10 | 10/10 | 10/10 | -| Documentation (36 files) | 10/10 | 10/10 | 10/10 | -| **COMPOSITE** | **10/10** | **10/10** | **10/10** | - ---- - -## Justification - -### Product Manager: 10/10 -- Every sample delivers on its documented promise -- No hardcoded data masquerading as real computation results -- Domain language is authentic across all 10 industry verticals -- Install instructions now include --index-url everywhere -- First-time user path is clear: quickstart.py in 3 steps - -### Platform Engineer: 10/10 -- All SDK calls use correct signatures (evaluation_goal, judge_id, attribute access) -- All judge creation goes through create_judge() helper with model_id auto-resolution -- All polling uses exponential backoff (poll_evaluation_results or equivalent) -- All async code wraps sync SDK calls in asyncio.to_thread() -- All lazy client init uses threading.Lock for thread safety -- All judges cleaned up in try/finally blocks -- All temp files cleaned up in try/finally blocks -- All trace_ids access is guarded against empty lists -- 469 non-live tests passing (317 structural tests in test_samples.py + ~152 smoke tests in test_samples_e2e.py that verify samples run without crashing under mocked SDK calls) - -### Data Engineer: 10/10 -- Trace data consistently structured (input as role/content list, output as string) -- Evaluation results consumed correctly (score, passed, reasoning as attributes) -- Pagination handled correctly where used -- No data type mismatches anywhere -- Async evaluation pattern documented and handled (404 during PENDING, empty during EXECUTING) -- Mock data types match real data types in tests diff --git a/docs/samples-guide.md b/docs/samples-guide.md index c25d2da0..65cb680a 100644 --- a/docs/samples-guide.md +++ b/docs/samples-guide.md @@ -88,16 +88,16 @@ Located in [`samples/mcp/`](../samples/mcp/). Expose LayerLens capabilities as t See the [MCP README](../samples/mcp/README.md) for setup instructions. -### CopilotKit Integration (2 agents + UI components) +### CopilotKit Integration -Located in [`samples/copilotkit/`](../samples/copilotkit/). Full-stack integration with CopilotKit using LangGraph CoAgents and generative UI card components. +Located in [`samples/copilotkit/`](../samples/copilotkit/). A full-stack canvas + chat sample built on `langchain.agents.create_agent` + `CopilotKitMiddleware`, with a runnable Next.js 16 + Tailwind 4 + shadcn/ui demo app under `app/`. The pattern mirrors CopilotKit's own [`coagents-research-canvas`](https://github.com/CopilotKit/CopilotKit/tree/main/examples/v1/research-canvas) reference: state-driven cards on the host page, a chat sidebar with a frontend HITL widget, and out-of-band polling for long-running async work. -- [`agents/evaluator_agent.py`](../samples/copilotkit/agents/evaluator_agent.py) -- LangGraph CoAgent for evaluation workflows (human-in-the-loop judge confirmation via `interrupt()`) -- [`agents/investigator_agent.py`](../samples/copilotkit/agents/investigator_agent.py) -- LangGraph CoAgent for trace investigation -- [`components/*.tsx`](../samples/copilotkit/components/) -- React card components for rendering results -- [`hooks/*.ts`](../samples/copilotkit/hooks/) -- CopilotKit hooks for wiring LayerLens actions +- [`agents/evaluator_agent.py`](../samples/copilotkit/agents/evaluator_agent.py) -- LangGraph agent with four backend tools (`list_recent_traces`, `list_judges`, `run_trace_evaluation`, `get_evaluation_result`) and a frontend HITL tool (`confirm_judge`) for picking which judge to apply. The picker is a real React widget registered via `useCopilotAction({ renderAndWaitForResponse })`, bridged into the LLM's toolbelt by `CopilotKitMiddleware` -- no `interrupt()` call. +- [`agents/investigator_agent.py`](../samples/copilotkit/agents/investigator_agent.py) -- Standalone procedural `StateGraph` for trace investigation (errors / latency / cost hot spots). No HITL, no LLM. Reference for non-conversational agents. +- [`components/*.tsx`](../samples/copilotkit/components/) -- Five reusable SDK card components (`EvaluationCard`, `TraceCard`, `JudgeVerdictCard`, `MetricCard`, `ComplianceCard`) plus `MarkdownLite`, re-exported as `@layerlens/copilotkit-cards`. +- [`app/`](../samples/copilotkit/app/) -- Runnable Next.js + FastAPI demo. Real LayerLens only -- a missing `LAYERLENS_STRATIX_API_KEY` is a hard error at startup. -> **Checkpointer note:** Any LangGraph CoAgent that calls `interrupt()` (such as `evaluator_agent.py`) **must** be compiled with a checkpointer. Without one, the AG-UI stream ends without emitting `RUN_FINISHED` and CopilotKit blocks all subsequent messages. The sample ships with `InMemorySaver` for a zero-setup local run and documents Postgres / SQLite / Redis / LangGraph Platform alternatives for production in its [README](../samples/copilotkit/README.md#human-in-the-loop-checkpointers). +> **Checkpointer note:** The evaluator graph is compiled with `InMemorySaver` so `ag_ui_langgraph`'s endpoint can call `graph.aget_state(config)` per request -- without it the AG-UI handler errors with "No checkpointer set" before any tool runs. The sample ships `InMemorySaver` for zero-setup local development; production deployments should swap to a durable saver (Postgres / SQLite / Redis / LangGraph Platform). See the sample's [README](../samples/copilotkit/README.md) for the full architecture walkthrough. See the [CopilotKit README](../samples/copilotkit/README.md) for the full list. diff --git a/tests/test_samples.py b/tests/test_samples.py index 2ee9ae1e..9adc9156 100644 --- a/tests/test_samples.py +++ b/tests/test_samples.py @@ -16,11 +16,32 @@ # Directories containing library/support modules (not standalone samples) _LIBRARY_DIRS = {"judges", "lib", "components", "hooks"} +# Directories to skip entirely during sample discovery. The CopilotKit +# sample ships a Next.js app under ``app/frontend``; once a developer runs +# ``npm install`` there, dependencies like ``katex`` drop their own .py +# helper scripts into ``node_modules`` -- those are not LayerLens samples +# and must not be treated as such. +_SKIP_DIRS = { + "node_modules", + ".next", + "__pycache__", + ".venv", + "venv", + "dist", + "build", + ".pytest_cache", + "test-results", + "playwright-report", +} + def _collect_samples(): """Collect all sample .py files, excluding helpers and __init__.""" samples = [] for root, dirs, files in os.walk(SAMPLES_DIR): + # Mutate ``dirs`` in place so ``os.walk`` does not descend into + # build artefacts, virtualenvs, or vendored packages. + dirs[:] = [d for d in dirs if d not in _SKIP_DIRS] for f in files: if f.endswith(".py") and not f.startswith("_"): rel = os.path.relpath(os.path.join(root, f), SAMPLES_DIR)