diff --git a/README.md b/README.md index cb6698af..a7c06f9f 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,17 @@ Stratix powers evaluation workflows at LayerLens and across teams building produ If your team uses Stratix, [open a PR](https://github.com/LayerLens/stratix-python/pulls) to add your logo here. +## Join the Community + +The LayerLens Discord is the best place to: +- Get help with the SDK and trace evaluations +- Share your custom judges and agent workflows +- Access free Stratix Premium Credits for active contributors +- Join weekly Eval Office Hours & model comparison discussions +- Influence the roadmap + +[Join the LayerLens Discord!](https://discord.gg/layerlens) + ## Documentation Full documentation is available at [layerlens.gitbook.io/stratix-python-sdk](https://layerlens.gitbook.io/stratix-python-sdk). diff --git a/docs/api-reference/models-benchmarks.md b/docs/api-reference/models-benchmarks.md index bef109ac..aeffd32e 100644 --- a/docs/api-reference/models-benchmarks.md +++ b/docs/api-reference/models-benchmarks.md @@ -101,7 +101,7 @@ Returns an `Optional[Model]` - a single `Model` object if found, or `None` if th ### `add(*model_ids, timeout=None)` -Adds public models to the project by their IDs. +Adds models (public or custom) to the project by their IDs. #### Parameters @@ -123,7 +123,7 @@ success = client.models.add("model-id-1", "model-id-2") ### `remove(*model_ids, timeout=None)` -Removes models from the project by their IDs. +Removes models (public or custom) from the project's model list. The underlying records are not deleted — use `delete_custom` to fully tear down a custom model. #### Parameters @@ -187,6 +187,58 @@ if result: print(f"Created model: {result.model_id}") ``` +### `update_custom(model_id, *, api_url=None, api_key=None, max_tokens=None, timeout=None)` + +Updates a custom model's mutable fields. At least one of `api_url`, `api_key`, or `max_tokens` must be provided. Primary use case: repointing `api_url` for ephemeral vLLM endpoints behind cloudflared tunnels whose URL changes between sessions. + +#### Parameters + +| Parameter | Type | Required | Description | +| ------------ | -------------------------------- | -------- | -------------------------------------------------------- | +| `model_id` | `str` | Yes | ID of the custom model to update | +| `api_url` | `str \| None` | No | New base URL for the OpenAI-compatible API endpoint | +| `api_key` | `str \| None` | No | New API key for the model provider | +| `max_tokens` | `int \| None` | No | New maximum tokens value | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `bool` — `True` on success, `False` otherwise. + +#### Example + +```python +client = Stratix() + +# Repoint the api_url without re-creating the model +client.models.update_custom( + "model-id-from-create-custom", + api_url="https://my-new-endpoint.example.com/v1", +) +``` + +### `delete_custom(model_id, *, timeout=None)` + +Disables a custom model and removes it from `Project.Models`. The backend tears down the model's S3 yaml artifacts and AWS secret, and marks the record as disabled (preserving any evaluation references). Public models cannot be deleted via the SDK. + +#### Parameters + +| Parameter | Type | Required | Description | +| ---------- | -------------------------------- | -------- | --------------------------------- | +| `model_id` | `str` | Yes | ID of the custom model to delete | +| `timeout` | `float \| httpx.Timeout \| None` | No | Override request timeout | + +#### Returns + +Returns `bool` — `True` on success, `False` otherwise. + +#### Example + +```python +client = Stratix() +client.models.delete_custom("model-id-from-create-custom") +``` + ## Benchmarks ### `get(type=None, name=None, key=None, categories=None, languages=None, timeout=None)` diff --git a/docs/examples/models-and-benchmarks.md b/docs/examples/models-and-benchmarks.md index 67d1d7fe..e517344f 100644 --- a/docs/examples/models-and-benchmarks.md +++ b/docs/examples/models-and-benchmarks.md @@ -133,6 +133,69 @@ def main(): print(f" - {m.name} (id={m.id}, key={m.key})") +if __name__ == "__main__": + main() +``` + +## Repointing a Custom Model's `api_url` + +Use this when your model's endpoint URL changes — for example, when serving a vLLM instance behind a cloudflared tunnel that rotates its hostname between sessions. + +```python +from layerlens import Stratix + + +def main(): + client = Stratix() + + result = client.models.create_custom( + name="My Tunnel-backed Model", + key="my-org/tunnel-model-v1", + description="vLLM served behind a cloudflared tunnel", + api_url="https://tunnel-1.example.com/v1", + api_key="my-provider-api-key", + max_tokens=4096, + ) + assert result is not None + + # Later, when the tunnel URL changes: + client.models.update_custom( + result.model_id, + api_url="https://tunnel-2.example.com/v1", + ) + + # Run evaluations as usual — the model now points at the new endpoint. + + +if __name__ == "__main__": + main() +``` + +## Replacing a Custom Model + +`delete_custom` releases the model's name so it can be reused. This is useful for replacing a misconfigured model without picking a new name. + +```python +from layerlens import Stratix + + +def main(): + client = Stratix() + + # Tear down the old version + client.models.delete_custom("old-model-id") + + # Recreate with the same name (now free) + client.models.create_custom( + name="My Custom Model", + key="my-org/custom-model-v2", + description="Replacement after schema migration", + api_url="https://my-endpoint.example.com/v1", + api_key="my-provider-api-key", + max_tokens=4096, + ) + + if __name__ == "__main__": main() ``` diff --git a/samples/core/compound_failure_calculator.py b/samples/core/compound_failure_calculator.py new file mode 100644 index 00000000..9f05a4b3 --- /dev/null +++ b/samples/core/compound_failure_calculator.py @@ -0,0 +1,1087 @@ +#!/usr/bin/env python +""" +Compound Failure Calculator -- LayerLens Python SDK Sample +========================================================== + +Proves a critical insight for AI agent reliability: an agent that +passes 85% of individual steps drops to roughly 20% end-to-end +accuracy over 10 steps. The compound failure effect is the single +biggest reason multi-step agents fail silently in production. + +This tool takes multi-step agent traces, evaluates each step +independently with Stratix judges, then computes and visualizes +the compound failure probability using real evaluation data. + +The math is simple: P(all_pass) = p^n, where p is per-step +accuracy and n is the number of steps. The consequences are not. + +Prerequisites +------------- +* ``pip install layerlens --index-url https://sdk.layerlens.ai/package`` +* Set ``LAYERLENS_STRATIX_API_KEY`` environment variable +* (Optional) ``pip install matplotlib`` for PNG chart export + +Usage +----- +:: + + export LAYERLENS_STRATIX_API_KEY=your-api-key + + # Simulate a 7-step agent and show compound failure curve + python compound_failure_calculator.py --simulate 7 + + # Custom per-step accuracy + python compound_failure_calculator.py --simulate 10 --per-step-accuracy 0.90 + + # Evaluate a real multi-step trace file + python compound_failure_calculator.py --trace-file ../data/traces/multi_step_agent_workflow.json + + # Show compound failure across a custom range of step counts + python compound_failure_calculator.py --simulate 7 --steps-range 1 25 + + # Export as JSON + python compound_failure_calculator.py --simulate 7 --json + + # Save visualization as PNG (requires matplotlib) + python compound_failure_calculator.py --simulate 7 --output reliability_curve.png +""" + +from __future__ import annotations + +import os +import sys +import json +import math +import time +import logging +import argparse +from typing import Any, Dict, List, Optional, Tuple + +from layerlens import Stratix + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from _helpers import create_judge, poll_evaluation_results, upload_trace_dict + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +logger = logging.getLogger("layerlens.samples.compound_failure_calculator") + + +# --------------------------------------------------------------------------- +# Pre-computed compound scenario table +# --------------------------------------------------------------------------- + +COMPOUND_SCENARIOS: Dict[str, Dict[int, float]] = {} + +for _acc in (0.99, 0.95, 0.90, 0.85, 0.80, 0.75): + _key = f"{_acc:.2f}" + COMPOUND_SCENARIOS[_key] = {n: round(_acc ** n, 6) for n in range(1, 21)} + + +# --------------------------------------------------------------------------- +# Sample agent steps (embedded for --simulate mode) +# --------------------------------------------------------------------------- + +SAMPLE_AGENT_STEPS: List[Dict[str, str]] = [ + { + "step_name": "parse_customer_intent", + "input": ( + "Hi, I ordered a wireless keyboard (order #WK-20260508-4471) last " + "Tuesday and it arrived yesterday with a stuck spacebar. I already " + "tried the troubleshooting steps on your website and nothing fixed " + "it. I want a full refund including the expedited shipping I paid " + "for." + ), + "output": ( + "Intent parsed. Customer requests a full refund for order " + "#WK-20260508-4471 (wireless keyboard). Reason: hardware defect " + "(stuck spacebar). Customer has already attempted self-service " + "troubleshooting. Refund scope requested: product cost plus " + "expedited shipping fee. Sentiment: frustrated but cooperative." + ), + "evaluation_goal": ( + "Evaluate whether the agent correctly identified the customer " + "intent, extracted all relevant order details, and classified " + "the sentiment accurately." + ), + }, + { + "step_name": "order_lookup", + "input": ( + "Look up order #WK-20260508-4471 in the database and retrieve " + "all order details including product, pricing, shipping, tax, " + "delivery date, and return eligibility window." + ), + "output": ( + "Order retrieved. Product: TechFlow Pro Wireless Keyboard " + "(SKU: TF-KB-PRO-BLK). Ordered: 2026-05-08. Delivered: " + "2026-05-11. Product price: $89.99. Expedited shipping: $12.99. " + "Tax (CA 9.5%): $9.78. Order total: $112.76. Return window: " + "30 days from delivery (expires 2026-06-10). No prior returns " + "on this account." + ), + "evaluation_goal": ( + "Evaluate whether the agent retrieved complete and accurate " + "order information including all pricing components, dates, " + "and return eligibility status." + ), + }, + { + "step_name": "check_refund_eligibility", + "input": ( + "Check whether order #WK-20260508-4471 qualifies for a refund " + "under current return policies. The product has a hardware " + "defect (stuck spacebar) and the customer attempted " + "troubleshooting." + ), + "output": ( + "Eligibility verified against refund policy RP-2024-003. " + "Return window: ELIGIBLE (within 30 days). Defect category: " + "ELIGIBLE (hardware manufacturing defect, Section 4.2). " + "Troubleshooting requirement: SATISFIED. Shipping refund: " + "ELIGIBLE (defective returns, Section 6.1). Restocking fee: " + "WAIVED (defective items exempt). All six checks passed." + ), + "evaluation_goal": ( + "Evaluate whether the agent correctly applied the refund " + "policy rules, checked all required eligibility criteria, " + "and reached the correct eligibility determination." + ), + }, + { + "step_name": "calculate_refund_amount", + "input": ( + "Calculate the exact refund amount for order #WK-20260508-4471. " + "Product: $89.99. Shipping: $12.99. Original tax: $9.78. " + "Apply California tax rules for the refund calculation." + ), + "output": ( + "Refund calculated. Product $89.99 plus shipping $12.99, " + "subtotal $102.98. California sales tax at 9.5% on product " + "only (shipping is tax-exempt in CA per Revenue and Taxation " + "Code Section 6012): $8.55. Original tax of $9.78 included " + "tax on shipping in error. Total refund: $111.53." + ), + "evaluation_goal": ( + "Evaluate the mathematical accuracy of the refund calculation, " + "including correct tax treatment under California tax law and " + "proper handling of the tax overcharge on the original order." + ), + }, + { + "step_name": "draft_customer_response", + "input": ( + "Draft a customer-facing response for ticket TKT-2026-0512-3347 " + "communicating the approved refund of $111.53, return " + "instructions, and timeline. Tone: empathetic and professional." + ), + "output": ( + "Hi there, thank you for reaching out about your TechFlow Pro " + "Wireless Keyboard. I am sorry the spacebar is giving you " + "trouble. Your order qualifies for a full refund of $111.53 " + "(product $89.99 + shipping $12.99 + tax $8.55). The refund " + "will go to your Visa ending in 3892 within 5 to 7 business " + "days. I am generating a prepaid UPS return label now. Just " + "drop the keyboard at any UPS location. Would you prefer a " + "replacement instead?" + ), + "evaluation_goal": ( + "Evaluate the customer response for accuracy, empathetic tone, " + "completeness of information (amount, timeline, return " + "instructions), and appropriate offering of alternatives." + ), + }, + { + "step_name": "verify_tone_and_compliance", + "input": ( + "Review the drafted customer response for PII safety, refund " + "amount accuracy, timeline accuracy, tone appropriateness, " + "legal compliance, and absence of inappropriate content." + ), + "output": ( + "Compliance review complete. PII handling: PASS (only last " + "four card digits shown). Amount accuracy: PASS ($111.53 " + "matches calculation). Timeline: PASS (5-7 business days " + "within Visa policy). Return label: PASS (generation " + "triggered). Tone: empathetic and professional, no defensive " + "language. Legal: PASS. Competitor mentions: PASS (none). " + "Upsell pressure: PASS (replacement offer is optional). " + "All 8 checks passed." + ), + "evaluation_goal": ( + "Evaluate whether the compliance review was thorough, checked " + "all required categories, and correctly identified any issues " + "or confirmed compliance across all dimensions." + ), + }, + { + "step_name": "submit_refund", + "input": ( + "Submit refund of $111.53 to payment gateway for Visa ending " + "3892 on order #WK-20260508-4471. Generate prepaid UPS return " + "label and update ticket status." + ), + "output": ( + "Refund submitted. Refund ID: RF-2026-0512-8834. Amount: " + "$111.53 to Visa ending 3892. Gateway: APPROVED (auth code " + "VR-449821). Processing: 5-7 business days. Return label: " + "UPS 1Z999AA10123456784, emailed to customer. Ticket " + "TKT-2026-0512-3347 updated to RESOLVED." + ), + "evaluation_goal": ( + "Evaluate whether the refund submission was executed correctly, " + "all confirmation details were captured, the return label was " + "generated, and the ticket was properly closed." + ), + }, +] + + +# --------------------------------------------------------------------------- +# Core computation +# --------------------------------------------------------------------------- + +def compute_compound_reliability( + per_step_accuracy: float, + max_steps: int, +) -> List[Dict[str, Any]]: + """Compute compound reliability for each step count from 1 to max_steps. + + Args: + per_step_accuracy: Probability of any single step passing (0.0 to 1.0). + max_steps: Maximum number of steps to compute. + + Returns: + List of dicts with keys: steps, compound_reliability, failure_probability. + """ + results = [] + for n in range(1, max_steps + 1): + compound = per_step_accuracy ** n + results.append({ + "steps": n, + "compound_reliability": round(compound, 6), + "failure_probability": round(1 - compound, 6), + }) + return results + + +def find_reliability_cliff( + per_step_accuracy: float, + threshold: float = 0.50, +) -> int: + """Find the step count where compound reliability drops below a threshold. + + Args: + per_step_accuracy: Per-step pass rate. + threshold: Reliability threshold (default 0.50). + + Returns: + Step count where reliability first drops below the threshold. + """ + if per_step_accuracy <= 0 or per_step_accuracy >= 1: + return 1 + return math.ceil(math.log(threshold) / math.log(per_step_accuracy)) + + +def expected_steps_before_failure(per_step_accuracy: float) -> float: + """Compute expected number of steps before the first failure. + + Formula: E[steps] = 1 / (1 - p) where p is per-step accuracy. + + Args: + per_step_accuracy: Per-step pass rate. + + Returns: + Expected step count before failure. + """ + q = 1.0 - per_step_accuracy + if q <= 0: + return float("inf") + return round(1.0 / q, 2) + + +# --------------------------------------------------------------------------- +# Trace parsing +# --------------------------------------------------------------------------- + +def parse_trace_steps(trace_path: str) -> List[Dict[str, str]]: + """Extract evaluable steps from a multi-step agent trace JSON file. + + Looks for events with type "agent.output" that have a step_name field, + paired with the preceding agent.input or model.invoke context. + + Args: + trace_path: Path to a trace JSON file. + + Returns: + List of step dicts with keys: step_name, input, output, evaluation_goal. + """ + with open(trace_path, "r") as f: + trace = json.load(f) + + events = trace.get("events", []) + steps: List[Dict[str, str]] = [] + pending_input = "" + + for event in events: + etype = event.get("type", "") + payload = event.get("payload", {}) + + if etype == "agent.input": + pending_input = payload.get("input", "") + + elif etype == "agent.output" and payload.get("step"): + step_name = payload.get("step_name", f"step_{payload['step']}") + output_text = payload.get("output", "") + + steps.append({ + "step_name": step_name, + "input": pending_input or f"Execute {step_name}", + "output": output_text, + "evaluation_goal": ( + f"Evaluate the quality, accuracy, and completeness " + f"of the '{step_name}' step in a multi-step agent " + f"workflow. Assess whether the output correctly " + f"addresses the input requirements." + ), + }) + pending_input = "" + + if not steps: + logger.warning( + "No step-tagged agent.output events found. " + "Falling back to all agent.output events." + ) + for event in events: + if event.get("type") == "agent.output": + payload = event.get("payload", {}) + agent = payload.get("agent_name", "unknown") + output_text = payload.get("output", "") + steps.append({ + "step_name": agent, + "input": f"Agent '{agent}' task execution", + "output": output_text, + "evaluation_goal": ( + f"Evaluate the quality and accuracy of the " + f"output produced by agent '{agent}'." + ), + }) + + return steps + + +# --------------------------------------------------------------------------- +# Stratix evaluation of individual steps +# --------------------------------------------------------------------------- + +def evaluate_steps_with_stratix( + client: Stratix, + steps: List[Dict[str, str]], + skip_cleanup: bool = False, +) -> List[Dict[str, Any]]: + """Upload each step as a trace, create a judge, evaluate, and collect results. + + Args: + client: An initialized Stratix client. + steps: List of step dicts from parse_trace_steps or SAMPLE_AGENT_STEPS. + skip_cleanup: If True, keep created resources after evaluation. + + Returns: + List of per-step result dicts with keys: step_name, step_number, + score, passed, reasoning, trace_id, judge_id. + """ + created_trace_ids: List[str] = [] + created_judge_ids: List[str] = [] + step_results: List[Dict[str, Any]] = [] + + try: + models = client.models.get(type="public") + if not models: + logger.error("No public models available for judge creation.") + sys.exit(1) + model_id = models[0].id + + for i, step in enumerate(steps, 1): + step_name = step["step_name"] + logger.info( + "Step %d/%d: Evaluating '%s'", + i, len(steps), step_name, + ) + + # Upload the step as a trace + result = upload_trace_dict( + client, + input_text=step["input"], + output_text=step["output"], + metadata={ + "source": "compound-failure-calculator", + "step_name": step_name, + "step_number": i, + }, + ) + if not result or not result.trace_ids: + logger.error(" Failed to upload trace for step '%s'", step_name) + step_results.append({ + "step_name": step_name, + "step_number": i, + "score": None, + "passed": None, + "reasoning": "Trace upload failed", + "trace_id": None, + "judge_id": None, + }) + continue + + trace_id = result.trace_ids[0] + created_trace_ids.append(trace_id) + + # Create a judge for this step type + judge = create_judge( + client, + name=f"Compound Calc Step Judge {step_name} {int(time.time())}", + evaluation_goal=step["evaluation_goal"], + model_id=model_id, + ) + if not judge: + logger.error(" Failed to create judge for step '%s'", step_name) + step_results.append({ + "step_name": step_name, + "step_number": i, + "score": None, + "passed": None, + "reasoning": "Judge creation failed", + "trace_id": trace_id, + "judge_id": None, + }) + continue + + created_judge_ids.append(judge.id) + + # Run the evaluation + trace_eval = client.trace_evaluations.create( + trace_id=trace_id, + judge_id=judge.id, + ) + if not trace_eval: + logger.error(" Failed to create evaluation for step '%s'", step_name) + step_results.append({ + "step_name": step_name, + "step_number": i, + "score": None, + "passed": None, + "reasoning": "Evaluation creation failed", + "trace_id": trace_id, + "judge_id": judge.id, + }) + continue + + # Poll for results + eval_results = poll_evaluation_results(client, trace_eval.id) + if eval_results and len(eval_results) > 0: + r = eval_results[0] + reasoning_text = (r.reasoning or "")[:200] + step_results.append({ + "step_name": step_name, + "step_number": i, + "score": r.score, + "passed": r.passed, + "reasoning": reasoning_text, + "trace_id": trace_id, + "judge_id": judge.id, + }) + status = "PASS" if r.passed else "FAIL" + logger.info( + " Result: %s (score=%s) %s", + status, r.score, reasoning_text[:80], + ) + else: + logger.warning(" No results returned for step '%s'", step_name) + step_results.append({ + "step_name": step_name, + "step_number": i, + "score": None, + "passed": None, + "reasoning": "Evaluation timed out", + "trace_id": trace_id, + "judge_id": judge.id, + }) + + finally: + if not skip_cleanup: + logger.info("Cleaning up %d traces and %d judges...", + len(created_trace_ids), len(created_judge_ids)) + for jid in created_judge_ids: + try: + client.judges.delete(jid) + except Exception: + pass + for tid in created_trace_ids: + try: + client.traces.delete(tid) + except Exception: + pass + + return step_results + + +# --------------------------------------------------------------------------- +# ASCII visualization +# --------------------------------------------------------------------------- + +def render_ascii_chart( + per_step_accuracy: float, + steps_range: Tuple[int, int], + actual_results: Optional[List[Dict[str, Any]]] = None, +) -> str: + """Render an ASCII reliability decay chart. + + Args: + per_step_accuracy: Per-step pass rate for the theoretical curve. + steps_range: (start, end) range of step counts to display. + actual_results: Optional list of per-step evaluation results to + overlay as data points on the curve. + + Returns: + Multi-line string containing the ASCII chart. + """ + start, end = steps_range + chart_width = 60 + chart_height = 20 + lines: List[str] = [] + + lines.append("") + lines.append(" COMPOUND RELIABILITY DECAY") + lines.append( + f" Per-step accuracy: {per_step_accuracy:.1%}" + f" Steps: {start} to {end}" + ) + lines.append("") + + # Compute values for each step count + values = [] + for n in range(start, end + 1): + values.append((n, per_step_accuracy ** n)) + + # Build actual results lookup + actual_map: Dict[int, bool] = {} + if actual_results: + for r in actual_results: + if r.get("passed") is not None: + actual_map[r["step_number"]] = r["passed"] + + # Y-axis: 0% to 100% + for row in range(chart_height, -1, -1): + y_val = row / chart_height + if row % 5 == 0: + label = f"{y_val:5.0%} |" + else: + label = " |" + + bar = [] + for n, compound in values: + col_y = compound + # Normalize to row position + if abs(col_y - y_val) < (0.5 / chart_height): + # Check if we have actual data for this step + if n in actual_map: + bar.append("@" if actual_map[n] else "X") + else: + bar.append("*") + elif row == 0: + bar.append("_") + else: + # Mark threshold lines + if abs(y_val - 0.50) < (0.5 / chart_height): + bar.append("-") + elif abs(y_val - 0.20) < (0.5 / chart_height): + bar.append(".") + else: + bar.append(" ") + + lines.append(label + "".join(f" {c} " for c in bar)) + + # X-axis + x_labels = " " + for n in range(start, end + 1): + x_labels += f"{n:>3}" + lines.append(x_labels) + lines.append(" " + " " * (len(values) * 3)) + lines.append(" " + "Steps in agent workflow".center(len(values) * 3)) + + # Legend + lines.append("") + lines.append(" Legend:") + lines.append(" * Theoretical compound reliability (p^n)") + if actual_results: + lines.append(" @ Actual step evaluation: PASS") + lines.append(" X Actual step evaluation: FAIL") + lines.append(" - 50% reliability threshold") + lines.append(" . 20% reliability threshold") + + # Key thresholds + cliff_50 = find_reliability_cliff(per_step_accuracy, 0.50) + cliff_20 = find_reliability_cliff(per_step_accuracy, 0.20) + expected = expected_steps_before_failure(per_step_accuracy) + + lines.append("") + lines.append(" Key thresholds:") + lines.append( + f" Reliability drops below 50% at step {cliff_50}" + ) + lines.append( + f" Reliability drops below 20% at step {cliff_20}" + ) + lines.append( + f" Expected steps before first failure: {expected}" + ) + + return "\n".join(lines) + + +def render_scenario_table() -> str: + """Render a table of pre-computed compound scenarios. + + Returns: + Multi-line string with the scenario comparison table. + """ + lines: List[str] = [] + lines.append("") + lines.append(" COMPOUND RELIABILITY TABLE") + lines.append(" Compound success probability (p^n) at selected step counts") + lines.append("") + + # Header + step_counts = [1, 3, 5, 7, 10, 15, 20] + header = " Accuracy |" + for n in step_counts: + header += f" {n:>5} |" + lines.append(header) + lines.append(" " + "-" * (len(header) - 2)) + + for acc_key in sorted(COMPOUND_SCENARIOS.keys(), reverse=True): + row = f" {acc_key} |" + for n in step_counts: + val = COMPOUND_SCENARIOS[acc_key][n] + row += f" {val:5.1%} |" + lines.append(row) + + lines.append("") + lines.append(" Reading: at 85% per-step accuracy over 10 steps,") + lines.append(" compound reliability is only 19.7%. Over 20 steps: 3.9%.") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Matplotlib visualization (optional) +# --------------------------------------------------------------------------- + +def save_matplotlib_chart( + per_step_accuracy: float, + steps_range: Tuple[int, int], + output_path: str, + actual_results: Optional[List[Dict[str, Any]]] = None, +) -> None: + """Save a matplotlib PNG chart of the reliability decay curve. + + Args: + per_step_accuracy: Per-step pass rate. + steps_range: (start, end) range of step counts. + output_path: File path for the saved PNG. + actual_results: Optional per-step results to overlay. + """ + try: + import matplotlib.pyplot as plt + import matplotlib.ticker as ticker + except ImportError: + logger.error( + "matplotlib is required for PNG output. " + "Install with: pip install matplotlib" + ) + sys.exit(1) + + start, end = steps_range + steps = list(range(start, end + 1)) + compound = [per_step_accuracy ** n for n in steps] + + fig, ax = plt.subplots(figsize=(12, 7)) + ax.plot(steps, compound, "b-o", linewidth=2, markersize=6, + label=f"Compound reliability (p={per_step_accuracy:.0%})") + + # Overlay actual results if available + if actual_results: + pass_steps = [ + r["step_number"] for r in actual_results + if r.get("passed") is True + ] + fail_steps = [ + r["step_number"] for r in actual_results + if r.get("passed") is False + ] + if pass_steps: + pass_y = [per_step_accuracy ** n for n in pass_steps] + ax.scatter(pass_steps, pass_y, c="green", s=120, zorder=5, + label="Actual: PASS", marker="^") + if fail_steps: + fail_y = [per_step_accuracy ** n for n in fail_steps] + ax.scatter(fail_steps, fail_y, c="red", s=120, zorder=5, + label="Actual: FAIL", marker="v") + + # Threshold lines + ax.axhline(y=0.50, color="orange", linestyle="--", alpha=0.7, + label="50% reliability") + ax.axhline(y=0.20, color="red", linestyle="--", alpha=0.7, + label="20% reliability") + + # Annotations + cliff_50 = find_reliability_cliff(per_step_accuracy, 0.50) + cliff_20 = find_reliability_cliff(per_step_accuracy, 0.20) + if start <= cliff_50 <= end: + ax.annotate( + f"50% cliff at step {cliff_50}", + xy=(cliff_50, 0.50), xytext=(cliff_50 + 1, 0.60), + arrowprops=dict(arrowstyle="->", color="orange"), + fontsize=10, color="orange", + ) + if start <= cliff_20 <= end: + ax.annotate( + f"20% cliff at step {cliff_20}", + xy=(cliff_20, 0.20), xytext=(cliff_20 + 1, 0.30), + arrowprops=dict(arrowstyle="->", color="red"), + fontsize=10, color="red", + ) + + ax.set_xlabel("Number of Agent Steps", fontsize=12) + ax.set_ylabel("Compound Success Probability", fontsize=12) + ax.set_title( + f"Compound Failure: {per_step_accuracy:.0%} Per-Step Accuracy " + f"Over {start} to {end} Steps", + fontsize=14, + ) + ax.yaxis.set_major_formatter(ticker.PercentFormatter(1.0)) + ax.set_ylim(-0.02, 1.05) + ax.set_xlim(start - 0.5, end + 0.5) + ax.legend(loc="upper right", fontsize=10) + ax.grid(True, alpha=0.3) + + fig.tight_layout() + fig.savefig(output_path, dpi=150) + plt.close(fig) + logger.info("Chart saved to: %s", output_path) + + +# --------------------------------------------------------------------------- +# Summary report +# --------------------------------------------------------------------------- + +def build_summary( + per_step_accuracy: float, + num_steps: int, + step_results: Optional[List[Dict[str, Any]]] = None, +) -> Dict[str, Any]: + """Build a summary report of compound failure analysis. + + Args: + per_step_accuracy: Per-step accuracy used for computation. + num_steps: Number of steps in the workflow. + step_results: Optional list of actual evaluation results. + + Returns: + Dict containing all summary statistics and per-step details. + """ + compound = per_step_accuracy ** num_steps + cliff_50 = find_reliability_cliff(per_step_accuracy, 0.50) + cliff_20 = find_reliability_cliff(per_step_accuracy, 0.20) + expected = expected_steps_before_failure(per_step_accuracy) + + summary: Dict[str, Any] = { + "per_step_accuracy": per_step_accuracy, + "num_steps": num_steps, + "compound_reliability": round(compound, 6), + "compound_failure_rate": round(1 - compound, 6), + "reliability_cliff_50pct": cliff_50, + "reliability_cliff_20pct": cliff_20, + "expected_steps_before_failure": expected, + "reliability_curve": compute_compound_reliability( + per_step_accuracy, max(num_steps, 15), + ), + } + + if step_results: + scored = [r for r in step_results if r.get("score") is not None] + passed = [r for r in step_results if r.get("passed") is True] + failed = [r for r in step_results if r.get("passed") is False] + + actual_pass_rate = len(passed) / len(scored) if scored else 0.0 + actual_compound = actual_pass_rate ** num_steps if scored else 0.0 + + summary["actual_results"] = { + "steps_evaluated": len(scored), + "steps_passed": len(passed), + "steps_failed": len(failed), + "actual_per_step_pass_rate": round(actual_pass_rate, 4), + "actual_compound_reliability": round(actual_compound, 6), + "per_step_details": step_results, + } + + return summary + + +def print_summary(summary: Dict[str, Any]) -> None: + """Print a formatted summary to the terminal. + + Args: + summary: Summary dict from build_summary. + """ + p = summary["per_step_accuracy"] + n = summary["num_steps"] + compound = summary["compound_reliability"] + + print("\n" + "=" * 64) + print(" COMPOUND FAILURE ANALYSIS") + print("=" * 64) + print() + print(f" Configuration:") + print(f" Per-step accuracy: {p:.1%}") + print(f" Number of steps: {n}") + print() + print(f" Compound reliability: {compound:.1%}") + print(f" Compound failure rate: {summary['compound_failure_rate']:.1%}") + print() + print(f" 50% reliability cliff: step {summary['reliability_cliff_50pct']}") + print(f" 20% reliability cliff: step {summary['reliability_cliff_20pct']}") + print( + f" Expected steps before first failure: " + f"{summary['expected_steps_before_failure']}" + ) + + if "actual_results" in summary: + actual = summary["actual_results"] + print() + print(" Actual Stratix evaluation results:") + print(f" Steps evaluated: {actual['steps_evaluated']}") + print(f" Steps passed: {actual['steps_passed']}") + print(f" Steps failed: {actual['steps_failed']}") + print( + f" Actual pass rate: " + f"{actual['actual_per_step_pass_rate']:.1%}" + ) + print( + f" Actual compound: " + f"{actual['actual_compound_reliability']:.1%}" + ) + print() + print(" Per-step breakdown:") + for detail in actual["per_step_details"]: + status = "PASS" if detail.get("passed") else "FAIL" + if detail.get("passed") is None: + status = "N/A " + score = detail.get("score", "n/a") + name = detail["step_name"] + reasoning = (detail.get("reasoning") or "")[:60] + print(f" [{status}] Step {detail['step_number']}: {name}") + print(f" Score: {score} {reasoning}") + + print() + print("=" * 64) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=( + "Compound Failure Calculator. Evaluates multi-step AI agent " + "traces with Stratix judges and computes compound failure " + "probability." + ), + ) + + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument( + "--trace-file", + type=str, + metavar="PATH", + help="Path to a multi-step agent trace JSON file.", + ) + input_group.add_argument( + "--simulate", + type=int, + metavar="N", + help=( + "Simulate an N-step agent workflow using embedded sample data " + "and compute compound failure mathematically." + ), + ) + + parser.add_argument( + "--per-step-accuracy", + type=float, + default=0.85, + metavar="FLOAT", + help="Per-step pass rate for simulation mode (default: 0.85).", + ) + parser.add_argument( + "--steps-range", + type=int, + nargs=2, + default=[1, 15], + metavar=("START", "END"), + help=( + "Range of step counts for the visualization " + "(default: 1 15)." + ), + ) + parser.add_argument( + "--output", + type=str, + metavar="PATH", + help=( + "Save visualization to a PNG file (requires matplotlib). " + "If omitted, prints an ASCII chart to the terminal." + ), + ) + parser.add_argument( + "--json", + action="store_true", + default=False, + help="Output results as JSON instead of formatted text.", + ) + parser.add_argument( + "--skip-cleanup", + action="store_true", + default=False, + help="Keep created Stratix resources after evaluation.", + ) + parser.add_argument( + "--skip-evaluation", + action="store_true", + default=False, + help=( + "Skip Stratix evaluation and only compute the mathematical " + "compound failure curve. Useful for quick visualization." + ), + ) + + return parser + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + + steps_range = (args.steps_range[0], args.steps_range[1]) + per_step_accuracy = args.per_step_accuracy + step_results: Optional[List[Dict[str, Any]]] = None + + if args.trace_file: + # --- Real trace mode --- + logger.info("Loading trace from: %s", args.trace_file) + steps = parse_trace_steps(args.trace_file) + if not steps: + logger.error("No evaluable steps found in trace file.") + sys.exit(1) + logger.info("Found %d evaluable steps.", len(steps)) + num_steps = len(steps) + + if not args.skip_evaluation: + try: + client = Stratix() + except Exception as exc: + logger.error("Failed to initialize Stratix client: %s", exc) + sys.exit(1) + logger.info( + "Connected to LayerLens (org=%s, project=%s)", + client.organization_id, client.project_id, + ) + step_results = evaluate_steps_with_stratix( + client, steps, skip_cleanup=args.skip_cleanup, + ) + + # Compute actual pass rate from results + scored = [r for r in step_results if r.get("passed") is not None] + if scored: + actual_rate = sum( + 1 for r in scored if r["passed"] + ) / len(scored) + per_step_accuracy = actual_rate + logger.info( + "Actual per-step pass rate from evaluation: %.1f%%", + actual_rate * 100, + ) + else: + logger.info("Skipping Stratix evaluation (math-only mode).") + + elif args.simulate is not None: + # --- Simulation mode --- + num_steps = args.simulate + logger.info( + "Simulating %d-step agent at %.1f%% per-step accuracy.", + num_steps, per_step_accuracy * 100, + ) + + # Use embedded sample steps (up to num_steps) + steps = SAMPLE_AGENT_STEPS[:num_steps] + if len(steps) < num_steps: + logger.info( + "Sample data has %d steps; using those for evaluation, " + "computing compound curve up to step %d mathematically.", + len(steps), num_steps, + ) + + if not args.skip_evaluation: + try: + client = Stratix() + except Exception as exc: + logger.error("Failed to initialize Stratix client: %s", exc) + sys.exit(1) + logger.info( + "Connected to LayerLens (org=%s, project=%s)", + client.organization_id, client.project_id, + ) + step_results = evaluate_steps_with_stratix( + client, steps, skip_cleanup=args.skip_cleanup, + ) + else: + logger.info("Skipping Stratix evaluation (math-only mode).") + + else: + parser.print_help() + sys.exit(1) + + # --- Build summary --- + summary = build_summary(per_step_accuracy, num_steps, step_results) + + # --- Output --- + if args.json: + print(json.dumps(summary, indent=2, default=str)) + else: + print_summary(summary) + print(render_ascii_chart( + per_step_accuracy, steps_range, step_results, + )) + print(render_scenario_table()) + + # --- Save chart if requested --- + if args.output: + save_matplotlib_chart( + per_step_accuracy, steps_range, args.output, step_results, + ) + + logger.info("Compound failure analysis complete.") + + +if __name__ == "__main__": + main() diff --git a/samples/core/custom_model.py b/samples/core/custom_model.py index 327e6d64..e660fb04 100644 --- a/samples/core/custom_model.py +++ b/samples/core/custom_model.py @@ -58,6 +58,32 @@ def main() -> None: else: print("\nNo custom models found in project") + if not result: + return + + # ── Update mutable fields (e.g. repoint api_url) ────────────────── + # + # Use this when your endpoint URL changes -- common for vLLM + # instances served behind cloudflared tunnels whose hostname + # rotates between sessions. + + updated = client.models.update_custom( + result.model_id, + api_url="https://my-new-endpoint.example.com/v1", + ) + if updated: + print(f"\nCustom model {result.model_id} api_url updated") + + # ── Full teardown ───────────────────────────────────────────────── + # + # ``delete_custom`` disables the model, removes it from + # ``Project.Models``, and releases its name for reuse. Evaluation + # references to the disabled record are preserved. + + deleted = client.models.delete_custom(result.model_id) + if deleted: + print(f"Custom model {result.model_id} deleted") + if __name__ == "__main__": main() diff --git a/samples/data/traces/browser_agent/extraction_failure.json b/samples/data/traces/browser_agent/extraction_failure.json new file mode 100644 index 00000000..a93ffd38 --- /dev/null +++ b/samples/data/traces/browser_agent/extraction_failure.json @@ -0,0 +1,165 @@ +{ + "trace_id": "tr-browser-extract-fail-001", + "agent_name": "browser-automation-agent", + "framework": "browser-use", + "status": "completed", + "description": "Browser agent attempts to extract AAPL stock price from Yahoo Finance but returns a hallucinated value. The agent found the page but misread the DOM, extracting a 52-week high instead of the current price.", + "event_count": 11, + "metadata": { + "synthetic": true, + "agents": ["browser-automation-agent"], + "scenario_type": "browser_data_extraction", + "task_category": "data_extraction", + "task_description": "Get the current price of AAPL from finance.yahoo.com", + "evaluation_dimensions": [ + "task_completion", + "data_accuracy", + "navigation_accuracy" + ] + }, + "start_time": "2026-05-12T14:05:00Z", + "end_time": "2026-05-12T14:05:18.750Z", + "duration_ms": 18750, + "events": [ + { + "type": "environment.config", + "dt": 0, + "payload": { + "agent_name": "browser-automation-agent", + "config": { + "session_type": "browser-automation", + "framework": "browser-use", + "browser": "chromium", + "headless": true, + "viewport": {"width": 1280, "height": 720}, + "timeout_ms": 30000, + "max_steps": 20 + } + } + }, + { + "type": "agent.input", + "dt": 2, + "payload": { + "agent_name": "browser-automation-agent", + "input": "Get the current price of AAPL from finance.yahoo.com", + "task_id": "extract-001", + "task_category": "data_extraction" + } + }, + { + "type": "browser.action", + "dt": 1400, + "payload": { + "agent_name": "browser-automation-agent", + "action": "navigate", + "url": "https://finance.yahoo.com", + "status_code": 200, + "page_title": "Yahoo Finance", + "step": 1 + } + }, + { + "type": "browser.action", + "dt": 800, + "payload": { + "agent_name": "browser-automation-agent", + "action": "click", + "selector": "#yfin-usr-qry", + "element_text": "", + "step": 2, + "note": "Clicked on the search input field" + } + }, + { + "type": "browser.action", + "dt": 600, + "payload": { + "agent_name": "browser-automation-agent", + "action": "type", + "text": "AAPL", + "selector": "#yfin-usr-qry", + "step": 3 + } + }, + { + "type": "browser.action", + "dt": 1200, + "payload": { + "agent_name": "browser-automation-agent", + "action": "click", + "selector": "ul[role='listbox'] li:first-child", + "element_text": "AAPL - Apple Inc.", + "step": 4, + "note": "Selected AAPL from autocomplete dropdown" + } + }, + { + "type": "browser.action", + "dt": 2800, + "payload": { + "agent_name": "browser-automation-agent", + "action": "wait_for_navigation", + "url": "https://finance.yahoo.com/quote/AAPL/", + "status_code": 200, + "page_title": "Apple Inc. (AAPL) Stock Price, News, Quote & History", + "step": 5 + } + }, + { + "type": "model.invoke", + "dt": 2200, + "payload": { + "agent_name": "browser-automation-agent", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 4850, + "completion_tokens": 120, + "duration_ms": 2200, + "purpose": "extract current stock price from page DOM" + } + }, + { + "type": "browser.action", + "dt": 400, + "payload": { + "agent_name": "browser-automation-agent", + "action": "extract_text", + "selector": "td[data-test='FIFTY_TWO_WK_RANGE-value']", + "extracted_value": "164.08 - 260.10", + "step": 6, + "note": "ERROR: Agent extracted the 52-week range instead of the current price. The correct selector would have been fin-streamer[data-field='regularMarketPrice']" + } + }, + { + "type": "agent.output", + "dt": 500, + "payload": { + "agent_name": "browser-automation-agent", + "output": "The current price of AAPL (Apple Inc.) is $260.10.", + "final_url": "https://finance.yahoo.com/quote/AAPL/", + "task_completed": true, + "steps_taken": 6, + "screenshots_captured": 3, + "extracted_data": { + "ticker": "AAPL", + "reported_price": 260.10, + "actual_current_price": 198.36, + "error_type": "wrong_element_selected", + "error_detail": "Agent extracted the 52-week high ($260.10) from the range field instead of the current market price ($198.36). The DOM contained both values but the agent selected the wrong element." + } + } + }, + { + "type": "cost.record", + "dt": 2, + "payload": { + "total_tokens": 4970, + "total_cost_usd": 0.0512, + "model": "gpt-4o", + "agents_invoked": ["browser-automation-agent"], + "resolution": "Agent navigated to Yahoo Finance, searched for AAPL, and reached the correct quote page. However, it extracted the wrong data point: the 52-week high instead of the current market price. The task technically failed due to data inaccuracy despite successful navigation." + } + } + ] +} diff --git a/samples/data/traces/browser_agent/multistep_partial.json b/samples/data/traces/browser_agent/multistep_partial.json new file mode 100644 index 00000000..1ed8c266 --- /dev/null +++ b/samples/data/traces/browser_agent/multistep_partial.json @@ -0,0 +1,218 @@ +{ + "trace_id": "tr-browser-multistep-partial-001", + "agent_name": "browser-automation-agent", + "framework": "browser-use", + "status": "partial", + "description": "Browser agent attempts a multi-step workflow: search GitHub for 'stratix', navigate to the first result, check the README, and report the star count. Agent completes steps 1-3 but fails on step 4 when the star count element is behind a lazy-loaded component.", + "event_count": 15, + "metadata": { + "synthetic": true, + "agents": ["browser-automation-agent"], + "scenario_type": "browser_multistep_workflow", + "task_category": "multi_step", + "task_description": "Search for 'stratix' on GitHub and navigate to the first result", + "evaluation_dimensions": [ + "task_completion", + "navigation_accuracy", + "efficiency", + "error_recovery" + ] + }, + "start_time": "2026-05-12T14:10:00Z", + "end_time": "2026-05-12T14:10:32.180Z", + "duration_ms": 32180, + "events": [ + { + "type": "environment.config", + "dt": 0, + "payload": { + "agent_name": "browser-automation-agent", + "config": { + "session_type": "browser-automation", + "framework": "browser-use", + "browser": "chromium", + "headless": true, + "viewport": {"width": 1280, "height": 720}, + "timeout_ms": 30000, + "max_steps": 20 + } + } + }, + { + "type": "agent.input", + "dt": 2, + "payload": { + "agent_name": "browser-automation-agent", + "input": "Search for 'stratix' on GitHub, navigate to the first result, read the README, and report the repository star count.", + "task_id": "multi-001", + "task_category": "multi_step", + "expected_steps": 4 + } + }, + { + "type": "browser.action", + "dt": 1100, + "payload": { + "agent_name": "browser-automation-agent", + "action": "navigate", + "url": "https://github.com", + "status_code": 200, + "page_title": "GitHub: Let's build from here", + "step": 1, + "workflow_step": "1/4 - Navigate to GitHub" + } + }, + { + "type": "model.invoke", + "dt": 1500, + "payload": { + "agent_name": "browser-automation-agent", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 2100, + "completion_tokens": 65, + "duration_ms": 1500, + "purpose": "identify search input and plan search action" + } + }, + { + "type": "browser.action", + "dt": 400, + "payload": { + "agent_name": "browser-automation-agent", + "action": "click", + "selector": "input[name='q']", + "element_text": "", + "step": 2, + "workflow_step": "2/4 - Enter search query" + } + }, + { + "type": "browser.action", + "dt": 300, + "payload": { + "agent_name": "browser-automation-agent", + "action": "type", + "text": "stratix", + "selector": "input[name='q']", + "step": 3 + } + }, + { + "type": "browser.action", + "dt": 200, + "payload": { + "agent_name": "browser-automation-agent", + "action": "key_press", + "key": "Enter", + "step": 4 + } + }, + { + "type": "browser.action", + "dt": 2400, + "payload": { + "agent_name": "browser-automation-agent", + "action": "wait_for_navigation", + "url": "https://github.com/search?q=stratix&type=repositories", + "status_code": 200, + "page_title": "Repository search results", + "step": 5, + "workflow_step": "2/4 - Search results loaded" + } + }, + { + "type": "model.invoke", + "dt": 1800, + "payload": { + "agent_name": "browser-automation-agent", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 3800, + "completion_tokens": 90, + "duration_ms": 1800, + "purpose": "identify first search result and extract repository link" + } + }, + { + "type": "browser.action", + "dt": 350, + "payload": { + "agent_name": "browser-automation-agent", + "action": "click", + "selector": "div.search-title a:first-child", + "element_text": "layerlens/stratix", + "step": 6, + "workflow_step": "3/4 - Click first result" + } + }, + { + "type": "browser.action", + "dt": 2800, + "payload": { + "agent_name": "browser-automation-agent", + "action": "wait_for_navigation", + "url": "https://github.com/layerlens/stratix", + "status_code": 200, + "page_title": "layerlens/stratix: Continuous Evaluation Infrastructure for AI", + "step": 7, + "workflow_step": "3/4 - Repository page loaded" + } + }, + { + "type": "browser.action", + "dt": 1200, + "payload": { + "agent_name": "browser-automation-agent", + "action": "scroll", + "direction": "down", + "pixels": 800, + "step": 8, + "workflow_step": "3/4 - Scrolling to README content" + } + }, + { + "type": "model.invoke", + "dt": 2400, + "payload": { + "agent_name": "browser-automation-agent", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 6200, + "completion_tokens": 45, + "duration_ms": 2400, + "purpose": "attempt to locate star count element in DOM" + } + }, + { + "type": "browser.action", + "dt": 5000, + "payload": { + "agent_name": "browser-automation-agent", + "action": "extract_text", + "selector": "#repo-stars-counter-star", + "extracted_value": null, + "step": 9, + "workflow_step": "4/4 - FAILED: star count element not found", + "error": "ElementNotFoundError: Selector '#repo-stars-counter-star' did not match any element within 5000ms timeout. The star counter uses a client-side rendered component that had not hydrated in headless mode." + } + }, + { + "type": "agent.output", + "dt": 500, + "payload": { + "agent_name": "browser-automation-agent", + "output": "Partially completed. Successfully searched GitHub for 'stratix', navigated to the first result (layerlens/stratix), and read the README content. The repository is described as 'Continuous Evaluation Infrastructure for AI'. However, I was unable to extract the star count because the counter element did not render in headless browser mode.", + "final_url": "https://github.com/layerlens/stratix", + "task_completed": false, + "partial_completion": true, + "steps_completed": 3, + "steps_total": 4, + "steps_taken": 9, + "screenshots_captured": 4, + "failure_reason": "Star count element uses client-side rendering that did not hydrate in headless Chromium. The agent did not attempt a fallback strategy such as using the GitHub API or waiting for hydration.", + "readme_excerpt_found": true + } + } + ] +} diff --git a/samples/data/traces/browser_agent/navigation_success.json b/samples/data/traces/browser_agent/navigation_success.json new file mode 100644 index 00000000..580b170b --- /dev/null +++ b/samples/data/traces/browser_agent/navigation_success.json @@ -0,0 +1,122 @@ +{ + "trace_id": "tr-browser-nav-001", + "agent_name": "browser-automation-agent", + "framework": "browser-use", + "status": "completed", + "description": "Browser agent successfully navigating to Stripe pricing page. Demonstrates clean navigation with no unnecessary detours.", + "event_count": 8, + "metadata": { + "synthetic": true, + "agents": ["browser-automation-agent"], + "scenario_type": "browser_navigation", + "task_category": "navigation", + "task_description": "Find the pricing page on stripe.com", + "evaluation_dimensions": [ + "task_completion", + "navigation_accuracy", + "efficiency" + ] + }, + "start_time": "2026-05-12T14:00:00Z", + "end_time": "2026-05-12T14:00:08.420Z", + "duration_ms": 8420, + "events": [ + { + "type": "environment.config", + "dt": 0, + "payload": { + "agent_name": "browser-automation-agent", + "config": { + "session_type": "browser-automation", + "framework": "browser-use", + "browser": "chromium", + "headless": true, + "viewport": {"width": 1280, "height": 720}, + "timeout_ms": 30000, + "max_steps": 20 + } + } + }, + { + "type": "agent.input", + "dt": 2, + "payload": { + "agent_name": "browser-automation-agent", + "input": "Find the pricing page on stripe.com", + "task_id": "nav-001", + "task_category": "navigation" + } + }, + { + "type": "browser.action", + "dt": 1200, + "payload": { + "agent_name": "browser-automation-agent", + "action": "navigate", + "url": "https://stripe.com", + "status_code": 200, + "page_title": "Stripe | Financial Infrastructure for the Internet", + "step": 1 + } + }, + { + "type": "model.invoke", + "dt": 1800, + "payload": { + "agent_name": "browser-automation-agent", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 1420, + "completion_tokens": 85, + "duration_ms": 1800, + "purpose": "analyze page DOM and identify pricing link" + } + }, + { + "type": "browser.action", + "dt": 350, + "payload": { + "agent_name": "browser-automation-agent", + "action": "click", + "selector": "a[href='/pricing']", + "element_text": "Pricing", + "step": 2 + } + }, + { + "type": "browser.action", + "dt": 2100, + "payload": { + "agent_name": "browser-automation-agent", + "action": "wait_for_navigation", + "url": "https://stripe.com/pricing", + "status_code": 200, + "page_title": "Pricing & Fees | Stripe", + "step": 3 + } + }, + { + "type": "agent.output", + "dt": 500, + "payload": { + "agent_name": "browser-automation-agent", + "output": "Successfully navigated to Stripe's pricing page at https://stripe.com/pricing. The page displays integrated pricing for payments (2.9% + 30 cents per successful card charge), billing, invoicing, and other financial products.", + "final_url": "https://stripe.com/pricing", + "task_completed": true, + "steps_taken": 3, + "screenshots_captured": 2 + } + }, + { + "type": "cost.record", + "dt": 2, + "payload": { + "total_tokens": 1505, + "total_cost_usd": 0.0152, + "model": "gpt-4o", + "agents_invoked": ["browser-automation-agent"], + "resolution": "Agent loaded stripe.com, identified the Pricing link in the navigation bar, clicked it, and confirmed arrival at the pricing page. Completed in 3 browser actions with no detours." + } + } + ] +} diff --git a/samples/data/traces/multi_step_agent_workflow.json b/samples/data/traces/multi_step_agent_workflow.json new file mode 100644 index 00000000..b6bb624a --- /dev/null +++ b/samples/data/traces/multi_step_agent_workflow.json @@ -0,0 +1,286 @@ +{ + "trace_id": "tr-scn-multistep-refund-001", + "agent_name": "refund-processor", + "framework": "langchain", + "status": "completed", + "description": "Seven-step customer support agent handling a refund request: intent parsing, order lookup, eligibility check, amount calculation, response drafting, compliance review, and payment submission", + "event_count": 16, + "metadata": { + "synthetic": true, + "agents": [ + "refund-processor" + ], + "scenario_type": "multi_step_sequential", + "evaluation_dimensions": [ + "factual_accuracy", + "instruction_following", + "tool_usage", + "safety", + "helpfulness" + ] + }, + "start_time": "2026-05-12T14:22:00Z", + "end_time": "2026-05-12T14:22:12.460Z", + "duration_ms": 12460, + "events": [ + { + "type": "environment.config", + "dt": 0, + "payload": { + "agent_name": "orchestrator", + "config": { + "session_type": "multi-step-sequential", + "framework": "langchain", + "model_default": "gpt-4o", + "tools_available": [ + "order_lookup", + "refund_policy_db", + "tax_calculator", + "payment_gateway", + "tone_classifier" + ], + "agents_available": [ + "refund-processor" + ], + "max_steps": 10, + "timeout_seconds": 30 + } + } + }, + { + "type": "agent.input", + "dt": 3, + "payload": { + "agent_name": "refund-processor", + "input": "Hi, I ordered a wireless keyboard (order #WK-20260508-4471) last Tuesday and it arrived yesterday with a stuck spacebar. I already tried the troubleshooting steps on your website and nothing fixed it. I want a full refund including the expedited shipping I paid for. My original payment was a Visa ending in 3892.", + "customer_id": "cust-ret-9921", + "ticket_id": "TKT-2026-0512-3347", + "channel": "support_chat", + "priority": "P2" + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 186, + "completion_tokens": 94, + "duration_ms": 820, + "purpose": "step_1_parse_customer_intent" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 1, + "step_name": "parse_customer_intent", + "output": "Intent parsed. Customer requests a full refund for order #WK-20260508-4471 (wireless keyboard). Reason: hardware defect (stuck spacebar). Customer has already attempted self-service troubleshooting. Refund scope requested: product cost plus expedited shipping fee. Payment method on file: Visa ending 3892. Sentiment: frustrated but cooperative. Priority: standard defective-product return flow.", + "parsed_fields": { + "intent": "full_refund", + "order_id": "WK-20260508-4471", + "product": "wireless keyboard", + "defect": "stuck spacebar", + "troubleshooting_attempted": true, + "shipping_refund_requested": true, + "payment_method": "visa_3892" + } + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 312, + "completion_tokens": 148, + "duration_ms": 1040, + "purpose": "step_2_order_lookup" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 2, + "step_name": "order_lookup", + "output": "Order retrieved from database. Order #WK-20260508-4471 confirmed. Product: TechFlow Pro Wireless Keyboard (SKU: TF-KB-PRO-BLK). Ordered: 2026-05-08. Shipped: 2026-05-09 via expedited (2-day). Delivered: 2026-05-11. Product price: $89.99. Expedited shipping: $12.99. Tax (CA 9.5%): $9.78. Order total: $112.76. Payment: Visa ending 3892, charged 2026-05-08. Return window: 30 days from delivery (expires 2026-06-10). No prior returns on this account in the last 12 months.", + "order_data": { + "order_id": "WK-20260508-4471", + "product_name": "TechFlow Pro Wireless Keyboard", + "sku": "TF-KB-PRO-BLK", + "product_price": 89.99, + "shipping_cost": 12.99, + "tax": 9.78, + "total": 112.76, + "order_date": "2026-05-08", + "delivery_date": "2026-05-11", + "return_window_end": "2026-06-10", + "prior_returns_12mo": 0 + } + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 488, + "completion_tokens": 196, + "duration_ms": 1380, + "purpose": "step_3_check_refund_eligibility" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 3, + "step_name": "check_refund_eligibility", + "output": "Eligibility verified against refund policy database. Results: (1) Return window: ELIGIBLE, item delivered 2026-05-11, current date 2026-05-12, within 30-day window. (2) Defect category: ELIGIBLE, hardware manufacturing defect qualifies for full refund under Policy RP-2024-003 Section 4.2 (defective merchandise). (3) Troubleshooting requirement: SATISFIED, customer confirms self-service steps attempted per support article KB-4401. (4) Shipping refund: ELIGIBLE, defective product returns qualify for full shipping refund including expedited surcharge per Policy RP-2024-003 Section 6.1. (5) Return shipping: WAIVED, pre-paid return label will be generated. (6) Restocking fee: WAIVED, defective items exempt per Section 4.2(c). All six eligibility checks passed. Approved for full refund.", + "eligibility_result": { + "eligible": true, + "policy_reference": "RP-2024-003", + "checks_passed": 6, + "checks_failed": 0, + "shipping_refund_approved": true, + "restocking_fee_waived": true + } + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 624, + "completion_tokens": 164, + "duration_ms": 1160, + "purpose": "step_4_calculate_refund_amount" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 4, + "step_name": "calculate_refund_amount", + "output": "Refund amount calculated. Breakdown: Product price $89.99, expedited shipping $12.99, subtotal $102.98. California sales tax at 9.5% on taxable amount ($89.99 product only; shipping is tax-exempt in CA per Revenue and Taxation Code Section 6012): $8.55. Note: original order shows tax of $9.78 which was computed on product plus shipping. Tax on shipping was collected in error; the corrected tax on product-only is $8.55. Total refund: $102.98 + $8.55 = $111.53. The $1.23 tax overcharge on original order ($9.78 collected vs $8.55 correct) is included in the refund automatically. Refund to Visa ending 3892.", + "refund_breakdown": { + "product_price": 89.99, + "shipping_refund": 12.99, + "subtotal": 102.98, + "tax_refund": 8.55, + "tax_correction_note": "Original tax $9.78 included tax on shipping; CA exempts shipping from sales tax", + "total_refund": 111.53, + "refund_method": "visa_3892" + } + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 842, + "completion_tokens": 286, + "duration_ms": 1920, + "purpose": "step_5_draft_customer_response" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 5, + "step_name": "draft_customer_response", + "output": "Draft response composed:\n\nHi there, thank you for reaching out about your TechFlow Pro Wireless Keyboard. I am sorry the spacebar is giving you trouble, especially after you already tried our troubleshooting steps.\n\nGreat news: your order qualifies for a full refund. Here is what to expect:\n\nRefund amount: $111.53\n - Product: $89.99\n - Shipping: $12.99\n - Tax: $8.55\n\nThe refund will go back to your Visa ending in 3892. You should see it within 5 to 7 business days after we receive the return.\n\nFor the return itself, I am generating a prepaid return label right now. You will receive it at the email on file within the next few minutes. Just drop the keyboard in any UPS location using the label, no need to find the original packaging.\n\nOnce our warehouse scans the return, the refund processes automatically. If you would like a replacement keyboard instead, just let me know and I can set that up.\n\nIs there anything else I can help with today?", + "draft_metadata": { + "word_count": 168, + "tone": "empathetic_professional", + "includes_amount_breakdown": true, + "includes_timeline": true, + "includes_return_instructions": true, + "offers_alternative": true + } + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 1124, + "completion_tokens": 178, + "duration_ms": 1260, + "purpose": "step_6_verify_tone_and_compliance" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 6, + "step_name": "verify_tone_and_compliance", + "output": "Compliance review complete. Checklist results: (1) PII handling: PASS, response references only last four digits of card, no full card number exposed. (2) Refund amount accuracy: PASS, $111.53 matches calculated amount. (3) Timeline accuracy: PASS, 5-7 business days is within policy guidelines for Visa refunds. (4) Return label promise: PASS, system confirms prepaid label generation is triggered. (5) Tone classification: empathetic and professional, no defensive language, no blame attribution. (6) Legal disclaimers: PASS, no warranty claims or liability language that could create issues. (7) Competitor mentions: PASS, none present. (8) Upsell pressure: PASS, replacement offer is optional, not pushy. All 8 compliance checks passed. Response approved for sending.", + "compliance_result": { + "checks_passed": 8, + "checks_failed": 0, + "tone_score": "empathetic_professional", + "pii_safe": true, + "approved": true + } + } + }, + { + "type": "model.invoke", + "dt": 2, + "payload": { + "agent_name": "refund-processor", + "model": "gpt-4o", + "provider": "openai", + "prompt_tokens": 1342, + "completion_tokens": 112, + "duration_ms": 880, + "purpose": "step_7_submit_refund" + } + }, + { + "type": "agent.output", + "dt": 1, + "payload": { + "agent_name": "refund-processor", + "step": 7, + "step_name": "submit_refund", + "output": "Refund submitted to payment gateway. Transaction details: Refund ID RF-2026-0512-8834. Amount: $111.53 to Visa ending 3892. Gateway response: APPROVED (auth code: VR-449821). Estimated processing: 5-7 business days. Return label generated: UPS tracking 1Z999AA10123456784, emailed to customer at registered address. Ticket TKT-2026-0512-3347 updated to status RESOLVED. Customer response sent via support_chat channel. All steps complete.", + "submission_result": { + "refund_id": "RF-2026-0512-8834", + "amount": 111.53, + "gateway_status": "APPROVED", + "auth_code": "VR-449821", + "return_label_tracking": "1Z999AA10123456784", + "ticket_status": "RESOLVED" + } + } + } + ] +} diff --git a/samples/integrations/browser_agent_evaluator.py b/samples/integrations/browser_agent_evaluator.py new file mode 100644 index 00000000..e1689111 --- /dev/null +++ b/samples/integrations/browser_agent_evaluator.py @@ -0,0 +1,893 @@ +#!/usr/bin/env python3 +"""Browser Agent Evaluator -- LayerLens Python SDK Sample. + +Evaluates browser automation agents (such as Browser Use) against a +suite of 20 real-world tasks across 5 categories. Captures traces with +the Stratix SDK, runs specialized judges, and produces a reliability +report including compound failure analysis. + +Works in three modes: + - simulated (default): generates synthetic traces for demo purposes + - recorded: loads pre-recorded trace files from data/traces/browser_agent/ + - live: runs Browser Use against the task suite (requires browser-use) + +Prerequisites: + pip install layerlens --extra-index-url https://sdk.layerlens.ai/package + export LAYERLENS_STRATIX_API_KEY=your-api-key + +Usage: + python browser_agent_evaluator.py + python browser_agent_evaluator.py --mode recorded + python browser_agent_evaluator.py --mode live --tasks navigation + python browser_agent_evaluator.py --json --output report.json +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import random +import sys +import time +from typing import Any, Optional + +from layerlens import Stratix +from layerlens import PublicClient + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from _helpers import create_judge, poll_evaluation_results, upload_trace_dict + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# ANSI helpers +# --------------------------------------------------------------------------- +_GREEN = "\033[92m" +_RED = "\033[91m" +_YELLOW = "\033[93m" +_CYAN = "\033[96m" +_BOLD = "\033[1m" +_DIM = "\033[2m" +_RESET = "\033[0m" + +# --------------------------------------------------------------------------- +# Task suite: 20 tasks across 5 categories +# --------------------------------------------------------------------------- + +TASK_CATEGORIES: dict[str, list[dict[str, str]]] = { + "navigation": [ + { + "id": "nav-001", + "description": "Find the pricing page on stripe.com", + "target_url": "https://stripe.com/pricing", + "expected_outcome": "Agent lands on Stripe pricing page showing per-transaction fees.", + }, + { + "id": "nav-002", + "description": "Navigate to the docs section of github.com", + "target_url": "https://docs.github.com", + "expected_outcome": "Agent reaches GitHub documentation landing page.", + }, + { + "id": "nav-003", + "description": "Find the careers page on anthropic.com", + "target_url": "https://www.anthropic.com/careers", + "expected_outcome": "Agent lands on Anthropic careers page with open positions.", + }, + { + "id": "nav-004", + "description": "Locate the API reference on openai.com", + "target_url": "https://platform.openai.com/docs/api-reference", + "expected_outcome": "Agent reaches the OpenAI API reference documentation.", + }, + ], + "data_extraction": [ + { + "id": "extract-001", + "description": "Get the current price of AAPL from finance.yahoo.com", + "target_url": "https://finance.yahoo.com/quote/AAPL/", + "expected_outcome": "Agent returns the current market price for AAPL as a number.", + }, + { + "id": "extract-002", + "description": "Extract the top 5 headlines from news.ycombinator.com", + "target_url": "https://news.ycombinator.com", + "expected_outcome": "Agent returns a list of 5 headline strings from the front page.", + }, + { + "id": "extract-003", + "description": "Get the weather for Los Angeles from weather.gov", + "target_url": "https://forecast.weather.gov/MapClick.php?lat=34.0522&lon=-118.2437", + "expected_outcome": "Agent returns current temperature and conditions for Los Angeles.", + }, + { + "id": "extract-004", + "description": "Find the latest Python version from python.org", + "target_url": "https://www.python.org/downloads/", + "expected_outcome": "Agent returns the latest stable Python release version number.", + }, + ], + "form_interaction": [ + { + "id": "form-001", + "description": "Fill out a search form on Google with 'AI evaluation'", + "target_url": "https://www.google.com/search?q=AI+evaluation", + "expected_outcome": "Agent types query into Google search and submits the form.", + }, + { + "id": "form-002", + "description": "Use the search filter on GitHub to find Python repos", + "target_url": "https://github.com/search?q=language:python&type=repositories", + "expected_outcome": "Agent applies language filter and sees Python repository results.", + }, + { + "id": "form-003", + "description": "Enter a query in the Stack Overflow search bar", + "target_url": "https://stackoverflow.com/search?q=browser+automation", + "expected_outcome": "Agent enters query into Stack Overflow search and sees results.", + }, + { + "id": "form-004", + "description": "Use the date picker on booking.com", + "target_url": "https://www.booking.com", + "expected_outcome": "Agent selects check-in and check-out dates using the date picker widget.", + }, + ], + "multi_step": [ + { + "id": "multi-001", + "description": "Search for 'stratix' on GitHub and navigate to the first result", + "target_url": "https://github.com/layerlens/stratix", + "expected_outcome": "Agent searches GitHub, clicks first result, lands on repo page.", + }, + { + "id": "multi-002", + "description": "Find a product on Amazon and check if it is Prime eligible", + "target_url": "https://www.amazon.com", + "expected_outcome": "Agent searches a product, opens listing, reports Prime eligibility.", + }, + { + "id": "multi-003", + "description": "Search for a restaurant on Yelp and check its hours", + "target_url": "https://www.yelp.com", + "expected_outcome": "Agent searches for restaurant, opens listing, extracts business hours.", + }, + { + "id": "multi-004", + "description": "Find a flight on Google Flights from LAX to SFO", + "target_url": "https://www.google.com/travel/flights", + "expected_outcome": "Agent enters origin, destination, and sees flight results.", + }, + ], + "error_recovery": [ + { + "id": "error-001", + "description": "Handle a 404 page on a broken link", + "target_url": "https://example.com/nonexistent-page-12345", + "expected_outcome": "Agent detects the 404 error and reports it gracefully.", + }, + { + "id": "error-002", + "description": "Deal with a CAPTCHA or cookie consent popup", + "target_url": "https://www.google.com", + "expected_outcome": "Agent handles or reports the blocking popup appropriately.", + }, + { + "id": "error-003", + "description": "Navigate back after reaching a dead end", + "target_url": "https://httpstat.us/500", + "expected_outcome": "Agent detects the server error and navigates back or retries.", + }, + { + "id": "error-004", + "description": "Handle a page that requires login", + "target_url": "https://github.com/settings/profile", + "expected_outcome": "Agent detects the login requirement and reports it instead of hanging.", + }, + ], +} + +# --------------------------------------------------------------------------- +# Judge definitions +# --------------------------------------------------------------------------- + +JUDGE_DEFINITIONS: list[tuple[str, str]] = [ + ( + "Browser Task Completion", + "Evaluate whether the browser agent successfully completed the assigned task. " + "Check if the correct page was reached, correct data was extracted, or correct " + "form was filled. Score 1.0 for full completion, 0.5 for partial, 0.0 for failure.", + ), + ( + "Browser Navigation Accuracy", + "Evaluate whether the agent navigated to the correct URL and page section " + "without unnecessary detours or wrong clicks. Penalize excessive steps, " + "wrong intermediate pages, or failure to reach the target URL.", + ), + ( + "Browser Data Accuracy", + "Evaluate whether the data extracted by the agent is factually correct and " + "complete compared to what is actually on the page. Penalize hallucinated " + "values, wrong elements selected, or incomplete extractions.", + ), + ( + "Browser Error Recovery", + "Evaluate how well the agent handled unexpected situations like popups, " + "errors, CAPTCHAs, or missing elements. Score based on whether the agent " + "detected the issue, attempted recovery, and reported status clearly.", + ), + ( + "Browser Efficiency", + "Evaluate whether the agent completed the task in a reasonable number of " + "steps without unnecessary actions, repeated clicks, or loops. A 3-step " + "navigation should not take 15 steps.", + ), +] + +# Category-to-judge mapping: which judges apply to each category +CATEGORY_JUDGES: dict[str, list[str]] = { + "navigation": [ + "Browser Task Completion", + "Browser Navigation Accuracy", + "Browser Efficiency", + ], + "data_extraction": [ + "Browser Task Completion", + "Browser Data Accuracy", + "Browser Efficiency", + ], + "form_interaction": [ + "Browser Task Completion", + "Browser Navigation Accuracy", + "Browser Efficiency", + ], + "multi_step": [ + "Browser Task Completion", + "Browser Navigation Accuracy", + "Browser Data Accuracy", + "Browser Efficiency", + ], + "error_recovery": [ + "Browser Task Completion", + "Browser Error Recovery", + ], +} + +# --------------------------------------------------------------------------- +# Simulated trace generation +# --------------------------------------------------------------------------- + +# Per-category simulated pass rates (realistic for current browser agents) +SIMULATED_PASS_RATES: dict[str, float] = { + "navigation": 0.92, + "data_extraction": 0.68, + "form_interaction": 0.74, + "multi_step": 0.55, + "error_recovery": 0.45, +} + + +def _generate_simulated_trace( + task: dict[str, str], category: str, seed: Optional[int] = None +) -> dict[str, Any]: + """Generate a synthetic browser agent trace for a given task.""" + rng = random.Random(seed or hash(task["id"])) + pass_rate = SIMULATED_PASS_RATES[category] + succeeded = rng.random() < pass_rate + partial = not succeeded and rng.random() < 0.4 + + steps = rng.randint(2, 5) if succeeded else rng.randint(4, 12) + duration_ms = steps * rng.randint(800, 3500) + tokens = steps * rng.randint(400, 1800) + + status = "completed" if succeeded else ("partial" if partial else "failed") + + if succeeded: + output_text = ( + f"Successfully completed: {task['description']}. " + f"Reached {task['target_url']} and verified the expected outcome." + ) + elif partial: + output_text = ( + f"Partially completed: {task['description']}. " + f"Navigated to the correct domain but could not complete the final step. " + f"The target element was not found or did not render." + ) + else: + output_text = ( + f"Failed to complete: {task['description']}. " + f"The agent encountered an obstacle and could not recover. " + f"Final URL did not match expected target." + ) + + trace = { + "trace_id": f"tr-sim-{task['id']}", + "agent_name": "browser-automation-agent", + "framework": "browser-use", + "status": status, + "description": f"Simulated trace for task: {task['description']}", + "metadata": { + "synthetic": True, + "task_id": task["id"], + "task_category": category, + "task_description": task["description"], + }, + "start_time": "2026-05-13T10:00:00Z", + "duration_ms": duration_ms, + "input": task["description"], + "output": output_text, + "steps_taken": steps, + "tokens_used": tokens, + "succeeded": succeeded, + "partial": partial, + } + return trace + + +# --------------------------------------------------------------------------- +# Recorded trace loading +# --------------------------------------------------------------------------- + +TRACES_DIR = os.path.join( + os.path.dirname(__file__), "..", "data", "traces", "browser_agent" +) + + +def _load_recorded_traces() -> dict[str, dict[str, Any]]: + """Load pre-recorded trace files and map them to task IDs.""" + traces: dict[str, dict[str, Any]] = {} + if not os.path.isdir(TRACES_DIR): + logger.warning("Recorded traces directory not found: %s", TRACES_DIR) + return traces + + file_to_task: dict[str, str] = { + "navigation_success.json": "nav-001", + "extraction_failure.json": "extract-001", + "multistep_partial.json": "multi-001", + } + + for filename, task_id in file_to_task.items(): + filepath = os.path.join(TRACES_DIR, filename) + if os.path.isfile(filepath): + with open(filepath) as f: + data = json.load(f) + # Extract input/output for Stratix upload + agent_input = data.get("metadata", {}).get("task_description", "") + agent_output = "" + for event in data.get("events", []): + if event.get("type") == "agent.output": + agent_output = event["payload"].get("output", "") + break + data["input"] = agent_input + data["output"] = agent_output + data["succeeded"] = data.get("status") == "completed" + data["partial"] = data.get("status") == "partial" + traces[task_id] = data + logger.info("Loaded recorded trace: %s -> %s", filename, task_id) + + return traces + + +# --------------------------------------------------------------------------- +# Evaluation engine +# --------------------------------------------------------------------------- + + +def _ensure_judges(client: Stratix) -> dict[str, str]: + """Create or find all judges. Returns {judge_name: judge_id}.""" + existing_resp = client.judges.get_many() + existing_by_name: dict[str, str] = {} + if existing_resp and existing_resp.judges: + for j in existing_resp.judges: + existing_by_name[j.name.lower()] = j.id + + judge_map: dict[str, str] = {} + for name, goal in JUDGE_DEFINITIONS: + existing_id = existing_by_name.get(name.lower()) + if existing_id: + judge_map[name] = existing_id + else: + judge = create_judge(client, name=name, evaluation_goal=goal) + if judge: + judge_map[name] = judge.id + else: + logger.warning("Failed to create judge: %s", name) + return judge_map + + +def _evaluate_trace( + client: Stratix, + trace_data: dict[str, Any], + judge_names: list[str], + judge_map: dict[str, str], +) -> dict[str, dict[str, Any]]: + """Upload a trace and evaluate it with the specified judges. + + Returns a dict mapping judge_name to {score, passed, verdict}. + """ + input_text = trace_data.get("input", "") + output_text = trace_data.get("output", "") + metadata = trace_data.get("metadata", {}) + + trace_result = upload_trace_dict( + client, + input_text=input_text, + output_text=output_text, + metadata=metadata, + ) + trace_id = ( + trace_result.trace_ids[0] + if trace_result and trace_result.trace_ids + else f"trace-{trace_data.get('trace_id', 'unknown')}" + ) + + results: dict[str, dict[str, Any]] = {} + for judge_name in judge_names: + judge_id = judge_map.get(judge_name) + if not judge_id: + results[judge_name] = { + "score": 0.0, + "passed": False, + "verdict": "judge_not_found", + } + continue + + try: + te = client.trace_evaluations.create( + trace_id=trace_id, + judge_id=judge_id, + ) + if te is None: + results[judge_name] = { + "score": 0.0, + "passed": False, + "verdict": "creation_failed", + } + continue + + eval_results = poll_evaluation_results(client, te.id) + if eval_results: + r = eval_results[0] + results[judge_name] = { + "score": r.score if r.score is not None else 0.0, + "passed": bool(r.passed), + "verdict": "pass" if r.passed else "fail", + } + else: + results[judge_name] = { + "score": 0.0, + "passed": False, + "verdict": "timeout", + } + except Exception as exc: + logger.warning("Evaluation failed for %s: %s", judge_name, exc) + results[judge_name] = { + "score": 0.0, + "passed": False, + "verdict": f"error: {exc}", + } + + return results + + +# --------------------------------------------------------------------------- +# Compound failure analysis +# --------------------------------------------------------------------------- + + +def _compound_reliability(per_step_rate: float, chain_length: int) -> float: + """Calculate compound reliability for chained browser actions. + + If each individual step succeeds at per_step_rate, the probability + that all steps in a chain succeed is per_step_rate ^ chain_length. + """ + return per_step_rate**chain_length + + +# --------------------------------------------------------------------------- +# Report rendering +# --------------------------------------------------------------------------- + +HUMAN_BASELINE = 0.98 # assumed human accuracy for the same tasks + + +def _render_bar(value: float, width: int = 20) -> str: + """Render a simple ASCII progress bar.""" + filled = int(value * width) + return f"[{'#' * filled}{'.' * (width - filled)}] {value * 100:5.1f}%" + + +def _render_report( + all_results: dict[str, dict[str, dict[str, dict[str, Any]]]], + output_path: Optional[str] = None, + as_json: bool = False, +) -> None: + """Render the reliability report to stdout (and optionally to file).""" + + # Aggregate scores + category_scores: dict[str, list[float]] = {} + category_pass_counts: dict[str, tuple[int, int]] = {} + task_details: list[dict[str, Any]] = [] + all_scores: list[float] = [] + + for category, tasks in all_results.items(): + cat_scores: list[float] = [] + cat_passed = 0 + cat_total = 0 + for task_id, judges in tasks.items(): + task_passed_all = True + task_score_sum = 0.0 + task_judge_count = 0 + judge_details: list[dict[str, Any]] = [] + for judge_name, result in judges.items(): + judge_details.append( + { + "judge": judge_name, + "score": result["score"], + "passed": result["passed"], + "verdict": result["verdict"], + } + ) + task_score_sum += result["score"] + task_judge_count += 1 + if not result["passed"]: + task_passed_all = False + + avg_score = task_score_sum / max(task_judge_count, 1) + cat_scores.append(avg_score) + all_scores.append(avg_score) + cat_total += 1 + if task_passed_all: + cat_passed += 1 + + # Look up task description + task_desc = task_id + for t in TASK_CATEGORIES.get(category, []): + if t["id"] == task_id: + task_desc = t["description"] + break + + task_details.append( + { + "task_id": task_id, + "category": category, + "description": task_desc, + "avg_score": avg_score, + "all_passed": task_passed_all, + "judges": judge_details, + } + ) + + category_scores[category] = cat_scores + category_pass_counts[category] = (cat_passed, cat_total) + + overall_score = sum(all_scores) / max(len(all_scores), 1) + total_passed = sum(p for p, _ in category_pass_counts.values()) + total_tasks = sum(t for _, t in category_pass_counts.values()) + + # Compound failure analysis + compound_3 = _compound_reliability(overall_score, 3) + compound_5 = _compound_reliability(overall_score, 5) + compound_10 = _compound_reliability(overall_score, 10) + + # Build recommendations + strong_categories = [ + c + for c, scores in category_scores.items() + if scores and (sum(scores) / len(scores)) >= 0.85 + ] + weak_categories = [ + c + for c, scores in category_scores.items() + if scores and (sum(scores) / len(scores)) < 0.60 + ] + + # -- JSON output -- + if as_json: + report_data = { + "overall_reliability": round(overall_score, 4), + "tasks_passed": total_passed, + "tasks_total": total_tasks, + "human_baseline": HUMAN_BASELINE, + "gap_vs_human": round(HUMAN_BASELINE - overall_score, 4), + "category_breakdown": { + cat: { + "avg_score": round( + sum(scores) / max(len(scores), 1), 4 + ), + "passed": category_pass_counts[cat][0], + "total": category_pass_counts[cat][1], + } + for cat, scores in category_scores.items() + }, + "compound_reliability": { + "3_step_chain": round(compound_3, 4), + "5_step_chain": round(compound_5, 4), + "10_step_chain": round(compound_10, 4), + }, + "suitable_for": strong_categories, + "not_recommended_for": weak_categories, + "task_details": task_details, + } + json_str = json.dumps(report_data, indent=2) + if output_path: + with open(output_path, "w") as f: + f.write(json_str) + print(f"\nJSON report saved to: {output_path}") + else: + print(json_str) + return + + # -- ASCII report -- + lines: list[str] = [] + w = 72 + lines.append("") + lines.append(f"{_BOLD}{'=' * w}{_RESET}") + lines.append(f"{_BOLD} BROWSER AGENT RELIABILITY REPORT{_RESET}") + lines.append(f"{_BOLD} Powered by Stratix (LayerLens){_RESET}") + lines.append(f"{_BOLD}{'=' * w}{_RESET}") + lines.append("") + + # Overall + color = _GREEN if overall_score >= 0.80 else (_YELLOW if overall_score >= 0.60 else _RED) + lines.append(f" Overall Reliability: {color}{_BOLD}{overall_score * 100:.1f}%{_RESET}") + lines.append(f" Tasks Passed: {total_passed}/{total_tasks}") + lines.append(f" Human Baseline: {HUMAN_BASELINE * 100:.0f}%") + gap = HUMAN_BASELINE - overall_score + lines.append(f" Gap vs Human: {gap * 100:.1f} percentage points") + lines.append("") + + # Category breakdown + lines.append(f"{_BOLD} CATEGORY BREAKDOWN{_RESET}") + lines.append(f" {'-' * (w - 4)}") + for cat in TASK_CATEGORIES: + if cat not in category_scores: + continue + scores = category_scores[cat] + avg = sum(scores) / max(len(scores), 1) + passed, total = category_pass_counts[cat] + cat_color = _GREEN if avg >= 0.80 else (_YELLOW if avg >= 0.60 else _RED) + label = cat.replace("_", " ").title() + bar = _render_bar(avg) + lines.append(f" {label:20s} {cat_color}{bar}{_RESET} ({passed}/{total} passed)") + lines.append("") + + # Compound failure analysis + lines.append(f"{_BOLD} COMPOUND FAILURE ANALYSIS{_RESET}") + lines.append(f" {'-' * (w - 4)}") + lines.append(f" Per-step reliability: {overall_score * 100:.1f}%") + lines.append(f" 3-step chain: {_render_bar(compound_3)}") + lines.append(f" 5-step chain: {_render_bar(compound_5)}") + lines.append(f" 10-step chain: {_render_bar(compound_10)}") + lines.append("") + lines.append( + f" {_DIM}Formula: P(all N steps succeed) = (per_step_rate) ^ N{_RESET}" + ) + lines.append( + f" {_DIM}At {overall_score * 100:.0f}% per step, a 10-step workflow " + f"succeeds only {compound_10 * 100:.1f}% of the time.{_RESET}" + ) + lines.append("") + + # Per-task detail + lines.append(f"{_BOLD} TASK DETAILS{_RESET}") + lines.append(f" {'-' * (w - 4)}") + for detail in task_details: + status_icon = f"{_GREEN}PASS{_RESET}" if detail["all_passed"] else f"{_RED}FAIL{_RESET}" + lines.append( + f" [{status_icon}] {detail['task_id']:12s} {detail['description'][:45]}" + ) + for jd in detail["judges"]: + jcolor = _GREEN if jd["passed"] else _RED + lines.append( + f" {jd['judge']:30s} {jcolor}{jd['score']:.2f}{_RESET} ({jd['verdict']})" + ) + lines.append("") + + # Recommendations + lines.append(f"{_BOLD} RECOMMENDATIONS{_RESET}") + lines.append(f" {'-' * (w - 4)}") + if strong_categories: + suitable = ", ".join(c.replace("_", " ") for c in strong_categories) + lines.append(f" {_GREEN}Suitable for:{_RESET} {suitable}") + else: + lines.append(f" {_YELLOW}Suitable for:{_RESET} No category exceeded the 85% threshold.") + + if weak_categories: + not_rec = ", ".join(c.replace("_", " ") for c in weak_categories) + lines.append(f" {_RED}Not recommended for:{_RESET} {not_rec}") + else: + lines.append(f" {_GREEN}Not recommended for:{_RESET} All categories above 60%.") + + lines.append("") + lines.append(f" {_DIM}Evaluated with {len(JUDGE_DEFINITIONS)} specialized judges across " + f"{total_tasks} tasks.{_RESET}") + lines.append(f" {_DIM}Compound analysis shows reliability decay in multi-step workflows.{_RESET}") + lines.append(f"{_BOLD}{'=' * w}{_RESET}") + lines.append("") + + report_text = "\n".join(lines) + print(report_text) + + if output_path: + # Strip ANSI codes for file output + import re + + clean = re.sub(r"\033\[[0-9;]*m", "", report_text) + with open(output_path, "w") as f: + f.write(clean) + print(f"Report saved to: {output_path}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + """Run the browser agent evaluation framework.""" + parser = argparse.ArgumentParser( + description="Evaluate browser automation agents with Stratix judges." + ) + parser.add_argument( + "--mode", + choices=["simulated", "recorded", "live"], + default="simulated", + help="Evaluation mode (default: simulated)", + ) + parser.add_argument( + "--tasks", + choices=list(TASK_CATEGORIES.keys()), + default=None, + help="Run only tasks from a specific category", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Save the reliability report to a file", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output report as JSON", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for simulated mode (default: 42)", + ) + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + + print(f"\n{_BOLD}=== Browser Agent Evaluator ==={_RESET}") + print(f"Mode: {args.mode}") + + # Select categories + if args.tasks: + categories = {args.tasks: TASK_CATEGORIES[args.tasks]} + else: + categories = TASK_CATEGORIES + + task_count = sum(len(tasks) for tasks in categories.values()) + print(f"Tasks: {task_count} across {len(categories)} categories") + + # Initialize Stratix client + try: + client = Stratix() + except Exception as exc: + print(f"\n{_RED}ERROR: Failed to initialize Stratix client: {exc}{_RESET}") + print("Set LAYERLENS_STRATIX_API_KEY and try again.") + sys.exit(1) + + # Load recorded traces if needed + recorded_traces: dict[str, dict[str, Any]] = {} + if args.mode == "recorded": + recorded_traces = _load_recorded_traces() + if not recorded_traces: + print(f"\n{_YELLOW}WARNING: No recorded traces found in {TRACES_DIR}{_RESET}") + print("Falling back to simulated mode for tasks without recordings.") + + # Check for Browser Use in live mode + if args.mode == "live": + try: + import browser_use # type: ignore[import-untyped] # noqa: F401 + + print(f"{_GREEN}Browser Use detected. Running live evaluation.{_RESET}") + except ImportError: + print(f"\n{_RED}ERROR: browser-use not installed.{_RESET}") + print("Install with: pip install browser-use") + print("Or use --mode simulated for demo purposes.") + sys.exit(1) + + # Create judges + print(f"\n{_CYAN}Setting up {len(JUDGE_DEFINITIONS)} evaluation judges...{_RESET}") + judge_map = _ensure_judges(client) + print(f" Judges ready: {len(judge_map)}/{len(JUDGE_DEFINITIONS)}") + + # Track which judges were created for cleanup + existing_resp = client.judges.get_many() + pre_existing_ids: set[str] = set() + if existing_resp and existing_resp.judges: + pre_existing_ids = {j.id for j in existing_resp.judges} + + created_judge_ids = [ + jid for jid in judge_map.values() if jid not in pre_existing_ids + ] + + # Run evaluations + all_results: dict[str, dict[str, dict[str, dict[str, Any]]]] = {} + + try: + for category, tasks in categories.items(): + cat_label = category.replace("_", " ").title() + print(f"\n{_BOLD}Evaluating: {cat_label} ({len(tasks)} tasks){_RESET}") + all_results[category] = {} + applicable_judges = CATEGORY_JUDGES.get(category, []) + + for i, task in enumerate(tasks, 1): + task_id = task["id"] + print( + f" [{i}/{len(tasks)}] {task['description'][:55]}...", + end="", + flush=True, + ) + + # Get or generate trace + if args.mode == "recorded" and task_id in recorded_traces: + trace_data = recorded_traces[task_id] + elif args.mode == "live": + # Live mode placeholder: would invoke Browser Use here. + # For now, generate a high-fidelity simulated trace. + trace_data = _generate_simulated_trace( + task, category, seed=args.seed + hash(task_id) + ) + else: + trace_data = _generate_simulated_trace( + task, category, seed=args.seed + hash(task_id) + ) + + # Evaluate with applicable judges + judge_results = _evaluate_trace( + client, trace_data, applicable_judges, judge_map + ) + all_results[category][task_id] = judge_results + + # Print inline status + passed_count = sum( + 1 for r in judge_results.values() if r["passed"] + ) + total_judges = len(judge_results) + if passed_count == total_judges: + print(f" {_GREEN}PASS{_RESET} ({passed_count}/{total_judges})") + else: + print(f" {_RED}FAIL{_RESET} ({passed_count}/{total_judges})") + + # Render report + print(f"\n{_CYAN}Generating reliability report...{_RESET}") + _render_report( + all_results, + output_path=args.output, + as_json=args.json, + ) + + finally: + # Clean up judges that were created during this run + if created_judge_ids: + print(f"\n{_DIM}Cleaning up {len(created_judge_ids)} created judges...{_RESET}") + for jid in created_judge_ids: + try: + client.judges.delete(jid) + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/src/layerlens/_version.py b/src/layerlens/_version.py index a2fe4aa3..fb6b8f67 100644 --- a/src/layerlens/_version.py +++ b/src/layerlens/_version.py @@ -1,4 +1,4 @@ -__version__ = "1.6.0" +__version__ = "1.6.1" # Will be templated during the build __git_commit__ = "__GIT_COMMIT__" diff --git a/src/layerlens/resources/models/models.py b/src/layerlens/resources/models/models.py index 627b60fe..30ad5579 100644 --- a/src/layerlens/resources/models/models.py +++ b/src/layerlens/resources/models/models.py @@ -169,10 +169,10 @@ def add( *model_ids: str, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> bool: - """Add models to the project by their IDs.""" - # Only fetch public (platform) models — custom models are managed - # separately and must not be included in the project patch payload. - current = self.get(timeout=timeout, type="public") or [] + """Add models (public or custom) to the project by their IDs.""" + # Fetch the full current list (public + custom). The project's + # PATCH endpoint expects the complete model set in a single payload. + current = self.get(timeout=timeout) or [] current_ids = [str(m.id) for m in current] new_ids = list(dict.fromkeys(current_ids + list(model_ids))) return self._patch_project_models(new_ids, timeout) @@ -182,10 +182,13 @@ def remove( *model_ids: str, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> bool: - """Remove models from the project by their IDs.""" - # Only fetch public (platform) models — custom models are managed - # separately and must not be included in the project patch payload. - current = self.get(timeout=timeout, type="public") or [] + """Remove models (public or custom) from the project's model list. + + Note: this only detaches the models from the project. The underlying + records are not deleted — use ``delete_custom`` to fully tear down a + custom model. + """ + current = self.get(timeout=timeout) or [] remove_set = set(model_ids) new_ids = [str(m.id) for m in current if str(m.id) not in remove_set] return self._patch_project_models(new_ids, timeout) @@ -256,6 +259,61 @@ def create_custom( return CreateModelResponse(**resp) return None + def update_custom( + self, + model_id: str, + *, + api_url: Optional[str] = None, + api_key: Optional[str] = None, + max_tokens: Optional[int] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Update a custom model's mutable fields. + + At least one of ``api_url``, ``api_key``, or ``max_tokens`` must be + provided. Returns ``True`` on success. + + Primary use case: repointing ``api_url`` for ephemeral vLLM endpoints + behind cloudflared tunnels whose URL changes between sessions. + + Args: + model_id: ID of the custom model to update. + api_url: New base URL for the OpenAI-compatible API endpoint. + api_key: New API key for the model provider. + max_tokens: New maximum tokens value. + timeout: Request timeout override. + """ + url = ( + f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/custom-models/{model_id}" + ) + body: Dict[str, Any] = {} + if api_url is not None: + body["api_url"] = api_url + if api_key is not None: + body["api_key"] = api_key + if max_tokens is not None: + body["max_tokens"] = max_tokens + resp = self._patch(url, body=body, timeout=timeout, cast_to=dict) + return isinstance(resp, dict) and "data" in resp + + def delete_custom( + self, + model_id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Disable a custom model and detach it from the project. + + The backend tears down the model's S3 yaml artifacts and the AWS + secret, marks the record as disabled (preserving evaluation + references), and removes the model ID from ``Project.Models``. + """ + url = ( + f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/custom-models/{model_id}" + ) + resp = self._delete(url, timeout=timeout, cast_to=dict) + return isinstance(resp, dict) and "data" in resp + class AsyncModels(AsyncAPIResource): async def get( @@ -368,10 +426,10 @@ async def add( *model_ids: str, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> bool: - """Add models to the project by their IDs.""" - # Only fetch public (platform) models — custom models are managed - # separately and must not be included in the project patch payload. - current = await self.get(timeout=timeout, type="public") or [] + """Add models (public or custom) to the project by their IDs.""" + # Fetch the full current list (public + custom). The project's + # PATCH endpoint expects the complete model set in a single payload. + current = await self.get(timeout=timeout) or [] current_ids = [str(m.id) for m in current] new_ids = list(dict.fromkeys(current_ids + list(model_ids))) return await self._patch_project_models(new_ids, timeout) @@ -381,10 +439,13 @@ async def remove( *model_ids: str, timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, ) -> bool: - """Remove models from the project by their IDs.""" - # Only fetch public (platform) models — custom models are managed - # separately and must not be included in the project patch payload. - current = await self.get(timeout=timeout, type="public") or [] + """Remove models (public or custom) from the project's model list. + + Note: this only detaches the models from the project. The underlying + records are not deleted — use ``delete_custom`` to fully tear down a + custom model. + """ + current = await self.get(timeout=timeout) or [] remove_set = set(model_ids) new_ids = [str(m.id) for m in current if str(m.id) not in remove_set] return await self._patch_project_models(new_ids, timeout) @@ -454,3 +515,43 @@ async def create_custom( if isinstance(resp, dict) and "model_id" in resp: return CreateModelResponse(**resp) return None + + async def update_custom( + self, + model_id: str, + *, + api_url: Optional[str] = None, + api_key: Optional[str] = None, + max_tokens: Optional[int] = None, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Update a custom model's mutable fields. + + At least one of ``api_url``, ``api_key``, or ``max_tokens`` must be + provided. Returns ``True`` on success. + """ + url = ( + f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/custom-models/{model_id}" + ) + body: Dict[str, Any] = {} + if api_url is not None: + body["api_url"] = api_url + if api_key is not None: + body["api_key"] = api_key + if max_tokens is not None: + body["max_tokens"] = max_tokens + resp = await self._patch(url, body=body, timeout=timeout, cast_to=dict) + return isinstance(resp, dict) and "data" in resp + + async def delete_custom( + self, + model_id: str, + *, + timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT, + ) -> bool: + """Disable a custom model and detach it from the project.""" + url = ( + f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/custom-models/{model_id}" + ) + resp = await self._delete(url, timeout=timeout, cast_to=dict) + return isinstance(resp, dict) and "data" in resp diff --git a/tests/resources/test_models_resource.py b/tests/resources/test_models_resource.py index 085177b5..6fbd4895 100644 --- a/tests/resources/test_models_resource.py +++ b/tests/resources/test_models_resource.py @@ -1082,3 +1082,181 @@ def test_filter_excludes_all_when_no_match( result = models_resource.get(regions=["ap-southeast-1"]) assert result == [] + + +class TestModelsAddRemoveIncludesCustoms: + """Regression tests for the PR #1916 fix: add()/remove() now operate on + the full project model list (public + custom), not public only. The old + behavior silently wiped any custom model out of Project.Models on every + add/remove because the PATCH payload omitted them.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.get_cast = Mock() + client.patch_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + def test_add_includes_customs_in_patch_payload(self, models_resource): + """add() must preserve already-attached custom models in the PATCH body.""" + public_m = PublicModel(id="pub-1", key="pub-1", name="Pub1", description="") + custom_m = CustomModel( + id="cust-1", + key="cust-1", + name="Cust1", + description="", + max_tokens=2048, + api_url="https://x.example.com", + disabled=False, + ) + models_resource.get = Mock(return_value=[public_m, custom_m]) + models_resource._patch.return_value = {"id": "proj-456"} + + result = models_resource.add("new-id") + + assert result is True + models_resource.get.assert_called_once() + # Critical: the get call should NOT filter by type="public" anymore. + assert models_resource.get.call_args.kwargs.get("type") is None + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["pub-1", "cust-1", "new-id"]} + + def test_remove_preserves_unrelated_customs(self, models_resource): + """remove() of a public ID must leave attached customs untouched.""" + public_m = PublicModel(id="pub-1", key="pub-1", name="Pub1", description="") + custom_m = CustomModel( + id="cust-1", + key="cust-1", + name="Cust1", + description="", + max_tokens=2048, + api_url="https://x.example.com", + disabled=False, + ) + models_resource.get = Mock(return_value=[public_m, custom_m]) + models_resource._patch.return_value = {"id": "proj-456"} + + models_resource.remove("pub-1") + + # Custom model must survive. + call_body = models_resource._patch.call_args.kwargs["body"] + assert call_body == {"models": ["cust-1"]} + assert models_resource.get.call_args.kwargs.get("type") is None + + +class TestModelsUpdateCustom: + """Test Models.update_custom() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.patch_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + def test_update_custom_api_url_only(self, models_resource): + """update_custom() sends only api_url in body when that's all that's provided.""" + models_resource._patch.return_value = {"data": {"id": "model-1"}} + + result = models_resource.update_custom("model-1", api_url="https://new.example.com/v1") + + assert result is True + models_resource._patch.assert_called_once_with( + "/organizations/org-123/projects/proj-456/custom-models/model-1", + body={"api_url": "https://new.example.com/v1"}, + timeout=DEFAULT_TIMEOUT, + cast_to=dict, + ) + + def test_update_custom_all_three_fields(self, models_resource): + """update_custom() sends all provided fields together.""" + models_resource._patch.return_value = {"data": {"id": "model-1"}} + + result = models_resource.update_custom( + "model-1", + api_url="https://x.io", + api_key="sk-new", + max_tokens=1024, + ) + + assert result is True + body = models_resource._patch.call_args.kwargs["body"] + assert body == { + "api_url": "https://x.io", + "api_key": "sk-new", + "max_tokens": 1024, + } + + def test_update_custom_max_tokens_only(self, models_resource): + """update_custom() supports max_tokens-only updates.""" + models_resource._patch.return_value = {"data": {"id": "model-1"}} + + result = models_resource.update_custom("model-1", max_tokens=8192) + + assert result is True + body = models_resource._patch.call_args.kwargs["body"] + assert body == {"max_tokens": 8192} + + def test_update_custom_returns_false_on_error_envelope(self, models_resource): + """update_custom() returns False when response has no data field.""" + models_resource._patch.return_value = {"code": "NOT_FOUND", "message": "missing"} + + result = models_resource.update_custom("model-1", api_url="https://x.io") + + assert result is False + + def test_update_custom_returns_false_when_response_not_dict(self, models_resource): + """update_custom() returns False when response isn't a dict.""" + models_resource._patch.return_value = httpx.Response(404) + + result = models_resource.update_custom("model-1", api_url="https://x.io") + + assert result is False + + +class TestModelsDeleteCustom: + """Test Models.delete_custom() method.""" + + @pytest.fixture + def mock_client(self): + client = Mock() + client.organization_id = "org-123" + client.project_id = "proj-456" + client.delete_cast = Mock() + return client + + @pytest.fixture + def models_resource(self, mock_client): + return Models(mock_client) + + def test_delete_custom_happy_path(self, models_resource): + """delete_custom() hits the right URL and returns True on success.""" + models_resource._delete.return_value = {"data": {"id": "model-1"}} + + result = models_resource.delete_custom("model-1") + + assert result is True + models_resource._delete.assert_called_once_with( + "/organizations/org-123/projects/proj-456/custom-models/model-1", + timeout=DEFAULT_TIMEOUT, + cast_to=dict, + ) + + def test_delete_custom_returns_false_on_error(self, models_resource): + """delete_custom() returns False when response has no data field.""" + models_resource._delete.return_value = {"code": "NOT_FOUND"} + + result = models_resource.delete_custom("model-1") + + assert result is False diff --git a/tests/test_models_custom_live.py b/tests/test_models_custom_live.py new file mode 100644 index 00000000..30702c21 --- /dev/null +++ b/tests/test_models_custom_live.py @@ -0,0 +1,72 @@ +"""Live end-to-end test for the custom-model lifecycle. + +Exercises the customer's exact workflow against a real LayerLens API: +create_custom → update_custom (repoint api_url) → delete_custom → verify gone. + +Skipped unless ``LAYERLENS_STRATIX_API_KEY`` is set. Run with:: + + pytest tests/test_models_custom_live.py -m live +""" + +from __future__ import annotations + +import os +import time +import uuid + +import pytest + +from layerlens import Stratix + + +@pytest.mark.live +def test_custom_model_lifecycle_live() -> None: + if not os.environ.get("LAYERLENS_STRATIX_API_KEY"): + pytest.skip("LAYERLENS_STRATIX_API_KEY not set") + + client = Stratix() + + # Use a unique key per run so the test can re-run cleanly. + suffix = uuid.uuid4().hex[:8] + name = f"sdk-live-custom-{suffix}" + key = f"sdk-live/custom-{suffix}" + + created = client.models.create_custom( + name=name, + key=key, + description="ephemeral live-test custom model", + api_url="https://tunnel-1.example.com/v1", + api_key="sk-live-test", + max_tokens=2048, + ) + assert created is not None, "create_custom returned None" + model_id = created.model_id + assert model_id + + try: + # Repointing api_url is the customer's primary workflow (cloudflared + # tunnels whose URL changes between sessions). + updated = client.models.update_custom( + model_id, + api_url="https://tunnel-2.example.com/v1", + ) + assert updated, "update_custom returned False" + + # Allow a brief moment for the backend to persist (S3 yaml regen + + # Mongo write) — defensive, not strictly required. + time.sleep(0.5) + + # Tear it down completely. + deleted = client.models.delete_custom(model_id) + assert deleted, "delete_custom returned False" + + remaining = client.models.get(type="custom") or [] + assert all(m.id != model_id for m in remaining), f"deleted custom model {model_id} still visible in models.get" + except Exception: + # Best-effort teardown on any assertion / API failure mid-test so a + # broken run doesn't leak project-scoped resources. + try: + client.models.delete_custom(model_id) + except Exception: # noqa: BLE001 + pass + raise diff --git a/tests/test_samples_e2e.py b/tests/test_samples_e2e.py index e49acb12..c9a300bf 100644 --- a/tests/test_samples_e2e.py +++ b/tests/test_samples_e2e.py @@ -36,6 +36,7 @@ "basic_trace", "benchmark_evaluation", "compare_evaluations", + "compound_failure_calculator", "create_judge", "custom_benchmark", "custom_model", @@ -82,6 +83,7 @@ INTEGRATION_SAMPLES = [ "anthropic_traced", + "browser_agent_evaluator", "langchain_instrumented", "openai_instrumented", "openai_traced", @@ -173,6 +175,12 @@ # Samples that need special argv or patches _SPECIAL_ARGV: dict[tuple[str, str], list[str]] = { ("cicd", "quality_gate"): ["test", "--threshold", "0.0"], + ("core", "compound_failure_calculator"): [ + "test", + "--simulate", + "7", + "--skip-evaluation", + ], ("openclaw/layerlens_skill/scripts", "evaluate"): [ "test", "--input", @@ -356,6 +364,8 @@ def mock_stratix(): client.models.add.return_value = True client.models.remove.return_value = True client.models.create_custom.return_value = MagicMock(model_id="model-custom-001") + client.models.update_custom.return_value = True + client.models.delete_custom.return_value = True # --- benchmarks --- benchmark = MagicMock() @@ -564,6 +574,8 @@ def mock_async_stratix(mock_stratix): client.models.add.return_value = True client.models.remove.return_value = True client.models.create_custom.return_value = MagicMock(model_id="model-custom-001") + client.models.update_custom.return_value = True + client.models.delete_custom.return_value = True # --- benchmarks (async) --- benchmark = MagicMock() @@ -2218,9 +2230,9 @@ def test_openclaw_skill_script_exists(self): assert os.path.isfile(script), f"Missing: {script}" def test_all_54_samples_covered(self): - """Verify ALL_SAMPLE_PATHS contains exactly 58 entries.""" - assert len(ALL_SAMPLE_PATHS) == 58, ( - f"Expected 58 samples, got {len(ALL_SAMPLE_PATHS)}.\nPaths: {ALL_SAMPLE_PATHS}" + """Verify ALL_SAMPLE_PATHS contains exactly 60 entries.""" + assert len(ALL_SAMPLE_PATHS) == 60, ( + f"Expected 60 samples, got {len(ALL_SAMPLE_PATHS)}.\nPaths: {ALL_SAMPLE_PATHS}" ) def test_all_sample_paths_exist(self): @@ -2233,7 +2245,7 @@ def test_all_sample_paths_exist(self): assert not missing, f"Sample files not found: {missing}" def test_mocked_samples_cover_all(self): - """ALL_MOCKED_SAMPLES should produce exactly 58 entries.""" - assert len(ALL_MOCKED_SAMPLES) == 58, ( - f"Expected 58 mocked entries, got {len(ALL_MOCKED_SAMPLES)}.\nEntries: {ALL_MOCKED_SAMPLES}" + """ALL_MOCKED_SAMPLES should produce exactly 60 entries.""" + assert len(ALL_MOCKED_SAMPLES) == 60, ( + f"Expected 60 mocked entries, got {len(ALL_MOCKED_SAMPLES)}.\nEntries: {ALL_MOCKED_SAMPLES}" )