From b7a79ee01c1b8d75072fed5457fbc02adf8d403d Mon Sep 17 00:00:00 2001 From: Jacob Ellis Date: Thu, 30 Apr 2026 10:35:19 +0930 Subject: [PATCH] feat: add Codex adapter, judge personas, and Claude routing updates --- python/cube.yaml | 9 +- python/cube/automation/judge_panel.py | 47 ++++- python/cube/automation/single_writer.py | 6 +- python/cube/cli.py | 3 +- python/cube/commands/orchestrate/handlers.py | 8 +- python/cube/commands/orchestrate/phases.py | 6 +- python/cube/commands/orchestrate/pr.py | 8 +- python/cube/commands/orchestrate/prompts.py | 6 +- python/cube/commands/pr_fix.py | 61 ++++++- python/cube/core/adapters/__init__.py | 2 + python/cube/core/adapters/claude.py | 20 ++- python/cube/core/adapters/codex.py | 80 +++++++++ python/cube/core/adapters/registry.py | 2 + python/cube/core/judge_personas.py | 40 +++++ python/cube/core/parsers/__init__.py | 2 + python/cube/core/parsers/claude.py | 21 ++- python/cube/core/parsers/codex.py | 91 ++++++++++ python/cube/core/parsers/registry.py | 2 + python/cube/core/single_layout.py | 9 +- python/cube/core/user_config.py | 4 + python/cube/models/types.py | 1 + tests/cli/test_adapters.py | 179 ++++++++++++++++++- tests/cli/test_pr_fix.py | 39 ++++ tests/core/test_judge_panel_retry.py | 48 ++++- tests/core/test_orchestrate_pr.py | 51 ++++++ tests/core/test_single_layout.py | 23 +++ tests/core/test_user_config.py | 30 +++- 27 files changed, 747 insertions(+), 51 deletions(-) create mode 100644 python/cube/core/adapters/codex.py create mode 100644 python/cube/core/judge_personas.py create mode 100644 python/cube/core/parsers/codex.py create mode 100644 tests/core/test_orchestrate_pr.py create mode 100644 tests/core/test_single_layout.py diff --git a/python/cube.yaml b/python/cube.yaml index bd96dd23..1faf2657 100644 --- a/python/cube.yaml +++ b/python/cube.yaml @@ -24,10 +24,12 @@ model_aliases: cli_tools: sonnet-4.5-thinking: cursor-agent sonnet-4.6-thinking: cursor-agent - claude-opus-4-7-thinking-max: cursor-agent + claude-opus-4-7-thinking-max: claude gpt-5.3-codex-high: cursor-agent gpt-5.5-high: cursor-agent gpt-5.5-extra-high: cursor-agent + codex-gpt-5.5: codex + gpt-5.5-codex: codex grok-4-20-thinking: cursor-agent grok: cursor-agent gemini-2.5-pro: gemini @@ -128,9 +130,10 @@ judges: color: "yellow" judge_3: - model: "qwen" - label: "Judge Qwen" + model: "gpt" + label: "Judge Security" color: "magenta" + persona: "security-pentest" # Paths paths: diff --git a/python/cube/automation/judge_panel.py b/python/cube/automation/judge_panel.py index 8747e028..c79a674e 100644 --- a/python/cube/automation/judge_panel.py +++ b/python/cube/automation/judge_panel.py @@ -11,7 +11,7 @@ from ..core.git import branch_exists, fetch_branches, get_commit_hash, sync_worktree from ..core.output import console, print_error, print_info, print_success from ..core.parsers.registry import get_parser -from ..core.session import load_session, save_session +from ..core.session import load_session, load_session_metadata, save_session from ..core.user_config import get_judge_configs, get_writer_by_key, get_writer_by_key_or_metadata, load_config from ..models.types import JudgeInfo from .stream import format_stream_message @@ -101,13 +101,37 @@ def _get_cli_review_worktrees(task_id: str, winner: str = None) -> dict: ] -def _load_matching_judge_session(jconfig, task_id: str, review_type: str) -> str | None: - """Load a judge session whenever one exists. +def _judge_session_metadata(jconfig, cli_name: str) -> str: + """Build metadata used to avoid resuming sessions across CLI backends.""" + return f"{jconfig.label} ({jconfig.model}) | cli={cli_name}" - Resume should be attempted by default. If a CLI rejects or ignores an old - session, the retry/new-session handling will update the saved session. - """ - return load_session(jconfig.key.upper(), f"{task_id}_{review_type}") + +def _apply_judge_persona(prompt: str, judge_info: JudgeInfo) -> str: + """Prepend judge-specific persona instructions when configured.""" + if not judge_info.persona: + return prompt + + return f"""# JUDGE PERSONA — {judge_info.label} + +{judge_info.persona} + +Apply this persona as your review lens, but do not ignore the shared review rules below. + +--- + +{prompt}""" + + +def _load_matching_judge_session(jconfig, task_id: str, review_type: str) -> str | None: + """Load a judge session only when it matches the current model and CLI backend.""" + config = load_config() + cli_name = "cli-review" if jconfig.type == "cli-review" else config.cli_tools.get(jconfig.model, "cursor-agent") + session_task_key = f"{task_id}_{review_type}" + metadata = load_session_metadata(jconfig.key.upper(), session_task_key) + expected = _judge_session_metadata(jconfig, cli_name) + if metadata and metadata != expected: + return None + return load_session(jconfig.key.upper(), session_task_key) async def _wait_for_valid_decision_file(decision_file: Path, timeout_seconds: float = 5.0) -> None: @@ -163,6 +187,7 @@ async def run_judge( session_id = judge_info.session_id if resume else None run_dir = WORKTREE_BASE.parent if cli_name == "gemini" else PROJECT_ROOT judge_specific_prompt = prompt.replace("{{judge_key}}", judge_info.key).replace("{judge_key}", judge_info.key) + judge_specific_prompt = _apply_judge_persona(judge_specific_prompt, judge_info) decision_type = "peer-review" if judge_info.review_type == "peer-review" else "decision" from ..core.decision_parser import get_decision_file_path, parse_single_decision_file @@ -209,7 +234,10 @@ async def run_judge( suffix=judge_info.review_type, session_key=judge_info.key.upper(), session_task_key=f"{judge_info.task_id}_{judge_info.review_type}", - metadata=f"{judge_info.label} ({judge_info.key}) - {judge_info.task_id} - {judge_info.review_type} - {datetime.now()}", + metadata=( + f"{judge_info.label} ({judge_info.key}) - {judge_info.task_id} - " + f"{judge_info.review_type} - {cli_name} - {datetime.now()}" + ), ) as logger: async for line in stream: # type: ignore[attr-defined] logger.write_line(line) @@ -224,7 +252,7 @@ async def run_judge( judge_info.key.upper(), f"{judge_info.task_id}_{judge_info.review_type}", msg.session_id, - f"{judge_info.label} ({judge_info.model})", + _judge_session_metadata(judge_info, cli_name), ) formatted = format_stream_message(msg, judge_info.label, judge_info.color) @@ -599,6 +627,7 @@ async def launch_judge_panel( label=jconfig.label, task_id=task_id, review_type=review_type, + persona=jconfig.persona, session_id=session_id, adapter_config={"type": jconfig.type, "cmd": jconfig.cmd, "name": jconfig.label} if jconfig.type == "cli-review" diff --git a/python/cube/automation/single_writer.py b/python/cube/automation/single_writer.py index 0e9e53d3..880dd6b1 100644 --- a/python/cube/automation/single_writer.py +++ b/python/cube/automation/single_writer.py @@ -37,7 +37,7 @@ async def run_single_writer(writer_info: WriterInfo, prompt: str, resume: bool) parser = get_parser(cli_name) layout = SingleAgentLayout - layout.initialize(writer_info.label) + layout.initialize(writer_info.label, task_name=writer_info.task_id) layout.start() session_id = writer_info.session_id if resume else None @@ -158,7 +158,7 @@ async def launch_single_writer( wconfig = get_writer_config(writer_key) layout = SingleAgentLayout - layout.initialize(f"Writer: {wconfig.label}") + layout.initialize(f"Writer: {wconfig.label}", task_name=task_id) from ..core.writer_metadata import WriterMetadata, save_writer_metadata @@ -231,7 +231,7 @@ async def launch_single_writer( current_prompt = interrupt.message is_resuming = True # Re-initialize layout for the resume - layout.initialize(f"Writer: {wconfig.label}") + layout.initialize(f"Writer: {wconfig.label}", task_name=task_id) continue except Exception as e: print_error(f"Writer {writer_info.label} failed: {e}") diff --git a/python/cube/cli.py b/python/cube/cli.py index 99562775..f1a3626c 100644 --- a/python/cube/cli.py +++ b/python/cube/cli.py @@ -504,7 +504,8 @@ def pr( console.print() try: - asyncio.run(create_pr(resolved_task_id, winner)) + state = load_state(resolved_task_id) + asyncio.run(create_pr(resolved_task_id, winner, single_mode=bool(state and state.mode == "single"))) except Exception as e: _print_error(e) sys.exit(1) diff --git a/python/cube/commands/orchestrate/handlers.py b/python/cube/commands/orchestrate/handlers.py index 01fcd612..a44802e3 100644 --- a/python/cube/commands/orchestrate/handlers.py +++ b/python/cube/commands/orchestrate/handlers.py @@ -206,7 +206,7 @@ async def synthesis_final_decision(ctx: WorkflowContext) -> PhaseResult: return PhaseResult(exit=True) if final_result["approved"] and not final_result["remaining_issues"]: - await create_pr(ctx.task_id, ctx.result["winner"]) + await create_pr(ctx.task_id, ctx.result["winner"], single_mode=is_single_mode(ctx)) return PhaseResult(exit=True) # If no issues to fix, proceed even if not fully approved (UNKNOWN judges etc) @@ -284,7 +284,7 @@ async def synthesis_final_peer_review(ctx: WorkflowContext) -> PhaseResult: phase9_data = ctx.result.get("phase_9_data", {}) if phase9_data.get("fixes_skipped"): print_info("No fixes were made - skipping re-review") - await create_pr(ctx.task_id, ctx.result["winner"]) + await create_pr(ctx.task_id, ctx.result["winner"], single_mode=is_single_mode(ctx)) return PhaseResult(exit=True) # Resume judges by default (they have context), use --fresh-judges for fresh start @@ -295,7 +295,7 @@ async def synthesis_final_peer_review(ctx: WorkflowContext) -> PhaseResult: final_check = run_decide_peer_review(ctx.task_id) if final_check["approved"] and not final_check["remaining_issues"]: - await create_pr(ctx.task_id, ctx.result["winner"]) + await create_pr(ctx.task_id, ctx.result["winner"], single_mode=is_single_mode(ctx)) elif final_check["approved"]: print_warning(f"Approved but still has {len(final_check['remaining_issues'])} issue(s) after minor fixes") console.print() @@ -304,7 +304,7 @@ async def synthesis_final_peer_review(ctx: WorkflowContext) -> PhaseResult: console.print(f" • {_normalize_issue(issue)}") console.print() console.print("Creating PR anyway (all judges approved)...") - await create_pr(ctx.task_id, ctx.result["winner"]) + await create_pr(ctx.task_id, ctx.result["winner"], single_mode=is_single_mode(ctx)) else: MAX_MINOR_FIX_LOOPS = 3 if ctx.minor_fix_count == 0: diff --git a/python/cube/commands/orchestrate/phases.py b/python/cube/commands/orchestrate/phases.py index 71ffd6a5..1801b4e1 100644 --- a/python/cube/commands/orchestrate/phases.py +++ b/python/cube/commands/orchestrate/phases.py @@ -184,7 +184,7 @@ async def run_synthesis(task_id: str, result: dict, prompts_dir: Path, resume_pr Save to: `.prompts/synthesis-{task_id}.md`""" layout = SingleAgentLayout - layout.initialize("Prompter") + layout.initialize("Prompter", task_name=task_id) layout.start() try: @@ -255,7 +255,7 @@ async def run_peer_review( Include the worktree location and git commands for reviewing.""" layout = SingleAgentLayout - layout.initialize("Prompter") + layout.initialize("Prompter", task_name=task_id) layout.start() try: @@ -344,7 +344,7 @@ async def run_minor_fixes( from ...core.single_layout import SingleAgentLayout layout = SingleAgentLayout - layout.initialize("Prompter") + layout.initialize("Prompter", task_name=task_id) layout.start() try: diff --git a/python/cube/commands/orchestrate/pr.py b/python/cube/commands/orchestrate/pr.py index 663b1af2..50e3fa8b 100644 --- a/python/cube/commands/orchestrate/pr.py +++ b/python/cube/commands/orchestrate/pr.py @@ -4,14 +4,14 @@ from ...core.config import PROJECT_ROOT from ...core.output import console, print_success, print_warning +from ...core.user_config import get_writer_by_key_or_metadata -async def create_pr(task_id: str, winner: str): +async def create_pr(task_id: str, winner: str, single_mode: bool = False): """Create PR automatically.""" - from ...core.user_config import get_writer_by_key_or_metadata - winner_cfg = get_writer_by_key_or_metadata(winner, task_id) branch = f"writer-{winner_cfg.name}/{task_id}" + writer_line = f"Writer: {winner_cfg.label} ({winner_cfg.key})" if single_mode else f"Winner: {winner_cfg.label}" console.print(f"[green]✅ Creating PR from: {branch}[/green]") console.print() @@ -29,7 +29,7 @@ async def create_pr(task_id: str, winner: str): "--title", f"feat: {task_id}", "--body", - f"Autonomous implementation via Agent Cube\n\nWinner: Writer {winner}\nBranch: {branch}\n\nReview decisions in `.prompts/decisions/{task_id}-*.json`", + f"Autonomous implementation via Agent Cube\n\n{writer_line}\nBranch: {branch}\n\nReview decisions in `.prompts/decisions/{task_id}-*.json`", ], cwd=PROJECT_ROOT, capture_output=True, diff --git a/python/cube/commands/orchestrate/prompts.py b/python/cube/commands/orchestrate/prompts.py index 4444fa67..2c710ffb 100644 --- a/python/cube/commands/orchestrate/prompts.py +++ b/python/cube/commands/orchestrate/prompts.py @@ -71,7 +71,7 @@ async def generate_writer_prompt(task_id: str, task_content: str, prompts_dir: P ### Last Step: Commit and push when verification passes!""" - layout = SingleAgentLayout.initialize("Prompter") + layout = SingleAgentLayout.initialize("Prompter", task_name=task_id) layout.start() # Resume prompter session if exists, otherwise capture new session ID @@ -125,7 +125,7 @@ async def generate_panel_prompt(task_id: str, prompts_dir: Path) -> Path: Include evaluation criteria, scoring rubric, and decision JSON format.""" - layout = SingleAgentLayout.initialize("Prompter") + layout = SingleAgentLayout.initialize("Prompter", task_name=task_id) layout.start() # Resume prompter session if exists, otherwise capture new session ID @@ -263,7 +263,7 @@ def capture_session(sid: str) -> None: if len(entries) == 1: entry = entries[0] - layout = SingleAgentLayout.initialize(entry["label"]) + layout = SingleAgentLayout.initialize(entry["label"], task_name=task_id) layout.start() try: await run_agent_with_layout( diff --git a/python/cube/commands/pr_fix.py b/python/cube/commands/pr_fix.py index 0b0df1b1..1dd1c478 100644 --- a/python/cube/commands/pr_fix.py +++ b/python/cube/commands/pr_fix.py @@ -259,6 +259,43 @@ def _delete_reply_plan(path: Path) -> None: print_warning(f"Could not delete reply plan: {path}") +def _is_non_fast_forward_push_error(stderr: str) -> bool: + """Return True when git push failed because the remote branch moved.""" + error = stderr.lower() + return "non-fast-forward" in error or "fetch first" in error or "tip of your current branch is behind" in error + + +def _recover_non_fast_forward_push(worktree: Path, branch_name: str) -> bool: + """Rebase local fix commit(s) onto the latest remote branch before retrying push.""" + if not branch_name or branch_name == "HEAD": + return False + + fetch = subprocess.run( + ["git", "fetch", "origin", branch_name], + cwd=worktree, + capture_output=True, + text=True, + timeout=60, + ) + if fetch.returncode != 0: + print_warning(f"Fetch before push retry failed: {fetch.stderr.strip()}") + return False + + rebase = subprocess.run( + ["git", "rebase", f"origin/{branch_name}"], + cwd=worktree, + capture_output=True, + text=True, + timeout=120, + ) + if rebase.returncode == 0: + return True + + subprocess.run(["git", "rebase", "--abort"], cwd=worktree, capture_output=True, text=True, timeout=30) + print_warning(f"Rebase before push retry failed: {rebase.stderr.strip() or rebase.stdout.strip()}") + return False + + def _reply_plan_entries(reply_plan: dict[str, Any]) -> dict[int, dict[str, Any]]: """Return reply plan entries keyed by 1-based comment index.""" entries: dict[int, dict[str, Any]] = {} @@ -451,7 +488,7 @@ def _run_fix_agent( head_before = result.stdout.strip() if result.returncode == 0 else None layout = SingleAgentLayout - layout.initialize(wconfig.label) + layout.initialize(wconfig.label, task_name=f"PR #{pr_number}") layout.start() # Track session ID from stream @@ -608,6 +645,28 @@ async def run_fix(): timeout=60, ) if result.returncode != 0: + if _is_non_fast_forward_push_error(result.stderr): + print_warning("Push was rejected because the remote branch moved; rebasing local fix and retrying") + if _recover_non_fast_forward_push(worktree, branch_name): + result = subprocess.run( + ["git", "push", "origin", push_ref], + cwd=worktree, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode == 0: + refreshed_sha = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + cwd=worktree, + capture_output=True, + text=True, + timeout=10, + ) + if refreshed_sha.returncode == 0 and refreshed_sha.stdout.strip(): + commit_sha = refreshed_sha.stdout.strip() + return FixAgentResult(commit_sha=commit_sha or "", reply_plan=reply_plan) + print_error(f"Push failed: {result.stderr}") return None diff --git a/python/cube/core/adapters/__init__.py b/python/cube/core/adapters/__init__.py index 9de5501f..0c635646 100644 --- a/python/cube/core/adapters/__init__.py +++ b/python/cube/core/adapters/__init__.py @@ -3,6 +3,7 @@ from .base import CLIAdapter, read_stream_with_buffer, run_subprocess_streaming from .claude import ClaudeAdapter from .cli_review import CLIReviewAdapter +from .codex import CodexAdapter from .cursor import CursorAdapter from .gemini import GeminiAdapter from .generic_cli import GenericCLIAdapter @@ -11,6 +12,7 @@ "CLIAdapter", "ClaudeAdapter", "CLIReviewAdapter", + "CodexAdapter", "CursorAdapter", "GeminiAdapter", "GenericCLIAdapter", diff --git a/python/cube/core/adapters/claude.py b/python/cube/core/adapters/claude.py index ab3d63c2..1063455b 100644 --- a/python/cube/core/adapters/claude.py +++ b/python/cube/core/adapters/claude.py @@ -7,13 +7,15 @@ from .base import CLIAdapter, run_subprocess_streaming -_MODEL_ALIASES = { - "claude-opus": "opus", - "claude-sonnet": "sonnet", - "claude-code-opus": "opus", - "claude-code-sonnet": "sonnet", - "claude-4-opus": "opus", - "claude-4-sonnet": "sonnet", +_MODEL_ALIASES: dict[str, tuple[str, str | None]] = { + "claude-opus": ("opus", "max"), + "claude-code-opus": ("opus", "max"), + "claude-code-opus-max": ("opus", "max"), + "claude-opus-4-7-thinking-max": ("opus", "max"), + "claude-4-opus": ("opus", "max"), + "claude-sonnet": ("sonnet", None), + "claude-code-sonnet": ("sonnet", None), + "claude-4-sonnet": ("sonnet", None), } @@ -36,7 +38,7 @@ async def run( env = os.environ.copy() - cli_model = _MODEL_ALIASES.get(model, model) + cli_model, effort = _MODEL_ALIASES.get(model, (model, None)) # Build command cmd = [ @@ -51,6 +53,8 @@ async def run( "--model", cli_model, ] + if effort: + cmd.extend(["--effort", effort]) if resume and session_id: cmd.extend(["--resume", session_id]) diff --git a/python/cube/core/adapters/codex.py b/python/cube/core/adapters/codex.py new file mode 100644 index 00000000..10cdda4a --- /dev/null +++ b/python/cube/core/adapters/codex.py @@ -0,0 +1,80 @@ +"""OpenAI Codex CLI adapter.""" + +import os +import shutil +from pathlib import Path +from typing import AsyncGenerator, Optional + +from .base import CLIAdapter, run_subprocess_streaming + + +class CodexAdapter(CLIAdapter): + """Adapter for OpenAI Codex CLI.""" + + async def run( + self, worktree: Path, model: str, prompt: str, session_id: Optional[str] = None, resume: bool = False + ) -> AsyncGenerator[str, None]: + """Run codex exec in non-interactive JSON mode.""" + if not self.check_installed(): + raise RuntimeError("codex is not installed. " + self.get_install_instructions()) + + env = os.environ.copy() + env["PATH"] = f"{Path.home() / '.local' / 'bin'}:{env.get('PATH', '')}" + + cmd = [ + "codex", + "exec", + "--json", + "--model", + model, + "--full-auto", + "--sandbox", + "workspace-write", + "--cd", + str(worktree), + ] + + if resume and session_id: + cmd.extend(["resume", session_id, prompt]) + else: + cmd.append(prompt) + + last_error = None + line_count = 0 + + from ..master_log import get_master_log + + try: + async for line in run_subprocess_streaming(cmd, worktree, "codex", env, stdin_data=""): + line_count += 1 + + master_log = get_master_log() + if master_log: + master_log.write_raw_line(f"codex-{model}", line) + + if line.startswith('{"type":"error"') or line.startswith("Error:"): + last_error = line[:200] + + yield line + + except RuntimeError: + if last_error: + raise RuntimeError(f"codex failed: {last_error}") + raise + + if line_count == 0: + raise RuntimeError("codex produced no output (is it authenticated?)") + + def check_installed(self) -> bool: + """Check if codex CLI is installed.""" + return shutil.which("codex") is not None + + def get_install_instructions(self) -> str: + """Get installation instructions.""" + return """Install OpenAI Codex CLI: + npm install -g @openai/codex + +After installation, authenticate with: + codex login + +For CI/headless use, set CODEX_API_KEY.""" diff --git a/python/cube/core/adapters/registry.py b/python/cube/core/adapters/registry.py index ae59e37f..1b39a722 100644 --- a/python/cube/core/adapters/registry.py +++ b/python/cube/core/adapters/registry.py @@ -5,6 +5,7 @@ from .base import CLIAdapter from .claude import ClaudeAdapter from .cli_review import CLIReviewAdapter +from .codex import CodexAdapter from .cursor import CursorAdapter from .gemini import GeminiAdapter from .generic_cli import GenericCLIAdapter @@ -12,6 +13,7 @@ _ADAPTERS: Dict[str, Type[CLIAdapter]] = { "cursor-agent": CursorAdapter, "claude": ClaudeAdapter, + "codex": CodexAdapter, "gemini": GeminiAdapter, "cli-review": CLIReviewAdapter, } diff --git a/python/cube/core/judge_personas.py b/python/cube/core/judge_personas.py new file mode 100644 index 00000000..57fcf45d --- /dev/null +++ b/python/cube/core/judge_personas.py @@ -0,0 +1,40 @@ +"""Built-in judge personas.""" + +BUILTIN_JUDGE_PERSONAS: dict[str, str] = { + "security-pentest": """You are the security and abuse-case reviewer for this panel. + +Your job is to find realistic abuse paths, not generic code-quality nits. Think like a pragmatic penetration tester reviewing a production multi-tenant app: + +Primary focus areas: +- Tenant isolation: `org_id`, brand, account, workspace, project, or customer scope must come from trusted server-side auth/context, not request body/query/client headers. +- Authorization: verify role/permission checks are made at the operation boundary and cannot be bypassed through alternate routes, background jobs, webhooks, tool calls, or admin/helper APIs. +- RLS/data boundaries: check composite keys, scoped DB helpers, cross-tenant reads/writes/deletes, system/shared rows, and any migration that changes privileges. +- Secret handling: API keys, tokens, OAuth refresh tokens, JWTs, provider credentials, signed URLs, and webhook secrets must not leak to logs, browser-visible data, redirects, error messages, or analytics. +- Injection and SSRF: inspect URL fetches, webhooks, provider callbacks, tool/integration inputs, SQL fragments, shell commands, HTML/Markdown rendering, and file/path handling. +- Replay and idempotency: webhooks, payment/session flows, async jobs, queues, imports, and retries must be safe against duplicate delivery, stale messages, and out-of-order events. +- Trust boundaries: treat LLM/tool arguments, third-party callbacks, browser state, and queue metadata as attacker-controlled unless proven otherwise. +- Failure defaults: missing config, missing claims, unavailable verification, unknown provider state, and partial writes should fail closed with useful logging, not silently continue. +- Concurrency and durability: flag module-level maps/caches/timers for anything security-, financial-, credential-, idempotency-, or cross-request-sensitive. + +Severity bar: +- Request changes only for issues that create plausible exploitation, cross-tenant leakage, privilege escalation, credential exposure, data corruption, unsafe deletion, or a security-relevant correctness failure. +- Do not block on theoretical CWE matching, style preferences, or "defense in depth" rewrites unless the current code has a concrete abuse path. +- If another judge already raised the same generic issue, only repeat it when you can add the security impact, exploit path, or safer minimal fix. + +Fix style: +- Prefer small, direct, fail-closed fixes. +- Preserve KISS/minimalism; do not recommend broad security architecture when a narrow guard, scoped query, validation rule, or regression test solves the actual risk. +- Include the attacker-controlled input, the trust boundary crossed, the affected asset, and the minimal fix in each security finding.""", +} + + +def resolve_judge_persona(persona: str | None) -> str | None: + """Resolve a built-in persona name or return inline persona text.""" + if not persona: + return None + + key = persona.strip() + if key.startswith("builtin:"): + key = key.split(":", 1)[1].strip() + + return BUILTIN_JUDGE_PERSONAS.get(key, persona) diff --git a/python/cube/core/parsers/__init__.py b/python/cube/core/parsers/__init__.py index 68db6668..844fcbc5 100644 --- a/python/cube/core/parsers/__init__.py +++ b/python/cube/core/parsers/__init__.py @@ -3,6 +3,7 @@ from .base import ParserAdapter from .claude import ClaudeParser from .cli_review import CLIReviewParser +from .codex import CodexParser from .cursor import CursorParser from .gemini import GeminiParser from .kimi import KimiParser @@ -11,6 +12,7 @@ __all__ = [ "CLIReviewParser", "ClaudeParser", + "CodexParser", "CursorParser", "GeminiParser", "KimiParser", diff --git a/python/cube/core/parsers/claude.py b/python/cube/core/parsers/claude.py index dad69570..843e9664 100644 --- a/python/cube/core/parsers/claude.py +++ b/python/cube/core/parsers/claude.py @@ -35,6 +35,13 @@ def parse(self, line: str) -> Optional[StreamMessage]: msg.model = data.get("model", "claude") return msg + if msg.type == "thinking": + text = data.get("text") or data.get("thinking") + if text: + msg.content = text + return msg + return None + # Handle stream_event - extract content deltas if msg.type == "stream_event": event = data.get("event", {}) @@ -118,9 +125,12 @@ def parse(self, line: str) -> Optional[StreamMessage]: return msg # Thinking block elif content_block.get("type") == "thinking": - msg.type = "thinking" - msg.content = content_block.get("thinking", "") - return msg + thinking = content_block.get("thinking", "") + if thinking: + msg.type = "thinking" + msg.content = thinking + return msg + return None # Tool use block elif content_block.get("type") == "tool_use": msg.type = "tool_call" @@ -175,6 +185,11 @@ def parse(self, line: str) -> Optional[StreamMessage]: msg.content = data.get("result", "")[:200] return msg + # Telemetry/status events from Claude Code are useful for raw logs, + # but noisy in the live judge UI. + if msg.type in {"rate_limit_event"}: + return None + # Unknown type - return as unknown for logging if msg.type not in ("system", "user", "assistant", "result"): msg.type = "unknown" diff --git a/python/cube/core/parsers/codex.py b/python/cube/core/parsers/codex.py new file mode 100644 index 00000000..529f22c2 --- /dev/null +++ b/python/cube/core/parsers/codex.py @@ -0,0 +1,91 @@ +"""OpenAI Codex CLI JSONL parser.""" + +import json +from typing import Any, Optional + +from ...models.types import StreamMessage +from .base import ParserAdapter + + +class CodexParser(ParserAdapter): + """Parser for `codex exec --json` events.""" + + def parse(self, line: str) -> Optional[StreamMessage]: + """Parse a Codex JSONL event.""" + clean = line.strip() + if not clean: + return None + + try: + data = json.loads(clean) + except json.JSONDecodeError: + return StreamMessage(type="unknown", content=clean[:500]) + + if not isinstance(data, dict): + return StreamMessage(type="unknown", content=clean[:500]) + + event_type = data.get("type") + + if event_type == "thread.started": + return StreamMessage(type="system", subtype="init", session_id=data.get("thread_id")) + + if event_type == "turn.started": + return StreamMessage(type="system", subtype="turn_started") + + if event_type == "turn.completed": + return StreamMessage(type="result", duration_ms=data.get("duration_ms", 0)) + + if event_type == "turn.failed": + return StreamMessage(type="error", content=str(data.get("error") or "Codex turn failed")[:200]) + + if event_type == "error": + return StreamMessage(type="error", content=str(data.get("message") or data.get("error") or data)[:200]) + + if isinstance(event_type, str) and event_type.startswith("item."): + return _parse_item_event(event_type, data.get("item")) + + return None + + def supports_resume(self) -> bool: + """Codex exec supports explicit session resume.""" + return True + + +def _parse_item_event(event_type: str, item: Any) -> Optional[StreamMessage]: + """Parse Codex item events.""" + if not isinstance(item, dict): + return None + + item_type = item.get("type") + subtype = "started" if event_type == "item.started" else "completed" + + if item_type == "agent_message": + text = item.get("text") or item.get("message") or item.get("content") + if text: + return StreamMessage(type="assistant", content=str(text)) + return None + + if item_type == "reasoning": + text = item.get("text") or item.get("summary") or item.get("content") + if text: + return StreamMessage(type="thinking", content=str(text)) + return None + + if item_type == "command_execution": + command = item.get("command") or item.get("cmd") or "" + return StreamMessage(type="tool_call", subtype=subtype, tool_name="shell", tool_args={"command": command}) + + if item_type == "file_change": + path = item.get("path") or item.get("file") or "" + return StreamMessage(type="tool_call", subtype=subtype, tool_name="edit", tool_args={"path": path}) + + if item_type == "mcp_tool_call": + tool_name = item.get("name") or item.get("tool_name") or "mcp" + args = item.get("arguments") or item.get("args") or {} + return StreamMessage(type="tool_call", subtype=subtype, tool_name=str(tool_name), tool_args=args) + + if item_type == "web_search": + query = item.get("query") or "" + return StreamMessage(type="tool_call", subtype=subtype, tool_name="web_search", tool_args={"query": query}) + + return None diff --git a/python/cube/core/parsers/registry.py b/python/cube/core/parsers/registry.py index d52c8da3..4e576ba8 100644 --- a/python/cube/core/parsers/registry.py +++ b/python/cube/core/parsers/registry.py @@ -5,6 +5,7 @@ from .base import ParserAdapter from .claude import ClaudeParser from .cli_review import CLIReviewParser +from .codex import CodexParser from .cursor import CursorParser from .gemini import GeminiParser from .kimi import KimiParser @@ -13,6 +14,7 @@ _PARSERS: Dict[str, Type[ParserAdapter]] = { "cursor-agent": CursorParser, "claude": ClaudeParser, + "codex": CodexParser, "gemini": GeminiParser, "kimi": KimiParser, "qwen": QwenParser, diff --git a/python/cube/core/single_layout.py b/python/cube/core/single_layout.py index b93af367..c3d68f36 100644 --- a/python/cube/core/single_layout.py +++ b/python/cube/core/single_layout.py @@ -46,15 +46,16 @@ class SingleAgentLayout(BaseThinkingLayout): _lock = RLock() @classmethod - def initialize(cls, title: str = "Agent"): + def initialize(cls, title: str = "Agent", task_name: str = None): with cls._lock: if cls._instance: cls._instance.close() - cls._instance = cls({"agent": title}, lines_per_box=3) + display_title = f"{title} ▶ {task_name}" if task_name else title + cls._instance = cls({"agent": display_title}, lines_per_box=3, task_name=task_name) return cls._instance - def __init__(self, boxes: Dict[str, str], lines_per_box: int = 3): - super().__init__(boxes, lines_per_box) + def __init__(self, boxes: Dict[str, str], lines_per_box: int = 3, task_name: str = None): + super().__init__(boxes, lines_per_box, task_name=task_name) @classmethod def add_thinking(cls, text: str) -> None: diff --git a/python/cube/core/user_config.py b/python/cube/core/user_config.py index e193572c..e319311e 100644 --- a/python/cube/core/user_config.py +++ b/python/cube/core/user_config.py @@ -7,6 +7,8 @@ import yaml from rich.console import Console +from .judge_personas import resolve_judge_persona + _console_err = Console(stderr=True) @@ -37,6 +39,7 @@ class JudgeConfig: model: str label: str color: str + persona: Optional[str] = None type: str = "llm" # "llm" or "cli-review" cmd: Optional[str] = None peer_review_only: bool = False # Skip in panel, only run in peer-review @@ -211,6 +214,7 @@ def load_config() -> CubeConfig: model=_resolve_model_alias(j.get("model", "sonnet-4.5-thinking"), model_aliases), label=j.get("label", key), color=j.get("color", "green"), + persona=resolve_judge_persona(j.get("persona")), type=j.get("type", "llm"), cmd=j.get("cmd"), peer_review_only=j.get("peer_review_only", False), diff --git a/python/cube/models/types.py b/python/cube/models/types.py index 36d3f640..06790ff6 100644 --- a/python/cube/models/types.py +++ b/python/cube/models/types.py @@ -30,6 +30,7 @@ class JudgeInfo: label: str task_id: str review_type: str + persona: Optional[str] = None session_id: Optional[str] = None adapter_config: Optional[dict] = None diff --git a/tests/cli/test_adapters.py b/tests/cli/test_adapters.py index 9978c71e..21085441 100644 --- a/tests/cli/test_adapters.py +++ b/tests/cli/test_adapters.py @@ -2,8 +2,9 @@ from unittest.mock import AsyncMock, Mock, call, patch import pytest -from cube.core.adapters import ClaudeAdapter, CursorAdapter, GenericCLIAdapter +from cube.core.adapters import ClaudeAdapter, CodexAdapter, CursorAdapter, GenericCLIAdapter from cube.core.adapters.registry import get_adapter +from cube.core.parsers.codex import CodexParser from cube.core.parsers.kimi import KimiParser from cube.core.parsers.qwen import QwenParser from cube.core.parsers.registry import get_parser @@ -88,12 +89,188 @@ async def test_claude_adapter_uses_latest_headless_flags_and_alias(tmp_path): "bypassPermissions", "--model", "opus", + "--effort", + "max", "prompt", ] assert results == ['{"type":"result","is_error":false,"result":"ok"}'] assert mock_exec.call_args.kwargs["stdin"] is not None +@pytest.mark.asyncio +async def test_claude_adapter_maps_explicit_opus_47_thinking_max(tmp_path): + adapter = ClaudeAdapter() + mock_process = make_mock_process(b'{"type":"result","is_error":false,"result":"ok"}\n') + + with ( + patch("cube.core.adapters.claude.shutil.which", return_value="/usr/local/bin/claude"), + patch("asyncio.create_subprocess_exec", return_value=mock_process) as mock_exec, + ): + async for _ in adapter.run(tmp_path, "claude-opus-4-7-thinking-max", "prompt"): + pass + + args = collect_exec_args(mock_exec) + assert args[args.index("--model") + 1] == "opus" + assert args[args.index("--effort") + 1] == "max" + + +@pytest.mark.asyncio +async def test_claude_adapter_does_not_force_effort_for_sonnet(tmp_path): + adapter = ClaudeAdapter() + mock_process = make_mock_process(b'{"type":"result","is_error":false,"result":"ok"}\n') + + with ( + patch("cube.core.adapters.claude.shutil.which", return_value="/usr/local/bin/claude"), + patch("asyncio.create_subprocess_exec", return_value=mock_process) as mock_exec, + ): + async for _ in adapter.run(tmp_path, "claude-sonnet", "prompt"): + pass + + args = collect_exec_args(mock_exec) + assert args[args.index("--model") + 1] == "sonnet" + assert "--effort" not in args + + +@pytest.mark.asyncio +async def test_codex_adapter_run_command(tmp_path): + adapter = CodexAdapter() + mock_process = make_mock_process( + b'{"type":"thread.started","thread_id":"session-1"}\n', + b'{"type":"item.completed","item":{"type":"agent_message","text":"ok"}}\n', + ) + master_log = Mock() + + with ( + patch("cube.core.adapters.codex.shutil.which", return_value="/usr/local/bin/codex"), + patch("cube.core.master_log.get_master_log", return_value=master_log), + patch("asyncio.create_subprocess_exec", return_value=mock_process) as mock_exec, + ): + results = [] + async for line in adapter.run(tmp_path, "gpt-5.5-codex", "prompt"): + results.append(line) + + args = collect_exec_args(mock_exec) + assert args == [ + "codex", + "exec", + "--json", + "--model", + "gpt-5.5-codex", + "--full-auto", + "--sandbox", + "workspace-write", + "--cd", + str(tmp_path), + "prompt", + ] + assert results == [ + '{"type":"thread.started","thread_id":"session-1"}', + '{"type":"item.completed","item":{"type":"agent_message","text":"ok"}}', + ] + assert master_log.write_raw_line.call_args_list == [ + call("codex-gpt-5.5-codex", '{"type":"thread.started","thread_id":"session-1"}'), + call("codex-gpt-5.5-codex", '{"type":"item.completed","item":{"type":"agent_message","text":"ok"}}'), + ] + assert mock_exec.call_args.kwargs["stdin"] is not None + + +@pytest.mark.asyncio +async def test_codex_adapter_resume_command(tmp_path): + adapter = CodexAdapter() + mock_process = make_mock_process(b'{"type":"turn.completed"}\n') + + with ( + patch("cube.core.adapters.codex.shutil.which", return_value="/usr/local/bin/codex"), + patch("asyncio.create_subprocess_exec", return_value=mock_process) as mock_exec, + ): + async for _ in adapter.run(tmp_path, "gpt-5.5-codex", "follow up", session_id="session-1", resume=True): + pass + + args = collect_exec_args(mock_exec) + assert args == [ + "codex", + "exec", + "--json", + "--model", + "gpt-5.5-codex", + "--full-auto", + "--sandbox", + "workspace-write", + "--cd", + str(tmp_path), + "resume", + "session-1", + "follow up", + ] + + +def test_codex_parser_events(): + parser = CodexParser() + + init = parser.parse('{"type":"thread.started","thread_id":"session-1"}') + assert init is not None + assert init.type == "system" + assert init.subtype == "init" + assert init.session_id == "session-1" + + assistant = parser.parse('{"type":"item.completed","item":{"type":"agent_message","text":"done"}}') + assert assistant is not None + assert assistant.type == "assistant" + assert assistant.content == "done" + + thinking = parser.parse('{"type":"item.completed","item":{"type":"reasoning","text":"thinking"}}') + assert thinking is not None + assert thinking.type == "thinking" + assert thinking.content == "thinking" + + shell = parser.parse('{"type":"item.started","item":{"type":"command_execution","command":"python -m pytest"}}') + assert shell is not None + assert shell.type == "tool_call" + assert shell.subtype == "started" + assert shell.tool_name == "shell" + assert shell.tool_args == {"command": "python -m pytest"} + + result = parser.parse('{"type":"turn.completed","duration_ms":123}') + assert result is not None + assert result.type == "result" + assert result.duration_ms == 123 + + +def test_codex_registry_lookup(): + assert isinstance(get_adapter("codex"), CodexAdapter) + assert isinstance(get_parser("codex"), CodexParser) + + +def test_claude_parser_ignores_rate_limit_events(): + parser = get_parser("claude") + assert ( + parser.parse('{"type":"rate_limit_event","rate_limit_info":{"status":"allowed","resetsAt":1777000000000}}') + is None + ) + + +def test_claude_parser_routes_top_level_thinking_to_thinking_box(): + parser = get_parser("claude") + + msg = parser.parse('{"type":"thinking","subtype":"delta","text":"Inspecting the diff"}') + + assert msg is not None + assert msg.type == "thinking" + assert msg.subtype == "delta" + assert msg.content == "Inspecting the diff" + + +def test_claude_parser_ignores_empty_thinking_blocks(): + parser = get_parser("claude") + + assert ( + parser.parse( + '{"type":"stream_event","event":{"type":"content_block_start","content_block":{"type":"thinking","thinking":"","signature":""}}}' + ) + is None + ) + + def make_mock_process(*chunks): mock_process = AsyncMock() mock_process.wait.return_value = 0 diff --git a/tests/cli/test_pr_fix.py b/tests/cli/test_pr_fix.py index 1a0a9fc7..6c073b03 100644 --- a/tests/cli/test_pr_fix.py +++ b/tests/cli/test_pr_fix.py @@ -8,6 +8,8 @@ from cube.commands.pr_fix import ( _fixer_session_metadata, _is_compatible_fixer_session_metadata, + _is_non_fast_forward_push_error, + _recover_non_fast_forward_push, _reply_to_processed_comments, _sync_pr_worktree, ) @@ -362,6 +364,43 @@ def fake_reply_and_resolve(**kwargs): assert calls[1]["resolve"] is False +class TestPushRecovery: + def test_detects_non_fast_forward_push_errors(self): + assert _is_non_fast_forward_push_error("! [rejected] HEAD -> branch (non-fast-forward)") + assert _is_non_fast_forward_push_error("Updates were rejected because the tip of your current branch is behind") + assert _is_non_fast_forward_push_error("hint: fetch first") + assert not _is_non_fast_forward_push_error("permission denied") + + def test_recover_non_fast_forward_fetches_and_rebases(self, monkeypatch, tmp_path): + calls = [] + + def fake_run(cmd, **kwargs): + calls.append(tuple(cmd)) + return subprocess.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr(subprocess, "run", fake_run) + + assert _recover_non_fast_forward_push(tmp_path, "writer-codex/package-consolidation") is True + assert calls == [ + ("git", "fetch", "origin", "writer-codex/package-consolidation"), + ("git", "rebase", "origin/writer-codex/package-consolidation"), + ] + + def test_recover_non_fast_forward_aborts_failed_rebase(self, monkeypatch, tmp_path): + calls = [] + + def fake_run(cmd, **kwargs): + calls.append(tuple(cmd)) + if cmd[:2] == ["git", "rebase"] and "--abort" not in cmd: + return subprocess.CompletedProcess(cmd, 1, "", "conflict") + return subprocess.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr(subprocess, "run", fake_run) + + assert _recover_non_fast_forward_push(tmp_path, "writer-codex/package-consolidation") is False + assert ("git", "rebase", "--abort") in calls + + class TestSyncPrWorktree: def test_existing_clean_writer_worktree_resets_to_origin_branch(self, monkeypatch, tmp_path): worktree = tmp_path / ".cube" / "writer-codex-package-consolidation" diff --git a/tests/core/test_judge_panel_retry.py b/tests/core/test_judge_panel_retry.py index 5bba441f..fc3eef97 100644 --- a/tests/core/test_judge_panel_retry.py +++ b/tests/core/test_judge_panel_retry.py @@ -2,18 +2,60 @@ from unittest.mock import MagicMock import pytest -from cube.automation.judge_panel import _load_matching_judge_session, _wait_for_valid_decision_file, run_judge +from cube.automation.judge_panel import ( + _apply_judge_persona, + _judge_session_metadata, + _load_matching_judge_session, + _wait_for_valid_decision_file, + run_judge, +) from cube.models.types import JudgeInfo -def test_load_matching_judge_session_ignores_metadata_mismatch(monkeypatch): +def test_load_matching_judge_session_requires_matching_model_and_cli(monkeypatch): monkeypatch.setattr("cube.automation.judge_panel.load_session", lambda session_type, task_id: "session-id") + monkeypatch.setattr( + "cube.automation.judge_panel.load_session_metadata", + lambda session_type, task_id: "Judge Opus (opus) | cli=cursor-agent", + ) + monkeypatch.setattr("cube.automation.judge_panel.load_config", lambda: MagicMock(cli_tools={"opus": "claude"})) + + judge = MagicMock(key="judge_1", model="opus", label="Judge Opus", type="llm") + + assert _load_matching_judge_session(judge, "task", "peer-review") is None + - judge = MagicMock(key="judge_3", model="new-model") +def test_load_matching_judge_session_allows_matching_metadata(monkeypatch): + judge = MagicMock(key="judge_1", model="opus", label="Judge Opus", type="llm") + + monkeypatch.setattr("cube.automation.judge_panel.load_session", lambda session_type, task_id: "session-id") + monkeypatch.setattr( + "cube.automation.judge_panel.load_session_metadata", + lambda session_type, task_id: _judge_session_metadata(judge, "claude"), + ) + monkeypatch.setattr("cube.automation.judge_panel.load_config", lambda: MagicMock(cli_tools={"opus": "claude"})) assert _load_matching_judge_session(judge, "task", "peer-review") == "session-id" +def test_apply_judge_persona_prepends_configured_persona(): + judge = JudgeInfo( + key="judge_3", + model="gpt", + color="magenta", + label="Judge Security", + task_id="task", + review_type="peer-review", + persona="Focus on tenant isolation and authz bypasses.", + ) + + prompt = _apply_judge_persona("Base prompt", judge) + + assert prompt.startswith("# JUDGE PERSONA — Judge Security") + assert "Focus on tenant isolation and authz bypasses." in prompt + assert prompt.endswith("Base prompt") + + @pytest.mark.asyncio async def test_wait_for_valid_decision_file_waits_for_late_write(tmp_path): decision_file = tmp_path / "decision.json" diff --git a/tests/core/test_orchestrate_pr.py b/tests/core/test_orchestrate_pr.py new file mode 100644 index 00000000..8110c8b1 --- /dev/null +++ b/tests/core/test_orchestrate_pr.py @@ -0,0 +1,51 @@ +"""Tests for workflow PR creation.""" + +import subprocess + +import pytest +from cube.commands.orchestrate.pr import create_pr + + +@pytest.mark.asyncio +async def test_create_pr_single_mode_uses_writer_label_not_winner(monkeypatch): + calls = [] + + class Writer: + name = "codex" + key = "writer_a" + label = "Writer GPT" + + def fake_run(cmd, **kwargs): + calls.append(cmd) + return subprocess.CompletedProcess(cmd, 0, "https://example.test/pr/1\n", "") + + monkeypatch.setattr("cube.commands.orchestrate.pr.get_writer_by_key_or_metadata", lambda winner, task_id: Writer()) + monkeypatch.setattr(subprocess, "run", fake_run) + + await create_pr("package-consolidation", "writer_a", single_mode=True) + + body = calls[0][calls[0].index("--body") + 1] + assert "Writer: Writer GPT (writer_a)" in body + assert "Winner:" not in body + + +@pytest.mark.asyncio +async def test_create_pr_dual_mode_keeps_winner_label(monkeypatch): + calls = [] + + class Writer: + name = "codex" + key = "writer_a" + label = "Writer GPT" + + def fake_run(cmd, **kwargs): + calls.append(cmd) + return subprocess.CompletedProcess(cmd, 0, "https://example.test/pr/1\n", "") + + monkeypatch.setattr("cube.commands.orchestrate.pr.get_writer_by_key_or_metadata", lambda winner, task_id: Writer()) + monkeypatch.setattr(subprocess, "run", fake_run) + + await create_pr("package-consolidation", "writer_a", single_mode=False) + + body = calls[0][calls[0].index("--body") + 1] + assert "Winner: Writer GPT" in body diff --git a/tests/core/test_single_layout.py b/tests/core/test_single_layout.py new file mode 100644 index 00000000..b0014a6b --- /dev/null +++ b/tests/core/test_single_layout.py @@ -0,0 +1,23 @@ +"""Tests for single-agent layout title behavior.""" + +from cube.core.single_layout import SingleAgentLayout + + +def test_single_layout_includes_task_name_in_box_title(): + layout = SingleAgentLayout.initialize("Writer GPT", task_name="package-consolidation") + + try: + assert layout.boxes["agent"] == "Writer GPT ▶ package-consolidation" + assert layout.task_name == "package-consolidation" + finally: + SingleAgentLayout.close() + + +def test_single_layout_keeps_title_without_task_name(): + layout = SingleAgentLayout.initialize("Prompter") + + try: + assert layout.boxes["agent"] == "Prompter" + assert layout.task_name is None + finally: + SingleAgentLayout.close() diff --git a/tests/core/test_user_config.py b/tests/core/test_user_config.py index 0c5cad56..31e671f3 100644 --- a/tests/core/test_user_config.py +++ b/tests/core/test_user_config.py @@ -170,7 +170,12 @@ def test_model_aliases_are_resolved(self, tmp_path, monkeypatch): }, "judges": { "judge_1": {"model": "latest", "label": "Judge GPT", "color": "yellow"}, - "judge_2": {"model": "opus", "label": "Judge Opus", "color": "green"}, + "judge_2": { + "model": "opus", + "label": "Judge Opus", + "color": "green", + "persona": "security-pentest", + }, }, } config_path = tmp_path / "cube.yaml" @@ -186,8 +191,31 @@ def test_model_aliases_are_resolved(self, tmp_path, monkeypatch): assert config.writers["writer_b"].model == "claude-opus-4-7-thinking-max" assert config.judges["judge_1"].model == "gpt-5.5-high" assert config.judges["judge_2"].model == "claude-opus-4-7-thinking-max" + assert config.judges["judge_2"].persona is not None + assert "security and abuse-case reviewer" in config.judges["judge_2"].persona assert resolve_model_alias("latest") == "gpt-5.5-high" + def test_inline_judge_persona_is_preserved(self, tmp_path, monkeypatch): + config_data = { + "writers": {"writer_a": {"name": "codex", "model": "gpt-5.5-high", "label": "Writer GPT", "color": "blue"}}, + "judges": { + "judge_1": { + "model": "gpt-5.5-high", + "label": "Judge Custom", + "color": "yellow", + "persona": "Prefer database correctness issues.", + } + }, + } + config_path = tmp_path / "cube.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + monkeypatch.setattr("cube.core.user_config.find_config_files", lambda: (None, None, config_path)) + + config = load_config() + assert config.judges["judge_1"].persona == "Prefer database correctness issues." + def test_get_prompter_model(self, mock_config_files): """get_prompter_model() returns model name.""" from cube.core.user_config import get_prompter_model