diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 000000000..bcdc56b51 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,14 @@ +# Roadmap + +Стратегический план развития Auto Code (5 целей, реализация волнами) — полностью в +[docs/strategy/roadmap.md](docs/strategy/roadmap.md), с задачами, путями к файлам и критериями приёмки. + +**Порядок реализации:** + +- **Волна 1** (параллельно): `P3` доделать мульти-провайдерную автономность · `P5` прозрачность стоимости · `P1` слой доверия (наш дифференциатор). +- **Волна 2**: `P2` GitHub App (issue → автономный PR с отчётом доверия). +- **Волна 3**: `P4` облако / команды (мультиарендность, роли, история запусков). + +**Стратегия:** позиционирование «автономный кодер, которому можно доверять и который запускается на своей инфраструктуре» — прицел на регулируемый self-hosted энтерпрайз. + +Подробности — в [docs/strategy/roadmap.md](docs/strategy/roadmap.md). diff --git a/apps/backend/agents/session.py b/apps/backend/agents/session.py index 2561dc21d..0a6e972bb 100644 --- a/apps/backend/agents/session.py +++ b/apps/backend/agents/session.py @@ -371,6 +371,8 @@ def load_token_stats(spec_dir: Path) -> TaskTokenStats | None: input_tokens=phase_data["input_tokens"], output_tokens=phase_data["output_tokens"], session_count=phase_data.get("session_count", 0), + model=phase_data.get("model"), + provider=phase_data.get("provider"), updated_at=datetime.fromisoformat(phase_data["updated_at"]), ) @@ -387,11 +389,31 @@ def load_token_stats(spec_dir: Path) -> TaskTokenStats | None: return None +def _resolve_active_provider() -> str | None: + """Best-effort resolve the active AI provider name (for cost attribution). + + Returns the configured provider string (e.g. "claude", "openai") or None + if it cannot be determined. Never raises — token stats must persist even + when provider config is unavailable. + """ + try: + from core.providers.config import get_provider_config + + provider_config = get_provider_config() + if provider_config is not None: + return getattr(provider_config, "provider", None) + except Exception: # pragma: no cover - defensive, provider config optional + return None + return None + + def save_token_stats( spec_dir: Path, phase: PhaseType, input_tokens: int, output_tokens: int, + model: str | None = None, + provider: str | None = None, ) -> bool: """ Update token statistics for a phase and persist to token_stats.json. @@ -404,6 +426,9 @@ def save_token_stats( phase: Execution phase (planning, coding, validation) input_tokens: Number of input tokens used in this session output_tokens: Number of output tokens used in this session + model: Model used in this session (recorded for cost attribution). + provider: Provider used; when omitted, the active provider is resolved + from provider config so every caller records it without changes. Returns: True if saved successfully, False otherwise @@ -413,6 +438,10 @@ def save_token_stats( existing_stats = load_token_stats(spec_dir) now = datetime.now() + # Resolve provider once so every caller records it without changes. + if provider is None: + provider = _resolve_active_provider() + if existing_stats: phases = existing_stats.phases.copy() created_at = existing_stats.created_at @@ -427,12 +456,20 @@ def save_token_stats( phase_stats.output_tokens += output_tokens phase_stats.session_count += 1 phase_stats.updated_at = now + # Last non-empty value wins; don't clobber a known model/provider + # with None from a later session that didn't supply one. + if model: + phase_stats.model = model + if provider: + phase_stats.provider = provider else: phase_stats = PhaseTokenStats( phase=phase, input_tokens=input_tokens, output_tokens=output_tokens, session_count=1, + model=model, + provider=provider, updated_at=now, ) phases[phase] = phase_stats @@ -1502,6 +1539,7 @@ async def run_agent_session( phase_type, usage_metadata["input_tokens"], usage_metadata["output_tokens"], + model=getattr(client, "model", None), ) if saved: print_status( diff --git a/apps/backend/cli/artifacts.py b/apps/backend/cli/artifacts.py index c7a328fd8..23707b05f 100644 --- a/apps/backend/cli/artifacts.py +++ b/apps/backend/cli/artifacts.py @@ -21,6 +21,11 @@ logger = logging.getLogger(__name__) +# Trust Layer verification report (P1) — see docs/strategy/roadmap.md +VERIFICATION_REPORT_FILENAME = "verification-report.json" +VERIFICATION_REPORT_SCHEMA_VERSION = 1 +_ALLOWED_VERDICTS = ("approved", "rejected", "error") + class ArtifactManager: """ @@ -212,6 +217,51 @@ def save_coverage_report( logger.warning(f"Failed to save coverage report: {e}") return None + def save_verification_report( + self, + verification_data: dict[str, Any], + ) -> Path | None: + """ + Save the Trust Layer verification report as a JSON artifact. + + Persists the structured QA verdict that the desktop UI and GitHub PR + comments surface as a "what was verified" report: verdict, confidence, + tests run, diff summary, the agent's uncertainty list, and any + out-of-scope edits. Use :func:`build_verification_report` to assemble + ``verification_data`` from the QA loop's existing signals. + + Args: + verification_data: Verification report dict (see + build_verification_report). A ``timestamp`` is added if absent. + + Returns: + Path to saved artifact file, or None if disabled. + + Example: + >>> report = build_verification_report(verdict="approved") + >>> manager.save_verification_report(report) + """ + if not self.enabled: + return None + + artifact_path = self.artifact_dir / VERIFICATION_REPORT_FILENAME + + try: + # Add timestamp if not present (copy to avoid mutating caller's dict) + if "timestamp" not in verification_data: + verification_data = dict(verification_data) + verification_data["timestamp"] = datetime.utcnow().isoformat() + "Z" + + with open(artifact_path, "w", encoding="utf-8") as f: + json.dump(verification_data, f, indent=2) + + logger.debug(f"Verification report saved: {artifact_path}") + return artifact_path + + except (OSError, ValueError, TypeError) as e: + logger.warning(f"Failed to save verification report: {e}") + return None + def save_custom_artifact( self, artifact_name: str, @@ -475,6 +525,80 @@ def copy_artifact_to_directory( return None +def build_verification_report( + *, + verdict: str | None, + qa_session: int | None = None, + iteration: int | None = None, + confidence: float | None = None, + tests_run: dict[str, Any] | None = None, + diff_summary: dict[str, Any] | None = None, + issues: list[dict[str, Any]] | None = None, + uncertainty: list[dict[str, Any]] | None = None, + out_of_scope_edits: list[dict[str, Any]] | None = None, + duration_seconds: float | None = None, + notes: str | None = None, +) -> dict[str, Any]: + """ + Assemble a normalized Trust Layer verification report (no I/O). + + Pure helper so the schema can be unit-tested and reused by the QA reviewer + and fixer. ``verdict`` is normalized to one of ``approved``/``rejected``/ + ``error`` and ``confidence`` is clamped to ``[0, 1]``. The ``uncertainty`` + and ``out_of_scope_edits`` lists are part of the contract today and stay + empty until P1·T2 (out-of-scope detection) and P1·T3 (confidence / + uncertainty extraction) populate them — see docs/strategy/roadmap.md. + + Args: + verdict: QA outcome; ``None`` or unknown values map to ``"error"``. + qa_session: QA session/pass index, if known. + iteration: QA loop iteration number, if known. + confidence: Optional 0..1 confidence signal (clamped). + tests_run: Test/coverage summary (e.g. passed/failed/total/coverage). + diff_summary: Change summary (e.g. files_changed, files). + issues: Issues found (reuses the ``qa_signoff`` issue shape). + uncertainty: Areas the agent is unsure about. + out_of_scope_edits: Edits made outside the planned files. + duration_seconds: Optional duration of the QA pass. + notes: Free-form notes. + + Returns: + A JSON-serializable verification report dict (no timestamp — the + timestamp is stamped by :meth:`ArtifactManager.save_verification_report`). + """ + normalized_verdict = (verdict or "error").strip().lower() + if normalized_verdict not in _ALLOWED_VERDICTS: + normalized_verdict = "error" + + clamped_confidence: float | None = None + if confidence is not None: + try: + clamped_confidence = max(0.0, min(1.0, float(confidence))) + except (TypeError, ValueError): + clamped_confidence = None + + report: dict[str, Any] = { + "schema_version": VERIFICATION_REPORT_SCHEMA_VERSION, + "verdict": normalized_verdict, + "qa_session": qa_session, + "iteration": iteration, + "confidence": clamped_confidence, + "tests_run": dict(tests_run) if tests_run else {}, + "diff_summary": dict(diff_summary) if diff_summary else {}, + "issues": list(issues) if issues else [], + "uncertainty": list(uncertainty) if uncertainty else [], + "out_of_scope_edits": list(out_of_scope_edits) if out_of_scope_edits else [], + "notes": notes, + } + if duration_seconds is not None: + try: + report["duration_seconds"] = round(float(duration_seconds), 2) + except (TypeError, ValueError): + # Non-numeric duration is dropped rather than failing the report. + pass + return report + + def create_artifact_manager( spec_dir: Path, enabled: bool = True, diff --git a/apps/backend/cli/build_commands.py b/apps/backend/cli/build_commands.py index d11a59413..5ccaa3de4 100644 --- a/apps/backend/cli/build_commands.py +++ b/apps/backend/cli/build_commands.py @@ -22,7 +22,7 @@ # Import only what we need at module level # Heavy imports are lazy-loaded in functions to avoid import errors -from cli.artifacts import create_artifact_manager +from cli.artifacts import build_verification_report, create_artifact_manager from cli.exit_codes import ExitCode from cli.json_output import format_build_result from progress import print_paused_banner @@ -145,6 +145,128 @@ def _generate_test_report_data( return test_report_data +def _compute_out_of_scope( + impl_plan: dict[str, Any], changed_files: list[str] | None +) -> list[dict[str, str]]: + """Best-effort out-of-scope edit detection (never fails the report).""" + try: + from qa.scope_check import detect_out_of_scope_edits, get_planned_files + + return detect_out_of_scope_edits( + get_planned_files(impl_plan), changed_files or [] + ) + except Exception as e: # noqa: BLE001 - best-effort enrichment + logger.debug("Could not compute out-of-scope edits: %s", e) + return [] + + +def _generate_verification_report_data( + spec_dir: Path, + qa_approved: bool, + changed_files: list[str] | None = None, +) -> dict[str, Any]: + """ + Build the Trust Layer verification report from QA results. + + Reads the QA sign-off and iteration history already persisted in + implementation_plan.json and assembles the normalized report via + cli.artifacts.build_verification_report. The confidence, uncertainty, + and out-of-scope fields stay at their contract defaults until P1.T2/T3 + populate them (see docs/strategy/roadmap.md). + """ + import json + + impl_plan: dict[str, Any] = {} + qa_signoff: dict[str, Any] = {} + qa_stats: dict[str, Any] = {} + iteration_history: list[dict[str, Any]] = [] + + impl_plan_path = spec_dir / "implementation_plan.json" + if impl_plan_path.exists(): + try: + with open(impl_plan_path, encoding="utf-8") as f: + impl_plan = json.load(f) + qa_signoff = impl_plan.get("qa_signoff") or {} + qa_stats = impl_plan.get("qa_stats") or {} + iteration_history = impl_plan.get("qa_iteration_history") or [] + except (OSError, json.JSONDecodeError) as e: + logger.debug( + "Could not read implementation plan for verification report: %s", e + ) + + # qa_approved is the QA loop's authoritative final outcome. + verdict = "approved" if qa_approved else "rejected" + + durations = [ + it.get("duration_seconds", 0) + for it in iteration_history + if it.get("duration_seconds") is not None + ] + total_duration = round(sum(durations), 2) if durations else None + + diff_summary: dict[str, Any] = {} + if changed_files: + diff_summary = { + "files_changed": len(changed_files), + "files": list(changed_files)[:50], + } + + # Flag edits the agent made outside the plan's declared files (P1.T2). + out_of_scope = _compute_out_of_scope(impl_plan, changed_files) + + return build_verification_report( + verdict=verdict, + qa_session=qa_signoff.get("qa_session"), + iteration=qa_stats.get("last_iteration"), + confidence=qa_signoff.get("confidence"), + tests_run=qa_signoff.get("test_results") or {}, + diff_summary=diff_summary, + issues=qa_signoff.get("issues_found") or [], + uncertainty=qa_signoff.get("uncertainty") or [], + out_of_scope_edits=out_of_scope, + duration_seconds=total_duration, + ) + + +def _save_verification_report( + spec_dir: Path, + qa_approved: bool, + changed_files: list[str] | None, + artifact_manager, + worktree_manager=None, +) -> None: + """ + Persist the verification report on every build (not only CI/json mode). + + Reuses the build's artifact manager when present, otherwise creates one, + so the desktop UI and GitHub PR comments can always surface the report. + When the changed-file list isn't supplied, it is derived from the spec's + worktree so out-of-scope detection has data. Best-effort: failures are + logged and never interrupt the build. + """ + try: + # Derive changed files from the worktree when the caller didn't pass them. + if not changed_files and worktree_manager is not None: + try: + changed_files = [ + path + for _status, path in worktree_manager.get_changed_files( + spec_dir.name + ) + ] + except Exception as e: # noqa: BLE001 - best-effort enrichment + logger.debug("Could not list changed files for report: %s", e) + manager = artifact_manager or create_artifact_manager( + spec_dir=spec_dir, enabled=True + ) + report = _generate_verification_report_data( + spec_dir, qa_approved, changed_files + ) + manager.save_verification_report(report) + except Exception as e: # best-effort artifact; must never break the build + logger.debug("Could not save verification report: %s", e) + + # Pattern management commands are available in pattern_commands.py # Run: python apps/backend/cli/pattern_commands.py --help @@ -492,6 +614,17 @@ def handle_build_command( test_report_data = _generate_test_report_data(spec_dir, qa_approved) artifact_manager.save_test_report(test_report_data) + # Persist the Trust Layer verification report on every build + # (not only CI/json) so the desktop UI and PR comments can + # surface what was verified. Best-effort — never breaks the build. + _save_verification_report( + spec_dir, + qa_approved, + changed_files, + artifact_manager, + worktree_manager, + ) + # Sync implementation plan to main project after QA # This ensures the main project has the latest status (human_review) if sync_spec_to_source(spec_dir, source_spec_dir): diff --git a/apps/backend/core/token_stats.py b/apps/backend/core/token_stats.py index 06819b2b4..1e872a300 100644 --- a/apps/backend/core/token_stats.py +++ b/apps/backend/core/token_stats.py @@ -21,6 +21,8 @@ class PhaseTokenStats: input_tokens: int = 0 output_tokens: int = 0 session_count: int = 0 # Number of agent sessions in this phase + model: str | None = None # Most recent model used in this phase + provider: str | None = None # Most recent provider used in this phase updated_at: datetime = field(default_factory=datetime.now) @property @@ -50,6 +52,8 @@ def to_dict(self) -> dict: "output_tokens": stats.output_tokens, "total_tokens": stats.total_tokens, "session_count": stats.session_count, + "model": stats.model, + "provider": stats.provider, "updated_at": stats.updated_at.isoformat(), } for name, stats in self.phases.items() diff --git a/apps/backend/prompts/qa_reviewer.md b/apps/backend/prompts/qa_reviewer.md index eb9f88626..124459634 100644 --- a/apps/backend/prompts/qa_reviewer.md +++ b/apps/backend/prompts/qa_reviewer.md @@ -773,6 +773,12 @@ For each critical/major issue, describe what the Coder Agent should do: ## PHASE 9: UPDATE IMPLEMENTATION PLAN +**Trust signals (recommended):** In your `qa_signoff`, also include `confidence` +(a number from 0.0 to 1.0 — how sure you are of this verdict) and `uncertainty` +(a list of `{ "area": ..., "reason": ... }` entries for anything you could not +fully verify). These surface in the build's verification report — be honest: +named uncertainties and a calibrated confidence are more useful than false certainty. + ### If APPROVED: Update `implementation_plan.json` to record QA sign-off: @@ -789,7 +795,11 @@ Update `implementation_plan.json` to record QA sign-off: "integration": "[X/Y]", "e2e": "[X/Y]" }, - "verified_by": "qa_agent" + "verified_by": "qa_agent", + "confidence": [0.0-1.0], + "uncertainty": [ + { "area": "[what you could not fully verify]", "reason": "[why]" } + ] } } ``` @@ -868,7 +878,11 @@ Update `implementation_plan.json`: "fix_required": "[Description]" } ], - "fix_request_file": "QA_FIX_REQUEST.md" + "fix_request_file": "QA_FIX_REQUEST.md", + "confidence": [0.0-1.0], + "uncertainty": [ + { "area": "[what you could not fully verify]", "reason": "[why]" } + ] } } ``` diff --git a/apps/backend/qa/reviewer.py b/apps/backend/qa/reviewer.py index adf6325e4..74d828200 100644 --- a/apps/backend/qa/reviewer.py +++ b/apps/backend/qa/reviewer.py @@ -378,6 +378,15 @@ def merge_runtime_qa_signoff_artifact(spec_dir: Path, qa_session: int) -> bool: # coverage_results are added afterwards by update_qa_signoff_with_coverage. if isinstance(raw.get("coverage_passed"), bool): signoff["coverage_passed"] = raw["coverage_passed"] + # Trust Layer signals (P1.T3): carry the model's self-reported confidence and + # uncertainty so the verification report can surface them. Sanitize like the + # other fields — a real number in [0, 1] only, and dict uncertainty items only. + confidence = raw.get("confidence") + if isinstance(confidence, (int, float)) and not isinstance(confidence, bool): + signoff["confidence"] = max(0.0, min(1.0, float(confidence))) + uncertainty = raw.get("uncertainty") + if isinstance(uncertainty, list): + signoff["uncertainty"] = [u for u in uncertainty if isinstance(u, dict)] if status == "rejected": # Downstream consumers call issue.get(...), so keep dict items only — # a stray string would crash rejection handling and the QA report. @@ -441,6 +450,8 @@ def _runtime_signoff_instructions(rel_spec: str) -> str: "status": "approved", "tests_passed": {"unit": "X/Y", "integration": "X/Y", "e2e": "X/Y"}, "coverage_passed": True, + "confidence": 0.9, + "uncertainty": [], }, indent=2, ) @@ -456,6 +467,10 @@ def _runtime_signoff_instructions(rel_spec: str) -> str: } ], "coverage_passed": False, + "confidence": 0.6, + "uncertainty": [ + {"area": "", "reason": ""} + ], }, indent=2, ) diff --git a/apps/backend/qa/scope_check.py b/apps/backend/qa/scope_check.py new file mode 100644 index 000000000..96c5bf53e --- /dev/null +++ b/apps/backend/qa/scope_check.py @@ -0,0 +1,92 @@ +"""Out-of-scope edit detection for the Trust Layer verification report (P1.T2). + +Compares the files a build actually changed against the files the plan declared +it would touch (``files_to_modify`` + ``files_to_create`` per subtask) and flags +the difference, so the verification report can surface edits the agent made +outside its plan. + +Pure logic (stdlib only): the plan dict and the changed-file list are passed in, +so this is unit-testable without git or the QA package's runtime dependencies. +""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +# Framework bookkeeping (specs, artifacts, memory) lives under .auto-claude/ and +# is never a user-facing source edit, so it must not be flagged as out of scope. +_IGNORED_PREFIXES = (".auto-claude/", ".auto-claude-") + + +def _normalize_path(path: str) -> str: + """Normalize a repo-relative path for comparison (slashes, leading ./).""" + normalized = (path or "").strip().replace("\\", "/") + while normalized.startswith("./"): + normalized = normalized[2:] + return normalized + + +def _subtask_planned_files(subtask: Any) -> set[str]: + """Normalized files_to_modify + files_to_create for a single subtask.""" + files: set[str] = set() + if not isinstance(subtask, dict): + return files + for key in ("files_to_modify", "files_to_create"): + for file_path in subtask.get(key, []) or []: + normalized = _normalize_path(file_path) + if normalized: + files.add(normalized) + return files + + +def get_planned_files(plan: dict[str, Any] | None) -> set[str]: + """Collect the files the plan declared it would touch. + + Unions ``files_to_modify`` + ``files_to_create`` across every subtask of + every phase, supporting both the ``subtasks`` and legacy ``chunks`` keys. + Returns normalized paths. + """ + planned: set[str] = set() + if not plan: + return planned + + for phase in plan.get("phases", []) or []: + if not isinstance(phase, dict): + continue + subtasks = phase.get("subtasks") or phase.get("chunks") or [] + for subtask in subtasks: + planned |= _subtask_planned_files(subtask) + return planned + + +def detect_out_of_scope_edits( + planned_files: Iterable[str], + changed_files: Iterable[str], +) -> list[dict[str, str]]: + """Return changed files that the plan did not declare. + + Paths under framework bookkeeping (``.auto-claude/``) are ignored. The + result is de-duplicated and stable-sorted for deterministic reports. + """ + planned = {_normalize_path(f) for f in planned_files} + out_of_scope: list[dict[str, str]] = [] + seen: set[str] = set() + + for raw in changed_files: + normalized = _normalize_path(raw) + if not normalized or normalized in seen: + continue + if any(normalized.startswith(prefix) for prefix in _IGNORED_PREFIXES): + continue + if normalized in planned: + continue + seen.add(normalized) + out_of_scope.append( + { + "file": normalized, + "reason": "edited outside the plan's files_to_modify/files_to_create", + } + ) + + return sorted(out_of_scope, key=lambda item: item["file"]) diff --git a/apps/frontend/src/__tests__/setup.ts b/apps/frontend/src/__tests__/setup.ts index 279891e66..5179286de 100644 --- a/apps/frontend/src/__tests__/setup.ts +++ b/apps/frontend/src/__tests__/setup.ts @@ -112,6 +112,7 @@ if (typeof window !== 'undefined') { getGenericEditArtifactManifest: vi.fn().mockResolvedValue({ success: true, data: null }), getQAReport: vi.fn().mockResolvedValue({ success: true, data: null }), getQAEscalation: vi.fn().mockResolvedValue({ success: true, data: null }), + getVerificationReport: vi.fn().mockResolvedValue({ success: true, data: null }), getSettings: vi.fn(), saveSettings: vi.fn(), selectDirectory: vi.fn(), diff --git a/apps/frontend/src/main/agent/agent-process.ts b/apps/frontend/src/main/agent/agent-process.ts index 8fe32d8ca..b5594058d 100644 --- a/apps/frontend/src/main/agent/agent-process.ts +++ b/apps/frontend/src/main/agent/agent-process.ts @@ -183,6 +183,25 @@ export class AgentProcessManager { return env; } + /** + * Resolve AUTO_CODE_AUTONOMY from the persisted app setting for the build env. + * Explicit env wins (ADR-006 precedence), so this returns {} when + * AUTO_CODE_AUTONOMY is already set. Best-effort: never throws. + */ + private getAutonomyEnv(): Record { + if (process.env.AUTO_CODE_AUTONOMY) return {}; + try { + const settings = readSettingsFile(); + const level = settings?.autonomyLevel; + if (typeof level === 'string' && ['off', 'claude', 'safe', 'bold'].includes(level)) { + return { AUTO_CODE_AUTONOMY: level }; + } + } catch { + // Missing/unreadable settings just means default autonomy. + } + return {}; + } + private async setupProcessEnvironment( extraEnv: Record ): Promise { @@ -218,6 +237,7 @@ export class AgentProcessManager { return { ...augmentedEnv, + ...this.getAutonomyEnv(), ...gitBashEnv, ...claudeCliEnv, ...ghCliEnv, diff --git a/apps/frontend/src/main/ipc-handlers/task/spec-file-handlers.ts b/apps/frontend/src/main/ipc-handlers/task/spec-file-handlers.ts index 7d65d3a77..e50ae1767 100644 --- a/apps/frontend/src/main/ipc-handlers/task/spec-file-handlers.ts +++ b/apps/frontend/src/main/ipc-handlers/task/spec-file-handlers.ts @@ -4,14 +4,16 @@ import type { IPCResult, ImplementationPlan, QAEscalation, - GenericEditArtifactManifest + GenericEditArtifactManifest, + VerificationReport } from '../../../shared/types'; import { findTaskAndProject } from './shared'; import { readImplementationPlan, readQAReport, readQAEscalation, - readGenericEditArtifactManifest + readGenericEditArtifactManifest, + readVerificationReport } from './spec-file-readers'; /** @@ -162,4 +164,37 @@ export function registerSpecFileHandlers(): void { } } ); + + /** + * Get the Trust Layer verification report for a task + * @param taskId - The task ID + */ + ipcMain.handle( + IPC_CHANNELS.TASK_SPEC_VERIFICATION_REPORT_GET, + async (_, taskId: string): Promise> => { + if (!isValidTaskId(taskId)) { + return { success: false, error: 'Invalid taskId' }; + } + + try { + const { task, project } = await findTaskAndProject(taskId); + if (!task || !project) { + return { success: false, error: 'Task or project not found' }; + } + + const report = await readVerificationReport(project, task); + if (!report) { + return { success: false, error: 'Verification report not found' }; + } + + return { success: true, data: report }; + } catch (err) { + console.error('[IPC] TASK_SPEC_VERIFICATION_REPORT_GET error:', err); + return { + success: false, + error: err instanceof Error ? err.message : 'Failed to read verification report' + }; + } + } + ); } diff --git a/apps/frontend/src/main/ipc-handlers/task/spec-file-readers.ts b/apps/frontend/src/main/ipc-handlers/task/spec-file-readers.ts index fed1e1203..955ad1aae 100644 --- a/apps/frontend/src/main/ipc-handlers/task/spec-file-readers.ts +++ b/apps/frontend/src/main/ipc-handlers/task/spec-file-readers.ts @@ -16,6 +16,7 @@ import type { Project, Task, ImplementationPlan, + VerificationReport, QAEscalation, GenericEditArtifactManifest, GenericEditArtifactManifestEntry, @@ -1166,6 +1167,32 @@ export async function readGenericEditArtifactManifest( } } +/** + * Read the Trust Layer verification report from artifacts/verification-report.json + * + * @param project - The project containing the task + * @param task - The task to read the report for + * @returns The parsed verification report, or null if it doesn't exist + */ +export async function readVerificationReport( + project: Project, + task: Task +): Promise { + try { + const specDir = getSpecDir(project, task); + const reportPath = path.join(specDir, AUTO_BUILD_PATHS.VERIFICATION_REPORT); + + const reportContent = await fs.readFile(reportPath, 'utf-8'); + return JSON.parse(reportContent) as VerificationReport; + } catch (err) { + if (isFileNotFoundError(err)) { + return null; + } + console.error(`[spec-file-readers] Error reading verification report:`, err); + throw err; + } +} + /** * Read the QA report from qa_report.md * diff --git a/apps/frontend/src/preload/api/task-api.ts b/apps/frontend/src/preload/api/task-api.ts index 23cba7810..7d633da76 100644 --- a/apps/frontend/src/preload/api/task-api.ts +++ b/apps/frontend/src/preload/api/task-api.ts @@ -103,6 +103,7 @@ export interface TaskAPI { getGenericEditArtifactManifest: (taskId: string) => Promise>; getQAReport: (taskId: string) => Promise>; getQAEscalation: (taskId: string) => Promise>; + getVerificationReport: (taskId: string) => Promise>; // Merge Analytics getMergeHistory: (projectId: string, filter?: MergeAnalyticsFilter) => Promise>; @@ -364,6 +365,9 @@ export const createTaskAPI = (): TaskAPI => ({ getQAEscalation: (taskId: string): Promise> => ipcRenderer.invoke(IPC_CHANNELS.TASK_SPEC_QA_ESCALATION_GET, taskId), + getVerificationReport: (taskId: string): Promise> => + ipcRenderer.invoke(IPC_CHANNELS.TASK_SPEC_VERIFICATION_REPORT_GET, taskId), + // Merge Analytics getMergeHistory: (projectId: string, filter?: MergeAnalyticsFilter): Promise> => ipcRenderer.invoke(IPC_CHANNELS.MERGE_ANALYTICS_GET_HISTORY, projectId, filter), diff --git a/apps/frontend/src/renderer/components/settings/ProviderSettings.tsx b/apps/frontend/src/renderer/components/settings/ProviderSettings.tsx index cb16e6b58..ba592b5b0 100644 --- a/apps/frontend/src/renderer/components/settings/ProviderSettings.tsx +++ b/apps/frontend/src/renderer/components/settings/ProviderSettings.tsx @@ -39,6 +39,7 @@ export function ProviderSettings() { const settings = useSettingsStore((state) => state.settings); const selectedProviderId = settings.selectedProviderId || 'anthropic'; const selectedFallbackModelId = settings.fallbackModelId || ''; + const selectedAutonomyLevel = settings.autonomyLevel || 'claude'; const [showProviderDetails, setShowProviderDetails] = useState>({}); // Find the selected provider @@ -82,6 +83,18 @@ export function ProviderSettings() { } }; + /** + * Handle autonomy level selection (AUTO_CODE_AUTONOMY). + */ + const handleSelectAutonomy = async (level: string) => { + const success = await saveSettings({ + autonomyLevel: level as 'off' | 'claude' | 'safe' | 'bold' + }); + if (!success) { + console.error('Failed to save autonomy level'); + } + }; + /** * Toggle provider details visibility */ @@ -234,6 +247,27 @@ export function ProviderSettings() { description={t('provider.description')} >
+ {/* Autonomy level (AUTO_CODE_AUTONOMY) */} +
+
+ +

+ {t('provider.autonomyDescription')} +

+
+ +
+ {/* Provider selector */}
diff --git a/apps/frontend/src/renderer/components/task-detail/TaskOverview.tsx b/apps/frontend/src/renderer/components/task-detail/TaskOverview.tsx index 7fb436785..8b108394d 100644 --- a/apps/frontend/src/renderer/components/task-detail/TaskOverview.tsx +++ b/apps/frontend/src/renderer/components/task-detail/TaskOverview.tsx @@ -23,9 +23,11 @@ import type { Phase, SubtaskStatus, QAEscalation, - GenericEditArtifactManifest + GenericEditArtifactManifest, + VerificationReport } from '../../../shared/types'; import { GenericEditArtifactsPanel } from './GenericEditArtifactsPanel'; +import { VerificationReportPanel } from './VerificationReportPanel'; interface TaskOverviewProps { task: Task; @@ -37,6 +39,7 @@ export function TaskOverview({ task }: TaskOverviewProps) { const [genericEditManifest, setGenericEditManifest] = useState(null); const [qaReport, setQAReport] = useState(null); const [qaEscalation, setQAEscalation] = useState(null); + const [verificationReport, setVerificationReport] = useState(null); const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(null); const [expandedPhases, setExpandedPhases] = useState>(new Set()); @@ -52,6 +55,7 @@ export function TaskOverview({ task }: TaskOverviewProps) { setGenericEditManifest(null); setQAReport(null); setQAEscalation(null); + setVerificationReport(null); try { // Load implementation plan const planResult = await globalThis.electronAPI.getImplementationPlan(task.id); @@ -76,6 +80,12 @@ export function TaskOverview({ task }: TaskOverviewProps) { setGenericEditManifest(manifestResult.data); } + // Load the Trust Layer verification report if present. + const verificationResult = await globalThis.electronAPI.getVerificationReport(task.id); + if (verificationResult.success && verificationResult.data) { + setVerificationReport(verificationResult.data); + } + // Load QA report if available const qaResult = await globalThis.electronAPI.getQAReport(task.id); if (qaResult.success && qaResult.data) { @@ -260,6 +270,14 @@ export function TaskOverview({ task }: TaskOverviewProps) { )} + {/* Verification Report Section (Trust Layer) */} + {verificationReport && ( + <> + + + + )} + {/* QA Report Section */} {qaReport && ( <> @@ -366,7 +384,7 @@ export function TaskOverview({ task }: TaskOverviewProps) { )} {/* No Data Available */} - {!implementationPlan && !genericEditManifest && !qaReport && !qaEscalation && ( + {!implementationPlan && !genericEditManifest && !verificationReport && !qaReport && !qaEscalation && (

{t('tasks:overview.noDataAvailable')}

diff --git a/apps/frontend/src/renderer/components/task-detail/VerificationReportPanel.tsx b/apps/frontend/src/renderer/components/task-detail/VerificationReportPanel.tsx new file mode 100644 index 000000000..5a91f7dcd --- /dev/null +++ b/apps/frontend/src/renderer/components/task-detail/VerificationReportPanel.tsx @@ -0,0 +1,131 @@ +import { useTranslation } from 'react-i18next'; +import { ShieldCheck, ShieldX, ShieldQuestion, FileWarning, HelpCircle } from 'lucide-react'; +import { Badge } from '../ui/badge'; +import type { VerificationReport } from '../../../shared/types'; + +type VerificationReportPanelProps = Readonly<{ + report: VerificationReport; +}>; + +function verdictBadgeVariant( + verdict: VerificationReport['verdict'] +): 'success' | 'destructive' | 'warning' { + if (verdict === 'approved') return 'success'; + if (verdict === 'rejected') return 'destructive'; + return 'warning'; +} + +function isPrimitive(value: unknown): value is string | number | boolean { + return ( + typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean' + ); +} + +function VerdictIcon({ verdict }: Readonly<{ verdict: VerificationReport['verdict'] }>) { + if (verdict === 'approved') return ; + if (verdict === 'rejected') return ; + return ; +} + +/** + * Trust Layer verification report — the structured "what was verified" summary + * (verdict, confidence, tests, out-of-scope edits, uncertainty) read from + * artifacts/verification-report.json. + */ +export function VerificationReportPanel({ report }: VerificationReportPanelProps) { + const { t } = useTranslation(['tasks', 'common']); + + const testEntries = Object.entries(report.tests_run ?? {}).filter(([, v]) => + isPrimitive(v) + ); + const filesChanged = report.diff_summary?.files_changed; + + return ( +
+
+ + {t('tasks:overview.verificationReport')} +
+ +
+ {/* Verdict + confidence + files changed */} +
+ + {t(`tasks:overview.verdict.${report.verdict}`)} + + {report.confidence !== null && report.confidence !== undefined && ( + + {t('tasks:overview.confidence')}:{' '} + + {Math.round(report.confidence * 100)}% + + + )} + {typeof filesChanged === 'number' && ( + + {t('tasks:overview.filesChanged')}:{' '} + {filesChanged} + + )} +
+ + {/* Tests run */} + {testEntries.length > 0 && ( +
+ {testEntries.map(([key, value]) => ( + + {key}:{' '} + {String(value)} + + ))} +
+ )} + + {/* Out-of-scope edits */} + {report.out_of_scope_edits.length > 0 && ( +
+

+ + {t('tasks:overview.outOfScopeEdits')} +

+
    + {report.out_of_scope_edits.map((edit) => ( +
  • + {edit.file ?? '?'} + {edit.reason ? ` — ${edit.reason}` : ''} +
  • + ))} +
+
+ )} + + {/* Uncertainty */} + {report.uncertainty.length > 0 && ( +
+

+ + {t('tasks:overview.uncertainty')} +

+
    + {report.uncertainty.map((item) => ( +
  • + {item.area && ( + {item.area} + )} + {item.area && item.reason ? ' — ' : ''} + {item.reason ?? ''} +
  • + ))} +
+
+ )} +
+
+ ); +} diff --git a/apps/frontend/src/renderer/lib/mocks/task-mock.ts b/apps/frontend/src/renderer/lib/mocks/task-mock.ts index 02f06cc28..978ec3471 100644 --- a/apps/frontend/src/renderer/lib/mocks/task-mock.ts +++ b/apps/frontend/src/renderer/lib/mocks/task-mock.ts @@ -185,6 +185,11 @@ export const taskMock = { data: null }), + getVerificationReport: async () => ({ + success: true, + data: null + }), + // Event Listeners (no-op in browser) onTaskProgress: () => () => {}, onTaskError: () => () => {}, diff --git a/apps/frontend/src/shared/constants/config.ts b/apps/frontend/src/shared/constants/config.ts index d76c918ed..4876f73ff 100644 --- a/apps/frontend/src/shared/constants/config.ts +++ b/apps/frontend/src/shared/constants/config.ts @@ -30,6 +30,7 @@ export const DEFAULT_APP_SETTINGS = { colorTheme: 'default' as const, defaultModel: 'opus', agentFramework: 'auto-claude', + autonomyLevel: 'claude' as 'off' | 'claude' | 'safe' | 'bold', pythonPath: undefined as string | undefined, gitPath: undefined as string | undefined, githubCLIPath: undefined as string | undefined, @@ -109,6 +110,7 @@ export const AUTO_BUILD_PATHS = { SPEC_FILE: 'spec.md', QA_REPORT: 'qa_report.md', GENERIC_EDIT_ARTIFACT_MANIFEST: 'artifacts/generic_edit_artifact_manifest.json', + VERIFICATION_REPORT: 'artifacts/verification-report.json', BUILD_PROGRESS: 'build-progress.txt', GENERATION_PROGRESS: 'generation_progress.json', CONTEXT: 'context.json', diff --git a/apps/frontend/src/shared/constants/ipc.ts b/apps/frontend/src/shared/constants/ipc.ts index 847bcb752..71b1d759f 100644 --- a/apps/frontend/src/shared/constants/ipc.ts +++ b/apps/frontend/src/shared/constants/ipc.ts @@ -49,6 +49,7 @@ export const IPC_CHANNELS = { TASK_SPEC_QA_REPORT_GET: 'task:spec:qaReportGet', TASK_SPEC_QA_ESCALATION_GET: 'task:spec:qaEscalationGet', TASK_SPEC_GENERIC_EDIT_ARTIFACT_MANIFEST_GET: 'task:spec:genericEditArtifactManifestGet', + TASK_SPEC_VERIFICATION_REPORT_GET: 'task:spec:verificationReportGet', // Workspace management (for human review) // Per-spec architecture: Each spec has its own worktree at .worktrees/{spec-name}/ diff --git a/apps/frontend/src/shared/i18n/locales/en/settings.json b/apps/frontend/src/shared/i18n/locales/en/settings.json index 07d608072..e49108dd8 100644 --- a/apps/frontend/src/shared/i18n/locales/en/settings.json +++ b/apps/frontend/src/shared/i18n/locales/en/settings.json @@ -434,6 +434,14 @@ "provider": { "title": "AI Provider", "description": "Select your AI provider and available models", + "autonomy": "Autonomy", + "autonomyDescription": "How independent agents are (AUTO_CODE_AUTONOMY). Explicit environment variables override this.", + "autonomyLevels": { + "off": "Off — analysis & patch suggestions only", + "claude": "Claude / Codex (default) — full autonomy on the SDK runtimes", + "safe": "Safe — also promote direct-API providers via the evidence gate", + "bold": "Bold — direct providers run promoted without the gate (power user)" + }, "selectProvider": "Select Provider", "selectProviderPlaceholder": "Choose an AI provider", "availableProviders": "Available Providers", diff --git a/apps/frontend/src/shared/i18n/locales/en/tasks.json b/apps/frontend/src/shared/i18n/locales/en/tasks.json index c3d161937..0630e1061 100644 --- a/apps/frontend/src/shared/i18n/locales/en/tasks.json +++ b/apps/frontend/src/shared/i18n/locales/en/tasks.json @@ -472,6 +472,16 @@ "verification": "Verification", "qaReport": "QA Report", "qaEscalation": "QA Escalation", + "verificationReport": "Verification Report", + "verdict": { + "approved": "Approved", + "rejected": "Rejected", + "error": "Error" + }, + "confidence": "Confidence", + "filesChanged": "Files changed", + "outOfScopeEdits": "Out-of-scope edits", + "uncertainty": "Uncertainty", "totalIterations": "Total Iterations", "totalIssues": "Total Issues", "uniqueIssues": "Unique Issues", diff --git a/apps/frontend/src/shared/i18n/locales/fr/settings.json b/apps/frontend/src/shared/i18n/locales/fr/settings.json index de52e47d6..232885908 100644 --- a/apps/frontend/src/shared/i18n/locales/fr/settings.json +++ b/apps/frontend/src/shared/i18n/locales/fr/settings.json @@ -434,6 +434,14 @@ "provider": { "title": "Fournisseur IA", "description": "Sélectionnez votre fournisseur IA et les modèles disponibles", + "autonomy": "Autonomie", + "autonomyDescription": "Niveau d'indépendance des agents (AUTO_CODE_AUTONOMY). Les variables d'environnement explicites priment.", + "autonomyLevels": { + "off": "Désactivé — analyse et suggestions de correctifs uniquement", + "claude": "Claude / Codex (par défaut) — autonomie complète sur les runtimes SDK", + "safe": "Sûr — promeut aussi les fournisseurs API directs via la barrière de preuves", + "bold": "Audacieux — les fournisseurs directs sont promus sans barrière (expert)" + }, "selectProvider": "Sélectionner le fournisseur", "selectProviderPlaceholder": "Choisir un fournisseur IA", "availableProviders": "Fournisseurs disponibles", diff --git a/apps/frontend/src/shared/i18n/locales/fr/tasks.json b/apps/frontend/src/shared/i18n/locales/fr/tasks.json index 43cfa82a1..c89c44da0 100644 --- a/apps/frontend/src/shared/i18n/locales/fr/tasks.json +++ b/apps/frontend/src/shared/i18n/locales/fr/tasks.json @@ -471,6 +471,16 @@ "verification": "Vérification", "qaReport": "Rapport QA", "qaEscalation": "Escalade QA", + "verificationReport": "Rapport de vérification", + "verdict": { + "approved": "Approuvé", + "rejected": "Rejeté", + "error": "Erreur" + }, + "confidence": "Confiance", + "filesChanged": "Fichiers modifiés", + "outOfScopeEdits": "Modifications hors périmètre", + "uncertainty": "Incertitude", "totalIterations": "Itérations totales", "totalIssues": "Problèmes totaux", "uniqueIssues": "Problèmes uniques", diff --git a/apps/frontend/src/shared/types/ipc.ts b/apps/frontend/src/shared/types/ipc.ts index 49ce5ddfd..29a8252e3 100644 --- a/apps/frontend/src/shared/types/ipc.ts +++ b/apps/frontend/src/shared/types/ipc.ts @@ -978,6 +978,7 @@ export interface ElectronAPI { getGenericEditArtifactManifest: (taskId: string) => Promise>; getQAReport: (taskId: string) => Promise>; getQAEscalation: (taskId: string) => Promise>; + getVerificationReport: (taskId: string) => Promise>; // Plugin management listPlugins: (options: { projectPath: string; filter?: { pluginType?: string; enabledOnly?: boolean } }) => Promise>; diff --git a/apps/frontend/src/shared/types/settings.ts b/apps/frontend/src/shared/types/settings.ts index 4e0d6c06e..e7ee9b7fe 100644 --- a/apps/frontend/src/shared/types/settings.ts +++ b/apps/frontend/src/shared/types/settings.ts @@ -267,6 +267,8 @@ export interface AppSettings { colorTheme?: ColorTheme; defaultModel: string; agentFramework: string; + // Autonomy level (AUTO_CODE_AUTONOMY) — how independent agents are. See ADR-006. + autonomyLevel?: 'off' | 'claude' | 'safe' | 'bold'; pythonPath?: string; gitPath?: string; githubCLIPath?: string; diff --git a/apps/frontend/src/shared/types/task.ts b/apps/frontend/src/shared/types/task.ts index 166ef6afb..8acdc907f 100644 --- a/apps/frontend/src/shared/types/task.ts +++ b/apps/frontend/src/shared/types/task.ts @@ -67,6 +67,24 @@ export interface QAReport { timestamp: Date; } +// Trust Layer verification report (artifacts/verification-report.json). +// Mirrors cli/artifacts.py::build_verification_report output. +export interface VerificationReport { + schema_version: number; + verdict: 'approved' | 'rejected' | 'error'; + qa_session: number | null; + iteration: number | null; + confidence: number | null; // 0..1, model self-assessed (calibrate against tests) + tests_run: Record; + diff_summary: { files_changed?: number; files?: string[]; [key: string]: unknown }; + issues: Array>; + uncertainty: Array<{ area?: string; reason?: string; [key: string]: unknown }>; + out_of_scope_edits: Array<{ file?: string; reason?: string; [key: string]: unknown }>; + notes: string | null; + duration_seconds?: number; + timestamp?: string; +} + export interface QAIssue { id: string; severity: 'critical' | 'major' | 'minor'; diff --git a/docs/architecture/adr/ADR-006-autonomy-levels.md b/docs/architecture/adr/ADR-006-autonomy-levels.md index a368a9532..668a49aaa 100644 --- a/docs/architecture/adr/ADR-006-autonomy-levels.md +++ b/docs/architecture/adr/ADR-006-autonomy-levels.md @@ -1,7 +1,7 @@ # ADR-006: User-facing autonomy levels **Date:** 2026-05-23 -**Status:** Proposed +**Status:** Accepted — core implemented (`core/autonomy_level.py` with `AutonomyLevel` + `resolve_autonomy_settings`; wired into `agents/coder.py` and QA phase routing; deprecated `AUTO_CODE_DIRECT_API_FULL_AUTONOMOUS` warns). Docs now lead with `AUTO_CODE_AUTONOMY` (see [CLI Usage → Autonomy](../../../guides/CLI-USAGE.md#autonomy-one-knob)). Remaining follow-ups: planner-phase gate, `--runtime-modes --json` `autonomy_level` field, and the Electron settings toggle. **Deciders:** Auto Code Core Team **Tags:** autonomy, runtimes, providers, configuration, ux diff --git a/docs/strategy/roadmap.md b/docs/strategy/roadmap.md new file mode 100644 index 000000000..635d34e12 --- /dev/null +++ b/docs/strategy/roadmap.md @@ -0,0 +1,152 @@ +# Auto Code — план развития (5 целей) + +> Источник: стратегическое исследование рынка ИИ-кодинга и конкурентов (2026-06-24) + разведка фактического состояния кода. +> Контекст и конкурентный анализ — см. раздел «Позиционирование» ниже. + +## Позиционирование (зачем эти цели) + +Рынок ИИ-кодинга растёт (~$12–16 млрд, CAGR 27–37 %), но главная незакрытая боль — **доверие**: лишь ~29 % разработчиков доверяют выводу ИИ, ИИ создаёт в 1,3–1,7× больше критических багов. Пространство «автономных агентов» сжимается сверху (Claude Code/Anthropic, GitHub Copilot agent) и снизу (OpenHands). + +**Наш ров — пересечение трёх вещей, которые почти никто не сочетает:** +1. spec-driven дисциплина, +2. замкнутый цикл «автономное исполнение → QA → фиксы» с изоляцией в worktree и явными ревью-гейтами, +3. open-source + self-hosted + мульти-провайдер (данные не покидают периметр). + +Позиционирование: **«автономный кодер, которому можно доверять и которого можно запустить на своей инфраструктуре»**, с прицелом на регулируемый self-hosted энтерпрайз. + +## Главный вывод разведки: что уже готово + +| Цель | Готово в коде | Объём остатка | Ценность | Волна | +|---|---|---|---|---| +| **P3. Мульти-провайдер / автономность** | **~90 %** (уровни off/claude/safe/bold, evidence-gate, runtime-слой, Ollama — есть) | **S** | высокая (снимает зависимость от Anthropic + локальные модели) | **1** | +| **P5. Прозрачность стоимости** | **~70 %** (захват токенов, прайс-база и графики во фронте — есть, не связаны) | **S–M** | высокая (тревога из-за usage-pricing) | **1** | +| **P1. Слой доверия** | бэкенд ~60 %, **UI — 0 %** | **M** | **наивысшая (ров)** | **1** | +| **P2. GitHub App (issue→PR)** | **~75 % переиспользуемо** | **M–L** | высокая (дистрибуция) | **2** | +| **P4. Облако / команды** | каркас FastAPI + auth + agent_runner есть; мультиарендности — 0 | **L** | высокая (выручка/энтерпрайз) | **3** | + +Объём: S/M/L — относительный размер работы (не время). По правилу проекта — без оценок в днях. + +## Последовательность + +- **Волна 1 (параллельно):** P3 (доделать автономность) · P5 (MVP стоимости) · P1 (слой доверия). +- **Волна 2:** P2 (GitHub App) — переиспользует CI/worktree/PR; в комментарий PR кладёт отчёт доверия из P1 (после P1). +- **Волна 3:** P4 (облако/команды) — самый большой объём; модель workspace аддитивна и может стартовать параллельно с Волной 1. +- **Стратегическая связка:** P3 (локальные модели) + P4 (self-hosted мультипользователь) = регулируемый энтерпрайз. + +> ⚠️ Номера строк ниже — из автоматической разведки и могут «плыть». Перед правкой подтверждайте место по имени функции (через codegraph), а не по номеру строки. + +--- + +## P1. Слой доверия (Trust Layer) — Волна 1, ров + +**Итог:** каждая сборка выдаёт `verification-report.json` и видимый в UI отчёт: вердикт + оценка уверенности + прогнанные/прошедшие тесты + объяснение диффа + список «в чём агент не уверен» + флаг правок вне scope; история итераций QA — на таймлайне. + +**Что уже есть:** `qa_signoff` в `implementation_plan.json`, история в `qa_iteration_history[]` (`apps/backend/qa/report.py`), `ArtifactManager` (`apps/backend/cli/artifacts.py`) пишет `artifacts/test-report.json`/`coverage-report.json`, парсер покрытия (`apps/backend/analysis/coverage_reporter.py`). **Нет:** уверенности/неопределённости, детекта out-of-scope и QA-экрана во фронте. + +| # | Задача | Файлы | Критерий приёмки | +|---|---|---|---| +| T1 | `save_verification_report()` + сборка артефакта из уже известного (вердикт, тесты, дифф, итерация) | `cli/artifacts.py`; вызов из `run_qa_agent_session()` в `qa/reviewer.py` и `run_qa_fixer_session()` в `qa/fixer.py` | После каждой QA-сессии есть `artifacts/verification-report.json` с вердиктом и тестами | +| T2 | Детект правок вне scope: дифф изменённых файлов против `phases[].subtasks[].files_to_modify[]` | новый `qa/scope_check.py`, вызов в `qa/loop.py` | Правка файла не из плана попадает в `out_of_scope_edits` | +| T3 | Уверенность + неопределённость: маркеры `[CONFIDENCE: x]`/`[UNCERTAIN: …]` в `prompts/qa_reviewer.md` и `qa_fixer.md`, парсинг + привязка к пробелам покрытия | reviewer/fixer + схема отчёта | В отчёте есть confidence и пункты неопределённости из ответа агента и покрытия | +| T4 | **Первый QA-экран:** IPC-хендлер + React-компонент (датчик уверенности, разбивка тестов, неопределённость, out-of-scope, таймлайн итераций) + i18n (en+fr) | новый `apps/frontend/src/main/ipc-handlers/qa-handlers.ts`, новый `…/renderer/components/VerificationReportView.tsx`, локали | Открыв собранную спеку, вижу отчёт верификации | + +**🚀 Начать с:** T1 в минимальной схеме (`verdict + tests_run + timestamp`). +**Риск:** confidence — самооценка модели; калибровать против покрытия/тестов, в UI показывать как «сигнал + доказательства». + +--- + +## P5. Прозрачность стоимости — Волна 1, быстрый выигрыш + +**Итог:** на спеку видно «сколько стоила сборка» (по фазам и ролям), работает лимит бюджета на спеку, простые подзадачи можно роутить на дешёвую модель. + +**Что уже есть:** захват токенов по фазам (`apps/backend/core/token_stats.py`) + извлечение usage из SDK (`apps/backend/agents/session.py`); во фронте — прайс-база `shared/constants/model-costs.ts`, `CostComparison.tsx`, `CostBreakdownChart.tsx`, `ContextViewer.tsx`. Классы агрегации `apps/backend/analysis/model_usage_analytics.py` написаны, но не подключены. + +| # | Задача | Файлы | Критерий приёмки | +|---|---|---|---| +| T1 | Надёжный захват usage + запись модели/провайдера в статистику | `agents/session.py`, `core/token_stats.py` (поля `model`, `provider`) | `token_stats.json` содержит токены + модель + провайдер по фазам | +| T2 | Подключить агрегацию: per-spec и проектный `.auto-claude/model_usage_summary.json` | `analysis/model_usage_analytics.py`, вызов после `save_token_stats()` | Считается стоимость по ролям и фазам | +| T3 | Дашборд: посчитать $ через `model-costs.ts`, показать (переиспользуя графики) + строку «Cost» в `ContextViewer.tsx` | `apps/frontend/src/main/ipc-handlers/token-stats-handler.ts` + компоненты | «Сборка стоила $X — планирование/код/QA …» | +| T4 | Бюджет-гард: лимит на спеку, останов/предупреждение при превышении | проверка в `qa/loop.py` и цикле сессий | Низкий лимит останавливает прогон с понятным сообщением | +| T5 *(позже)* | Роутинг дешёвой модели на тривиальные подзадачи | `phase_config.py` (`get_model_for_agent`/`lock_agent_model`) | Простые подзадачи идут на haiku, записано в статистике | + +**🚀 Начать с:** T1. + +--- + +## P3. Доделать мульти-провайдерную автономность — Волна 1, почти бесплатно + +**Итог:** один тумблер `AUTO_CODE_AUTONOMY` (off/claude/safe/bold) реально управляет всеми агентами; не-Claude провайдеры доходят до паритета через evidence-gate; локальные модели (Ollama) — задокументированный «приватный» режим. + +**Что уже есть (≈90 %):** `apps/backend/core/autonomy_level.py` (`resolve_autonomy_settings()`), `apps/backend/core/autonomy_policy.py` + gate `apps/backend/agents/runtime/direct_api_autonomy.py`, runtime-слой (capabilities/modes/adapters), реальный Ollama-адаптер. **Разрыв — три «проводки» + UX/доки.** + +| # | Задача | Файлы | Критерий приёмки | +|---|---|---|---| +| T1 | ✅ **Уже реализовано.** Coder уважает уровень автономности | `agents/coder.py:1282–1296` + `agents/runtime/direct_api_autonomy.py:124` | `AUTO_CODE_AUTONOMY=claude` блокирует автономный direct-API coder; `safe` — через gate; `bold` — мимо gate (хардкод `=True` остался только в смоук-тесте `cli/provider_smoke_commands.py`) | +| T2 | Гейт для planner на не-Claude провайдерах | `agents/planner.py` | При `off` не-Claude planner блокируется | +| T3 | Фабрика принимает `autonomy_settings` и логирует уровень | `agents/runtime/adapters/__init__.py` (`create_runtime_session`) | Уровень автономности виден в логе сессии | +| T4 | Один тумблер в UI + вывод уровня в JSON | `cli/runtime_commands.py`, настройки фронта | UI показывает и задаёт один селектор автономности | +| T5 | Доки: вести с `AUTO_CODE_AUTONOMY`, 30+ переменных — в приложение | `guides/QUICK-START.md`, `docs/` | Quickstart: «поставь `safe` — готово» | +| T6 | Рецепт «данные не покидают периметр»: `safe` + `provider=ollama` | доки + проверка | Рабочий локальный рецепт задокументирован | + +**🚀 Начать с:** T1 уже сделан — оставшийся разрыв это UX/наблюдаемость: T4 (один тумблер в UI + уровень в JSON) и T5 (доки ведут с `AUTO_CODE_AUTONOMY`). + +--- + +## P2. GitHub App: issue → автономный PR — Волна 2, дистрибуция + +**Итог:** назначаешь issue на Auto Code → он прогоняет спека→план→код→QA и открывает PR с приложенным отчётом доверия (P1). + +**Что уже есть (~75 % переиспользуемо):** `runners/github/gh_client.py`, создание ветки/PR в `core/worktree.py` (`push_and_create_pr`), полный билд `handle_build_command()` в `cli/build_commands.py`, режим CI+JSON, проверка прав, каркас приёма вебхуков `integrations/webhooks/server.py`, `AutoFixProcessor` (issue→state). + +| # | Задача | Файлы | Критерий приёмки | +|---|---|---|---| +| T1 | Auth GitHub App: installation-токены + проверка подписи вебхука | новый `integrations/github_app/auth.py` | Минтим installation-токен, верифицируем HMAC | +| T2 | Хендлер вебхука `issues.assigned` | новый `integrations/webhooks/handlers/github_app.py` | Назначение issue запускает обработчик | +| T3 | Issue→spec: из issue+контекста собрать спека-директорию | новый `runners/github/services/issue_to_spec.py` | Issue даёт валидные `spec.md` + план | +| T4 | Оркестрация билда (переиспользование) | `cli/build_commands.handle_build_command` + `core/worktree.push_and_create_pr` | Сквозняк issue→ветка→билд→PR | +| T5 | Комментарий-отчёт в PR (**зависит от P1**) + status check | новый `runners/github/services/pr_reporter.py` | В PR есть отчёт верификации и pass/fail-статус | +| T6 *(опц.)* | Эндпоинт-триггер билда | `apps/web-backend/api/routes/builds.py` | Внешний триггер работает | + +**🚀 Начать с:** T1+T2 (единственный реально отсутствующий примитив). + +--- + +## P4. Облако / команды — Волна 3, выручка/энтерпрайз + +**Итог:** браузерный многопользовательский доступ с изоляцией по workspace и ролями (owner/editor/viewer), историей запусков и командными процессами. + +**Что уже есть:** FastAPI `apps/web-backend/main.py` (specs/tasks/agents/auth/users/git/files/usage/websocket); реальные регистрация/логин (JWT, bcrypt), `agent_runner` реально зовёт `run_autonomous_agent`, websocket-броадкаст, ORM `User`/`GitRepository`, миграции, web-frontend. **Нет:** мультиарендности, изоляции по пользователю, персистентности запусков, использования OAuth. + +| # | Задача | Файлы | Критерий приёмки | +|---|---|---|---| +| T1 | Модель Workspace + роли + проверка доступа | новый `api/models/workspace.py`, миграции 003–004, новый `core/permissions.py` | Пользователи входят в workspace с ролями | +| T2 | Изоляция исполнения по пользователю/workspace | `services/agent_runner.py`, новый `services/workspace_service.py` | Пользователь A не видит/не запускает спеки B | +| T3 | Персистентность запусков | новый `api/models/agent_execution.py` + миграция | После рестарта история сохранена | +| T4 | Спеки: ФС — источник истины + ORM как кэш/аудит | `api/models/spec.py` + миграция 005 | Правки спек аудируются, web читает через API | +| T5 | Реалтайм + терминал в браузере | `api/websocket.py` + web-компонент терминала | Живой вывод агента в браузере | +| T6 | OAuth-привязка репозиториев | `core/oauth.py` + коллбэки | Подключил репо → агент в нём работает | +| T7 | Командные процессы: инвайты/участники, общие спеки/ревью | новый `api/routes/workspaces.py` | Можно пригласить коллегу в workspace | + +**🚀 Начать с:** T1 (аддитивно, разблокирует остальное). +**Риск:** миграция спек ФС→БД; смягчение — на первом этапе ФС остаётся источником истины. + +--- + +## Прогресс ([PR #361](https://github.com/OBenner/Auto-Coding/pull/361)) + +**Волна 1 — закрыта** (бэкенд: 28 юнит-тестов зелёные; фронтенд: `tsc --noEmit` зелёный): + +- ✅ **P3·T1** — coder уважает `AUTO_CODE_AUTONOMY` (оказался уже реализован). +- ✅ **P1·T1 / T1-wire** — контракт `verification-report.json` + запись на каждой сборке (`cli/artifacts.py`, `cli/build_commands.py`). +- ✅ **P1·T2** — детект правок вне scope (`qa/scope_check.py`) → `out_of_scope_edits`. +- ✅ **P1·T3** — `confidence` + `uncertainty` через SDK- и runtime-путь (`qa/reviewer.py`, промпт, merge-санитайзинг). +- ✅ **P5·T1** — модель/провайдер по фазам в `token_stats.json` (`core/token_stats.py`, `agents/session.py`). +- ✅ **P3·T5** — доки ведут с `AUTO_CODE_AUTONOMY` (`guides/CLI-USAGE.md`, ADR-006 → Accepted). +- ✅ **P1·T4** — QA-экран отчёта доверия в task overview (reader + IPC + `VerificationReportPanel`, i18n en/fr). +- ✅ **P3·T4** — тумблер автономности в настройках → инжектит `AUTO_CODE_AUTONOMY` в окружение сборки (явный env приоритетнее). + +Отчёт доверия теперь несёт: **вердикт · тесты · дифф · out-of-scope · confidence · uncertainty** — и виден в UI. + +**Дальше отдельными PR:** P2 (GitHub App), P4 (облако/команды). + +Каждая задача shippable отдельно и тянет тесты (`apps/backend/.venv/bin/pytest tests/<файл>` — точечно). diff --git a/guides/CLI-USAGE.md b/guides/CLI-USAGE.md index 42edad8a3..10ba426b3 100644 --- a/guides/CLI-USAGE.md +++ b/guides/CLI-USAGE.md @@ -212,6 +212,40 @@ echo "Focus on fixing the login bug first" > specs/001-name/HUMAN_INPUT.md python validate_spec.py --spec-dir specs/001-feature --checkpoint all ``` +## Autonomy: one knob + +How independent the agents are is controlled by a single setting — `AUTO_CODE_AUTONOMY`. Set it and you're done; you do not need to touch the low-level runtime/provider/policy variables. + +```bash +# In apps/backend/.env (or the environment) +AUTO_CODE_AUTONOMY=claude # default +``` + +| Level | What it allows | +|-------|----------------| +| `off` | Analysis and patch suggestions only — agents never write your workspace. | +| `claude` *(default)* | Full autonomy on the Claude SDK / Codex CLI runtimes. Direct-API providers (OpenAI, Google, …) are refused with a clear capability error. | +| `safe` | Adds direct-API providers: they run through `generic_edit` and are promoted to full autonomy once they pass the evidence gate. Mutating parallel subagents enabled (write-scope confined). Still fail-fast on missing capability. | +| `bold` | Power-user mode: direct providers run promoted without waiting for the gate. Use for benchmarking or seeding evidence. | + +Optional second knob — how strict the promotion gate is: + +```bash +AUTO_CODE_AUTONOMY_PRESET=standard # strict | standard | lax +``` + +- `strict` — 10 stable runs / 3-day freshness / all required cases +- `standard` *(default)* — 3 stable runs / 7-day freshness / all required cases +- `lax` — 1 stable run / 30-day freshness / critical cases only + +**Privacy / local-model recipe:** `AUTO_CODE_AUTONOMY=safe` together with `AI_ENGINE_PROVIDER=ollama` keeps the whole loop on a local model — your code never leaves the machine. + +### Advanced overrides + +Every low-level variable still works and **takes precedence** over the level — they are normally not needed. The matrix (`AUTO_CODE_RUNTIME_MODE`, `AGENT_RUNTIME_MODE_`, the per-provider `AUTO_CODE_AUTONOMY__` policy knobs, …) is documented in [ADR-006: Autonomy levels](../docs/architecture/adr/ADR-006-autonomy-levels.md) and [Provider runtime modes](../docs/architecture/provider-runtime-modes.md). The legacy `AUTO_CODE_DIRECT_API_FULL_AUTONOMOUS` is deprecated — use `AUTO_CODE_AUTONOMY=safe` instead. + +--- + ## Environment Variables Copy `.env.example` to `.env` and configure as needed: diff --git a/guides/QUICK-START.md b/guides/QUICK-START.md index 994f72c5f..46b4c2014 100644 --- a/guides/QUICK-START.md +++ b/guides/QUICK-START.md @@ -139,6 +139,7 @@ Now that you've completed your first task, explore: - **[Spec Creation Pipeline](./SPEC-CREATION-PIPELINE.md)** -- Deep dive on how specs are created - **[Troubleshooting](./TROUBLESHOOTING.md)** -- Common issues and fixes - **[Intelligent Pattern Recognition](./INTELLIGENT-PATTERN-RECOGNITION.md)** -- How agents learn from previous builds +- **[CLI Usage — Autonomy](./CLI-USAGE.md#autonomy-one-knob)** -- Control how independent agents are with one setting (`AUTO_CODE_AUTONOMY`) --- diff --git a/tests/test_build_verification_wiring.py b/tests/test_build_verification_wiring.py new file mode 100644 index 000000000..8fa96b0ea --- /dev/null +++ b/tests/test_build_verification_wiring.py @@ -0,0 +1,101 @@ +"""Tests for wiring the Trust Layer verification report into the build (P1.T1-wire). + +Covers the build-layer helpers that read the persisted QA sign-off from +implementation_plan.json and write artifacts/verification-report.json on +every build (not only CI/json mode). +""" + +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend")) + +from cli.artifacts import VERIFICATION_REPORT_FILENAME # noqa: E402 +from cli.build_commands import ( # noqa: E402 + _generate_verification_report_data, + _save_verification_report, +) + + +def _write_plan(spec_dir: Path, signoff_status: str) -> None: + plan = { + "qa_signoff": { + "status": signoff_status, + "qa_session": 2, + "issues_found": [ + {"title": "missing null check", "type": "error", "location": "a.py"} + ], + "test_results": { + "tests_passing": 41, + "tests_skipped": 1, + "coverage_overall": "85%", + }, + }, + "qa_stats": {"last_iteration": 3, "last_status": signoff_status}, + "qa_iteration_history": [ + {"iteration": 1, "status": "rejected", "issues": [{}], "duration_seconds": 10.0}, + {"iteration": 2, "status": "rejected", "issues": [], "duration_seconds": 5.5}, + ], + } + (spec_dir / "implementation_plan.json").write_text( + json.dumps(plan), encoding="utf-8" + ) + + +def test_generate_report_data_from_plan(tmp_path): + _write_plan(tmp_path, "rejected") + report = _generate_verification_report_data( + tmp_path, qa_approved=False, changed_files=["a.py", "b.py"] + ) + assert report["verdict"] == "rejected" + assert report["issues"][0]["title"] == "missing null check" + assert report["tests_run"]["tests_passing"] == 41 + assert report["diff_summary"]["files_changed"] == 2 + assert report["iteration"] == 3 + assert report["qa_session"] == 2 + assert report["duration_seconds"] == pytest.approx(15.5) + + +def test_generate_report_data_approved_overrides_status(tmp_path): + # qa_approved (the loop's outcome) is authoritative for the verdict. + _write_plan(tmp_path, "rejected") + report = _generate_verification_report_data(tmp_path, qa_approved=True) + assert report["verdict"] == "approved" + + +def test_generate_report_data_missing_plan(tmp_path): + report = _generate_verification_report_data(tmp_path, qa_approved=True) + assert report["verdict"] == "approved" + assert report["issues"] == [] + assert report["tests_run"] == {} + assert report["diff_summary"] == {} + + +def test_save_writes_artifact_without_manager(tmp_path): + _write_plan(tmp_path, "approved") + _save_verification_report( + tmp_path, qa_approved=True, changed_files=None, artifact_manager=None + ) + artifact = tmp_path / "artifacts" / VERIFICATION_REPORT_FILENAME + assert artifact.exists() + data = json.loads(artifact.read_text(encoding="utf-8")) + assert data["verdict"] == "approved" + assert "timestamp" in data + + +def test_out_of_scope_edits_flagged_via_plan(tmp_path): + # Plan declares only a.py; the build also touched rogue.py. + plan = { + "phases": [{"subtasks": [{"files_to_modify": ["a.py"]}]}], + "qa_signoff": {"status": "approved"}, + } + (tmp_path / "implementation_plan.json").write_text( + json.dumps(plan), encoding="utf-8" + ) + report = _generate_verification_report_data( + tmp_path, qa_approved=True, changed_files=["a.py", "rogue.py"] + ) + assert [e["file"] for e in report["out_of_scope_edits"]] == ["rogue.py"] diff --git a/tests/test_qa_trust_signals.py b/tests/test_qa_trust_signals.py new file mode 100644 index 000000000..6f47a2d8c --- /dev/null +++ b/tests/test_qa_trust_signals.py @@ -0,0 +1,93 @@ +"""Tests for Trust Layer confidence/uncertainty signals (P1.T3). + +Covers the runtime qa_signoff merge carrying + sanitizing the model's +self-reported confidence and uncertainty, and the build reading them into the +verification report. +""" + +import json +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend")) + +from cli.build_commands import _generate_verification_report_data # noqa: E402 +from qa.reviewer import merge_runtime_qa_signoff_artifact # noqa: E402 + + +def _write_plan(spec_dir: Path, signoff=None) -> Path: + plan = spec_dir / "implementation_plan.json" + plan.write_text(json.dumps({"qa_signoff": signoff}), encoding="utf-8") + return plan + + +def _write_artifact(spec_dir: Path, payload: dict) -> None: + (spec_dir / "qa_signoff.json").write_text(json.dumps(payload), encoding="utf-8") + + +def test_merge_carries_confidence_and_uncertainty(tmp_path): + plan = _write_plan(tmp_path) + _write_artifact( + tmp_path, + { + "status": "approved", + "confidence": 0.83, + "uncertainty": [{"area": "timeouts", "reason": "no test"}], + }, + ) + assert merge_runtime_qa_signoff_artifact(tmp_path, qa_session=1) is True + signoff = json.loads(plan.read_text())["qa_signoff"] + assert signoff["confidence"] == pytest.approx(0.83) + assert signoff["uncertainty"] == [{"area": "timeouts", "reason": "no test"}] + + +def test_merge_clamps_confidence_and_filters_uncertainty(tmp_path): + plan = _write_plan(tmp_path) + _write_artifact( + tmp_path, + { + "status": "approved", + "confidence": 1.7, + "uncertainty": [{"area": "ok"}, "nope", 5], + }, + ) + assert merge_runtime_qa_signoff_artifact(tmp_path, qa_session=1) is True + signoff = json.loads(plan.read_text())["qa_signoff"] + assert signoff["confidence"] == pytest.approx(1.0) + assert signoff["uncertainty"] == [{"area": "ok"}] + + +def test_merge_drops_bool_confidence(tmp_path): + # bool is an int subclass; it must not be coerced into a 1.0 confidence. + plan = _write_plan(tmp_path) + _write_artifact(tmp_path, {"status": "approved", "confidence": True}) + assert merge_runtime_qa_signoff_artifact(tmp_path, qa_session=1) is True + signoff = json.loads(plan.read_text())["qa_signoff"] + assert "confidence" not in signoff + + +def test_merge_without_signals_omits_them(tmp_path): + plan = _write_plan(tmp_path) + _write_artifact(tmp_path, {"status": "approved"}) + assert merge_runtime_qa_signoff_artifact(tmp_path, qa_session=1) is True + signoff = json.loads(plan.read_text())["qa_signoff"] + assert "confidence" not in signoff + assert "uncertainty" not in signoff + + +def test_report_reads_confidence_and_uncertainty(tmp_path): + plan = { + "qa_signoff": { + "status": "approved", + "confidence": 0.77, + "uncertainty": [{"area": "x", "reason": "y"}], + } + } + (tmp_path / "implementation_plan.json").write_text( + json.dumps(plan), encoding="utf-8" + ) + report = _generate_verification_report_data(tmp_path, qa_approved=True) + assert report["confidence"] == pytest.approx(0.77) + assert report["uncertainty"] == [{"area": "x", "reason": "y"}] diff --git a/tests/test_scope_check.py b/tests/test_scope_check.py new file mode 100644 index 000000000..3d7c40acb --- /dev/null +++ b/tests/test_scope_check.py @@ -0,0 +1,68 @@ +"""Tests for out-of-scope edit detection (P1.T2).""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend")) + +from qa.scope_check import detect_out_of_scope_edits, get_planned_files # noqa: E402 + + +def _plan(*subtasks): + """Build a plan dict from (files_to_modify, files_to_create) tuples.""" + built = [] + for mod, create in subtasks: + st = {"id": "s", "description": "d", "status": "completed"} + if mod: + st["files_to_modify"] = mod + if create: + st["files_to_create"] = create + built.append(st) + return {"phases": [{"phase": 1, "name": "p", "subtasks": built}]} + + +def test_get_planned_files_unions_modify_and_create(): + plan = _plan((["a.py", "b.py"], ["c.py"]), ([], ["d.py"])) + assert get_planned_files(plan) == {"a.py", "b.py", "c.py", "d.py"} + + +def test_get_planned_files_supports_chunks_alias(): + plan = {"phases": [{"chunks": [{"files_to_modify": ["x.py"]}]}]} + assert get_planned_files(plan) == {"x.py"} + + +def test_get_planned_files_empty_or_none(): + assert get_planned_files(None) == set() + assert get_planned_files({}) == set() + assert get_planned_files({"phases": []}) == set() + + +def test_get_planned_files_normalizes_paths(): + plan = _plan(([".\\a.py", "./b.py", "dir\\c.py"], [])) + assert get_planned_files(plan) == {"a.py", "b.py", "dir/c.py"} + + +def test_detect_flags_only_unplanned_sorted(): + planned = {"a.py", "src/b.py"} + changed = ["a.py", "src/b.py", "rogue.py", "src/other.py"] + result = detect_out_of_scope_edits(planned, changed) + assert [r["file"] for r in result] == ["rogue.py", "src/other.py"] + assert all(r["reason"] for r in result) + + +def test_detect_ignores_framework_paths_and_dupes(): + changed = [ + ".auto-claude/specs/001/x.json", + ".auto-claude-security.json", + "z.py", + "z.py", + ] + result = detect_out_of_scope_edits(set(), changed) + assert [r["file"] for r in result] == ["z.py"] + + +def test_detect_normalizes_before_compare(): + planned = {"src/a.py"} + changed = ["./src/a.py", "src\\b.py"] + result = detect_out_of_scope_edits(planned, changed) + assert [r["file"] for r in result] == ["src/b.py"] diff --git a/tests/test_token_stats_model_provider.py b/tests/test_token_stats_model_provider.py new file mode 100644 index 000000000..9e8478136 --- /dev/null +++ b/tests/test_token_stats_model_provider.py @@ -0,0 +1,60 @@ +"""Tests for model/provider attribution in token stats (P5.T1). + +Verifies the persisted token_stats schema carries the model and provider per +phase, that the provider is auto-resolved when a caller omits it, and that a +known model is not clobbered by a later session that supplies none. +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend")) + +from agents import session as session_mod # noqa: E402 +from agents.session import load_token_stats, save_token_stats # noqa: E402 +from core.token_stats import PhaseTokenStats, TaskTokenStats # noqa: E402 + + +def test_phase_stats_to_dict_includes_model_provider(): + stats = PhaseTokenStats( + phase="coding", input_tokens=1, output_tokens=2, model="m", provider="p" + ) + task = TaskTokenStats( + phases={"coding": stats}, + total_input_tokens=1, + total_output_tokens=2, + total_tokens=3, + created_at=stats.updated_at, + updated_at=stats.updated_at, + ) + phase_dict = task.to_dict()["phases"]["coding"] + assert phase_dict["model"] == "m" + assert phase_dict["provider"] == "p" + + +def test_save_and_load_round_trips_model_provider(tmp_path): + assert save_token_stats( + tmp_path, "coding", 100, 50, model="claude-opus-4-8", provider="claude" + ) + loaded = load_token_stats(tmp_path) + assert loaded is not None + assert loaded.phases["coding"].model == "claude-opus-4-8" + assert loaded.phases["coding"].provider == "claude" + + +def test_provider_auto_resolved_when_omitted(tmp_path, monkeypatch): + monkeypatch.setattr(session_mod, "_resolve_active_provider", lambda: "openai") + save_token_stats(tmp_path, "planning", 10, 5, model="gpt-x") + loaded = load_token_stats(tmp_path) + assert loaded.phases["planning"].provider == "openai" + assert loaded.phases["planning"].model == "gpt-x" + + +def test_known_model_not_clobbered_by_later_none(tmp_path, monkeypatch): + monkeypatch.setattr(session_mod, "_resolve_active_provider", lambda: "claude") + save_token_stats(tmp_path, "coding", 10, 5, model="m1") + # A later session in the same phase that doesn't supply a model. + save_token_stats(tmp_path, "coding", 7, 3, model=None) + loaded = load_token_stats(tmp_path) + assert loaded.phases["coding"].model == "m1" + assert loaded.phases["coding"].input_tokens == 17 # tokens still aggregate diff --git a/tests/test_verification_report.py b/tests/test_verification_report.py new file mode 100644 index 000000000..443b72693 --- /dev/null +++ b/tests/test_verification_report.py @@ -0,0 +1,93 @@ +"""Tests for the Trust Layer verification report artifact (P1·T1). + +Covers the pure ``build_verification_report`` builder (schema, verdict +normalization, confidence clamping, defensive copies) and the +``ArtifactManager.save_verification_report`` writer (round-trip, timestamp +stamping, disabled no-op). +""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend")) + +from cli.artifacts import ( # noqa: E402 + VERIFICATION_REPORT_FILENAME, + VERIFICATION_REPORT_SCHEMA_VERSION, + ArtifactManager, + build_verification_report, +) + + +def test_build_minimal_has_contract_defaults(): + report = build_verification_report(verdict="approved") + + assert report["schema_version"] == VERIFICATION_REPORT_SCHEMA_VERSION + assert report["verdict"] == "approved" + assert report["confidence"] is None + assert report["tests_run"] == {} + assert report["diff_summary"] == {} + assert report["issues"] == [] + assert report["uncertainty"] == [] + assert report["out_of_scope_edits"] == [] + # No timestamp in the pure builder output — the writer stamps it. + assert "timestamp" not in report + + +def test_build_normalizes_verdict(): + assert build_verification_report(verdict="APPROVED")["verdict"] == "approved" + assert build_verification_report(verdict=" Rejected ")["verdict"] == "rejected" + assert build_verification_report(verdict="weird")["verdict"] == "error" + assert build_verification_report(verdict=None)["verdict"] == "error" + + +def test_build_clamps_confidence(): + assert build_verification_report(verdict="approved", confidence=1.5)["confidence"] == pytest.approx(1.0) + assert build_verification_report(verdict="approved", confidence=-0.3)["confidence"] == pytest.approx(0.0) + assert build_verification_report(verdict="approved", confidence=0.42)["confidence"] == pytest.approx(0.42) + # Non-numeric confidence degrades to None rather than raising. + assert build_verification_report(verdict="approved", confidence="oops")["confidence"] is None + + +def test_build_does_not_mutate_caller_collections(): + issues = [{"title": "x"}] + report = build_verification_report(verdict="rejected", issues=issues) + report["issues"].append({"title": "y"}) + assert issues == [{"title": "x"}] + + +def test_build_rounds_duration(): + report = build_verification_report(verdict="approved", duration_seconds=12.3456) + assert report["duration_seconds"] == pytest.approx(12.35) + + +def test_save_and_load_round_trip(tmp_path): + manager = ArtifactManager(spec_dir=tmp_path) + report = build_verification_report( + verdict="rejected", + qa_session=1, + iteration=2, + tests_run={"passed": 10, "failed": 1, "total": 11}, + issues=[{"title": "boom", "type": "error"}], + ) + + path = manager.save_verification_report(report) + assert path is not None + assert path.name == VERIFICATION_REPORT_FILENAME + + loaded = manager.load_artifact(VERIFICATION_REPORT_FILENAME) + assert loaded is not None + assert loaded["verdict"] == "rejected" + assert loaded["iteration"] == 2 + assert loaded["tests_run"]["failed"] == 1 + assert loaded["issues"][0]["title"] == "boom" + # Writer stamps a timestamp when the caller omits one. + assert "timestamp" in loaded + + +def test_save_disabled_returns_none(tmp_path): + manager = ArtifactManager(spec_dir=tmp_path, enabled=False) + report = build_verification_report(verdict="approved") + assert manager.save_verification_report(report) is None