diff --git a/cli/selftune/eval/hooks-to-evals.ts b/cli/selftune/eval/hooks-to-evals.ts index 01a679b..34d1727 100644 --- a/cli/selftune/eval/hooks-to-evals.ts +++ b/cli/selftune/eval/hooks-to-evals.ts @@ -366,6 +366,7 @@ export async function cliMain(): Promise { options: { skill: { type: "string" }, output: { type: "string" }, + out: { type: "string" }, max: { type: "string", default: "50" }, seed: { type: "string", default: "42" }, "list-skills": { type: "boolean", default: false }, @@ -409,7 +410,7 @@ export async function cliMain(): Promise { modelFlag: values.model, }); - const outputPath = values.output ?? `${values.skill}_trigger_eval.json`; + const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`; writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8"); const pos = evalSet.filter((e) => e.should_trigger); @@ -475,7 +476,7 @@ export async function cliMain(): Promise { annotateTaxonomy, ); - const outputPath = values.output ?? `${values.skill}_trigger_eval.json`; + const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`; writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8"); printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy); } diff --git a/cli/selftune/evolution/evolve.ts b/cli/selftune/evolution/evolve.ts index bea9da0..1dfa000 100644 --- a/cli/selftune/evolution/evolve.ts +++ b/cli/selftune/evolution/evolve.ts @@ -101,6 +101,7 @@ export interface EvolveDeps { buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet; updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve; measureBaseline?: typeof import("../eval/baseline.js").measureBaseline; + readSkillUsageLog?: () => SkillUsageRecord[]; } // --------------------------------------------------------------------------- @@ -151,6 +152,8 @@ export async function evolve( const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet; const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve; const _measureBaseline = _deps.measureBaseline ?? measureBaseline; + const _readSkillUsageLog = + _deps.readSkillUsageLog ?? (() => readJsonl(SKILL_LOG)); const auditEntries: EvolutionAuditEntry[] = []; @@ -217,8 +220,32 @@ export async function evolve( let evalSet: EvalEntry[]; if (evalSetPath && existsSync(evalSetPath)) { - const raw = readFileSync(evalSetPath, "utf-8"); - evalSet = JSON.parse(raw) as EvalEntry[]; + try { + const raw = readFileSync(evalSetPath, "utf-8"); + evalSet = JSON.parse(raw) as EvalEntry[]; + } catch (parseErr) { + const msg = parseErr instanceof Error ? parseErr.message : String(parseErr); + tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`); + finishTui(); + return withStats({ + proposal: null, + validation: null, + deployed: false, + auditEntries, + reason: `Failed to load eval set: ${msg}`, + }); + } + if (!Array.isArray(evalSet)) { + tui.fail(`Eval set at ${evalSetPath} is not an array`); + finishTui(); + return withStats({ + proposal: null, + validation: null, + deployed: false, + auditEntries, + reason: `Eval set at ${evalSetPath} is not a JSON array`, + }); + } } else { // Build from logs const skillRecords = readJsonl(SKILL_LOG); @@ -233,7 +260,7 @@ export async function evolve( // ----------------------------------------------------------------------- // Step 3: Load skill usage records // ----------------------------------------------------------------------- - const skillUsage = readJsonl(SKILL_LOG); + const skillUsage = _readSkillUsageLog(); // ----------------------------------------------------------------------- // Step 4: Extract failure patterns @@ -251,17 +278,38 @@ export async function evolve( ); // ----------------------------------------------------------------------- - // Step 5: Early exit if no patterns + // Step 5: Cold-start bootstrap or early exit if no patterns // ----------------------------------------------------------------------- if (failurePatterns.length === 0) { - finishTui(); - return withStats({ - proposal: null, - validation: null, - deployed: false, - auditEntries, - reason: "No failure patterns found", - }); + // Cold-start: if the eval set has positive entries that the skill should + // match but there are zero skill usage records, treat the positive eval + // entries themselves as "missed queries" — they ARE the failure signal. + const positiveEvals = evalSet.filter((e) => e.should_trigger); + const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName); + if (positiveEvals.length > 0 && !hasSkillUsageHistory) { + const coldStartPattern: FailurePattern = { + pattern_id: `fp-${skillName}-coldstart`, + skill_name: skillName, + invocation_type: "implicit", + missed_queries: positiveEvals.map((e) => e.query), + frequency: positiveEvals.length, + sample_sessions: [], + extracted_at: new Date().toISOString(), + }; + failurePatterns.push(coldStartPattern); + tui.done( + `Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`, + ); + } else { + finishTui(); + return withStats({ + proposal: null, + validation: null, + deployed: false, + auditEntries, + reason: "No failure patterns found", + }); + } } // ----------------------------------------------------------------------- diff --git a/tests/blog-proof/seo-audit-evolve.test.ts b/tests/blog-proof/seo-audit-evolve.test.ts index 520440c..7f2a226 100644 --- a/tests/blog-proof/seo-audit-evolve.test.ts +++ b/tests/blog-proof/seo-audit-evolve.test.ts @@ -13,7 +13,8 @@ */ import { describe, expect, mock, test } from "bun:test"; -import { readFileSync } from "node:fs"; +import { copyFileSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; import { join } from "node:path"; import { type EvolveDeps, evolve } from "../../cli/selftune/evolution/evolve.js"; @@ -283,127 +284,137 @@ describe("Blog Proof: seo-audit skill evolution", () => { test("evolve pipeline runs end-to-end with seo-audit fixtures", async () => { const before = computeAccuracy(simulateOriginalTrigger); const after = computeAccuracy(simulateImprovedTrigger); + const tempDir = mkdtempSync(join(tmpdir(), "selftune-blog-proof-")); + const tempSkillPath = join(tempDir, "SKILL.md"); + copyFileSync(SKILL_PATH, tempSkillPath); + + try { + // Build realistic failure patterns from the actual missed queries + const failurePatterns: FailurePattern[] = [ + { + pattern_id: "fp-seo-audit-0", + skill_name: "seo-audit", + invocation_type: "implicit", + missed_queries: before.false_negatives + .filter((e) => e.invocation_type === "implicit") + .map((e) => e.query), + frequency: before.false_negatives.filter((e) => e.invocation_type === "implicit").length, + sample_sessions: [], + extracted_at: new Date().toISOString(), + }, + { + pattern_id: "fp-seo-audit-1", + skill_name: "seo-audit", + invocation_type: "contextual", + missed_queries: before.false_negatives + .filter((e) => e.invocation_type === "contextual") + .map((e) => e.query), + frequency: before.false_negatives.filter((e) => e.invocation_type === "contextual") + .length, + sample_sessions: [], + extracted_at: new Date().toISOString(), + }, + ].filter((p) => p.frequency > 0); - // Build realistic failure patterns from the actual missed queries - const failurePatterns: FailurePattern[] = [ - { - pattern_id: "fp-seo-audit-0", - skill_name: "seo-audit", - invocation_type: "implicit", - missed_queries: before.false_negatives - .filter((e) => e.invocation_type === "implicit") - .map((e) => e.query), - frequency: before.false_negatives.filter((e) => e.invocation_type === "implicit").length, - sample_sessions: [], - extracted_at: new Date().toISOString(), - }, - { - pattern_id: "fp-seo-audit-1", + const proposal: EvolutionProposal = { + proposal_id: "evo-seo-audit-blog-proof", skill_name: "seo-audit", - invocation_type: "contextual", - missed_queries: before.false_negatives - .filter((e) => e.invocation_type === "contextual") - .map((e) => e.query), - frequency: before.false_negatives.filter((e) => e.invocation_type === "contextual").length, - sample_sessions: [], - extracted_at: new Date().toISOString(), - }, - ].filter((p) => p.frequency > 0); - - const proposal: EvolutionProposal = { - proposal_id: "evo-seo-audit-blog-proof", - skill_name: "seo-audit", - skill_path: SKILL_PATH, - original_description: skillContent, - proposed_description: `${skillContent}\n\n`, - rationale: `Detected ${before.false_negatives.length} missed triggers across implicit and contextual invocations. Users describe symptoms (traffic drops, slow loads, indexing problems) without using "SEO audit" keywords. Expanded description to cover symptom-based queries, migration diagnostics, and standalone technical signals.`, - failure_patterns: failurePatterns.map((p) => p.pattern_id), - eval_results: { - before: { - total: before.total, - passed: before.passed, - failed: before.total - before.passed, - pass_rate: before.pass_rate, + skill_path: tempSkillPath, + original_description: skillContent, + proposed_description: `${skillContent}\n\n`, + rationale: `Detected ${before.false_negatives.length} missed triggers across implicit and contextual invocations. Users describe symptoms (traffic drops, slow loads, indexing problems) without using "SEO audit" keywords. Expanded description to cover symptom-based queries, migration diagnostics, and standalone technical signals.`, + failure_patterns: failurePatterns.map((p) => p.pattern_id), + eval_results: { + before: { + total: before.total, + passed: before.passed, + failed: before.total - before.passed, + pass_rate: before.pass_rate, + }, + after: { + total: after.total, + passed: after.passed, + failed: after.total - after.passed, + pass_rate: after.pass_rate, + }, }, - after: { - total: after.total, - passed: after.passed, - failed: after.total - after.passed, - pass_rate: after.pass_rate, + confidence: 0.85, + created_at: new Date().toISOString(), + status: "pending", + }; + + const validationResult: ValidationResult = { + proposal_id: proposal.proposal_id, + before_pass_rate: before.pass_rate, + after_pass_rate: after.pass_rate, + improved: true, + regressions: [], + new_passes: before.false_negatives.filter( + (fn) => !after.false_negatives.some((afn) => afn.query === fn.query), + ), + net_change: after.pass_rate - before.pass_rate, + }; + + // Injectable deps — deterministic, no LLM calls + const deps: EvolveDeps = { + extractFailurePatterns: mock(() => failurePatterns), + generateProposal: mock(async () => proposal), + validateProposal: mock(async () => validationResult), + appendAuditEntry: mock(() => {}), + buildEvalSet: mock(() => evalSet), + updateContextAfterEvolve: mock(() => {}), + }; + + const result = await evolve( + { + skillName: "seo-audit", + skillPath: tempSkillPath, + evalSetPath: EVAL_SET_PATH, + agent: "claude", + dryRun: false, + confidenceThreshold: 0.6, + maxIterations: 3, }, - }, - confidence: 0.85, - created_at: new Date().toISOString(), - status: "pending", - }; - - const validationResult: ValidationResult = { - proposal_id: proposal.proposal_id, - before_pass_rate: before.pass_rate, - after_pass_rate: after.pass_rate, - improved: true, - regressions: [], - new_passes: before.false_negatives.filter( - (fn) => !after.false_negatives.some((afn) => afn.query === fn.query), - ), - net_change: after.pass_rate - before.pass_rate, - }; - - // Injectable deps — deterministic, no LLM calls - const deps: EvolveDeps = { - extractFailurePatterns: mock(() => failurePatterns), - generateProposal: mock(async () => proposal), - validateProposal: mock(async () => validationResult), - appendAuditEntry: mock(() => {}), - buildEvalSet: mock(() => evalSet), - updateContextAfterEvolve: mock(() => {}), - }; - - const result = await evolve( - { - skillName: "seo-audit", - skillPath: SKILL_PATH, - evalSetPath: EVAL_SET_PATH, - agent: "claude", - dryRun: false, - confidenceThreshold: 0.6, - maxIterations: 3, - }, - deps, - ); + deps, + ); - // Pipeline completed successfully - expect(result.deployed).toBe(true); - expect(result.proposal).not.toBeNull(); - expect(result.validation).not.toBeNull(); - expect(result.reason).toBe("Evolution deployed successfully"); - - // Validation shows improvement - expect(result.validation?.improved).toBe(true); - expect(result.validation?.after_pass_rate).toBeGreaterThan(result.validation?.before_pass_rate); - expect(result.validation?.regressions.length).toBe(0); - - // Audit trail recorded - expect(result.auditEntries.length).toBeGreaterThanOrEqual(2); - expect(result.auditEntries.some((e) => e.action === "created")).toBe(true); - expect(result.auditEntries.some((e) => e.action === "validated")).toBe(true); - expect(result.auditEntries.some((e) => e.action === "deployed")).toBe(true); - - // Print the blog-ready numbers - const missedFixed = result.validation?.new_passes.length; - console.log(`\n ══════════════════════════════════════════════`); - console.log(` BLOG PROOF DATA (seo-audit skill)`); - console.log(` ══════════════════════════════════════════════`); - console.log(` Skill: seo-audit (marketingskills, 11.2k ★)`); - console.log( - ` Eval set: ${evalSet.length} queries (${positiveQueries.length} positive, ${negativeQueries.length} negative)`, - ); - console.log(` Before accuracy: ${(result.validation?.before_pass_rate * 100).toFixed(1)}%`); - console.log(` After accuracy: ${(result.validation?.after_pass_rate * 100).toFixed(1)}%`); - console.log(` Missed triggers fixed: ${missedFixed}`); - console.log(` Regressions: ${result.validation?.regressions.length}`); - console.log(` Confidence: ${result.proposal?.confidence}`); - console.log(` ══════════════════════════════════════════════`); + // Pipeline completed successfully + expect(result.deployed).toBe(true); + expect(result.proposal).not.toBeNull(); + expect(result.validation).not.toBeNull(); + expect(result.reason).toBe("Evolution deployed successfully"); + + // Validation shows improvement + expect(result.validation?.improved).toBe(true); + expect(result.validation?.after_pass_rate).toBeGreaterThan( + result.validation?.before_pass_rate, + ); + expect(result.validation?.regressions.length).toBe(0); + + // Audit trail recorded + expect(result.auditEntries.length).toBeGreaterThanOrEqual(2); + expect(result.auditEntries.some((e) => e.action === "created")).toBe(true); + expect(result.auditEntries.some((e) => e.action === "validated")).toBe(true); + expect(result.auditEntries.some((e) => e.action === "deployed")).toBe(true); + + // Print the blog-ready numbers + const missedFixed = result.validation?.new_passes.length; + console.log(`\n ══════════════════════════════════════════════`); + console.log(` BLOG PROOF DATA (seo-audit skill)`); + console.log(` ══════════════════════════════════════════════`); + console.log(` Skill: seo-audit (marketingskills, 11.2k ★)`); + console.log( + ` Eval set: ${evalSet.length} queries (${positiveQueries.length} positive, ${negativeQueries.length} negative)`, + ); + console.log(` Before accuracy: ${(result.validation?.before_pass_rate * 100).toFixed(1)}%`); + console.log(` After accuracy: ${(result.validation?.after_pass_rate * 100).toFixed(1)}%`); + console.log(` Missed triggers fixed: ${missedFixed}`); + console.log(` Regressions: ${result.validation?.regressions.length}`); + console.log(` Confidence: ${result.proposal?.confidence}`); + console.log(` ══════════════════════════════════════════════`); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } }); }); diff --git a/tests/evolution/evolve.test.ts b/tests/evolution/evolve.test.ts index a37b82d..cabd34b 100644 --- a/tests/evolution/evolve.test.ts +++ b/tests/evolution/evolve.test.ts @@ -136,6 +136,7 @@ function makeDeps(): EvolveDeps { gateValidateProposal: mockGateValidateProposal, appendAuditEntry: mockAppendAuditEntry, buildEvalSet: mockBuildEvalSet, + readSkillUsageLog: () => [], }; } @@ -249,9 +250,11 @@ describe("evolve orchestrator", () => { expect(deployedCalls.length).toBe(0); }); - // 2. No failure patterns -> early exit with clear reason + // 2. No failure patterns and no positive evals -> early exit with clear reason test("no failure patterns returns early with clear reason", async () => { mockExtractFailurePatterns.mockImplementation(() => []); + // Use an eval set with only negatives so cold-start bootstrap doesn't apply + mockBuildEvalSet.mockImplementation(() => [{ query: "unrelated", should_trigger: false }]); const opts = makeOptions(); const result = await evolve(opts, makeDeps()); @@ -265,6 +268,46 @@ describe("evolve orchestrator", () => { expect(mockGenerateProposal.mock.calls.length).toBe(0); }); + // 2b. Cold-start bootstrap: no failure patterns + no usage history + positive evals -> proposal + test("cold-start bootstrap uses positive evals as missed queries only for unused skills", async () => { + mockExtractFailurePatterns.mockImplementation(() => []); + + const opts = makeOptions({ dryRun: true }); + const result = await evolve(opts, makeDeps()); + + // Should proceed to proposal generation instead of early exit + expect(result.proposal).not.toBeNull(); + expect(result.validation).not.toBeNull(); + expect(result.deployed).toBe(false); + expect(result.reason.toLowerCase()).toContain("dry"); + }); + + test("does not cold-start bootstrap when the skill already has usage history", async () => { + mockExtractFailurePatterns.mockImplementation(() => []); + + const opts = makeOptions(); + const result = await evolve(opts, { + ...makeDeps(), + readSkillUsageLog: () => [ + { + timestamp: new Date().toISOString(), + session_id: "sess-existing", + skill_name: "test-skill", + skill_path: opts.skillPath, + query: "test query", + triggered: true, + source: "test", + }, + ], + }); + + expect(result.proposal).toBeNull(); + expect(result.validation).toBeNull(); + expect(result.deployed).toBe(false); + expect(result.reason).toBe("No failure patterns found"); + expect(mockGenerateProposal.mock.calls.length).toBe(0); + }); + // 3. Low confidence -> rejected with reason test("low confidence proposal is rejected", async () => { mockGenerateProposal.mockImplementation(async () => makeProposal({ confidence: 0.3 }));