selftune-dev · WellDunDun · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
@@ -366,6 +366,7 @@ export async function cliMain(): Promise<void> {
     options: {
       skill: { type: "string" },
       output: { type: "string" },
+      out: { type: "string" },
       max: { type: "string", default: "50" },
       seed: { type: "string", default: "42" },
       "list-skills": { type: "boolean", default: false },
@@ -409,7 +410,7 @@ export async function cliMain(): Promise<void> {
       modelFlag: values.model,
     });
 
-    const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
+    const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
     writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
 
     const pos = evalSet.filter((e) => e.should_trigger);
@@ -475,7 +476,7 @@ export async function cliMain(): Promise<void> {
     annotateTaxonomy,
   );
 
-  const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
+  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
   writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
   printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
 }

@@ -101,6 +101,7 @@ export interface EvolveDeps {
   buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
   updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
   measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
+  readSkillUsageLog?: () => SkillUsageRecord[];
 }
 
 // ---------------------------------------------------------------------------
@@ -151,6 +152,8 @@ export async function evolve(
   const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
   const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
   const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
+  const _readSkillUsageLog =
+    _deps.readSkillUsageLog ?? (() => readJsonl<SkillUsageRecord>(SKILL_LOG));
 
   const auditEntries: EvolutionAuditEntry[] = [];
 
@@ -217,8 +220,32 @@ export async function evolve(
     let evalSet: EvalEntry[];
 
     if (evalSetPath && existsSync(evalSetPath)) {
-      const raw = readFileSync(evalSetPath, "utf-8");
-      evalSet = JSON.parse(raw) as EvalEntry[];
+      try {
+        const raw = readFileSync(evalSetPath, "utf-8");
+        evalSet = JSON.parse(raw) as EvalEntry[];
+      } catch (parseErr) {
+        const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
+        tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`);
+        finishTui();
+        return withStats({
+          proposal: null,
+          validation: null,
+          deployed: false,
+          auditEntries,
+          reason: `Failed to load eval set: ${msg}`,
+        });
+      }
+      if (!Array.isArray(evalSet)) {
+        tui.fail(`Eval set at ${evalSetPath} is not an array`);
+        finishTui();
+        return withStats({
+          proposal: null,
+          validation: null,
+          deployed: false,
+          auditEntries,
+          reason: `Eval set at ${evalSetPath} is not a JSON array`,
+        });
+      }
     } else {
       // Build from logs
       const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
@@ -233,7 +260,7 @@ export async function evolve(
     // -----------------------------------------------------------------------
     // Step 3: Load skill usage records
     // -----------------------------------------------------------------------
-    const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
+    const skillUsage = _readSkillUsageLog();
 
     // -----------------------------------------------------------------------
     // Step 4: Extract failure patterns
@@ -251,17 +278,38 @@ export async function evolve(
     );
 
     // -----------------------------------------------------------------------
-    // Step 5: Early exit if no patterns
+    // Step 5: Cold-start bootstrap or early exit if no patterns
     // -----------------------------------------------------------------------
     if (failurePatterns.length === 0) {
-      finishTui();
-      return withStats({
-        proposal: null,
-        validation: null,
-        deployed: false,
-        auditEntries,
-        reason: "No failure patterns found",
-      });
+      // Cold-start: if the eval set has positive entries that the skill should
+      // match but there are zero skill usage records, treat the positive eval
+      // entries themselves as "missed queries" — they ARE the failure signal.
+      const positiveEvals = evalSet.filter((e) => e.should_trigger);
+      const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName);
+      if (positiveEvals.length > 0 && !hasSkillUsageHistory) {
+        const coldStartPattern: FailurePattern = {
+          pattern_id: `fp-${skillName}-coldstart`,
+          skill_name: skillName,
+          invocation_type: "implicit",
+          missed_queries: positiveEvals.map((e) => e.query),
+          frequency: positiveEvals.length,
+          sample_sessions: [],
+          extracted_at: new Date().toISOString(),
+        };
+        failurePatterns.push(coldStartPattern);
+        tui.done(
+          `Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`,
+        );
+      } else {
+        finishTui();
+        return withStats({
+          proposal: null,
+          validation: null,
+          deployed: false,
+          auditEntries,
+          reason: "No failure patterns found",
+        });
+      }
     }
 
     // -----------------------------------------------------------------------

@@ -13,7 +13,8 @@
  */
 
 import { describe, expect, mock, test } from "bun:test";
-import { readFileSync } from "node:fs";
+import { copyFileSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
 import { join } from "node:path";
 
 import { type EvolveDeps, evolve } from "../../cli/selftune/evolution/evolve.js";
@@ -283,127 +284,137 @@ describe("Blog Proof: seo-audit skill evolution", () => {
   test("evolve pipeline runs end-to-end with seo-audit fixtures", async () => {
     const before = computeAccuracy(simulateOriginalTrigger);
     const after = computeAccuracy(simulateImprovedTrigger);
+    const tempDir = mkdtempSync(join(tmpdir(), "selftune-blog-proof-"));
+    const tempSkillPath = join(tempDir, "SKILL.md");
+    copyFileSync(SKILL_PATH, tempSkillPath);
+
+    try {
+      // Build realistic failure patterns from the actual missed queries
+      const failurePatterns: FailurePattern[] = [
+        {
+          pattern_id: "fp-seo-audit-0",
+          skill_name: "seo-audit",
+          invocation_type: "implicit",
+          missed_queries: before.false_negatives
+            .filter((e) => e.invocation_type === "implicit")
+            .map((e) => e.query),
+          frequency: before.false_negatives.filter((e) => e.invocation_type === "implicit").length,
+          sample_sessions: [],
+          extracted_at: new Date().toISOString(),
+        },
+        {
+          pattern_id: "fp-seo-audit-1",
+          skill_name: "seo-audit",
+          invocation_type: "contextual",
+          missed_queries: before.false_negatives
+            .filter((e) => e.invocation_type === "contextual")
+            .map((e) => e.query),
+          frequency: before.false_negatives.filter((e) => e.invocation_type === "contextual")
+            .length,
+          sample_sessions: [],
+          extracted_at: new Date().toISOString(),
+        },
+      ].filter((p) => p.frequency > 0);
 
-    // Build realistic failure patterns from the actual missed queries
-    const failurePatterns: FailurePattern[] = [
-      {
-        pattern_id: "fp-seo-audit-0",
-        skill_name: "seo-audit",
-        invocation_type: "implicit",
-        missed_queries: before.false_negatives
-          .filter((e) => e.invocation_type === "implicit")
-          .map((e) => e.query),
-        frequency: before.false_negatives.filter((e) => e.invocation_type === "implicit").length,
-        sample_sessions: [],
-        extracted_at: new Date().toISOString(),
-      },
-      {
-        pattern_id: "fp-seo-audit-1",
+      const proposal: EvolutionProposal = {
+        proposal_id: "evo-seo-audit-blog-proof",
         skill_name: "seo-audit",
-        invocation_type: "contextual",
-        missed_queries: before.false_negatives
-          .filter((e) => e.invocation_type === "contextual")
-          .map((e) => e.query),
-        frequency: before.false_negatives.filter((e) => e.invocation_type === "contextual").length,
-        sample_sessions: [],
-        extracted_at: new Date().toISOString(),
-      },
-    ].filter((p) => p.frequency > 0);
-
-    const proposal: EvolutionProposal = {
-      proposal_id: "evo-seo-audit-blog-proof",
-      skill_name: "seo-audit",
-      skill_path: SKILL_PATH,
-      original_description: skillContent,
-      proposed_description: `${skillContent}\n\n<!-- selftune: expanded trigger coverage for symptom-based queries, migration diagnostics, and standalone technical signals -->`,
-      rationale: `Detected ${before.false_negatives.length} missed triggers across implicit and contextual invocations. Users describe symptoms (traffic drops, slow loads, indexing problems) without using "SEO audit" keywords. Expanded description to cover symptom-based queries, migration diagnostics, and standalone technical signals.`,
-      failure_patterns: failurePatterns.map((p) => p.pattern_id),
-      eval_results: {
-        before: {
-          total: before.total,
-          passed: before.passed,
-          failed: before.total - before.passed,
-          pass_rate: before.pass_rate,
+        skill_path: tempSkillPath,
+        original_description: skillContent,
+        proposed_description: `${skillContent}\n\n<!-- selftune: expanded trigger coverage for symptom-based queries, migration diagnostics, and standalone technical signals -->`,
+        rationale: `Detected ${before.false_negatives.length} missed triggers across implicit and contextual invocations. Users describe symptoms (traffic drops, slow loads, indexing problems) without using "SEO audit" keywords. Expanded description to cover symptom-based queries, migration diagnostics, and standalone technical signals.`,
+        failure_patterns: failurePatterns.map((p) => p.pattern_id),
+        eval_results: {
+          before: {
+            total: before.total,
+            passed: before.passed,
+            failed: before.total - before.passed,
+            pass_rate: before.pass_rate,
+          },
+          after: {
+            total: after.total,
+            passed: after.passed,
+            failed: after.total - after.passed,
+            pass_rate: after.pass_rate,
+          },
         },
-        after: {
-          total: after.total,
-          passed: after.passed,
-          failed: after.total - after.passed,
-          pass_rate: after.pass_rate,
+        confidence: 0.85,
+        created_at: new Date().toISOString(),
+        status: "pending",
+      };
+
+      const validationResult: ValidationResult = {
+        proposal_id: proposal.proposal_id,
+        before_pass_rate: before.pass_rate,
+        after_pass_rate: after.pass_rate,
+        improved: true,
+        regressions: [],
+        new_passes: before.false_negatives.filter(
+          (fn) => !after.false_negatives.some((afn) => afn.query === fn.query),
+        ),
+        net_change: after.pass_rate - before.pass_rate,
+      };
+
+      // Injectable deps — deterministic, no LLM calls
+      const deps: EvolveDeps = {
+        extractFailurePatterns: mock(() => failurePatterns),
+        generateProposal: mock(async () => proposal),
+        validateProposal: mock(async () => validationResult),
+        appendAuditEntry: mock(() => {}),
+        buildEvalSet: mock(() => evalSet),
+        updateContextAfterEvolve: mock(() => {}),
+      };
+
+      const result = await evolve(
+        {
+          skillName: "seo-audit",
+          skillPath: tempSkillPath,
+          evalSetPath: EVAL_SET_PATH,
+          agent: "claude",
+          dryRun: false,
+          confidenceThreshold: 0.6,
+          maxIterations: 3,
         },
-      },
-      confidence: 0.85,
-      created_at: new Date().toISOString(),
-      status: "pending",
-    };
-
-    const validationResult: ValidationResult = {
-      proposal_id: proposal.proposal_id,
-      before_pass_rate: before.pass_rate,
-      after_pass_rate: after.pass_rate,
-      improved: true,
-      regressions: [],
-      new_passes: before.false_negatives.filter(
-        (fn) => !after.false_negatives.some((afn) => afn.query === fn.query),
-      ),
-      net_change: after.pass_rate - before.pass_rate,
-    };
-
-    // Injectable deps — deterministic, no LLM calls
-    const deps: EvolveDeps = {
-      extractFailurePatterns: mock(() => failurePatterns),
-      generateProposal: mock(async () => proposal),
-      validateProposal: mock(async () => validationResult),
-      appendAuditEntry: mock(() => {}),
-      buildEvalSet: mock(() => evalSet),
-      updateContextAfterEvolve: mock(() => {}),
-    };
-
-    const result = await evolve(
-      {
-        skillName: "seo-audit",
-        skillPath: SKILL_PATH,
-        evalSetPath: EVAL_SET_PATH,
-        agent: "claude",
-        dryRun: false,
-        confidenceThreshold: 0.6,
-        maxIterations: 3,
-      },
-      deps,
-    );
+        deps,
+      );
 
-    // Pipeline completed successfully
-    expect(result.deployed).toBe(true);
-    expect(result.proposal).not.toBeNull();
-    expect(result.validation).not.toBeNull();
-    expect(result.reason).toBe("Evolution deployed successfully");
-
-    // Validation shows improvement
-    expect(result.validation?.improved).toBe(true);
-    expect(result.validation?.after_pass_rate).toBeGreaterThan(result.validation?.before_pass_rate);
-    expect(result.validation?.regressions.length).toBe(0);
-
-    // Audit trail recorded
-    expect(result.auditEntries.length).toBeGreaterThanOrEqual(2);
-    expect(result.auditEntries.some((e) => e.action === "created")).toBe(true);
-    expect(result.auditEntries.some((e) => e.action === "validated")).toBe(true);
-    expect(result.auditEntries.some((e) => e.action === "deployed")).toBe(true);
-
-    // Print the blog-ready numbers
-    const missedFixed = result.validation?.new_passes.length;
-    console.log(`\n  ══════════════════════════════════════════════`);
-    console.log(`  BLOG PROOF DATA (seo-audit skill)`);
-    console.log(`  ══════════════════════════════════════════════`);
-    console.log(`  Skill:            seo-audit (marketingskills, 11.2k ★)`);
-    console.log(
-      `  Eval set:         ${evalSet.length} queries (${positiveQueries.length} positive, ${negativeQueries.length} negative)`,
-    );
-    console.log(`  Before accuracy:  ${(result.validation?.before_pass_rate * 100).toFixed(1)}%`);
-    console.log(`  After accuracy:   ${(result.validation?.after_pass_rate * 100).toFixed(1)}%`);
-    console.log(`  Missed triggers fixed: ${missedFixed}`);
-    console.log(`  Regressions:      ${result.validation?.regressions.length}`);
-    console.log(`  Confidence:       ${result.proposal?.confidence}`);
-    console.log(`  ══════════════════════════════════════════════`);
+      // Pipeline completed successfully
+      expect(result.deployed).toBe(true);
+      expect(result.proposal).not.toBeNull();
+      expect(result.validation).not.toBeNull();
+      expect(result.reason).toBe("Evolution deployed successfully");
+
+      // Validation shows improvement
+      expect(result.validation?.improved).toBe(true);
+      expect(result.validation?.after_pass_rate).toBeGreaterThan(
+        result.validation?.before_pass_rate,
+      );
+      expect(result.validation?.regressions.length).toBe(0);
+
+      // Audit trail recorded
+      expect(result.auditEntries.length).toBeGreaterThanOrEqual(2);
+      expect(result.auditEntries.some((e) => e.action === "created")).toBe(true);
+      expect(result.auditEntries.some((e) => e.action === "validated")).toBe(true);
+      expect(result.auditEntries.some((e) => e.action === "deployed")).toBe(true);
+
+      // Print the blog-ready numbers
+      const missedFixed = result.validation?.new_passes.length;
+      console.log(`\n  ══════════════════════════════════════════════`);
+      console.log(`  BLOG PROOF DATA (seo-audit skill)`);
+      console.log(`  ══════════════════════════════════════════════`);
+      console.log(`  Skill:            seo-audit (marketingskills, 11.2k ★)`);
+      console.log(
+        `  Eval set:         ${evalSet.length} queries (${positiveQueries.length} positive, ${negativeQueries.length} negative)`,
+      );
+      console.log(`  Before accuracy:  ${(result.validation?.before_pass_rate * 100).toFixed(1)}%`);
+      console.log(`  After accuracy:   ${(result.validation?.after_pass_rate * 100).toFixed(1)}%`);
+      console.log(`  Missed triggers fixed: ${missedFixed}`);
+      console.log(`  Regressions:      ${result.validation?.regressions.length}`);
+      console.log(`  Confidence:       ${result.proposal?.confidence}`);
+      console.log(`  ══════════════════════════════════════════════`);
+    } finally {
+      rmSync(tempDir, { recursive: true, force: true });
+    }
   });
 });