Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cli/selftune/eval/hooks-to-evals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ export async function cliMain(): Promise<void> {
options: {
skill: { type: "string" },
output: { type: "string" },
out: { type: "string" },
max: { type: "string", default: "50" },
seed: { type: "string", default: "42" },
"list-skills": { type: "boolean", default: false },
Expand Down Expand Up @@ -409,7 +410,7 @@ export async function cliMain(): Promise<void> {
modelFlag: values.model,
});

const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");

const pos = evalSet.filter((e) => e.should_trigger);
Expand Down Expand Up @@ -475,7 +476,7 @@ export async function cliMain(): Promise<void> {
annotateTaxonomy,
);

const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
}
Expand Down
72 changes: 60 additions & 12 deletions cli/selftune/evolution/evolve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ export interface EvolveDeps {
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
updateContextAfterEvolve?: typeof import("../memory/writer.js").updateContextAfterEvolve;
measureBaseline?: typeof import("../eval/baseline.js").measureBaseline;
readSkillUsageLog?: () => SkillUsageRecord[];
}

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -151,6 +152,8 @@ export async function evolve(
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
const _updateContextAfterEvolve = _deps.updateContextAfterEvolve ?? updateContextAfterEvolve;
const _measureBaseline = _deps.measureBaseline ?? measureBaseline;
const _readSkillUsageLog =
_deps.readSkillUsageLog ?? (() => readJsonl<SkillUsageRecord>(SKILL_LOG));

const auditEntries: EvolutionAuditEntry[] = [];

Expand Down Expand Up @@ -217,8 +220,32 @@ export async function evolve(
let evalSet: EvalEntry[];

if (evalSetPath && existsSync(evalSetPath)) {
const raw = readFileSync(evalSetPath, "utf-8");
evalSet = JSON.parse(raw) as EvalEntry[];
try {
const raw = readFileSync(evalSetPath, "utf-8");
evalSet = JSON.parse(raw) as EvalEntry[];
} catch (parseErr) {
const msg = parseErr instanceof Error ? parseErr.message : String(parseErr);
tui.fail(`Failed to load eval set from ${evalSetPath}: ${msg}`);
finishTui();
return withStats({
proposal: null,
validation: null,
deployed: false,
auditEntries,
reason: `Failed to load eval set: ${msg}`,
});
}
if (!Array.isArray(evalSet)) {
tui.fail(`Eval set at ${evalSetPath} is not an array`);
finishTui();
return withStats({
proposal: null,
validation: null,
deployed: false,
auditEntries,
reason: `Eval set at ${evalSetPath} is not a JSON array`,
});
}
} else {
// Build from logs
const skillRecords = readJsonl<SkillUsageRecord>(SKILL_LOG);
Expand All @@ -233,7 +260,7 @@ export async function evolve(
// -----------------------------------------------------------------------
// Step 3: Load skill usage records
// -----------------------------------------------------------------------
const skillUsage = readJsonl<SkillUsageRecord>(SKILL_LOG);
const skillUsage = _readSkillUsageLog();

// -----------------------------------------------------------------------
// Step 4: Extract failure patterns
Expand All @@ -251,17 +278,38 @@ export async function evolve(
);

// -----------------------------------------------------------------------
// Step 5: Early exit if no patterns
// Step 5: Cold-start bootstrap or early exit if no patterns
// -----------------------------------------------------------------------
if (failurePatterns.length === 0) {
finishTui();
return withStats({
proposal: null,
validation: null,
deployed: false,
auditEntries,
reason: "No failure patterns found",
});
// Cold-start: if the eval set has positive entries that the skill should
// match but there are zero skill usage records, treat the positive eval
// entries themselves as "missed queries" — they ARE the failure signal.
const positiveEvals = evalSet.filter((e) => e.should_trigger);
const hasSkillUsageHistory = skillUsage.some((record) => record.skill_name === skillName);
if (positiveEvals.length > 0 && !hasSkillUsageHistory) {
const coldStartPattern: FailurePattern = {
pattern_id: `fp-${skillName}-coldstart`,
skill_name: skillName,
invocation_type: "implicit",
missed_queries: positiveEvals.map((e) => e.query),
frequency: positiveEvals.length,
sample_sessions: [],
extracted_at: new Date().toISOString(),
};
failurePatterns.push(coldStartPattern);
tui.done(
`Cold-start bootstrap: ${positiveEvals.length} positive eval entries used as missed queries`,
);
} else {
finishTui();
return withStats({
proposal: null,
validation: null,
deployed: false,
auditEntries,
reason: "No failure patterns found",
});
}
}

// -----------------------------------------------------------------------
Expand Down
245 changes: 128 additions & 117 deletions tests/blog-proof/seo-audit-evolve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
*/

import { describe, expect, mock, test } from "bun:test";
import { readFileSync } from "node:fs";
import { copyFileSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";

import { type EvolveDeps, evolve } from "../../cli/selftune/evolution/evolve.js";
Expand Down Expand Up @@ -283,127 +284,137 @@ describe("Blog Proof: seo-audit skill evolution", () => {
test("evolve pipeline runs end-to-end with seo-audit fixtures", async () => {
const before = computeAccuracy(simulateOriginalTrigger);
const after = computeAccuracy(simulateImprovedTrigger);
const tempDir = mkdtempSync(join(tmpdir(), "selftune-blog-proof-"));
const tempSkillPath = join(tempDir, "SKILL.md");
copyFileSync(SKILL_PATH, tempSkillPath);

try {
// Build realistic failure patterns from the actual missed queries
const failurePatterns: FailurePattern[] = [
{
pattern_id: "fp-seo-audit-0",
skill_name: "seo-audit",
invocation_type: "implicit",
missed_queries: before.false_negatives
.filter((e) => e.invocation_type === "implicit")
.map((e) => e.query),
frequency: before.false_negatives.filter((e) => e.invocation_type === "implicit").length,
sample_sessions: [],
extracted_at: new Date().toISOString(),
},
{
pattern_id: "fp-seo-audit-1",
skill_name: "seo-audit",
invocation_type: "contextual",
missed_queries: before.false_negatives
.filter((e) => e.invocation_type === "contextual")
.map((e) => e.query),
frequency: before.false_negatives.filter((e) => e.invocation_type === "contextual")
.length,
sample_sessions: [],
extracted_at: new Date().toISOString(),
},
].filter((p) => p.frequency > 0);

// Build realistic failure patterns from the actual missed queries
const failurePatterns: FailurePattern[] = [
{
pattern_id: "fp-seo-audit-0",
skill_name: "seo-audit",
invocation_type: "implicit",
missed_queries: before.false_negatives
.filter((e) => e.invocation_type === "implicit")
.map((e) => e.query),
frequency: before.false_negatives.filter((e) => e.invocation_type === "implicit").length,
sample_sessions: [],
extracted_at: new Date().toISOString(),
},
{
pattern_id: "fp-seo-audit-1",
const proposal: EvolutionProposal = {
proposal_id: "evo-seo-audit-blog-proof",
skill_name: "seo-audit",
invocation_type: "contextual",
missed_queries: before.false_negatives
.filter((e) => e.invocation_type === "contextual")
.map((e) => e.query),
frequency: before.false_negatives.filter((e) => e.invocation_type === "contextual").length,
sample_sessions: [],
extracted_at: new Date().toISOString(),
},
].filter((p) => p.frequency > 0);

const proposal: EvolutionProposal = {
proposal_id: "evo-seo-audit-blog-proof",
skill_name: "seo-audit",
skill_path: SKILL_PATH,
original_description: skillContent,
proposed_description: `${skillContent}\n\n<!-- selftune: expanded trigger coverage for symptom-based queries, migration diagnostics, and standalone technical signals -->`,
rationale: `Detected ${before.false_negatives.length} missed triggers across implicit and contextual invocations. Users describe symptoms (traffic drops, slow loads, indexing problems) without using "SEO audit" keywords. Expanded description to cover symptom-based queries, migration diagnostics, and standalone technical signals.`,
failure_patterns: failurePatterns.map((p) => p.pattern_id),
eval_results: {
before: {
total: before.total,
passed: before.passed,
failed: before.total - before.passed,
pass_rate: before.pass_rate,
skill_path: tempSkillPath,
original_description: skillContent,
proposed_description: `${skillContent}\n\n<!-- selftune: expanded trigger coverage for symptom-based queries, migration diagnostics, and standalone technical signals -->`,
rationale: `Detected ${before.false_negatives.length} missed triggers across implicit and contextual invocations. Users describe symptoms (traffic drops, slow loads, indexing problems) without using "SEO audit" keywords. Expanded description to cover symptom-based queries, migration diagnostics, and standalone technical signals.`,
failure_patterns: failurePatterns.map((p) => p.pattern_id),
eval_results: {
before: {
total: before.total,
passed: before.passed,
failed: before.total - before.passed,
pass_rate: before.pass_rate,
},
after: {
total: after.total,
passed: after.passed,
failed: after.total - after.passed,
pass_rate: after.pass_rate,
},
},
after: {
total: after.total,
passed: after.passed,
failed: after.total - after.passed,
pass_rate: after.pass_rate,
confidence: 0.85,
created_at: new Date().toISOString(),
status: "pending",
};

const validationResult: ValidationResult = {
proposal_id: proposal.proposal_id,
before_pass_rate: before.pass_rate,
after_pass_rate: after.pass_rate,
improved: true,
regressions: [],
new_passes: before.false_negatives.filter(
(fn) => !after.false_negatives.some((afn) => afn.query === fn.query),
),
net_change: after.pass_rate - before.pass_rate,
};

// Injectable deps — deterministic, no LLM calls
const deps: EvolveDeps = {
extractFailurePatterns: mock(() => failurePatterns),
generateProposal: mock(async () => proposal),
validateProposal: mock(async () => validationResult),
appendAuditEntry: mock(() => {}),
buildEvalSet: mock(() => evalSet),
updateContextAfterEvolve: mock(() => {}),
};

const result = await evolve(
{
skillName: "seo-audit",
skillPath: tempSkillPath,
evalSetPath: EVAL_SET_PATH,
agent: "claude",
dryRun: false,
confidenceThreshold: 0.6,
maxIterations: 3,
},
},
confidence: 0.85,
created_at: new Date().toISOString(),
status: "pending",
};

const validationResult: ValidationResult = {
proposal_id: proposal.proposal_id,
before_pass_rate: before.pass_rate,
after_pass_rate: after.pass_rate,
improved: true,
regressions: [],
new_passes: before.false_negatives.filter(
(fn) => !after.false_negatives.some((afn) => afn.query === fn.query),
),
net_change: after.pass_rate - before.pass_rate,
};

// Injectable deps — deterministic, no LLM calls
const deps: EvolveDeps = {
extractFailurePatterns: mock(() => failurePatterns),
generateProposal: mock(async () => proposal),
validateProposal: mock(async () => validationResult),
appendAuditEntry: mock(() => {}),
buildEvalSet: mock(() => evalSet),
updateContextAfterEvolve: mock(() => {}),
};

const result = await evolve(
{
skillName: "seo-audit",
skillPath: SKILL_PATH,
evalSetPath: EVAL_SET_PATH,
agent: "claude",
dryRun: false,
confidenceThreshold: 0.6,
maxIterations: 3,
},
deps,
);
deps,
);

// Pipeline completed successfully
expect(result.deployed).toBe(true);
expect(result.proposal).not.toBeNull();
expect(result.validation).not.toBeNull();
expect(result.reason).toBe("Evolution deployed successfully");

// Validation shows improvement
expect(result.validation?.improved).toBe(true);
expect(result.validation?.after_pass_rate).toBeGreaterThan(result.validation?.before_pass_rate);
expect(result.validation?.regressions.length).toBe(0);

// Audit trail recorded
expect(result.auditEntries.length).toBeGreaterThanOrEqual(2);
expect(result.auditEntries.some((e) => e.action === "created")).toBe(true);
expect(result.auditEntries.some((e) => e.action === "validated")).toBe(true);
expect(result.auditEntries.some((e) => e.action === "deployed")).toBe(true);

// Print the blog-ready numbers
const missedFixed = result.validation?.new_passes.length;
console.log(`\n ══════════════════════════════════════════════`);
console.log(` BLOG PROOF DATA (seo-audit skill)`);
console.log(` ══════════════════════════════════════════════`);
console.log(` Skill: seo-audit (marketingskills, 11.2k ★)`);
console.log(
` Eval set: ${evalSet.length} queries (${positiveQueries.length} positive, ${negativeQueries.length} negative)`,
);
console.log(` Before accuracy: ${(result.validation?.before_pass_rate * 100).toFixed(1)}%`);
console.log(` After accuracy: ${(result.validation?.after_pass_rate * 100).toFixed(1)}%`);
console.log(` Missed triggers fixed: ${missedFixed}`);
console.log(` Regressions: ${result.validation?.regressions.length}`);
console.log(` Confidence: ${result.proposal?.confidence}`);
console.log(` ══════════════════════════════════════════════`);
// Pipeline completed successfully
expect(result.deployed).toBe(true);
expect(result.proposal).not.toBeNull();
expect(result.validation).not.toBeNull();
expect(result.reason).toBe("Evolution deployed successfully");

// Validation shows improvement
expect(result.validation?.improved).toBe(true);
expect(result.validation?.after_pass_rate).toBeGreaterThan(
result.validation?.before_pass_rate,
);
expect(result.validation?.regressions.length).toBe(0);

// Audit trail recorded
expect(result.auditEntries.length).toBeGreaterThanOrEqual(2);
expect(result.auditEntries.some((e) => e.action === "created")).toBe(true);
expect(result.auditEntries.some((e) => e.action === "validated")).toBe(true);
expect(result.auditEntries.some((e) => e.action === "deployed")).toBe(true);

// Print the blog-ready numbers
const missedFixed = result.validation?.new_passes.length;
console.log(`\n ══════════════════════════════════════════════`);
console.log(` BLOG PROOF DATA (seo-audit skill)`);
console.log(` ══════════════════════════════════════════════`);
console.log(` Skill: seo-audit (marketingskills, 11.2k ★)`);
console.log(
` Eval set: ${evalSet.length} queries (${positiveQueries.length} positive, ${negativeQueries.length} negative)`,
);
console.log(` Before accuracy: ${(result.validation?.before_pass_rate * 100).toFixed(1)}%`);
console.log(` After accuracy: ${(result.validation?.after_pass_rate * 100).toFixed(1)}%`);
console.log(` Missed triggers fixed: ${missedFixed}`);
console.log(` Regressions: ${result.validation?.regressions.length}`);
console.log(` Confidence: ${result.proposal?.confidence}`);
console.log(` ══════════════════════════════════════════════`);
} finally {
rmSync(tempDir, { recursive: true, force: true });
}
});
});

Expand Down
Loading