diff --git a/evolution/core/fitness.py b/evolution/core/fitness.py index 04f2c78..3f2f5a8 100644 --- a/evolution/core/fitness.py +++ b/evolution/core/fitness.py @@ -104,11 +104,21 @@ def score( ) -def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float: +def skill_fitness_metric( + example: dspy.Example, + prediction: dspy.Prediction, + trace=None, + pred_name: Optional[str] = None, + pred_trace=None, +) -> float | dict: """DSPy-compatible metric function for skill optimization. - This is what gets passed to dspy.GEPA(metric=...). - Returns a float 0-1 score. + DSPy GEPA 3.x requires metrics to accept five arguments: + ``(gold, pred, trace, pred_name, pred_trace)``. Normal evaluation and + MIPROv2 still call this as a simple score function, so the extra GEPA + arguments stay optional. When GEPA asks for predictor-level feedback + (``pred_name`` is set), return score+feedback so reflection has something + actionable to optimize against. """ # The prediction should have an 'output' field with the agent's response agent_output = getattr(prediction, "output", "") or "" @@ -116,7 +126,9 @@ def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, tra task = getattr(example, "task_input", "") or "" if not agent_output.strip(): - return 0.0 + score = 0.0 + feedback = "Output was empty; produce a concrete response for the task." + return {"score": score, "feedback": feedback} if pred_name is not None else score # Quick heuristic scoring (for speed during optimization) # Full LLM-as-judge scoring is expensive — use it selectively @@ -129,11 +141,26 @@ def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, tra # Simple keyword overlap as a fast proxy expected_words = set(expected_lower.split()) output_words = set(output_lower.split()) + overlap = 0.0 if expected_words: overlap = len(expected_words & output_words) / len(expected_words) score = 0.3 + (0.7 * overlap) - return min(1.0, max(0.0, score)) + score = min(1.0, max(0.0, score)) + + if pred_name is not None: + missing_terms = sorted(expected_words - output_words)[:8] + feedback_parts = [ + f"Score: {score:.2f} for task: {task[:160]}", + f"Expected behavior: {expected[:240]}", + ] + if missing_terms: + feedback_parts.append("Missing or underrepresented expected terms: " + ", ".join(missing_terms)) + else: + feedback_parts.append("Output covered the expected keyword signal; preserve this behavior while improving clarity and procedure adherence.") + return {"score": score, "feedback": "\n".join(feedback_parts)} + + return score def _parse_score(value) -> float: diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 8ad4d89..c2addf5 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -5,6 +5,7 @@ python -m evolution.skills.evolve_skill --skill arxiv --eval-source golden --dataset datasets/skills/arxiv/ """ +import inspect import json import sys import time @@ -33,6 +34,51 @@ console = Console() +def build_gepa_kwargs(iterations: int, optimizer_model: Optional[str] = None) -> dict: + """Build DSPy GEPA kwargs across DSPy API versions. + + hermes-agent-self-evolution originally used ``max_steps``. DSPy 3.x + removed that parameter in favor of budget-style knobs such as + ``max_metric_calls`` / ``max_full_evals``. Keep compatibility without + forcing a MIPROv2 fallback. + """ + params = inspect.signature(dspy.GEPA).parameters + kwargs = {"metric": skill_fitness_metric} + budget = max(1, int(iterations)) + + if "max_steps" in params: + kwargs["max_steps"] = budget + elif "max_full_evals" in params: + kwargs["max_full_evals"] = budget + elif "max_metric_calls" in params: + kwargs["max_metric_calls"] = budget + + if optimizer_model and "reflection_lm" in params: + kwargs["reflection_lm"] = dspy.LM(optimizer_model) + + return kwargs + + +def validate_evolved_skill( + validator: ConstraintValidator, + skill: dict, + evolved_body: str, +) -> tuple[str, list]: + """Reassemble and validate a complete SKILL.md candidate. + + The optimizer mutates only the Markdown body. Constraint validation for a + skill, however, expects full YAML frontmatter plus body. Validate the full + candidate and compare growth against the full baseline to avoid false + failures for body-only text. + """ + evolved_full = reassemble_skill(skill["frontmatter"], evolved_body) + return evolved_full, validator.validate_all( + evolved_full, + "skill", + baseline_text=skill["raw"], + ) + + def evolve( skill_name: str, iterations: int = 10, @@ -119,7 +165,7 @@ def evolve( # ── 3. Validate constraints on baseline ───────────────────────────── console.print(f"\n[bold]Validating baseline constraints[/bold]") validator = ConstraintValidator(config) - baseline_constraints = validator.validate_all(skill["body"], "skill") + baseline_constraints = validator.validate_all(skill["raw"], "skill") all_pass = True for c in baseline_constraints: icon = "✓" if c.passed else "✗" @@ -154,10 +200,7 @@ def evolve( start_time = time.time() try: - optimizer = dspy.GEPA( - metric=skill_fitness_metric, - max_steps=iterations, - ) + optimizer = dspy.GEPA(**build_gepa_kwargs(iterations, optimizer_model=optimizer_model)) optimized_module = optimizer.compile( baseline_module, @@ -182,11 +225,10 @@ def evolve( # ── 6. Extract evolved skill text ─────────────────────────────────── # The optimized module's instructions contain the evolved skill text evolved_body = optimized_module.skill_text - evolved_full = reassemble_skill(skill["frontmatter"], evolved_body) + evolved_full, evolved_constraints = validate_evolved_skill(validator, skill, evolved_body) # ── 7. Validate evolved skill ─────────────────────────────────────── console.print(f"\n[bold]Validating evolved skill[/bold]") - evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"]) all_pass = True for c in evolved_constraints: icon = "✓" if c.passed else "✗" diff --git a/evolution/skills/skill_module.py b/evolution/skills/skill_module.py index 6d4d22e..ae0ed85 100644 --- a/evolution/skills/skill_module.py +++ b/evolution/skills/skill_module.py @@ -84,33 +84,40 @@ def find_skill(skill_name: str, hermes_agent_path: Path) -> Optional[Path]: class SkillModule(dspy.Module): """A DSPy module that wraps a skill file for optimization. - The skill text (body) is the parameter that GEPA optimizes. - On each forward pass, the module: - 1. Uses the skill text as instructions - 2. Processes the task input - 3. Returns the agent's response + DSPy optimizers such as GEPA mutate predictor instructions, not arbitrary + runtime input values. The skill body therefore lives in the predictor + signature instructions so optimizer output can be recovered as the evolved + SKILL.md body. """ class TaskWithSkill(dspy.Signature): - """Complete a task following the provided skill instructions. + """Complete a task following the skill instructions.""" - You are an AI agent following specific skill instructions to complete a task. - Read the skill instructions carefully and follow the procedure described. - """ - skill_instructions: str = dspy.InputField(desc="The skill instructions to follow") task_input: str = dspy.InputField(desc="The task to complete") output: str = dspy.OutputField(desc="Your response following the skill instructions") def __init__(self, skill_text: str): super().__init__() - self.skill_text = skill_text - self.predictor = dspy.ChainOfThought(self.TaskWithSkill) + signature = self.TaskWithSkill.with_instructions(skill_text) + self.predictor = dspy.ChainOfThought(signature) + + @property + def skill_text(self) -> str: + """Return the current optimizable skill instructions. + + After GEPA/MIPRO compiles the module, this reflects any mutated + predictor signature instructions rather than the original constructor + argument. + """ + return self.predictor.predict.signature.instructions + + @skill_text.setter + def skill_text(self, value: str) -> None: + """Replace the optimizable skill instructions in-place.""" + self.predictor.predict.signature = self.predictor.predict.signature.with_instructions(value) def forward(self, task_input: str) -> dspy.Prediction: - result = self.predictor( - skill_instructions=self.skill_text, - task_input=task_input, - ) + result = self.predictor(task_input=task_input) return dspy.Prediction(output=result.output) diff --git a/tests/skills/test_evolution_pipeline_regressions.py b/tests/skills/test_evolution_pipeline_regressions.py new file mode 100644 index 0000000..a37224b --- /dev/null +++ b/tests/skills/test_evolution_pipeline_regressions.py @@ -0,0 +1,94 @@ +"""Regression tests for the skill evolution pipeline.""" + +import dspy + +from evolution.core.config import EvolutionConfig +from evolution.core.constraints import ConstraintValidator +from evolution.core.fitness import skill_fitness_metric +from evolution.skills.evolve_skill import build_gepa_kwargs, validate_evolved_skill +from evolution.skills.skill_module import SkillModule, reassemble_skill + + +SAMPLE_FRONTMATTER = "name: regression-skill\ndescription: Regression test skill" +SAMPLE_BODY = "# Regression Skill\n\nFollow the documented procedure." +SAMPLE_SKILL = { + "frontmatter": SAMPLE_FRONTMATTER, + "body": SAMPLE_BODY, + "raw": reassemble_skill(SAMPLE_FRONTMATTER, SAMPLE_BODY), +} + + +def test_build_gepa_kwargs_prefers_full_eval_budget(monkeypatch): + class FakeGEPA: + def __init__(self, metric, max_full_evals=None, max_metric_calls=None): + pass + + monkeypatch.setattr("evolution.skills.evolve_skill.dspy.GEPA", FakeGEPA) + + kwargs = build_gepa_kwargs(iterations=7) + + assert kwargs["metric"] is skill_fitness_metric + assert kwargs["max_full_evals"] == 7 + assert "max_metric_calls" not in kwargs + + +def test_build_gepa_kwargs_supports_legacy_max_steps(monkeypatch): + class FakeLegacyGEPA: + def __init__(self, metric, max_steps=None): + pass + + monkeypatch.setattr("evolution.skills.evolve_skill.dspy.GEPA", FakeLegacyGEPA) + + kwargs = build_gepa_kwargs(iterations=3) + + assert kwargs["max_steps"] == 3 + + +def test_gepa_metric_accepts_reflective_feedback_signature(): + example = dspy.Example( + task_input="Summarize a source", + expected_behavior="mention source URL and capture durable notes", + ) + prediction = dspy.Prediction(output="Capture notes and mention source URL.") + + plain_score = skill_fitness_metric(example, prediction) + reflective_score = skill_fitness_metric( + example, + prediction, + trace=[], + pred_name="predictor", + pred_trace=[], + ) + + assert isinstance(plain_score, float) + assert reflective_score["score"] == plain_score + assert "feedback" in reflective_score + + +def test_validate_evolved_skill_reassembles_full_skill_before_validation(): + validator = ConstraintValidator(EvolutionConfig()) + evolved_body = "# Regression Skill\n\nImproved procedure." + + evolved_full, results = validate_evolved_skill(validator, SAMPLE_SKILL, evolved_body) + + assert evolved_full.startswith("---\nname: regression-skill") + assert "Improved procedure" in evolved_full + assert all(result.passed for result in results) + + +def test_body_only_skill_validation_would_fail_structure(): + validator = ConstraintValidator(EvolutionConfig()) + + results = validator.validate_all(SAMPLE_BODY, "skill", baseline_text=SAMPLE_BODY) + + structure = next(result for result in results if result.constraint_name == "skill_structure") + assert not structure.passed + + +def test_skill_module_reads_mutated_signature_instructions(): + module = SkillModule("initial instructions") + + module.skill_text = "mutated instructions" + + assert module.skill_text == "mutated instructions" + assert module.predictor.predict.signature.instructions == "mutated instructions"