NousResearch · sunwz1115 · May 4, 2026
diff --git a/evolution/core/fitness.py b/evolution/core/fitness.py
@@ -104,19 +104,31 @@ def score(
         )
 
 
-def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float:
+def skill_fitness_metric(
+    example: dspy.Example,
+    prediction: dspy.Prediction,
+    trace=None,
+    pred_name: Optional[str] = None,
+    pred_trace=None,
+) -> float | dict:
     """DSPy-compatible metric function for skill optimization.
 
-    This is what gets passed to dspy.GEPA(metric=...).
-    Returns a float 0-1 score.
+    DSPy GEPA 3.x requires metrics to accept five arguments:
+    ``(gold, pred, trace, pred_name, pred_trace)``. Normal evaluation and
+    MIPROv2 still call this as a simple score function, so the extra GEPA
+    arguments stay optional. When GEPA asks for predictor-level feedback
+    (``pred_name`` is set), return score+feedback so reflection has something
+    actionable to optimize against.
     """
     # The prediction should have an 'output' field with the agent's response
     agent_output = getattr(prediction, "output", "") or ""
     expected = getattr(example, "expected_behavior", "") or ""
     task = getattr(example, "task_input", "") or ""
 
     if not agent_output.strip():
-        return 0.0
+        score = 0.0
+        feedback = "Output was empty; produce a concrete response for the task."
+        return {"score": score, "feedback": feedback} if pred_name is not None else score
 
     # Quick heuristic scoring (for speed during optimization)
     # Full LLM-as-judge scoring is expensive — use it selectively
@@ -129,11 +141,26 @@ def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, tra
     # Simple keyword overlap as a fast proxy
     expected_words = set(expected_lower.split())
     output_words = set(output_lower.split())
+    overlap = 0.0
     if expected_words:
         overlap = len(expected_words & output_words) / len(expected_words)
         score = 0.3 + (0.7 * overlap)
 
-    return min(1.0, max(0.0, score))
+    score = min(1.0, max(0.0, score))
+
+    if pred_name is not None:
+        missing_terms = sorted(expected_words - output_words)[:8]
+        feedback_parts = [
+            f"Score: {score:.2f} for task: {task[:160]}",
+            f"Expected behavior: {expected[:240]}",
+        ]
+        if missing_terms:
+            feedback_parts.append("Missing or underrepresented expected terms: " + ", ".join(missing_terms))
+        else:
+            feedback_parts.append("Output covered the expected keyword signal; preserve this behavior while improving clarity and procedure adherence.")
+        return {"score": score, "feedback": "\n".join(feedback_parts)}
+
+    return score
 
 
 def _parse_score(value) -> float:

diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
@@ -5,6 +5,7 @@
     python -m evolution.skills.evolve_skill --skill arxiv --eval-source golden --dataset datasets/skills/arxiv/
 """
 
+import inspect
 import json
 import sys
 import time
@@ -33,6 +34,51 @@
 console = Console()
 
 
+def build_gepa_kwargs(iterations: int, optimizer_model: Optional[str] = None) -> dict:
+    """Build DSPy GEPA kwargs across DSPy API versions.
+
+    hermes-agent-self-evolution originally used ``max_steps``. DSPy 3.x
+    removed that parameter in favor of budget-style knobs such as
+    ``max_metric_calls`` / ``max_full_evals``. Keep compatibility without
+    forcing a MIPROv2 fallback.
+    """
+    params = inspect.signature(dspy.GEPA).parameters
+    kwargs = {"metric": skill_fitness_metric}
+    budget = max(1, int(iterations))
+
+    if "max_steps" in params:
+        kwargs["max_steps"] = budget
+    elif "max_full_evals" in params:
+        kwargs["max_full_evals"] = budget
+    elif "max_metric_calls" in params:
+        kwargs["max_metric_calls"] = budget
+
+    if optimizer_model and "reflection_lm" in params:
+        kwargs["reflection_lm"] = dspy.LM(optimizer_model)
+
+    return kwargs
+
+
+def validate_evolved_skill(
+    validator: ConstraintValidator,
+    skill: dict,
+    evolved_body: str,
+) -> tuple[str, list]:
+    """Reassemble and validate a complete SKILL.md candidate.
+
+    The optimizer mutates only the Markdown body. Constraint validation for a
+    skill, however, expects full YAML frontmatter plus body. Validate the full
+    candidate and compare growth against the full baseline to avoid false
+    failures for body-only text.
+    """
+    evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
+    return evolved_full, validator.validate_all(
+        evolved_full,
+        "skill",
+        baseline_text=skill["raw"],
+    )
+
+
 def evolve(
     skill_name: str,
     iterations: int = 10,
@@ -119,7 +165,7 @@ def evolve(
     # ── 3. Validate constraints on baseline ─────────────────────────────
     console.print(f"\n[bold]Validating baseline constraints[/bold]")
     validator = ConstraintValidator(config)
-    baseline_constraints = validator.validate_all(skill["body"], "skill")
+    baseline_constraints = validator.validate_all(skill["raw"], "skill")
     all_pass = True
     for c in baseline_constraints:
         icon = "✓" if c.passed else "✗"
@@ -154,10 +200,7 @@ def evolve(
     start_time = time.time()
 
     try:
-        optimizer = dspy.GEPA(
-            metric=skill_fitness_metric,
-            max_steps=iterations,
-        )
+        optimizer = dspy.GEPA(**build_gepa_kwargs(iterations, optimizer_model=optimizer_model))
 
         optimized_module = optimizer.compile(
             baseline_module,
@@ -182,11 +225,10 @@ def evolve(
     # ── 6. Extract evolved skill text ───────────────────────────────────
     # The optimized module's instructions contain the evolved skill text
     evolved_body = optimized_module.skill_text
-    evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
+    evolved_full, evolved_constraints = validate_evolved_skill(validator, skill, evolved_body)
 
     # ── 7. Validate evolved skill ───────────────────────────────────────
     console.print(f"\n[bold]Validating evolved skill[/bold]")
-    evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"])
     all_pass = True
     for c in evolved_constraints:
         icon = "✓" if c.passed else "✗"

diff --git a/evolution/skills/skill_module.py b/evolution/skills/skill_module.py
@@ -84,33 +84,40 @@ def find_skill(skill_name: str, hermes_agent_path: Path) -> Optional[Path]:
 class SkillModule(dspy.Module):
     """A DSPy module that wraps a skill file for optimization.
 
-    The skill text (body) is the parameter that GEPA optimizes.
-    On each forward pass, the module:
-    1. Uses the skill text as instructions
-    2. Processes the task input
-    3. Returns the agent's response
+    DSPy optimizers such as GEPA mutate predictor instructions, not arbitrary
+    runtime input values. The skill body therefore lives in the predictor
+    signature instructions so optimizer output can be recovered as the evolved
+    SKILL.md body.
     """
 
     class TaskWithSkill(dspy.Signature):
-        """Complete a task following the provided skill instructions.
+        """Complete a task following the skill instructions."""
 
-        You are an AI agent following specific skill instructions to complete a task.
-        Read the skill instructions carefully and follow the procedure described.
-        """
-        skill_instructions: str = dspy.InputField(desc="The skill instructions to follow")
         task_input: str = dspy.InputField(desc="The task to complete")
         output: str = dspy.OutputField(desc="Your response following the skill instructions")
 
     def __init__(self, skill_text: str):
         super().__init__()
-        self.skill_text = skill_text
-        self.predictor = dspy.ChainOfThought(self.TaskWithSkill)
+        signature = self.TaskWithSkill.with_instructions(skill_text)
+        self.predictor = dspy.ChainOfThought(signature)
+
+    @property
+    def skill_text(self) -> str:
+        """Return the current optimizable skill instructions.
+
+        After GEPA/MIPRO compiles the module, this reflects any mutated
+        predictor signature instructions rather than the original constructor
+        argument.
+        """
+        return self.predictor.predict.signature.instructions
+
+    @skill_text.setter
+    def skill_text(self, value: str) -> None:
+        """Replace the optimizable skill instructions in-place."""
+        self.predictor.predict.signature = self.predictor.predict.signature.with_instructions(value)
 
     def forward(self, task_input: str) -> dspy.Prediction:
-        result = self.predictor(
-            skill_instructions=self.skill_text,
-            task_input=task_input,
-        )
+        result = self.predictor(task_input=task_input)
         return dspy.Prediction(output=result.output)
 
 

diff --git a/tests/skills/test_evolution_pipeline_regressions.py b/tests/skills/test_evolution_pipeline_regressions.py
@@ -0,0 +1,94 @@
+"""Regression tests for the skill evolution pipeline."""
+
+import dspy
+
+from evolution.core.config import EvolutionConfig
+from evolution.core.constraints import ConstraintValidator
+from evolution.core.fitness import skill_fitness_metric
+from evolution.skills.evolve_skill import build_gepa_kwargs, validate_evolved_skill
+from evolution.skills.skill_module import SkillModule, reassemble_skill
+
+
+SAMPLE_FRONTMATTER = "name: regression-skill\ndescription: Regression test skill"
+SAMPLE_BODY = "# Regression Skill\n\nFollow the documented procedure."
+SAMPLE_SKILL = {
+    "frontmatter": SAMPLE_FRONTMATTER,
+    "body": SAMPLE_BODY,
+    "raw": reassemble_skill(SAMPLE_FRONTMATTER, SAMPLE_BODY),
+}
+
+
+def test_build_gepa_kwargs_prefers_full_eval_budget(monkeypatch):
+    class FakeGEPA:
+        def __init__(self, metric, max_full_evals=None, max_metric_calls=None):
+            pass
+
+    monkeypatch.setattr("evolution.skills.evolve_skill.dspy.GEPA", FakeGEPA)
+
+    kwargs = build_gepa_kwargs(iterations=7)
+
+    assert kwargs["metric"] is skill_fitness_metric
+    assert kwargs["max_full_evals"] == 7
+    assert "max_metric_calls" not in kwargs
+
+
+def test_build_gepa_kwargs_supports_legacy_max_steps(monkeypatch):
+    class FakeLegacyGEPA:
+        def __init__(self, metric, max_steps=None):
+            pass
+
+    monkeypatch.setattr("evolution.skills.evolve_skill.dspy.GEPA", FakeLegacyGEPA)
+
+    kwargs = build_gepa_kwargs(iterations=3)
+
+    assert kwargs["max_steps"] == 3
+
+
+def test_gepa_metric_accepts_reflective_feedback_signature():
+    example = dspy.Example(
+        task_input="Summarize a source",
+        expected_behavior="mention source URL and capture durable notes",
+    )
+    prediction = dspy.Prediction(output="Capture notes and mention source URL.")
+
+    plain_score = skill_fitness_metric(example, prediction)
+    reflective_score = skill_fitness_metric(
+        example,
+        prediction,
+        trace=[],
+        pred_name="predictor",
+        pred_trace=[],
+    )
+
+    assert isinstance(plain_score, float)
+    assert reflective_score["score"] == plain_score
+    assert "feedback" in reflective_score
+
+
+def test_validate_evolved_skill_reassembles_full_skill_before_validation():
+    validator = ConstraintValidator(EvolutionConfig())
+    evolved_body = "# Regression Skill\n\nImproved procedure."
+
+    evolved_full, results = validate_evolved_skill(validator, SAMPLE_SKILL, evolved_body)
+
+    assert evolved_full.startswith("---\nname: regression-skill")
+    assert "Improved procedure" in evolved_full
+    assert all(result.passed for result in results)
+
+
+def test_body_only_skill_validation_would_fail_structure():
+    validator = ConstraintValidator(EvolutionConfig())
+
+    results = validator.validate_all(SAMPLE_BODY, "skill", baseline_text=SAMPLE_BODY)
+
+    structure = next(result for result in results if result.constraint_name == "skill_structure")
+    assert not structure.passed
+
+
+def test_skill_module_reads_mutated_signature_instructions():
+    module = SkillModule("initial instructions")
+
+    module.skill_text = "mutated instructions"
+
+    assert module.skill_text == "mutated instructions"
+    assert module.predictor.predict.signature.instructions == "mutated instructions"