Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions evolution/core/fitness.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,19 +104,31 @@ def score(
)


def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float:
def skill_fitness_metric(
example: dspy.Example,
prediction: dspy.Prediction,
trace=None,
pred_name: Optional[str] = None,
pred_trace=None,
) -> float | dict:
"""DSPy-compatible metric function for skill optimization.

This is what gets passed to dspy.GEPA(metric=...).
Returns a float 0-1 score.
DSPy GEPA 3.x requires metrics to accept five arguments:
``(gold, pred, trace, pred_name, pred_trace)``. Normal evaluation and
MIPROv2 still call this as a simple score function, so the extra GEPA
arguments stay optional. When GEPA asks for predictor-level feedback
(``pred_name`` is set), return score+feedback so reflection has something
actionable to optimize against.
"""
# The prediction should have an 'output' field with the agent's response
agent_output = getattr(prediction, "output", "") or ""
expected = getattr(example, "expected_behavior", "") or ""
task = getattr(example, "task_input", "") or ""

if not agent_output.strip():
return 0.0
score = 0.0
feedback = "Output was empty; produce a concrete response for the task."
return {"score": score, "feedback": feedback} if pred_name is not None else score

# Quick heuristic scoring (for speed during optimization)
# Full LLM-as-judge scoring is expensive — use it selectively
Expand All @@ -129,11 +141,26 @@ def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, tra
# Simple keyword overlap as a fast proxy
expected_words = set(expected_lower.split())
output_words = set(output_lower.split())
overlap = 0.0
if expected_words:
overlap = len(expected_words & output_words) / len(expected_words)
score = 0.3 + (0.7 * overlap)

return min(1.0, max(0.0, score))
score = min(1.0, max(0.0, score))

if pred_name is not None:
missing_terms = sorted(expected_words - output_words)[:8]
feedback_parts = [
f"Score: {score:.2f} for task: {task[:160]}",
f"Expected behavior: {expected[:240]}",
]
if missing_terms:
feedback_parts.append("Missing or underrepresented expected terms: " + ", ".join(missing_terms))
else:
feedback_parts.append("Output covered the expected keyword signal; preserve this behavior while improving clarity and procedure adherence.")
return {"score": score, "feedback": "\n".join(feedback_parts)}

return score


def _parse_score(value) -> float:
Expand Down
56 changes: 49 additions & 7 deletions evolution/skills/evolve_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
python -m evolution.skills.evolve_skill --skill arxiv --eval-source golden --dataset datasets/skills/arxiv/
"""

import inspect
import json
import sys
import time
Expand Down Expand Up @@ -33,6 +34,51 @@
console = Console()


def build_gepa_kwargs(iterations: int, optimizer_model: Optional[str] = None) -> dict:
"""Build DSPy GEPA kwargs across DSPy API versions.

hermes-agent-self-evolution originally used ``max_steps``. DSPy 3.x
removed that parameter in favor of budget-style knobs such as
``max_metric_calls`` / ``max_full_evals``. Keep compatibility without
forcing a MIPROv2 fallback.
"""
params = inspect.signature(dspy.GEPA).parameters
kwargs = {"metric": skill_fitness_metric}
budget = max(1, int(iterations))

if "max_steps" in params:
kwargs["max_steps"] = budget
elif "max_full_evals" in params:
kwargs["max_full_evals"] = budget
elif "max_metric_calls" in params:
kwargs["max_metric_calls"] = budget

if optimizer_model and "reflection_lm" in params:
kwargs["reflection_lm"] = dspy.LM(optimizer_model)

return kwargs


def validate_evolved_skill(
validator: ConstraintValidator,
skill: dict,
evolved_body: str,
) -> tuple[str, list]:
"""Reassemble and validate a complete SKILL.md candidate.

The optimizer mutates only the Markdown body. Constraint validation for a
skill, however, expects full YAML frontmatter plus body. Validate the full
candidate and compare growth against the full baseline to avoid false
failures for body-only text.
"""
evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
return evolved_full, validator.validate_all(
evolved_full,
"skill",
baseline_text=skill["raw"],
)


def evolve(
skill_name: str,
iterations: int = 10,
Expand Down Expand Up @@ -119,7 +165,7 @@ def evolve(
# ── 3. Validate constraints on baseline ─────────────────────────────
console.print(f"\n[bold]Validating baseline constraints[/bold]")
validator = ConstraintValidator(config)
baseline_constraints = validator.validate_all(skill["body"], "skill")
baseline_constraints = validator.validate_all(skill["raw"], "skill")
all_pass = True
for c in baseline_constraints:
icon = "✓" if c.passed else "✗"
Expand Down Expand Up @@ -154,10 +200,7 @@ def evolve(
start_time = time.time()

try:
optimizer = dspy.GEPA(
metric=skill_fitness_metric,
max_steps=iterations,
)
optimizer = dspy.GEPA(**build_gepa_kwargs(iterations, optimizer_model=optimizer_model))

optimized_module = optimizer.compile(
baseline_module,
Expand All @@ -182,11 +225,10 @@ def evolve(
# ── 6. Extract evolved skill text ───────────────────────────────────
# The optimized module's instructions contain the evolved skill text
evolved_body = optimized_module.skill_text
evolved_full = reassemble_skill(skill["frontmatter"], evolved_body)
evolved_full, evolved_constraints = validate_evolved_skill(validator, skill, evolved_body)

# ── 7. Validate evolved skill ───────────────────────────────────────
console.print(f"\n[bold]Validating evolved skill[/bold]")
evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"])
all_pass = True
for c in evolved_constraints:
icon = "✓" if c.passed else "✗"
Expand Down
39 changes: 23 additions & 16 deletions evolution/skills/skill_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,33 +84,40 @@ def find_skill(skill_name: str, hermes_agent_path: Path) -> Optional[Path]:
class SkillModule(dspy.Module):
"""A DSPy module that wraps a skill file for optimization.

The skill text (body) is the parameter that GEPA optimizes.
On each forward pass, the module:
1. Uses the skill text as instructions
2. Processes the task input
3. Returns the agent's response
DSPy optimizers such as GEPA mutate predictor instructions, not arbitrary
runtime input values. The skill body therefore lives in the predictor
signature instructions so optimizer output can be recovered as the evolved
SKILL.md body.
"""

class TaskWithSkill(dspy.Signature):
"""Complete a task following the provided skill instructions.
"""Complete a task following the skill instructions."""

You are an AI agent following specific skill instructions to complete a task.
Read the skill instructions carefully and follow the procedure described.
"""
skill_instructions: str = dspy.InputField(desc="The skill instructions to follow")
task_input: str = dspy.InputField(desc="The task to complete")
output: str = dspy.OutputField(desc="Your response following the skill instructions")

def __init__(self, skill_text: str):
super().__init__()
self.skill_text = skill_text
self.predictor = dspy.ChainOfThought(self.TaskWithSkill)
signature = self.TaskWithSkill.with_instructions(skill_text)
self.predictor = dspy.ChainOfThought(signature)

@property
def skill_text(self) -> str:
"""Return the current optimizable skill instructions.

After GEPA/MIPRO compiles the module, this reflects any mutated
predictor signature instructions rather than the original constructor
argument.
"""
return self.predictor.predict.signature.instructions

@skill_text.setter
def skill_text(self, value: str) -> None:
"""Replace the optimizable skill instructions in-place."""
self.predictor.predict.signature = self.predictor.predict.signature.with_instructions(value)

def forward(self, task_input: str) -> dspy.Prediction:
result = self.predictor(
skill_instructions=self.skill_text,
task_input=task_input,
)
result = self.predictor(task_input=task_input)
return dspy.Prediction(output=result.output)


Expand Down
94 changes: 94 additions & 0 deletions tests/skills/test_evolution_pipeline_regressions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Regression tests for the skill evolution pipeline."""

import dspy

from evolution.core.config import EvolutionConfig
from evolution.core.constraints import ConstraintValidator
from evolution.core.fitness import skill_fitness_metric
from evolution.skills.evolve_skill import build_gepa_kwargs, validate_evolved_skill
from evolution.skills.skill_module import SkillModule, reassemble_skill


SAMPLE_FRONTMATTER = "name: regression-skill\ndescription: Regression test skill"
SAMPLE_BODY = "# Regression Skill\n\nFollow the documented procedure."
SAMPLE_SKILL = {
"frontmatter": SAMPLE_FRONTMATTER,
"body": SAMPLE_BODY,
"raw": reassemble_skill(SAMPLE_FRONTMATTER, SAMPLE_BODY),
}


def test_build_gepa_kwargs_prefers_full_eval_budget(monkeypatch):
class FakeGEPA:
def __init__(self, metric, max_full_evals=None, max_metric_calls=None):
pass

monkeypatch.setattr("evolution.skills.evolve_skill.dspy.GEPA", FakeGEPA)

kwargs = build_gepa_kwargs(iterations=7)

assert kwargs["metric"] is skill_fitness_metric
assert kwargs["max_full_evals"] == 7
assert "max_metric_calls" not in kwargs


def test_build_gepa_kwargs_supports_legacy_max_steps(monkeypatch):
class FakeLegacyGEPA:
def __init__(self, metric, max_steps=None):
pass

monkeypatch.setattr("evolution.skills.evolve_skill.dspy.GEPA", FakeLegacyGEPA)

kwargs = build_gepa_kwargs(iterations=3)

assert kwargs["max_steps"] == 3


def test_gepa_metric_accepts_reflective_feedback_signature():
example = dspy.Example(
task_input="Summarize a source",
expected_behavior="mention source URL and capture durable notes",
)
prediction = dspy.Prediction(output="Capture notes and mention source URL.")

plain_score = skill_fitness_metric(example, prediction)
reflective_score = skill_fitness_metric(
example,
prediction,
trace=[],
pred_name="predictor",
pred_trace=[],
)

assert isinstance(plain_score, float)
assert reflective_score["score"] == plain_score
assert "feedback" in reflective_score


def test_validate_evolved_skill_reassembles_full_skill_before_validation():
validator = ConstraintValidator(EvolutionConfig())
evolved_body = "# Regression Skill\n\nImproved procedure."

evolved_full, results = validate_evolved_skill(validator, SAMPLE_SKILL, evolved_body)

assert evolved_full.startswith("---\nname: regression-skill")
assert "Improved procedure" in evolved_full
assert all(result.passed for result in results)


def test_body_only_skill_validation_would_fail_structure():
validator = ConstraintValidator(EvolutionConfig())

results = validator.validate_all(SAMPLE_BODY, "skill", baseline_text=SAMPLE_BODY)

structure = next(result for result in results if result.constraint_name == "skill_structure")
assert not structure.passed


def test_skill_module_reads_mutated_signature_instructions():
module = SkillModule("initial instructions")

module.skill_text = "mutated instructions"

assert module.skill_text == "mutated instructions"
assert module.predictor.predict.signature.instructions == "mutated instructions"