From 0e45d2967123d02b6f8ea9b59e4ff4f2c86a40a5 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Sat, 22 Nov 2025 16:37:00 +0100
Subject: [PATCH 01/16] added gepa integrations folder and readme

---
 integrations/gepa/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 integrations/gepa/README.md

diff --git a/integrations/gepa/README.md b/integrations/gepa/README.md
new file mode 100644
index 000000000..e85fea091
--- /dev/null
+++ b/integrations/gepa/README.md
@@ -0,0 +1,10 @@
+# GEPA Integration for Verifiers
+
+Optimize system prompts and tool descriptions using GEPA (Genetic-Pareto evolutionary algorithm).
+
+## Learn More
+
+- **GEPA Paper**: [arXiv:2507.19457](https://arxiv.org/abs/2507.19457)
+- **GEPA Repository**: [github.com/gepa-ai/gepa](https://github.com/gepa-ai/gepa)
+- **DSPy GEPA**: [dspy.ai/tutorials/gepa_ai_program](https://dspy.ai/tutorials/gepa_ai_program/)
+- **Verifiers Documentation**: [verifiers.readthedocs.io](https://verifiers.readthedocs.io/)

From 03e189add0ac373d9e92a59cf15f4f8208810f54 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Sat, 22 Nov 2025 22:26:46 +0100
Subject: [PATCH 02/16] first pass on gepa integration and command

---
 README.md                           |  32 ++
 docs/source/gepa.md                 | 356 ++++++++++++++
 environments/wordle/wordle.py       |  20 +-
 integrations/gepa/README.md         | 218 ++++++++-
 pyproject.toml                      |   5 +
 tests/test_gepa.py                  | 375 +++++++++++++++
 tests/test_rubric.py                | 107 +++++
 verifiers/adapters/__init__.py      |   5 +
 verifiers/adapters/gepa/__init__.py |   5 +
 verifiers/adapters/gepa/adapter.py  | 464 +++++++++++++++++++
 verifiers/envs/environment.py       |   1 +
 verifiers/rubrics/rubric.py         | 105 ++++-
 verifiers/scripts/gepa.py           | 690 ++++++++++++++++++++++++++++
 verifiers/types.py                  |  13 +
 14 files changed, 2371 insertions(+), 25 deletions(-)
 create mode 100644 docs/source/gepa.md
 create mode 100644 tests/test_gepa.py
 create mode 100644 verifiers/adapters/__init__.py
 create mode 100644 verifiers/adapters/gepa/__init__.py
 create mode 100644 verifiers/adapters/gepa/adapter.py
 create mode 100644 verifiers/scripts/gepa.py

diff --git a/README.md b/README.md
index 01774ff73..0e94fa28c 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,38 @@ uv run vf-eval wordle -m gpt-5-nano
 
 For advanced evaluation configurations with the `prime` [CLI](https://github.com/PrimeIntellect-ai/prime-cli), see [here](https://docs.primeintellect.ai/tutorials-environments/evaluating)
 
+## Prompt Optimization with GEPA
+
+Automatically improve your environment's prompts using GEPA (Gradient-free Evolutionary Prompt Adaptation):
+
+```bash
+# Install GEPA extras
+uv add 'verifiers[gepa]'
+
+# Optimize system prompt
+vf-gepa wordle --auto medium
+
+# Optimize system prompt + tool descriptions
+vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+```
+
+GEPA analyzes your rubric's feedback and iteratively refines prompts. Works best when reward functions return rich textual feedback. See the [GEPA documentation](docs/source/gepa.md) for details.
+
+After a run completes, apply the saved components to an environment instance:
+
+```python
+import json
+import verifiers as vf
+
+with open("gepa_results/wordle/<run_id>/wordle_optimized.json") as f:
+    optimized = json.load(f)
+
+env = vf.load_environment("wordle")
+env.system_prompt = optimized["system_prompt"]
+if "tool_0_description" in optimized and hasattr(env, "oai_tools"):
+    env.oai_tools[0]["function"]["description"] = optimized["tool_0_description"]
+```
+
 ## RL Training
 
 ### `prime-rl`
diff --git a/docs/source/gepa.md b/docs/source/gepa.md
new file mode 100644
index 000000000..44ba8dd68
--- /dev/null
+++ b/docs/source/gepa.md
@@ -0,0 +1,356 @@
+# GEPA: Prompt Optimization
+
+GEPA (Gradient-free Evolutionary Prompt Adaptation) is an automatic prompt optimization system that improves your environment's system prompts and tool descriptions based on rubric feedback.
+
+## Overview
+
+GEPA works by:
+1. Testing your current prompts on examples
+2. Analyzing failures using rubric feedback
+3. Generating improved prompts through reflection
+4. Iteratively refining until convergence
+
+This is particularly effective when combined with `FeedbackRubric`, which provides rich textual feedback explaining why rollouts succeeded or failed.
+
+## Installation
+
+GEPA is available as an optional dependency:
+
+```bash
+uv add 'verifiers[gepa]'
+```
+
+This installs the `gepa` optimization engine.
+
+## Quick Start
+
+Optimize the system prompt for an environment:
+
+```bash
+vf-gepa wordle --auto medium
+```
+
+This will:
+- Load the `wordle` environment
+- Use medium budget (~12 candidate prompts)
+- Optimize the `system_prompt` component
+- Save results to `./gepa_results/wordle/<run_id>/`
+
+## Budget Modes
+
+GEPA offers three auto budget levels:
+
+### Light (~6 candidates)
+Fast iteration for testing:
+```bash
+vf-gepa my-env --auto light
+```
+- Best for: Quick experiments, sanity checks
+- Time: ~5-10 minutes for simple environments
+- Use when: Testing GEPA setup, iterating rapidly
+
+### Medium (~12 candidates)  
+Balanced optimization:
+```bash
+vf-gepa my-env --auto medium
+```
+- Best for: Most use cases, good improvements
+- Time: ~15-30 minutes for simple environments
+- Use when: Standard optimization runs
+
+### Heavy (~18 candidates)
+Thorough exploration:
+```bash
+vf-gepa my-env --auto heavy
+```
+- Best for: Final production prompts, critical environments
+- Time: ~30-60 minutes for simple environments
+- Use when: You need the best possible prompt
+
+### Custom Budget
+
+For fine control, specify exact metric calls:
+```bash
+vf-gepa my-env --max-metric-calls 1000
+```
+
+## Component Selection
+
+By default, GEPA optimizes `system_prompt`. You can specify multiple components:
+
+### System Prompt Only
+```bash
+vf-gepa my-env --auto medium --components system_prompt
+```
+
+### Tool Descriptions
+For environments with tools, optimize their descriptions:
+```bash
+vf-gepa wiki-search --auto medium --components tool_descriptions
+```
+
+### Both System Prompt and Tool Descriptions
+```bash
+vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+```
+
+When optimizing `tool_descriptions`, GEPA:
+1. Extracts each tool's description from `oai_tools`
+2. Treats each as a separate component to optimize
+3. Uses separate reflection for each tool
+4. Injects optimized descriptions back into tools
+
+## Model Configuration
+
+### Task Model
+The model being optimized (default: `gpt-4o-mini`):
+```bash
+vf-gepa my-env --auto medium -m gpt-4o
+```
+
+### Reflection Model
+The model generating improved prompts (default: `gpt-4o`):
+```bash
+vf-gepa my-env --auto medium --reflection-model gpt-4o
+```
+
+### Sampling Parameters
+```bash
+vf-gepa my-env --auto medium \
+  -T 0.7 \              # Temperature for task model
+  -t 2048 \             # Max tokens
+  --reflection-temperature 1.0  # Temperature for reflection
+```
+
+## Dataset Configuration
+
+Control train/validation split sizes:
+
+```bash
+vf-gepa my-env --auto medium \
+  -n 100 \              # 100 training examples
+  --num-val 30          # 30 validation examples
+```
+
+**Guidelines**:
+- Training: 50-100 examples (more = slower but potentially better)
+- Validation: 20-30 examples (for measuring improvement)
+- Use representative examples that cover your task's diversity
+
+## Output
+
+GEPA saves three files to `./gepa_results/<env_id>/<run_id>/`:
+
+### 1. `<env_id>_optimized.json`
+The optimized components:
+```json
+{
+  "system_prompt": "You are a competitive Wordle player...",
+  "tool_0_description": "Search Wikipedia for..."
+}
+```
+
+### 2. `<env_id>_original.json`
+The original components for comparison.
+
+### 3. `<env_id>_metrics.json`
+Optimization metrics:
+```json
+{
+  "best_val_score": 0.85,
+  "initial_val_score": 0.62,
+  "improvement": 0.23,
+  "num_candidates": 12,
+  "candidates_history": [...]
+}
+```
+
+## Rubric Feedback Support
+
+For best results, have your reward functions return feedback:
+
+```python
+import verifiers as vf
+
+def accuracy_with_feedback(parser, completion, answer, **kwargs):
+    """Reward function that returns score + feedback."""
+    guess = parser.parse_answer(completion)
+    correct = (guess == answer)
+    
+    return {
+        "score": 1.0 if correct else 0.0,
+        "feedback": (
+            f"{'✓' if correct else '✗'} "
+            f"Expected: {answer}, Got: {guess}"
+        )
+    }
+
+rubric = vf.Rubric(parser=parser)
+rubric.add_reward_func(accuracy_with_feedback)
+```
+
+The `feedback` field is used by GEPA to understand *why* completions failed, enabling better prompt improvements. The base `Rubric` class automatically collects feedback via its `get_feedback()` method.
+
+## Advanced Usage
+
+### Multiple Rollouts Per Example
+Increase robustness with multiple rollouts:
+```bash
+vf-gepa my-env --auto medium --rollouts-per-example 3
+```
+
+### Custom Log Directory
+```bash
+vf-gepa my-env --auto medium --log-dir ./my_optimization_runs
+```
+
+### Track Detailed Statistics
+Save full outputs for analysis:
+```bash
+vf-gepa my-env --auto medium --track-stats
+```
+
+### Verbose Logging
+Debug optimization process:
+```bash
+vf-gepa my-env --auto medium -v
+```
+
+## Best Practices
+
+### 1. Provide Rich Feedback
+GEPA works best when reward functions return textual feedback explaining scores. If your functions only return numbers, GEPA has less to work with.
+
+**Good**: 
+```python
+return {
+    "score": 0.5,
+    "feedback": "Partially correct. Got step 1 right but step 2 is missing."
+}
+```
+
+**OK but less effective**:
+```python
+return 0.5  # GEPA will only see the number
+```
+
+### 2. Use Representative Examples
+Ensure your training and validation sets cover the full range of task difficulty and variety.
+
+### 3. Start Light, Then Scale Up
+Begin with `--auto light` to verify everything works, then use `medium` or `heavy` for production.
+
+### 4. Iterate on Feedback Quality
+If GEPA improvements are small, review your rubric's feedback. More specific feedback = better improvements.
+
+### 5. Version Control Prompts
+Save optimized prompts in your repo and track which version is in production.
+
+## Troubleshooting
+
+### "Error: GEPA is not installed"
+```bash
+uv add 'verifiers[gepa]'
+```
+
+### "Environment does not have component 'X'"
+Check that your environment exposes the component you're trying to optimize. Use `--components system_prompt` (default) if unsure.
+
+## Limitations
+
+### Unsupported Environment Types
+- **EnvGroup**: GEPA operates on a single environment at a time. Optimize each member separately, then compose them with `EnvGroup`.
+- **Dynamic tools**: Environments that mutate their tool list during `__init__` or per rollout may not preserve those changes across candidate reconstruction.
+
+### Requirements
+- Components you optimize must be attributes on the environment object (e.g., `system_prompt`).
+- `tool_descriptions` optimization requires `oai_tools` to be defined up front.
+- Reward functions should emit textual feedback to unlock GEPA's reflection step.
+
+### Operational Constraints
+- Multiple rollouts per example scale linearly in cost—start small before increasing `--rollouts-per-example`.
+- Heavy budgets require high-quality validation datasets; under-sized eval sets can hide regressions.
+- GEPA expects deterministic environment construction. Expensive setup code will re-run for every candidate.
+
+### Low Improvement
+- Increase budget: Use `--auto heavy` or `--max-metric-calls 2000`
+- Improve feedback: Make your rubric's feedback more specific
+- Add more examples: Use `-n 100 --num-val 30`
+- Check dataset quality: Ensure examples are representative
+
+### Out of Memory
+- Reduce batch sizes: `--reflection-minibatch-size 2`
+- Reduce examples: `-n 30 --num-val 10`
+- Use smaller models: `-m gpt-4o-mini`
+
+## Examples
+
+### Basic Optimization
+```bash
+vf-gepa wordle --auto medium
+```
+
+### Tool-Using Environment
+```bash
+vf-gepa wiki-search --auto heavy \
+  --components system_prompt tool_descriptions \
+  -m gpt-4o
+```
+
+### Large-Scale Optimization
+```bash
+vf-gepa my-env --max-metric-calls 2000 \
+  -n 200 --num-val 50 \
+  --rollouts-per-example 3 \
+  --track-stats
+```
+
+### Custom Models
+```bash
+vf-gepa my-env --auto medium \
+  -m claude-3-5-sonnet-20241022 \
+  --reflection-model gpt-4o
+```
+
+## API Usage
+
+For programmatic use:
+
+```python
+import verifiers as vf
+from verifiers.adapters import GEPAAdapter
+from gepa import optimize
+
+# Load environment
+env = vf.load_environment("wordle")
+
+# Create adapter
+adapter = GEPAAdapter(
+    env=env,
+    client=client,
+    model="gpt-4o-mini",
+    sampling_args={"temperature": 1.0, "max_tokens": 8096},
+    components_to_optimize=["system_prompt"],
+)
+
+# Run optimization
+result = optimize(
+    seed_candidate={"system_prompt": env.system_prompt},
+    trainset=trainset,
+    valset=valset,
+    adapter=adapter,
+    max_metric_calls=500,
+    reflection_lm=reflection_function,
+)
+
+# Access results
+best_prompt = result.best_candidate["system_prompt"]
+improvement = max(result.val_aggregate_scores) - result.val_aggregate_scores[0]
+```
+
+## Further Reading
+
+- [GEPA Paper](https://arxiv.org/abs/2507.19457)
+- [GEPA Documentation](https://dspy.ai/api/optimizers/GEPA/overview/)
+- [Creating Environments](environments.md)
+
diff --git a/environments/wordle/wordle.py b/environments/wordle/wordle.py
index a1d7052c3..d14e63404 100644
--- a/environments/wordle/wordle.py
+++ b/environments/wordle/wordle.py
@@ -1,5 +1,6 @@
 import verifiers as vf
 from verifiers.envs.textarena_env import TextArenaEnv
+from verifiers.types import RewardResult
 
 ### prompt
 
@@ -18,15 +19,26 @@ def wordle_feedback_fn(observation: str) -> str:
 
 
 ### reward functions
-def check_answer_reward_func(parser, completion, answer, **kwargs) -> float:
+def check_answer_reward_func(parser, completion, answer, **kwargs) -> RewardResult:
+    """Check if the guess is correct and provide feedback."""
     guess = parser.parse_answer(completion)
-    return 1.0 if guess == "[" + answer + "]" else 0.0
+    correct = guess == "[" + answer + "]"
+
+    # Return dict with score and feedback (for GEPA optimization)
+    return {
+        "score": 1.0 if correct else 0.0,
+        "feedback": (
+            f"{'✓ Correct!' if correct else '✗ Incorrect.'} "
+            f"Expected: {answer}, Got: {guess}"
+        ),
+    }
 
 
 def count_turns_reward_func(parser, completion, answer, **kwargs) -> float:
     num_turns = len([x for x in completion if x["role"] == "assistant"])
-    is_correct = check_answer_reward_func(parser, completion, answer, **kwargs)
-    return is_correct / (num_turns + 1)
+    result = check_answer_reward_func(parser, completion, answer, **kwargs)
+    score = result["score"] if isinstance(result, dict) else result
+    return score / (num_turns + 1)
 
 
 def partial_credit_reward_func(parser, completion, answer, **kwargs) -> float:
diff --git a/integrations/gepa/README.md b/integrations/gepa/README.md
index e85fea091..2bd277e58 100644
--- a/integrations/gepa/README.md
+++ b/integrations/gepa/README.md
@@ -1,10 +1,214 @@
-# GEPA Integration for Verifiers
+# GEPA Integration
 
-Optimize system prompts and tool descriptions using GEPA (Genetic-Pareto evolutionary algorithm).
+GEPA (Gradient-free Evolutionary Prompt Adaptation) integration for Verifiers environments.
 
-## Learn More
+## Overview
 
-- **GEPA Paper**: [arXiv:2507.19457](https://arxiv.org/abs/2507.19457)
-- **GEPA Repository**: [github.com/gepa-ai/gepa](https://github.com/gepa-ai/gepa)
-- **DSPy GEPA**: [dspy.ai/tutorials/gepa_ai_program](https://dspy.ai/tutorials/gepa_ai_program/)
-- **Verifiers Documentation**: [verifiers.readthedocs.io](https://verifiers.readthedocs.io/)
+This integration enables automatic prompt optimization using GEPA, a reflection-based optimization system that improves prompts by analyzing rubric feedback. GEPA works by:
+
+1. Running your environment with current prompts
+2. Collecting rich feedback from rubric evaluations
+3. Using an LLM to reflect on failures and propose improvements
+4. Iteratively refining prompts until convergence
+
+## Installation
+
+```bash
+uv sync --extra gepa
+```
+
+This installs the `gepa` package (>=0.0.22).
+
+## Quick Start
+
+Optimize a system prompt:
+
+```bash
+vf-gepa wordle --auto medium
+```
+
+Optimize system prompt + tool descriptions:
+
+```bash
+vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+```
+
+## Components
+
+### `adapter.py`
+
+The `GEPAAdapter` class bridges Verifiers environments to GEPA's optimization protocol:
+
+- **Component management**: Extracts and injects optimizable components (system prompts, tool descriptions)
+- **Evaluation**: Runs rollouts and collects scores
+- **Feedback generation**: Converts rubric feedback into reflection data
+- **Tool optimization**: Splits tool descriptions into separate optimizable components
+
+### Key Methods
+
+```python
+from verifiers.adapters.gepa import GEPAAdapter
+
+adapter = GEPAAdapter(
+    env=vf_env,
+    client=async_client,
+    model="gpt-4o-mini",
+    sampling_args={"temperature": 1.0},
+    components_to_optimize=["system_prompt"],
+)
+
+# Build new environment with optimized components
+new_env = adapter.build_program({"system_prompt": "Optimized prompt..."})
+
+# Evaluate candidate prompts
+results = adapter.evaluate(batch, candidate, capture_traces=True)
+
+# Generate reflection dataset for GEPA
+reflective_data = adapter.make_reflective_dataset(candidate, results, components)
+```
+
+## Rubric Feedback
+
+GEPA works best when reward functions return structured feedback:
+
+```python
+def accuracy_with_feedback(parser, completion, answer, **kwargs):
+    guess = parser.parse_answer(completion)
+    correct = (guess == answer)
+    
+    return {
+        "score": 1.0 if correct else 0.0,
+        "feedback": f"Expected: {answer}, Got: {guess}. {explain_why(...)}"
+    }
+
+rubric = vf.Rubric(parser=parser)
+rubric.add_reward_func(accuracy_with_feedback)
+```
+
+The `feedback` field provides context GEPA uses to understand failures and generate better prompts. Without it, GEPA only sees numeric scores.
+
+## Tool Description Optimization
+
+When optimizing `tool_descriptions`, the adapter:
+
+1. Extracts each tool's description from `env.oai_tools`
+2. Creates separate components: `tool_0_description`, `tool_1_description`, etc.
+3. Optimizes each independently through GEPA's reflection process
+4. Reconstructs `oai_tools` with improved descriptions
+
+Example:
+
+```bash
+vf-gepa my-env --components tool_descriptions --auto medium
+```
+
+## Architecture
+
+```
+┌─────────────────┐
+│  GEPA Engine    │
+│  (reflection +  │
+│   proposals)    │
+└────────┬────────┘
+         │
+         ├─ evaluate()
+         ├─ make_reflective_dataset()
+         └─ build_program()
+         │
+┌────────▼────────┐
+│  GEPAAdapter    │
+│  (integrations/ │
+│   gepa)         │
+└────────┬────────┘
+         │
+         ├─ rollout()
+         ├─ score_rollout()
+         └─ get_feedback()
+         │
+┌────────▼────────┐
+│  Verifiers Env  │
+│  (dataset +     │
+│   rubric)       │
+└─────────────────┘
+```
+
+## Configuration
+
+### Budget Modes
+
+- **light** (~6 candidates): Fast iteration, ~5-10 min
+- **medium** (~12 candidates): Balanced, ~15-30 min
+- **heavy** (~18 candidates): Thorough, ~30-60 min
+
+### Dataset Sizes
+
+- Training: 50-100 examples (more = slower but potentially better)
+- Validation: 20-30 examples (for measuring improvement)
+
+### Models
+
+- **Task model** (being optimized): `gpt-4o-mini`, `gpt-4o`, or custom
+- **Reflection model** (generating proposals): `gpt-4o` recommended
+
+## Output
+
+GEPA saves results to `./gepa_results/<env_id>/<run_id>/`:
+
+- `<env_id>_optimized.json` - Optimized components
+- `<env_id>_original.json` - Original components (for comparison)
+- `<env_id>_metrics.json` - Optimization metrics and history
+
+## Implementation Notes
+
+### Packaging
+
+The GEPA adapter ships inside the `verifiers.adapters` package so it is available to `pip install verifiers` users. The legacy `integrations/gepa` module re-exports the same class for backward compatibility inside this repository.
+
+### Feedback Collection
+
+The base `Rubric` class automatically collects feedback when reward functions return dicts with `"feedback"` keys. The adapter checks for `rubric.get_feedback(state)` to retrieve combined feedback from all functions.
+
+### Error Handling
+
+The adapter validates:
+- Environment has requested components (`system_prompt`, `oai_tools`)
+- Tool descriptions can only be optimized if environment has tools
+- Reflection datasets require `capture_traces=True`
+
+## CLI Reference
+
+Full documentation: [`docs/source/gepa.md`](../../docs/source/gepa.md)
+
+```bash
+# Basic
+vf-gepa ENV_ID --auto light|medium|heavy
+
+# Advanced
+vf-gepa ENV_ID \
+  --max-metric-calls 1000 \
+  -n 100 --num-val 30 \
+  --components system_prompt tool_descriptions \
+  -m gpt-4o \
+  --reflection-model gpt-4o \
+  --rollouts-per-example 3
+
+# Options
+  -n, --num-examples       Training examples (default: 50)
+  --num-val               Validation examples (default: 20)
+  --auto                  Budget: light/medium/heavy
+  --max-metric-calls      Custom budget (total metric calls)
+  --components            What to optimize (default: system_prompt)
+  -m, --model             Task model (default: gpt-4o-mini)
+  --reflection-model      Reflection model (default: gpt-4o)
+  -T, --temperature       Task model temperature (default: 1.0)
+  -t, --max-tokens        Max tokens (default: 8096)
+  --track-stats           Save detailed statistics
+  -v, --verbose           Verbose logging
+```
+
+## Links
+
+- [GEPA Documentation](../../docs/source/gepa.md) - Complete usage guide
+- [GEPA Paper](https://arxiv.org/abs/2507.19457) - Original research
+- [GEPA API Docs](https://dspy.ai/api/optimizers/GEPA/overview/) - DSPy reference
+- [Creating Environments](../../docs/source/environments.md) - Build custom environments
diff --git a/pyproject.toml b/pyproject.toml
index 662a4b0d3..5ffb9f45b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,9 @@ envs = [
     "nltk",
     "textarena",
 ]
+gepa = [
+    "gepa>=0.0.22",
+]
 all = [
     "torch>=2.8.0",
     "transformers",
@@ -95,6 +98,7 @@ all = [
     "brave-search",
     "nltk",
     "textarena",
+    "gepa>=0.0.22",
 ]
 docs = [
     "sphinx",
@@ -110,6 +114,7 @@ flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
 
 [project.scripts]
 vf-eval = "verifiers.scripts.eval:main"
+vf-gepa = "verifiers.scripts.gepa:main"
 vf-init = "verifiers.scripts.init:main"
 vf-install = "verifiers.scripts.install:main"
 vf-setup = "verifiers.scripts.setup:main"
diff --git a/tests/test_gepa.py b/tests/test_gepa.py
new file mode 100644
index 000000000..7d4578ef2
--- /dev/null
+++ b/tests/test_gepa.py
@@ -0,0 +1,375 @@
+"""
+Tests for GEPA integration: Rubric feedback support and GEPAAdapter.
+"""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import verifiers as vf
+from verifiers.types import RewardResult, State
+
+
+def require_gepa_adapter():
+    """Import GEPAAdapter or skip tests if the module is unavailable."""
+    module = pytest.importorskip("verifiers.adapters.gepa")
+    return module.GEPAAdapter
+
+
+class TestRubricFeedback:
+    """Tests for Rubric class feedback support."""
+
+    def test_rubric_with_dict_return(self):
+        """Test Rubric with reward function returning dict."""
+
+        def reward_with_feedback(completion, answer, **kwargs) -> RewardResult:
+            correct = completion == answer
+            return {
+                "score": 1.0 if correct else 0.0,
+                "feedback": f"Expected: {answer}, Got: {completion}",
+            }
+
+        rubric = vf.Rubric()
+        rubric.add_reward_func(reward_with_feedback)
+
+        assert len(rubric.funcs) == 1
+        assert rubric.funcs[0] == reward_with_feedback
+
+    def test_rubric_with_float_return(self):
+        """Test Rubric with reward function returning float (backward compat)."""
+
+        def simple_reward(completion, answer, **kwargs) -> float:
+            return 1.0 if completion == answer else 0.0
+
+        rubric = vf.Rubric()
+        rubric.add_reward_func(simple_reward)
+
+        assert len(rubric.funcs) == 1
+        assert rubric.funcs[0] == simple_reward
+
+    def test_rubric_mixed_functions(self):
+        """Test Rubric with mix of dict and float returning functions."""
+
+        def reward_with_feedback(completion, answer, **kwargs) -> RewardResult:
+            return {
+                "score": 1.0 if completion == answer else 0.0,
+                "feedback": "Detailed feedback",
+            }
+
+        def simple_reward(completion, **kwargs) -> float:
+            return 0.5
+
+        rubric = vf.Rubric()
+        rubric.add_reward_func(reward_with_feedback, weight=1.0)
+        rubric.add_reward_func(simple_reward, weight=0.5)
+
+        assert len(rubric.funcs) == 2
+
+    @pytest.mark.asyncio
+    async def test_get_feedback_with_feedbacks(self):
+        """Test get_feedback when state has feedbacks."""
+        rubric = vf.Rubric()
+
+        state = State(input={})
+        state["reward"] = 0.75
+        state["feedbacks"] = [
+            "reward_1: Good job!",
+            "reward_2: Could be better",
+        ]
+
+        feedback = rubric.get_feedback(state)
+
+        assert "0.75" in feedback or "75" in feedback  # Score percentage
+        assert "Good job!" in feedback
+        assert "Could be better" in feedback
+
+    @pytest.mark.asyncio
+    async def test_get_feedback_without_feedbacks(self):
+        """Test get_feedback when state has no feedbacks (fallback)."""
+        rubric = vf.Rubric()
+
+        state = State(input={})
+        state["reward"] = 0.5
+
+        feedback = rubric.get_feedback(state)
+
+        assert "0.5" in feedback or "50" in feedback
+        assert "no detailed feedback" in feedback.lower()
+
+
+class TestGEPAAdapter:
+    """Tests for GEPAAdapter class."""
+
+    def test_gepa_adapter_initialization(self):
+        """Test GEPAAdapter initializes correctly."""
+        GEPAAdapter = require_gepa_adapter()
+
+        # Create mock environment
+        env = MagicMock(spec=vf.SingleTurnEnv)
+        env.system_prompt = "Test prompt"
+        env.dataset = None
+        env.eval_dataset = None
+        env.parser = vf.Parser()
+        env.rubric = vf.Rubric()
+        env.sampling_args = {}
+        env.message_type = "chat"
+        env.max_workers = 512
+
+        client = AsyncMock()
+
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={"temperature": 1.0},
+            components_to_optimize=["system_prompt"],
+        )
+
+        assert adapter.base_env == env
+        assert adapter.model == "gpt-4o-mini"
+        assert "system_prompt" in adapter.components_to_optimize
+
+    def test_gepa_adapter_tool_descriptions_validation(self):
+        """Test GEPAAdapter validates tool_descriptions component."""
+        GEPAAdapter = require_gepa_adapter()
+
+        # Create mock environment WITHOUT tools
+        env = MagicMock(spec=vf.SingleTurnEnv)
+        env.system_prompt = "Test prompt"
+        env.oai_tools = None
+
+        client = AsyncMock()
+
+        # Should raise error when trying to optimize tool_descriptions without tools
+        with pytest.raises(ValueError, match="no tools"):
+            GEPAAdapter(
+                env=env,
+                client=client,
+                model="gpt-4o-mini",
+                sampling_args={},
+                components_to_optimize=["tool_descriptions"],
+            )
+
+    def test_gepa_adapter_build_program(self):
+        """Test GEPAAdapter.build_program creates new environment with updated components."""
+        GEPAAdapter = require_gepa_adapter()
+
+        # Create real environment
+        dataset = vf.load_example_dataset(n=5)
+        env = vf.SingleTurnEnv(
+            dataset=dataset,
+            system_prompt="Original prompt",
+            rubric=vf.Rubric(),
+        )
+
+        client = AsyncMock()
+
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["system_prompt"],
+        )
+
+        # Build new program with updated system_prompt
+        candidate = {"system_prompt": "Optimized prompt"}
+        new_env = adapter.build_program(candidate)
+
+        assert new_env.system_prompt == "Optimized prompt"
+        assert new_env.system_prompt != env.system_prompt
+
+    def test_gepa_adapter_extract_seed_candidate(self):
+        """Test extracting seed candidate from environment."""
+        dataset = vf.load_example_dataset(n=5)
+        env = vf.SingleTurnEnv(
+            dataset=dataset,
+            system_prompt="Test prompt",
+            rubric=vf.Rubric(),
+        )
+
+        # Verify we can extract the system_prompt
+        assert hasattr(env, "system_prompt")
+        assert env.system_prompt == "Test prompt"
+
+    def test_gepa_adapter_evaluate_uses_generate(self):
+        """Integration test ensuring evaluate() calls env.generate correctly."""
+        GEPAAdapter = require_gepa_adapter()
+
+        base_env = MagicMock(spec=vf.Environment)
+        base_env.dataset = None
+        base_env.eval_dataset = None
+        base_env.parser = vf.Parser()
+        base_env.rubric = vf.Rubric()
+        base_env.sampling_args = {}
+        base_env.message_type = "chat"
+        base_env.max_workers = 1
+        base_env.system_prompt = "Base system"
+        base_env.few_shot = None
+        base_env.env_id = "stub-env"
+        base_env.oai_tools = []
+
+        adapter = GEPAAdapter(
+            env=base_env,
+            client=AsyncMock(),
+            model="stub-model",
+            sampling_args={"temperature": 0.1},
+            components_to_optimize=["system_prompt"],
+            num_rollouts_per_example=1,
+        )
+
+        class StubEnv:
+            def __init__(self):
+                self.dataset = None
+                self.eval_dataset = None
+                self.parser = base_env.parser
+                self.rubric = base_env.rubric
+                self.sampling_args = {}
+                self.message_type = "chat"
+                self.system_prompt = "Stub system"
+                self.few_shot = None
+                self.env_id = "stub-env"
+                self.max_workers = 1
+                self.oai_tools = []
+                self.last_inputs = None
+
+            async def generate(
+                self,
+                inputs,
+                client,
+                model,
+                sampling_args=None,
+                max_concurrent=-1,
+                use_tqdm=True,
+            ):
+                self.last_inputs = inputs
+                return {
+                    "completion": [[{"role": "assistant", "content": "42"}]],
+                    "state": [
+                        {
+                            "prompt": [
+                                {"role": "system", "content": "Stub system"},
+                                {"role": "user", "content": "What is 6*7?"},
+                            ],
+                            "completion": [{"role": "assistant", "content": "42"}],
+                            "reward": 0.9,
+                        }
+                    ],
+                    "reward": [0.9],
+                }
+
+        stub_env = StubEnv()
+        batch = [
+            {
+                "question": "What is 6*7?",
+                "answer": "42",
+                "task": "math",
+                "info": {},
+            }
+        ]
+
+        with patch.object(adapter, "build_program", return_value=stub_env):
+            result = adapter.evaluate(
+                batch, candidate={"system_prompt": "Stub system"}, capture_traces=True
+            )
+
+        assert stub_env.last_inputs is not None
+        assert stub_env.last_inputs[0]["task"] == "math"
+        # Prompt should include system + user messages
+        assert isinstance(stub_env.last_inputs[0]["prompt"], list)
+        assert stub_env.last_inputs[0]["prompt"][-1]["content"] == "What is 6*7?"
+
+        assert result.scores == [0.9]
+        assert result.outputs == [[{"role": "assistant", "content": "42"}]]
+        assert result.trajectories is not None
+        assert result.trajectories[0]["score"] == 0.9
+
+
+class TestRubricDictSupport:
+    """Tests for base Rubric class dict return support."""
+
+    @pytest.mark.asyncio
+    async def test_rubric_score_rollout_with_dict_return(self):
+        """Test that score_rollout handles dict returns from reward functions."""
+
+        def reward_with_feedback(completion, answer, **kwargs) -> RewardResult:
+            return {
+                "score": 0.8,
+                "feedback": "Good answer",
+            }
+
+        rubric = vf.Rubric()
+        rubric.add_reward_func(reward_with_feedback)
+
+        # Create minimal state
+        state = State(
+            input={
+                "prompt": [{"role": "user", "content": "test"}],
+                "example_id": 0,
+                "task": "test",
+                "answer": "correct",
+            }
+        )
+        state["prompt"] = [{"role": "user", "content": "test"}]
+        state["completion"] = [{"role": "assistant", "content": "response"}]
+        state["task"] = "test"
+        state["timing"] = {"scoring_ms": 0.0, "total_ms": 0.0}
+
+        # Mock score_sem
+        from contextlib import asynccontextmanager
+
+        @asynccontextmanager
+        async def mock_sem():
+            yield
+
+        await rubric.score_rollout(state, score_sem=mock_sem())
+
+        # Check that reward was extracted correctly
+        assert state["reward"] == 0.8
+        assert "reward_with_feedback" in state["metrics"]
+        assert state["metrics"]["reward_with_feedback"] == 0.8
+
+        # Check that feedback was stored
+        assert "feedbacks" in state
+        assert len(state["feedbacks"]) == 1
+        assert "Good answer" in state["feedbacks"][0]
+
+    @pytest.mark.asyncio
+    async def test_rubric_score_rollout_with_float_return(self):
+        """Test that score_rollout still handles float returns (backward compat)."""
+
+        def simple_reward(completion, answer, **kwargs) -> float:
+            return 0.5
+
+        rubric = vf.Rubric()
+        rubric.add_reward_func(simple_reward)
+
+        # Create minimal state
+        state = State(
+            input={
+                "prompt": [{"role": "user", "content": "test"}],
+                "example_id": 0,
+                "task": "test",
+                "answer": "correct",
+            }
+        )
+        state["prompt"] = [{"role": "user", "content": "test"}]
+        state["completion"] = [{"role": "assistant", "content": "response"}]
+        state["task"] = "test"
+        state["timing"] = {"scoring_ms": 0.0, "total_ms": 0.0}
+
+        from contextlib import asynccontextmanager
+
+        @asynccontextmanager
+        async def mock_sem():
+            yield
+
+        await rubric.score_rollout(state, score_sem=mock_sem())
+
+        # Check that reward was extracted correctly
+        assert state["reward"] == 0.5
+        assert "simple_reward" in state["metrics"]
+        assert state["metrics"]["simple_reward"] == 0.5
+
+        # Feedbacks should be empty for float returns
+        assert "feedbacks" in state
+        assert len(state["feedbacks"]) == 0
diff --git a/tests/test_rubric.py b/tests/test_rubric.py
index a58b3c064..5d4d8f806 100644
--- a/tests/test_rubric.py
+++ b/tests/test_rubric.py
@@ -218,6 +218,36 @@ def list_func(completion, **kwargs):
         assert state["metrics"]["list_func"] == 2.0  # Length of completion list
         assert state["reward"] == 2.0
 
+    @pytest.mark.asyncio
+    async def test_reward_result_missing_score_raises(self):
+        """RewardResult dicts must include a score key."""
+
+        def bad_reward(completion, **kwargs):
+            return {"feedback": "oops"}
+
+        rubric = Rubric(funcs=[bad_reward])
+
+        state = State(
+            input=RolloutInput(
+                prompt="prompt",
+                answer="answer",
+                task="task",
+                example_id=0,
+            )
+        )
+        state["completion"] = "prediction"
+        state["trajectory"] = []
+        state["timing"] = {
+            "generation_ms": 0.0,
+            "scoring_ms": 0.0,
+            "total_ms": 0.0,
+            "start_time": 0.0,
+        }
+        score_sem = NullAsyncContext()
+
+        with pytest.raises(ValueError, match="missing required 'score'"):
+            await rubric.score_rollout(state, score_sem)
+
     @pytest.mark.asyncio
     async def test_score_rollouts_multiple(self):
         """Test scoring multiple rollouts using score_group."""
@@ -276,6 +306,83 @@ def length_func(completion, **kwargs):
         assert states[1]["metrics"]["length_func"] == 7.0
         assert states[2]["metrics"]["length_func"] == 5.0
 
+    @pytest.mark.asyncio
+    async def test_score_group_handles_reward_result_dicts(self):
+        """Ensure score_group handles RewardResult outputs from individual funcs."""
+
+        def reward_with_feedback(completion, **kwargs):
+            return {"score": 0.25, "feedback": "ok"}
+
+        rubric = Rubric(funcs=[reward_with_feedback], weights=[2.0])
+
+        state = State(
+            input=RolloutInput(
+                prompt="prompt",
+                answer="answer",
+                task="task",
+                example_id=0,
+            )
+        )
+        state["completion"] = "prediction"
+        state["trajectory"] = []
+        state["timing"] = {
+            "generation_ms": 0.0,
+            "scoring_ms": 0.0,
+            "total_ms": 0.0,
+            "start_time": 0.0,
+        }
+        score_sem = NullAsyncContext()
+
+        await rubric.score_group([state], score_sem)
+
+        assert state["metrics"]["reward_with_feedback"] == pytest.approx(0.25)
+        assert state["reward"] == pytest.approx(0.5)
+
+    @pytest.mark.asyncio
+    async def test_group_reward_func_handles_dict_scores(self):
+        """Ensure group-level reward functions can emit RewardResult dicts."""
+
+        def group_reward(states, **kwargs):
+            return [{"score": 0.1}, {"score": 0.2}]
+
+        rubric = Rubric(funcs=[group_reward], weights=[1.0])
+
+        states = [
+            State(
+                input=RolloutInput(
+                    prompt="p1",
+                    answer="a1",
+                    task="t1",
+                    example_id=0,
+                )
+            ),
+            State(
+                input=RolloutInput(
+                    prompt="p2",
+                    answer="a2",
+                    task="t2",
+                    example_id=1,
+                )
+            ),
+        ]
+        for state in states:
+            state["completion"] = "resp"
+            state["trajectory"] = []
+            state["timing"] = {
+                "generation_ms": 0.0,
+                "scoring_ms": 0.0,
+                "total_ms": 0.0,
+                "start_time": 0.0,
+            }
+
+        score_sem = NullAsyncContext()
+        await rubric.score_group(states, score_sem)
+
+        assert states[0]["metrics"]["group_reward"] == pytest.approx(0.1)
+        assert states[1]["metrics"]["group_reward"] == pytest.approx(0.2)
+        assert states[0]["reward"] == pytest.approx(0.1)
+        assert states[1]["reward"] == pytest.approx(0.2)
+
     @pytest.mark.asyncio
     async def test_score_rollouts_with_apply_weights(self):
         """Test scoring rollouts - weights always applied via score_group."""
diff --git a/verifiers/adapters/__init__.py b/verifiers/adapters/__init__.py
new file mode 100644
index 000000000..9f02635fe
--- /dev/null
+++ b/verifiers/adapters/__init__.py
@@ -0,0 +1,5 @@
+"""Adapters that bridge Verifiers with external optimization systems."""
+
+from .gepa import GEPAAdapter
+
+__all__ = ["GEPAAdapter"]
diff --git a/verifiers/adapters/gepa/__init__.py b/verifiers/adapters/gepa/__init__.py
new file mode 100644
index 000000000..cdff1d841
--- /dev/null
+++ b/verifiers/adapters/gepa/__init__.py
@@ -0,0 +1,5 @@
+"""GEPA adapter packaged for verifiers installations."""
+
+from .adapter import GEPAAdapter
+
+__all__ = ["GEPAAdapter"]
diff --git a/verifiers/adapters/gepa/adapter.py b/verifiers/adapters/gepa/adapter.py
new file mode 100644
index 000000000..15ab9b405
--- /dev/null
+++ b/verifiers/adapters/gepa/adapter.py
@@ -0,0 +1,464 @@
+"""
+GEPAAdapter: Bridge between Verifiers Environment API and GEPA optimization.
+
+This adapter implements the GEPAAdapter protocol from the gepa package,
+enabling automatic optimization of environment text components (system_prompt,
+tool descriptions, etc.) through reflection-based evolution.
+"""
+
+import asyncio
+import inspect
+import logging
+from copy import deepcopy
+from typing import Any
+
+from statistics import fmean
+from gepa import EvaluationBatch, GEPAAdapter as BaseGEPAAdapter
+from openai import AsyncOpenAI
+
+import verifiers as vf
+from verifiers.types import Messages, RolloutInput
+
+logger = logging.getLogger(__name__)
+
+
+class GEPAAdapter(BaseGEPAAdapter):
+    """
+    Adapter bridging Verifiers Environment API to GEPA optimization.
+
+    Key responsibilities:
+    - Component management: Extract/inject text components (system_prompt, tool descriptions)
+    - Evaluation: Run rollouts and collect scores
+    - Feedback generation: Convert rubric scores + state to GEPA feedback
+    - Dataset conversion: HF Dataset → GEPA format
+
+    Args:
+        env: Base Verifiers Environment to optimize
+        client: AsyncOpenAI client for model inference
+        model: Model name to optimize
+        sampling_args: Sampling configuration (temperature, max_tokens, etc.)
+        components_to_optimize: List of component names (e.g., ["system_prompt", "tool_descriptions"])
+        num_rollouts_per_example: Number of rollouts per example for evaluation
+        max_concurrent: Maximum concurrent rollout evaluations
+    """
+
+    def __init__(
+        self,
+        env: vf.Environment,
+        client: AsyncOpenAI,
+        model: str,
+        sampling_args: dict[str, Any],
+        components_to_optimize: list[str] | None = None,
+        num_rollouts_per_example: int = 1,
+        max_concurrent: int = 32,
+    ):
+        self.base_env = env
+        self.client = client
+        self.model = model
+        self.sampling_args = sampling_args
+        self.components_to_optimize = components_to_optimize or ["system_prompt"]
+        self.num_rollouts_per_example = num_rollouts_per_example
+        self.max_concurrent = max_concurrent
+
+        if self.num_rollouts_per_example < 1:
+            raise ValueError("num_rollouts_per_example must be at least 1")
+        if self.num_rollouts_per_example > 10:
+            logger.warning(
+                "num_rollouts_per_example=%s may be costly; "
+                "expect roughly %sx more rollouts per batch",
+                self.num_rollouts_per_example,
+                self.num_rollouts_per_example,
+            )
+
+        # Validate components
+        if "tool_descriptions" in self.components_to_optimize:
+            if not hasattr(env, "oai_tools") or not env.oai_tools:
+                raise ValueError(
+                    "Cannot optimize tool_descriptions: environment has no tools"
+                )
+
+        for comp in self.components_to_optimize:
+            if comp not in ["system_prompt", "tool_descriptions"]:
+                if not hasattr(env, comp):
+                    raise ValueError(
+                        f"Environment does not have component '{comp}'. "
+                        f"Available: system_prompt, tool_descriptions"
+                    )
+
+        logger.info(
+            f"Initialized GEPAAdapter for {len(self.components_to_optimize)} components: "
+            f"{self.components_to_optimize}"
+        )
+
+    def build_program(self, candidate: dict[str, str]) -> vf.Environment:
+        """
+        Reconstruct a fresh Environment instance with updated components.
+        """
+        env_class = self.base_env.__class__
+        signature = inspect.signature(env_class.__init__)
+        accepts_kwargs = any(
+            param.kind == inspect.Parameter.VAR_KEYWORD
+            for param in signature.parameters.values()
+        )
+
+        init_kwargs: dict[str, Any] = {}
+        post_init_overrides: dict[str, Any] = {}
+
+        # Preserve constructor arguments present on the base environment
+        for param_name in signature.parameters:
+            if param_name == "self":
+                continue
+            if hasattr(self.base_env, param_name):
+                value = getattr(self.base_env, param_name)
+                if isinstance(value, (dict, list)):
+                    init_kwargs[param_name] = deepcopy(value)
+                else:
+                    init_kwargs[param_name] = value
+
+        # Ensure core Environment parameters are forwarded when available
+        # BUT only if they're explicitly in the specific environment's signature
+        # (Some envs like TextArenaEnv create dataset/eval_dataset internally)
+        env_signature = inspect.signature(vf.Environment.__init__)
+        env_param_names = [
+            name for name in env_signature.parameters if name not in {"self", "kwargs"}
+        ]
+        for param_name in env_param_names:
+            if param_name in init_kwargs:
+                continue
+            # Only add if explicitly in the environment's signature
+            # Skip if only accepted via **kwargs
+            if param_name not in signature.parameters:
+                continue
+            if not hasattr(self.base_env, param_name):
+                continue
+            value = getattr(self.base_env, param_name)
+            if isinstance(value, (dict, list)):
+                init_kwargs[param_name] = deepcopy(value)
+            else:
+                init_kwargs[param_name] = value
+
+        updated_oai_tools = None
+        if (
+            "tool_descriptions" in self.components_to_optimize
+            and hasattr(self.base_env, "oai_tools")
+            and self.base_env.oai_tools
+        ):
+            updated_oai_tools = deepcopy(self.base_env.oai_tools)
+            for i, tool in enumerate(updated_oai_tools):
+                tool_desc_key = f"tool_{i}_description"
+                if tool_desc_key in candidate:
+                    tool["function"]["description"] = candidate[tool_desc_key]
+            init_kwargs["oai_tools"] = updated_oai_tools
+
+        # Override constructor args with candidate values when applicable
+        for comp_name, comp_value in candidate.items():
+            if comp_name.startswith("tool_") and comp_name.endswith("_description"):
+                continue
+            if comp_name in signature.parameters or accepts_kwargs:
+                init_kwargs[comp_name] = comp_value
+            else:
+                post_init_overrides[comp_name] = comp_value
+
+        try:
+            new_env = env_class(**init_kwargs)
+        except TypeError as exc:
+            raise ValueError(
+                f"Failed to reconstruct {env_class.__name__} with optimized components. "
+                f"Error: {exc}"
+            ) from exc
+
+        for attr_name, attr_value in post_init_overrides.items():
+            setattr(new_env, attr_name, attr_value)
+
+        if updated_oai_tools is not None:
+            new_env.oai_tools = updated_oai_tools
+
+        return new_env
+
+    def evaluate(
+        self,
+        batch: list[dict],
+        candidate: dict[str, str],
+        capture_traces: bool = False,
+    ) -> EvaluationBatch:
+        """
+        Evaluate candidate on batch of examples.
+
+        Args:
+            batch: List of examples (dicts with 'question', 'answer', 'info', 'task')
+            candidate: Dict of component values to evaluate
+            capture_traces: Whether to capture detailed execution traces
+
+        Returns:
+            EvaluationBatch with outputs, scores, and optional trajectories
+        """
+        # Build environment with candidate components
+        env = self.build_program(candidate)
+
+        # Run evaluation using Environment's evaluate method
+        evaluation = self._evaluate_async(env, batch, capture_traces)
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError:
+            return asyncio.run(evaluation)
+
+        raise RuntimeError(
+            "GEPAAdapter.evaluate() cannot run inside an active asyncio loop. "
+            "Use 'await adapter.evaluate_async(...)' instead."
+        )
+
+    async def evaluate_async(
+        self,
+        batch: list[dict],
+        candidate: dict[str, str],
+        capture_traces: bool = False,
+    ) -> EvaluationBatch:
+        """
+        Evaluate candidate asynchronously.
+
+        Preferred when the caller already manages an asyncio loop (e.g., notebooks,
+        services). Mirrors the synchronous evaluate() contract.
+        """
+        env = self.build_program(candidate)
+        return await self._evaluate_async(env, batch, capture_traces)
+
+    async def _evaluate_async(
+        self, env: vf.Environment, batch: list[dict], capture_traces: bool
+    ) -> EvaluationBatch:
+        """Async helper for evaluation."""
+        rollout_inputs = self._build_rollout_inputs(env, batch)
+        if not rollout_inputs:
+            logger.warning("Empty evaluation batch received by GEPAAdapter")
+            return EvaluationBatch(
+                outputs=[], scores=[], trajectories=[] if capture_traces else None
+            )
+
+        generate_outputs = await env.generate(
+            inputs=rollout_inputs,
+            client=self.client,
+            model=self.model,
+            sampling_args=self.sampling_args,
+            max_concurrent=self.max_concurrent,
+            use_tqdm=False,
+        )
+
+        completions = generate_outputs["completion"]
+        states = generate_outputs["state"]
+        rewards = generate_outputs["reward"]
+
+        scores = [float(score) if score is not None else 0.0 for score in rewards]
+        trajectories = [] if capture_traces else None
+
+        if capture_traces:
+            for completion, state, score in zip(completions, states, scores):
+                trajectories.append(
+                    {
+                        "completion": completion,
+                        "state": state,
+                        "score": score,
+                    }
+                )
+
+        mean_score = fmean(scores) if scores else 0.0
+        logger.debug(
+            f"Evaluation complete: {len(scores)} rollouts, "
+            f"mean={mean_score:.4f}, min={min(scores) if scores else 0:.4f}, "
+            f"max={max(scores) if scores else 0:.4f}"
+        )
+
+        return EvaluationBatch(
+            outputs=completions,
+            scores=scores,
+            trajectories=trajectories,
+        )
+
+    def _build_rollout_inputs(
+        self, env: vf.Environment, batch: list[dict]
+    ) -> list[RolloutInput]:
+        """
+        Convert GEPA batch examples into Verifiers RolloutInput objects.
+
+        Handles prompt normalization, example/task bookkeeping, answer passthrough,
+        and optional info payloads while duplicating entries according to
+        num_rollouts_per_example so downstream generate() calls receive independent
+        rollout inputs.
+        """
+        rollout_inputs: list[RolloutInput] = []
+
+        for example_idx, example in enumerate(batch):
+            raw_prompt = example.get("prompt") or example.get("question") or ""
+            formatted_prompt = self._format_prompt(env, raw_prompt)
+            task = str(example.get("task") or env.env_id or "default")
+
+            example_id_value = example.get("example_id", example_idx)
+            try:
+                example_id = int(example_id_value)
+            except (TypeError, ValueError):
+                example_id = example_idx
+
+            base_input: RolloutInput = {
+                "prompt": formatted_prompt,
+                "task": task,
+                "example_id": example_id,
+            }
+
+            if "answer" in example and example["answer"] is not None:
+                base_input["answer"] = example["answer"]
+
+            info = example.get("info")
+            if info is not None:
+                base_input["info"] = deepcopy(info)
+
+            for _ in range(self.num_rollouts_per_example):
+                rollout_inputs.append(deepcopy(base_input))
+
+        return rollout_inputs
+
+    def _format_prompt(self, env: vf.Environment, prompt: str | Messages) -> Messages:
+        """
+        Ensure prompts match the environment's declared message_type.
+
+        Completion environments expect raw strings, so chat-style prompts are
+        flattened into a single string. Chat environments expect structured
+        message lists, so bare strings are wrapped with system/few-shot context.
+        """
+        if env.message_type == "completion":
+            if isinstance(prompt, str):
+                return prompt
+            if isinstance(prompt, list):
+                content_parts: list[str] = []
+                for message in prompt:
+                    if isinstance(message, dict):
+                        content = message.get("content")
+                        if isinstance(content, str):
+                            content_parts.append(content)
+                return " ".join(content_parts) if content_parts else str(prompt)
+            return str(prompt)
+
+        if isinstance(prompt, list):
+            return prompt
+
+        messages: list[dict[str, str]] = []
+        if env.system_prompt:
+            messages.append({"role": "system", "content": env.system_prompt})
+        if env.few_shot:
+            messages.extend(deepcopy(env.few_shot))
+        messages.append({"role": "user", "content": str(prompt)})
+        return messages
+
+    def make_reflective_dataset(
+        self,
+        candidate: dict[str, str],
+        eval_batch: EvaluationBatch,
+        components_to_update: list[str],
+    ) -> dict[str, list[dict]]:
+        """
+        Generate reflective dataset for GEPA's proposal phase.
+
+        Each reflective example contains:
+        - Inputs: Original prompt/task context
+        - Generated_Outputs: Model completion
+        - Feedback: Textual explanation of score
+
+        Args:
+            candidate: Current candidate being evaluated
+            eval_batch: Results from evaluate()
+            components_to_update: Which components to generate feedback for
+
+        Returns:
+            Dict mapping component_name → list[ReflectiveExample]
+        """
+        if not eval_batch.trajectories:
+            raise ValueError(
+                "make_reflective_dataset requires capture_traces=True in evaluate()"
+            )
+
+        reflective_data: dict[str, list[dict]] = {}
+
+        # For environment-level components (like system_prompt), all examples
+        # reflect on the same component, so we aggregate feedback across examples
+        for comp_name in components_to_update:
+            if comp_name not in self.components_to_optimize:
+                continue
+
+            examples = []
+
+            for traj in eval_batch.trajectories:
+                completion = traj["completion"]
+                state = traj["state"]
+                score = traj["score"]
+
+                # Extract prompt for context
+                prompt = state.get("prompt", "")
+                if isinstance(prompt, list):
+                    # Chat format - extract user message
+                    user_msgs = [m for m in prompt if m.get("role") == "user"]
+                    prompt_text = user_msgs[-1].get("content", "") if user_msgs else ""
+                else:
+                    prompt_text = prompt
+
+                # Extract completion text
+                if isinstance(completion, list):
+                    # Chat format
+                    asst_msgs = [m for m in completion if m.get("role") == "assistant"]
+                    completion_text = (
+                        asst_msgs[-1].get("content", "") if asst_msgs else ""
+                    )
+                else:
+                    completion_text = completion
+
+                # Build inputs dict
+                inputs = {
+                    "Task": prompt_text,
+                }
+
+                # Build outputs
+                generated_outputs = completion_text
+
+                # Generate feedback - use rubric's get_feedback if available
+                if hasattr(self.base_env.rubric, "get_feedback"):
+                    feedback = self.base_env.rubric.get_feedback(state)
+                else:
+                    # Default fallback for basic rubrics
+                    feedback = f"Reward: {score:.3f}"
+                    if score < 0.5:
+                        feedback += " (Low score - needs improvement)"
+                    elif score >= 0.8:
+                        feedback += " (Good performance)"
+
+                examples.append(
+                    {
+                        "Inputs": inputs,
+                        "Generated Outputs": generated_outputs,
+                        "Feedback": feedback,
+                    }
+                )
+
+            reflective_data[comp_name] = examples
+
+        if not reflective_data:
+            raise ValueError(
+                f"No reflective data generated for components: {components_to_update}"
+            )
+
+        # Log sample feedback for debugging
+        for comp_name, examples in reflective_data.items():
+            logger.debug("\n%s\nComponent: %s", "=" * 80, comp_name)
+            logger.debug("Sample feedback (first example):")
+            if examples:
+                first_ex = examples[0]
+                logger.debug(
+                    f"  Task: {first_ex['Inputs'].get('Task', 'N/A')[:200]}..."
+                )
+                logger.debug(f"  Output: {first_ex['Generated Outputs'][:200]}...")
+                logger.debug(f"  Feedback: {first_ex['Feedback'][:500]}...")
+
+        logger.info(
+            f"Generated reflective dataset with {sum(len(v) for v in reflective_data.values())} examples "
+            f"across {len(reflective_data)} components"
+        )
+
+        return reflective_data
+
+
+__all__ = ["GEPAAdapter"]
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
index db1f80399..f43c90d2a 100644
--- a/verifiers/envs/environment.py
+++ b/verifiers/envs/environment.py
@@ -431,6 +431,7 @@ async def init_state(
             total_ms=0.0,
             start_time=time.time(),
         )
+        state["feedbacks"] = []
         return state
 
     @abstractmethod
diff --git a/verifiers/rubrics/rubric.py b/verifiers/rubrics/rubric.py
index 327caf560..05a797a44 100644
--- a/verifiers/rubrics/rubric.py
+++ b/verifiers/rubrics/rubric.py
@@ -8,6 +8,7 @@
 from verifiers.types import (
     GroupRewardFunc,
     RewardFunc,
+    RewardResult,
     RolloutScore,
     State,
 )
@@ -98,15 +99,38 @@ def _get_individual_reward_weights(self) -> list[float]:
             if not self._is_group_func(func)
         ]
 
+    def _parse_reward_result(
+        self, func_name: str, result: float | RewardResult
+    ) -> tuple[float, str | None]:
+        """
+        Normalize reward function outputs to (score, feedback).
+
+        Raises:
+            ValueError: if a RewardResult dict omits the required "score" key.
+        """
+        if isinstance(result, dict):
+            if "score" not in result:
+                raise ValueError(
+                    f"RewardResult dict missing required 'score' key for {func_name}: {result}"
+                )
+            score = float(result["score"])
+            feedback = result.get("feedback")
+            return score, feedback
+        return float(result), None
+
     async def _call_individual_reward_func(
         self,
         func: RewardFunc,
         state: State,
         score_sem: AsyncContextManager,
-    ) -> float:
+    ) -> float | RewardResult:
         """
         Invoke `func` with only the required arguments.
 
+        Reward functions can return either:
+        - float: backward compatible (no feedback)
+        - dict: {"score": float, "feedback": str} (for FeedbackRubric)
+
         Example:
         ```
         def func(completion, answer, **kwargs):
@@ -128,22 +152,31 @@ async def _call():
             merged.update(self.class_objects)
             if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
                 try:
-                    ans = float(await maybe_await(func, **merged))
+                    result = await maybe_await(func, **merged)
+                    # Handle both float and dict returns
+                    if isinstance(result, dict):
+                        return result
+                    else:
+                        return float(result)
                 except Exception as e:
                     self.logger.error(
                         f"Error calling reward function {func.__name__}: {e}"  # type: ignore[unresolved-attribute]
                     )
-                    ans = 0.0
+                    return 0.0
             else:
                 allowed = {k: v for k, v in merged.items() if k in sig.parameters}
                 try:
-                    ans = float(await maybe_await(func, **allowed))
+                    result = await maybe_await(func, **allowed)
+                    # Handle both float and dict returns
+                    if isinstance(result, dict):
+                        return result
+                    else:
+                        return float(result)
                 except Exception as e:
                     self.logger.error(
                         f"Error calling reward function {func.__name__}: {e}"  # type: ignore[unresolved-attribute]
                     )
-                    ans = 0.0
-            return ans
+                    return 0.0
 
         async with score_sem:
             return await _call()
@@ -216,14 +249,20 @@ async def score_rollout(self, state: State, score_sem: AsyncContextManager):
         )
         start_time = time.time()
         reward_scores = []
+        feedbacks = []  # Collect feedback from functions that return dicts
+
         for func in reward_funcs:
-            reward_scores.append(
-                await self._call_individual_reward_func(
-                    func=func,
-                    state=state,
-                    score_sem=score_sem,
-                )
+            result = await self._call_individual_reward_func(
+                func=func,
+                state=state,
+                score_sem=score_sem,
             )
+
+            score, feedback = self._parse_reward_result(func.__name__, result)
+            if feedback:
+                feedbacks.append(f"{func.__name__}: {feedback}")
+            reward_scores.append(score)
+
         rewards = RolloutScore(
             metrics={
                 func.__name__: reward
@@ -243,6 +282,32 @@ async def score_rollout(self, state: State, score_sem: AsyncContextManager):
         state["timing"]["total_ms"] += state["timing"]["scoring_ms"]
         state["reward"] = rewards["reward"]
         state["metrics"] = rewards["metrics"]
+        state["feedbacks"] = feedbacks  # Store feedback for get_feedback()
+
+    def get_feedback(self, state: State) -> str:
+        """
+        Combine feedback from all reward functions into a single string.
+
+        This method should be called after score_rollout() has been executed,
+        which populates state["feedbacks"].
+
+        Args:
+            state: State dict containing execution results
+
+        Returns:
+            Combined feedback string from all reward functions
+        """
+        feedbacks = state.get("feedbacks", [])
+
+        if not feedbacks:
+            # Fallback if no functions provided feedback
+            score = state.get("reward", 0.0)
+            return f"Score: {score:.2%} (no detailed feedback available)"
+
+        # Combine all feedback with score summary
+        combined = f"Score: {state.get('reward', 0.0):.2%}\n\n"
+        combined += "\n\n".join(feedbacks)
+        return combined
 
     async def score_group(self, states: list[State], score_sem: AsyncContextManager):
         """
@@ -271,7 +336,13 @@ async def score_group(self, states: list[State], score_sem: AsyncContextManager)
                 if func_name not in aggregated_metrics:
                     aggregated_metrics[func_name] = [0.0] * num_states
                 for i in range(num_states):
-                    score_value = scores[i]
+                    score_value, feedback = self._parse_reward_result(
+                        func_name, scores[i]
+                    )
+                    if feedback:
+                        states[i].setdefault("feedbacks", []).append(
+                            f"{func_name}: {feedback}"
+                        )
                     aggregated_rewards[i] += score_value * weight
                     aggregated_metrics[func_name][i] = score_value
             else:
@@ -288,7 +359,13 @@ async def score_group(self, states: list[State], score_sem: AsyncContextManager)
                 if func_name not in aggregated_metrics:
                     aggregated_metrics[func_name] = [0.0] * num_states
                 for i in range(num_states):
-                    score_value = scores[i]
+                    score_value, feedback = self._parse_reward_result(
+                        func_name, scores[i]
+                    )
+                    if feedback:
+                        states[i].setdefault("feedbacks", []).append(
+                            f"{func_name}: {feedback}"
+                        )
                     aggregated_rewards[i] += score_value * weight
                     aggregated_metrics[func_name][i] = score_value
 
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
new file mode 100644
index 000000000..09cf6f0bc
--- /dev/null
+++ b/verifiers/scripts/gepa.py
@@ -0,0 +1,690 @@
+#!/usr/bin/env python3
+"""
+GEPA optimization script for Verifiers environments.
+
+Usage:
+    vf-gepa wordle --auto light
+    vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+    vf-gepa my-env --max-metric-calls 1000 -n 100 --num-val 30
+"""
+
+import argparse
+import json
+import logging
+import math
+import os
+import sys
+import textwrap
+import uuid
+from pathlib import Path
+
+try:
+    from gepa import optimize
+except ImportError:
+    print("Error: GEPA is not installed.")
+    print("Install with: uv add 'verifiers[gepa]'")
+    sys.exit(1)
+
+
+from openai import OpenAI
+
+import verifiers as vf
+from verifiers.adapters.gepa import GEPAAdapter
+from verifiers.types import ClientConfig
+from verifiers.utils.client_utils import setup_client
+
+logger = logging.getLogger("gepa")
+
+# Auto-budget constants for clarity and tuning
+AUTO_BUDGET_CANDIDATES = {
+    "light": 6,
+    "medium": 12,
+    "heavy": 18,
+}
+TRIAL_LOG_BASE_MULTIPLIER = 2.0
+TRIAL_COMPONENT_MULTIPLIER = 2
+TRIAL_LINEAR_MULTIPLIER = 1.5
+BOOTSTRAP_TRIALS_PER_CANDIDATE = 5
+
+
+def auto_budget_to_metric_calls(
+    auto: str,
+    num_components: int,
+    valset_size: int,
+    minibatch_size: int = 3,
+    full_eval_steps: int = 5,
+) -> int:
+    """
+    Convert auto budget (light/medium/heavy) to max_metric_calls.
+
+    This replicates GEPA's auto_budget calculation for consistency.
+
+    Args:
+        auto: Budget level ('light', 'medium', or 'heavy')
+        num_components: Number of components being optimized
+        valset_size: Size of validation set
+        minibatch_size: Reflection minibatch size
+        full_eval_steps: Steps between full validations
+
+    Returns:
+        Maximum number of metric calls
+    """
+    num_candidates = AUTO_BUDGET_CANDIDATES[auto]
+
+    # Calculate number of trials using log-growth vs. linear fallback
+    log_trials = (
+        TRIAL_LOG_BASE_MULTIPLIER
+        * (num_components * TRIAL_COMPONENT_MULTIPLIER)
+        * math.log2(num_candidates)
+    )
+    linear_trials = TRIAL_LINEAR_MULTIPLIER * num_candidates
+    num_trials = int(max(log_trials, linear_trials))
+
+    V = valset_size
+    N = num_trials
+    M = minibatch_size
+    m = full_eval_steps
+
+    # Initial full evaluation on the default program
+    total = V
+
+    # Assume a handful of bootstrap trials per candidate
+    total += num_candidates * BOOTSTRAP_TRIALS_PER_CANDIDATE
+
+    # N minibatch evaluations
+    total += N * M
+
+    if N == 0:
+        return total
+
+    # Periodic full evals
+    periodic_fulls = (N + 1) // m + 1
+    extra_final = 1 if N < m else 0
+
+    total += (periodic_fulls + extra_final) * V
+
+    logger.info(
+        f"Auto budget '{auto}' → ~{num_candidates} candidates, "
+        f"~{total} metric calls (~{total // (V or 1)} full evals)"
+    )
+
+    return total
+
+
+def prepare_gepa_dataset(dataset) -> list[dict]:
+    """
+    Convert HuggingFace Dataset to GEPA format.
+
+    GEPA expects a list of dicts with keys like 'question', 'answer', 'info', 'task'.
+    """
+    if dataset is None:
+        return []
+
+    examples = []
+    for item in dataset:
+        example = {
+            "question": item.get("question", item.get("prompt", "")),
+            "answer": item.get("answer", ""),
+            "task": item.get("task", "default"),
+            "info": item.get("info", {}),
+        }
+        examples.append(example)
+
+    return examples
+
+
+def call_reflection_model(
+    client: OpenAI,
+    prompt: str,
+    model: str,
+    temperature: float = 1.0,
+    max_tokens: int | None = None,
+) -> str:
+    """
+    Call reflection model to generate proposal.
+
+    This is a wrapper around the API call for GEPA's reflection phase.
+    """
+    try:
+        request_args = {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": temperature,
+        }
+        if max_tokens is not None:
+            request_args["max_tokens"] = max_tokens
+        response = client.chat.completions.create(**request_args)
+        return response.choices[0].message.content or ""
+    except Exception as e:
+        logger.error(f"Error calling reflection model: {e}")
+        raise
+
+
+def save_optimized_components(
+    env_id: str,
+    best_candidate: dict[str, str],
+    seed_candidate: dict[str, str],
+    output_dir: Path,
+):
+    """Save optimized components to disk for future use."""
+    output_file = output_dir / f"{env_id}_optimized.json"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_file, "w") as f:
+        json.dump(best_candidate, f, indent=2)
+
+    logger.info(f"Saved optimized components to: {output_file}")
+
+    # Also save the original (seed) components for comparison
+    original_file = output_dir / f"{env_id}_original.json"
+    with open(original_file, "w") as f:
+        json.dump(seed_candidate, f, indent=2)
+
+    logger.info(f"Saved original components to: {original_file}")
+
+
+def save_optimization_metrics(
+    env_id: str,
+    result,
+    output_dir: Path,
+    run_config: dict,
+):
+    """Save optimization metrics and configuration for analysis."""
+    from datetime import datetime
+
+    metrics_file = output_dir / f"{env_id}_metrics.json"
+
+    metrics = {
+        # Run configuration
+        "config": run_config,
+        # Timestamps
+        "date": datetime.now().strftime("%Y-%m-%d"),
+        "timestamp": datetime.now().isoformat(),
+        # Results
+        "val_aggregate_scores": result.val_aggregate_scores,
+        "num_candidates": len(result.candidates),
+        "best_val_score": (
+            float(max(result.val_aggregate_scores))
+            if result.val_aggregate_scores
+            else 0.0
+        ),
+        "initial_val_score": (
+            float(result.val_aggregate_scores[0])
+            if result.val_aggregate_scores
+            else 0.0
+        ),
+        "improvement": (
+            float(max(result.val_aggregate_scores) - result.val_aggregate_scores[0])
+            if len(result.val_aggregate_scores) > 0
+            else 0.0
+        ),
+        "candidates_history": [
+            {
+                "iteration": i,
+                "score": float(score),
+            }
+            for i, score in enumerate(result.val_aggregate_scores)
+        ],
+    }
+
+    with open(metrics_file, "w") as f:
+        json.dump(metrics, f, indent=2)
+
+    logger.info(f"Saved optimization metrics to: {metrics_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run GEPA prompt optimization on Verifiers environments",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Light optimization (quick test)
+  vf-gepa wordle --auto light
+
+  # Heavy optimization with tool descriptions
+  vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+
+  # Custom configuration
+  vf-gepa my-env --max-metric-calls 1000 -n 100 --num-val 30
+        """,
+    )
+
+    # Environment args
+    parser.add_argument(
+        "env_id", type=str, help="Environment ID (e.g., wordle, wiki-search)"
+    )
+    parser.add_argument(
+        "--env-args",
+        "-a",
+        default="{}",
+        help="JSON dict of keyword args forwarded to vf.load_environment",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--num-examples",
+        type=int,
+        default=50,
+        help="Number of training examples (default: 50)",
+    )
+
+    parser.add_argument(
+        "--num-val",
+        type=int,
+        default=20,
+        help="Number of validation examples (default: 20)",
+    )
+
+    # GEPA budget (mutually exclusive)
+    budget_group = parser.add_mutually_exclusive_group(required=True)
+    budget_group.add_argument(
+        "--auto",
+        choices=["light", "medium", "heavy"],
+        help="Auto budget: light (~6 candidates), medium (~12), heavy (~18)",
+    )
+    budget_group.add_argument(
+        "--max-metric-calls", type=int, help="Maximum total metric calls budget"
+    )
+
+    # GEPA configuration
+    parser.add_argument(
+        "--reflection-model",
+        default="gpt-4o",
+        help="Model for reflection/proposal (default: gpt-4o)",
+    )
+
+    parser.add_argument(
+        "--reflection-temperature",
+        type=float,
+        default=1.0,
+        help="Temperature for reflection model (default: 1.0)",
+    )
+
+    parser.add_argument(
+        "--reflection-base-url",
+        default=None,
+        help="Base URL for reflection model API (default: task client base URL)",
+    )
+
+    parser.add_argument(
+        "--reflection-api-key-var",
+        default="OPENAI_API_KEY",
+        help="Env var that stores the reflection API key (default: OPENAI_API_KEY)",
+    )
+
+    parser.add_argument(
+        "--reflection-max-tokens",
+        type=int,
+        default=8000,
+        help="Max tokens for reflection completions (default: 8000)",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model",
+        default="gpt-4o-mini",
+        help="Model to optimize (default: gpt-4o-mini)",
+    )
+    parser.add_argument(
+        "--api-key-var",
+        "-k",
+        default="OPENAI_API_KEY",
+        help="Environment variable containing the task model API key",
+    )
+    parser.add_argument(
+        "--api-base-url",
+        "-b",
+        default="https://api.openai.com/v1",
+        help="Base URL for the task model API (default: https://api.openai.com/v1)",
+    )
+    parser.add_argument(
+        "--header",
+        action="append",
+        dest="headers",
+        default=None,
+        help="Additional HTTP header for the task model client. Format: 'Name: Value'. Repeatable.",
+    )
+
+    parser.add_argument(
+        "--components",
+        nargs="+",
+        default=["system_prompt"],
+        help="Components to optimize (default: system_prompt)",
+    )
+
+    parser.add_argument(
+        "--reflection-minibatch-size",
+        type=int,
+        default=3,
+        help="Number of examples per reflection step (default: 3)",
+    )
+
+    parser.add_argument(
+        "--rollouts-per-example",
+        type=int,
+        default=1,
+        help="Number of rollouts per example (default: 1)",
+    )
+
+    # Model configuration
+    parser.add_argument(
+        "-T",
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Temperature for task model (default: 1.0)",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--max-tokens",
+        type=int,
+        default=8096,
+        help="Max tokens for task model (default: 8096)",
+    )
+
+    # Logging
+    parser.add_argument(
+        "--log-dir",
+        help="Directory for GEPA logs (default: ./gepa_results/<env_id>/<run_id>)",
+    )
+
+    parser.add_argument(
+        "--track-stats",
+        action="store_true",
+        help="Track detailed optimization statistics",
+    )
+
+    parser.add_argument(
+        "--verbose", "-v", action="store_true", help="Enable verbose logging"
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility (default: 42)",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        env_args = json.loads(args.env_args)
+        if not isinstance(env_args, dict):
+            raise TypeError("env args must be a JSON object")
+    except (json.JSONDecodeError, TypeError) as exc:
+        raise ValueError(
+            "--env-args must be valid JSON representing a dictionary"
+        ) from exc
+
+    task_client_headers: dict[str, str] | None = None
+    if args.headers:
+        task_client_headers = {}
+        for header in args.headers:
+            if ":" not in header:
+                raise ValueError(
+                    "Headers must be provided in the format 'Name: Value'."
+                )
+            key, value = header.split(":", 1)
+            task_client_headers[key.strip()] = value.strip()
+
+    # Setup logging
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Silence noisy third-party loggers
+    logging.getLogger("openai").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
+    logger.info(f"Starting GEPA optimization for environment: {args.env_id}")
+    logger.info(f"Components to optimize: {args.components}")
+
+    # Setup client
+    client_config_kwargs = {
+        "api_key_var": args.api_key_var,
+        "api_base_url": args.api_base_url,
+    }
+    if task_client_headers is not None:
+        client_config_kwargs["extra_headers"] = task_client_headers
+
+    client_config = ClientConfig(**client_config_kwargs)
+    client = setup_client(client_config)
+    logger.debug("Initialized OpenAI client")
+
+    # Load environment
+    vf_env = vf.load_environment(env_id=args.env_id, **env_args)
+
+    if isinstance(vf_env, vf.EnvGroup):
+        raise ValueError(
+            "GEPA optimization is not supported for EnvGroup environments. "
+            "Optimize each environment individually, then combine them."
+        )
+
+    for component in args.components:
+        if component == "tool_descriptions":
+            if not getattr(vf_env, "oai_tools", None):
+                raise ValueError(
+                    "Cannot optimize tool_descriptions: "
+                    f"environment '{args.env_id}' has no tools configured."
+                )
+        elif not hasattr(vf_env, component):
+            raise ValueError(
+                f"Environment '{args.env_id}' is missing component '{component}'. "
+                "Provide a component that exists on the environment."
+            )
+
+    # Setup sampling args
+    sampling_args = {
+        "temperature": args.temperature,
+        "max_tokens": args.max_tokens,
+    }
+
+    # Create adapter
+    adapter = GEPAAdapter(
+        env=vf_env,
+        client=client,
+        model=args.model,
+        sampling_args=sampling_args,
+        components_to_optimize=args.components,
+        num_rollouts_per_example=args.rollouts_per_example,
+        max_concurrent=32,
+    )
+
+    # Prepare datasets
+    logger.info(f"Loading {args.num_examples} training examples")
+    logger.info(f"Loading {args.num_val} validation examples")
+    if vf_env.eval_dataset is not None:
+        train_dataset_raw = vf_env.get_dataset(n=args.num_examples, seed=args.seed)
+        val_dataset_raw = vf_env.get_eval_dataset(n=args.num_val, seed=args.seed + 1)
+    else:
+        total_requested = max(args.num_examples, 0) + max(args.num_val, 0)
+        base_dataset = vf_env.get_dataset(n=total_requested, seed=args.seed)
+        base_examples = (
+            base_dataset.to_list()
+            if hasattr(base_dataset, "to_list")
+            else list(base_dataset)
+        )
+        train_dataset_raw = (
+            base_examples[: args.num_examples]
+            if args.num_examples > 0
+            else base_examples
+        )
+        val_dataset_raw = (
+            base_examples[args.num_examples : args.num_examples + args.num_val]
+            if args.num_val > 0
+            else []
+        )
+        logger.debug(
+            "Eval dataset missing; derived %s validation examples from train split",
+            len(val_dataset_raw),
+        )
+
+    trainset = prepare_gepa_dataset(train_dataset_raw)
+    valset = prepare_gepa_dataset(val_dataset_raw)
+
+    if args.num_examples > 0 and not trainset:
+        raise ValueError(
+            "Training dataset is empty - check environment configuration and filters"
+        )
+    if args.num_val > 0 and not valset:
+        raise ValueError(
+            "Validation dataset is empty - check environment configuration and filters"
+        )
+
+    logger.info(f"Training set: {len(trainset)} examples")
+    logger.info(f"Validation set: {len(valset)} examples")
+
+    reflection_api_key_var = args.reflection_api_key_var or client_config.api_key_var
+    reflection_api_key = os.getenv(reflection_api_key_var)
+    if not reflection_api_key:
+        raise ValueError(
+            f"{reflection_api_key_var} environment variable not set for reflection client"
+        )
+    reflection_base_url = args.reflection_base_url
+    if not reflection_base_url:
+        base_url = getattr(client, "base_url", None)
+        reflection_base_url = str(base_url) if base_url else "https://api.openai.com/v1"
+
+    reflection_client_kwargs = {
+        "api_key": reflection_api_key,
+        "base_url": reflection_base_url,
+    }
+    if task_client_headers:
+        reflection_client_kwargs["default_headers"] = task_client_headers
+    reflection_client = OpenAI(**reflection_client_kwargs)
+    logger.debug(
+        "Reflection client configured for model %s at %s",
+        args.reflection_model,
+        reflection_base_url,
+    )
+
+    # Extract seed candidate (initial component values)
+    seed_candidate = {}
+    for comp in args.components:
+        if comp == "tool_descriptions":
+            # Extract tool descriptions
+            if hasattr(vf_env, "oai_tools") and vf_env.oai_tools:
+                for i, tool in enumerate(vf_env.oai_tools):
+                    seed_candidate[f"tool_{i}_description"] = tool["function"][
+                        "description"
+                    ]
+        elif hasattr(vf_env, comp):
+            seed_candidate[comp] = getattr(vf_env, comp)
+        else:
+            logger.warning(f"Environment doesn't have component '{comp}', skipping")
+
+    if not seed_candidate:
+        logger.error("No valid components found to optimize!")
+        return
+
+    logger.info("Initial component values:")
+    for comp, value in seed_candidate.items():
+        preview = value[:200] + "..." if len(value) > 200 else value
+        logger.info(f"  {comp}: {preview}")
+
+    # Setup log directory
+    if args.log_dir:
+        log_dir = Path(args.log_dir)
+    else:
+        run_id = str(uuid.uuid4())[:8]
+        log_dir = Path(f"./gepa_results/{args.env_id}/{run_id}")
+    log_dir.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Log directory: {log_dir}")
+
+    # Convert auto budget to max_metric_calls if needed
+    if args.auto:
+        max_metric_calls = auto_budget_to_metric_calls(
+            auto=args.auto,
+            num_components=len(seed_candidate),
+            valset_size=len(valset),
+            minibatch_size=args.reflection_minibatch_size,
+        )
+    else:
+        max_metric_calls = args.max_metric_calls
+
+    logger.info(f"Budget: {max_metric_calls} metric calls total")
+
+    # Run GEPA
+    logger.info("=" * 80)
+    logger.info("Starting GEPA optimization...")
+    logger.info("=" * 80)
+
+    try:
+        result = optimize(
+            seed_candidate=seed_candidate,
+            trainset=trainset,
+            valset=valset,
+            adapter=adapter,
+            max_metric_calls=max_metric_calls,
+            reflection_lm=lambda x: call_reflection_model(
+                reflection_client,
+                x,
+                args.reflection_model,
+                args.reflection_temperature,
+                args.reflection_max_tokens,
+            ),
+            reflection_minibatch_size=args.reflection_minibatch_size,
+            run_dir=str(log_dir),
+            track_best_outputs=args.track_stats,
+            seed=args.seed,
+            display_progress_bar=True,
+        )
+    except Exception as e:
+        logger.error(f"GEPA optimization failed: {e}", exc_info=True)
+        raise
+
+    # Print results
+    print("\n" + "=" * 80)
+    print("GEPA OPTIMIZATION COMPLETE")
+    print("=" * 80)
+    print(f"Best validation score: {max(result.val_aggregate_scores):.3f}")
+    print(f"Initial validation score: {result.val_aggregate_scores[0]:.3f}")
+    print(
+        f"Improvement: {max(result.val_aggregate_scores) - result.val_aggregate_scores[0]:.3f}"
+    )
+    print(f"Total candidates explored: {len(result.candidates)}")
+    print("\nOptimized components:")
+    print("-" * 80)
+
+    for comp, text in result.best_candidate.items():
+        print(f"\n{comp}:")
+        print(textwrap.indent(text, "  "))
+
+    # Prepare run configuration for saving
+    run_config = {
+        "env_id": args.env_id,
+        "model": args.model,
+        "reflection_model": args.reflection_model,
+        "reflection_temperature": args.reflection_temperature,
+        "components": args.components,
+        "trainset_size": len(trainset),
+        "valset_size": len(valset),
+        "rollouts_per_example": args.rollouts_per_example,
+        "max_metric_calls": max_metric_calls,
+        "reflection_minibatch_size": args.reflection_minibatch_size,
+        "seed": args.seed,
+        "temperature": args.temperature,
+        "max_tokens": args.max_tokens,
+    }
+
+    # Save results
+    save_optimized_components(
+        args.env_id, result.best_candidate, seed_candidate, log_dir
+    )
+    save_optimization_metrics(args.env_id, result, log_dir, run_config)
+
+    print("\n" + "=" * 80)
+    print(f"Logs saved to: {log_dir}")
+    print("=" * 80)
+
+    logger.info("GEPA optimization completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/verifiers/types.py b/verifiers/types.py
index 1a3125075..fef431c6f 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -104,6 +104,7 @@ class State(dict):
     reward: float | None
     advantage: float | None
     metrics: dict[str, float] | None
+    feedbacks: list[str] | None
     timing: RolloutTiming | None
 
     def __getitem__(self, key: str) -> Any:
@@ -174,6 +175,18 @@ class RolloutScore(TypedDict):
     metrics: dict[str, float]
 
 
+class RewardResult(TypedDict, total=False):
+    """Result from a reward function with optional feedback.
+
+    Reward functions can return either:
+    - float: backward compatible (no feedback)
+    - RewardResult: {"score": float, "feedback": str}
+    """
+
+    score: float  # required
+    feedback: str  # optional
+
+
 class RolloutScores(TypedDict):
     """TypedDict for rubric outputs."""
 

From c050a81809ac13eb29507349852c2a1131ae8f9f Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Sat, 22 Nov 2025 22:29:51 +0100
Subject: [PATCH 03/16] fixed typo

---
 README.md                   | 2 +-
 docs/source/gepa.md         | 2 +-
 integrations/gepa/README.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0e94fa28c..86b529d74 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ For advanced evaluation configurations with the `prime` [CLI](https://github.com
 
 ## Prompt Optimization with GEPA
 
-Automatically improve your environment's prompts using GEPA (Gradient-free Evolutionary Prompt Adaptation):
+Automatically improve your environment's prompts using GEPA (Genetic-Pareto):
 
 ```bash
 # Install GEPA extras
diff --git a/docs/source/gepa.md b/docs/source/gepa.md
index 44ba8dd68..d5bd8c0ec 100644
--- a/docs/source/gepa.md
+++ b/docs/source/gepa.md
@@ -1,6 +1,6 @@
 # GEPA: Prompt Optimization
 
-GEPA (Gradient-free Evolutionary Prompt Adaptation) is an automatic prompt optimization system that improves your environment's system prompts and tool descriptions based on rubric feedback.
+GEPA (Genetic-Pareto) is an automatic prompt optimization system that improves your environment's system prompts and tool descriptions based on rubric feedback.
 
 ## Overview
 
diff --git a/integrations/gepa/README.md b/integrations/gepa/README.md
index 2bd277e58..efa57ac00 100644
--- a/integrations/gepa/README.md
+++ b/integrations/gepa/README.md
@@ -1,6 +1,6 @@
 # GEPA Integration
 
-GEPA (Gradient-free Evolutionary Prompt Adaptation) integration for Verifiers environments.
+GEPA (Genetic-Pareto) integration for Verifiers environments.
 
 ## Overview
 

From 1d62237bb55300414bd66ec525af499dddff2ccd Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Mon, 24 Nov 2025 10:54:29 +0100
Subject: [PATCH 04/16] unify vf-gepa cli args with vf-eval

---
 tests/test_gepa.py                 |  13 +-
 verifiers/adapters/gepa/adapter.py |  14 +-
 verifiers/scripts/gepa.py          | 683 +++++++++++------------------
 verifiers/types.py                 |  39 ++
 verifiers/utils/gepa_utils.py      | 531 ++++++++++++++++++++++
 5 files changed, 849 insertions(+), 431 deletions(-)
 create mode 100644 verifiers/utils/gepa_utils.py

diff --git a/tests/test_gepa.py b/tests/test_gepa.py
index 7d4578ef2..1496fd59e 100644
--- a/tests/test_gepa.py
+++ b/tests/test_gepa.py
@@ -150,7 +150,11 @@ def test_gepa_adapter_tool_descriptions_validation(self):
             )
 
     def test_gepa_adapter_build_program(self):
-        """Test GEPAAdapter.build_program creates new environment with updated components."""
+        """Test GEPAAdapter.build_program creates new environment with updated components.
+
+        Important: datasets should NOT be copied for efficiency (can be huge).
+        The adapter provides inputs directly via _build_rollout_inputs.
+        """
         GEPAAdapter = require_gepa_adapter()
 
         # Create real environment
@@ -175,9 +179,16 @@ def test_gepa_adapter_build_program(self):
         candidate = {"system_prompt": "Optimized prompt"}
         new_env = adapter.build_program(candidate)
 
+        # Verify component was updated
         assert new_env.system_prompt == "Optimized prompt"
         assert new_env.system_prompt != env.system_prompt
 
+        # Verify dataset was NOT copied (efficiency optimization)
+        # New env should have a minimal dummy dataset, not the original
+        assert new_env.dataset is not None  # Has some dataset to satisfy init
+        assert len(new_env.dataset) == 1  # But it's minimal (dummy)
+        assert new_env.dataset is not env.dataset  # Not the same reference
+
     def test_gepa_adapter_extract_seed_candidate(self):
         """Test extracting seed candidate from environment."""
         dataset = vf.load_example_dataset(n=5)
diff --git a/verifiers/adapters/gepa/adapter.py b/verifiers/adapters/gepa/adapter.py
index 15ab9b405..ff45e2d11 100644
--- a/verifiers/adapters/gepa/adapter.py
+++ b/verifiers/adapters/gepa/adapter.py
@@ -105,9 +105,13 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         post_init_overrides: dict[str, Any] = {}
 
         # Preserve constructor arguments present on the base environment
+        # Skip dataset/eval_dataset as they are not needed (adapter provides inputs)
+        # and copying them would be hugely inefficient for large datasets
         for param_name in signature.parameters:
             if param_name == "self":
                 continue
+            if param_name in ("dataset", "eval_dataset"):
+                continue
             if hasattr(self.base_env, param_name):
                 value = getattr(self.base_env, param_name)
                 if isinstance(value, (dict, list)):
@@ -118,9 +122,12 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         # Ensure core Environment parameters are forwarded when available
         # BUT only if they're explicitly in the specific environment's signature
         # (Some envs like TextArenaEnv create dataset/eval_dataset internally)
+        # Skip dataset/eval_dataset for efficiency (not needed by adapter)
         env_signature = inspect.signature(vf.Environment.__init__)
         env_param_names = [
-            name for name in env_signature.parameters if name not in {"self", "kwargs"}
+            name
+            for name in env_signature.parameters
+            if name not in {"self", "kwargs", "dataset", "eval_dataset"}
         ]
         for param_name in env_param_names:
             if param_name in init_kwargs:
@@ -159,6 +166,11 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
             else:
                 post_init_overrides[comp_name] = comp_value
 
+        # Provide minimal dataset if none exists (adapter provides inputs directly)
+        # This avoids copying large datasets and improves performance
+        if "dataset" not in init_kwargs and "eval_dataset" not in init_kwargs:
+            init_kwargs["dataset"] = vf.load_example_dataset(n=1)
+
         try:
             new_env = env_class(**init_kwargs)
         except TypeError as exc:
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index 09cf6f0bc..7dc96b1bb 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -9,228 +9,40 @@
 """
 
 import argparse
+import asyncio
 import json
 import logging
-import math
 import os
 import sys
-import textwrap
 import uuid
 from pathlib import Path
 
 try:
-    from gepa import optimize
+    from gepa import optimize  # noqa: F401
 except ImportError:
     print("Error: GEPA is not installed.")
     print("Install with: uv add 'verifiers[gepa]'")
     sys.exit(1)
 
-
-from openai import OpenAI
+from verifiers import setup_logging
+from verifiers.types import ClientConfig, GEPAConfig
+from verifiers.utils.eval_utils import load_endpoints
+from verifiers.utils.gepa_utils import (
+    auto_budget_to_metric_calls,
+    ensure_env_dir_on_path,
+    get_env_gepa_defaults,
+    prepare_gepa_dataset,
+    run_gepa_optimization,
+)
 
 import verifiers as vf
-from verifiers.adapters.gepa import GEPAAdapter
-from verifiers.types import ClientConfig
-from verifiers.utils.client_utils import setup_client
 
 logger = logging.getLogger("gepa")
 
-# Auto-budget constants for clarity and tuning
-AUTO_BUDGET_CANDIDATES = {
-    "light": 6,
-    "medium": 12,
-    "heavy": 18,
-}
-TRIAL_LOG_BASE_MULTIPLIER = 2.0
-TRIAL_COMPONENT_MULTIPLIER = 2
-TRIAL_LINEAR_MULTIPLIER = 1.5
-BOOTSTRAP_TRIALS_PER_CANDIDATE = 5
-
-
-def auto_budget_to_metric_calls(
-    auto: str,
-    num_components: int,
-    valset_size: int,
-    minibatch_size: int = 3,
-    full_eval_steps: int = 5,
-) -> int:
-    """
-    Convert auto budget (light/medium/heavy) to max_metric_calls.
-
-    This replicates GEPA's auto_budget calculation for consistency.
-
-    Args:
-        auto: Budget level ('light', 'medium', or 'heavy')
-        num_components: Number of components being optimized
-        valset_size: Size of validation set
-        minibatch_size: Reflection minibatch size
-        full_eval_steps: Steps between full validations
-
-    Returns:
-        Maximum number of metric calls
-    """
-    num_candidates = AUTO_BUDGET_CANDIDATES[auto]
-
-    # Calculate number of trials using log-growth vs. linear fallback
-    log_trials = (
-        TRIAL_LOG_BASE_MULTIPLIER
-        * (num_components * TRIAL_COMPONENT_MULTIPLIER)
-        * math.log2(num_candidates)
-    )
-    linear_trials = TRIAL_LINEAR_MULTIPLIER * num_candidates
-    num_trials = int(max(log_trials, linear_trials))
-
-    V = valset_size
-    N = num_trials
-    M = minibatch_size
-    m = full_eval_steps
-
-    # Initial full evaluation on the default program
-    total = V
-
-    # Assume a handful of bootstrap trials per candidate
-    total += num_candidates * BOOTSTRAP_TRIALS_PER_CANDIDATE
-
-    # N minibatch evaluations
-    total += N * M
-
-    if N == 0:
-        return total
-
-    # Periodic full evals
-    periodic_fulls = (N + 1) // m + 1
-    extra_final = 1 if N < m else 0
-
-    total += (periodic_fulls + extra_final) * V
-
-    logger.info(
-        f"Auto budget '{auto}' → ~{num_candidates} candidates, "
-        f"~{total} metric calls (~{total // (V or 1)} full evals)"
-    )
-
-    return total
-
-
-def prepare_gepa_dataset(dataset) -> list[dict]:
-    """
-    Convert HuggingFace Dataset to GEPA format.
-
-    GEPA expects a list of dicts with keys like 'question', 'answer', 'info', 'task'.
-    """
-    if dataset is None:
-        return []
-
-    examples = []
-    for item in dataset:
-        example = {
-            "question": item.get("question", item.get("prompt", "")),
-            "answer": item.get("answer", ""),
-            "task": item.get("task", "default"),
-            "info": item.get("info", {}),
-        }
-        examples.append(example)
-
-    return examples
-
-
-def call_reflection_model(
-    client: OpenAI,
-    prompt: str,
-    model: str,
-    temperature: float = 1.0,
-    max_tokens: int | None = None,
-) -> str:
-    """
-    Call reflection model to generate proposal.
-
-    This is a wrapper around the API call for GEPA's reflection phase.
-    """
-    try:
-        request_args = {
-            "model": model,
-            "messages": [{"role": "user", "content": prompt}],
-            "temperature": temperature,
-        }
-        if max_tokens is not None:
-            request_args["max_tokens"] = max_tokens
-        response = client.chat.completions.create(**request_args)
-        return response.choices[0].message.content or ""
-    except Exception as e:
-        logger.error(f"Error calling reflection model: {e}")
-        raise
-
-
-def save_optimized_components(
-    env_id: str,
-    best_candidate: dict[str, str],
-    seed_candidate: dict[str, str],
-    output_dir: Path,
-):
-    """Save optimized components to disk for future use."""
-    output_file = output_dir / f"{env_id}_optimized.json"
-    output_file.parent.mkdir(parents=True, exist_ok=True)
-
-    with open(output_file, "w") as f:
-        json.dump(best_candidate, f, indent=2)
-
-    logger.info(f"Saved optimized components to: {output_file}")
-
-    # Also save the original (seed) components for comparison
-    original_file = output_dir / f"{env_id}_original.json"
-    with open(original_file, "w") as f:
-        json.dump(seed_candidate, f, indent=2)
-
-    logger.info(f"Saved original components to: {original_file}")
-
-
-def save_optimization_metrics(
-    env_id: str,
-    result,
-    output_dir: Path,
-    run_config: dict,
-):
-    """Save optimization metrics and configuration for analysis."""
-    from datetime import datetime
-
-    metrics_file = output_dir / f"{env_id}_metrics.json"
-
-    metrics = {
-        # Run configuration
-        "config": run_config,
-        # Timestamps
-        "date": datetime.now().strftime("%Y-%m-%d"),
-        "timestamp": datetime.now().isoformat(),
-        # Results
-        "val_aggregate_scores": result.val_aggregate_scores,
-        "num_candidates": len(result.candidates),
-        "best_val_score": (
-            float(max(result.val_aggregate_scores))
-            if result.val_aggregate_scores
-            else 0.0
-        ),
-        "initial_val_score": (
-            float(result.val_aggregate_scores[0])
-            if result.val_aggregate_scores
-            else 0.0
-        ),
-        "improvement": (
-            float(max(result.val_aggregate_scores) - result.val_aggregate_scores[0])
-            if len(result.val_aggregate_scores) > 0
-            else 0.0
-        ),
-        "candidates_history": [
-            {
-                "iteration": i,
-                "score": float(score),
-            }
-            for i, score in enumerate(result.val_aggregate_scores)
-        ],
-    }
-
-    with open(metrics_file, "w") as f:
-        json.dump(metrics, f, indent=2)
-
-    logger.info(f"Saved optimization metrics to: {metrics_file}")
+# Default constants
+DEFAULT_NUM_EXAMPLES = 50
+DEFAULT_NUM_VAL = 20
+DEFAULT_ROLLOUTS_PER_EXAMPLE = 1
 
 
 def main():
@@ -250,33 +62,120 @@ def main():
         """,
     )
 
-    # Environment args
+    # 1. Positional: env_id
     parser.add_argument(
         "env_id", type=str, help="Environment ID (e.g., wordle, wiki-search)"
     )
+
+    # 2. Environment config
     parser.add_argument(
         "--env-args",
         "-a",
         default="{}",
         help="JSON dict of keyword args forwarded to vf.load_environment",
     )
+    parser.add_argument(
+        "--env-dir-path",
+        "-p",
+        type=str,
+        default="./environments",
+        help="Path to environments directory",
+    )
 
+    # 3. Dataset
     parser.add_argument(
         "-n",
         "--num-examples",
         type=int,
-        default=50,
-        help="Number of training examples (default: 50)",
+        default=None,
+        help="Number of training examples",
     )
-
     parser.add_argument(
         "--num-val",
         type=int,
-        default=20,
-        help="Number of validation examples (default: 20)",
+        default=None,
+        help="Number of validation examples",
+    )
+
+    # 4. Endpoints/Model
+    parser.add_argument(
+        "--endpoints-path",
+        "-e",
+        type=str,
+        default="./configs/endpoints.py",
+        help="Path to API endpoints registry",
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        default="gpt-4o-mini",
+        help="Model to optimize (default: gpt-4o-mini)",
+    )
+    parser.add_argument(
+        "--api-key-var",
+        "-k",
+        default="OPENAI_API_KEY",
+        help="Environment variable containing the task model API key",
+    )
+    parser.add_argument(
+        "--api-base-url",
+        "-b",
+        default="https://api.openai.com/v1",
+        help="Base URL for the task model API (default: https://api.openai.com/v1)",
+    )
+    parser.add_argument(
+        "--header",
+        action="append",
+        dest="headers",
+        default=None,
+        help="Additional HTTP header for the task model client. Format: 'Name: Value'. Repeatable.",
+    )
+
+    # 5. Sampling
+    parser.add_argument(
+        "-T",
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Temperature for task model (default: 1.0)",
+    )
+    parser.add_argument(
+        "-t",
+        "--max-tokens",
+        type=int,
+        default=None,
+        help="Max tokens for task model (unset to use model default)",
+    )
+    parser.add_argument(
+        "--sampling-args",
+        "-S",
+        type=json.loads,
+        default=None,
+        help=(
+            "Sampling arguments as JSON object. Keys here override --max-tokens/--temperature. "
+            'Example: \'{"enable_thinking": false, "max_tokens": 256}\''
+        ),
     )
 
-    # GEPA budget (mutually exclusive)
+    # 6. Rollouts
+    parser.add_argument(
+        "--rollouts-per-example",
+        "-r",
+        type=int,
+        default=None,
+        help="Number of rollouts per example",
+    )
+
+    # 7. Concurrency
+    parser.add_argument(
+        "--max-concurrent",
+        "-c",
+        type=int,
+        default=32,
+        help="Maximum number of concurrent requests",
+    )
+
+    # 8. GEPA budget (mutually exclusive)
     budget_group = parser.add_mutually_exclusive_group(required=True)
     budget_group.add_argument(
         "--auto",
@@ -287,72 +186,40 @@ def main():
         "--max-metric-calls", type=int, help="Maximum total metric calls budget"
     )
 
-    # GEPA configuration
+    # 9. GEPA configuration
+    parser.add_argument(
+        "--components",
+        nargs="+",
+        default=["system_prompt"],
+        help="Components to optimize (default: system_prompt)",
+    )
     parser.add_argument(
         "--reflection-model",
         default="gpt-4o",
         help="Model for reflection/proposal (default: gpt-4o)",
     )
-
     parser.add_argument(
         "--reflection-temperature",
         type=float,
         default=1.0,
         help="Temperature for reflection model (default: 1.0)",
     )
-
     parser.add_argument(
         "--reflection-base-url",
         default=None,
         help="Base URL for reflection model API (default: task client base URL)",
     )
-
     parser.add_argument(
         "--reflection-api-key-var",
         default="OPENAI_API_KEY",
         help="Env var that stores the reflection API key (default: OPENAI_API_KEY)",
     )
-
     parser.add_argument(
         "--reflection-max-tokens",
         type=int,
         default=8000,
         help="Max tokens for reflection completions (default: 8000)",
     )
-
-    parser.add_argument(
-        "-m",
-        "--model",
-        default="gpt-4o-mini",
-        help="Model to optimize (default: gpt-4o-mini)",
-    )
-    parser.add_argument(
-        "--api-key-var",
-        "-k",
-        default="OPENAI_API_KEY",
-        help="Environment variable containing the task model API key",
-    )
-    parser.add_argument(
-        "--api-base-url",
-        "-b",
-        default="https://api.openai.com/v1",
-        help="Base URL for the task model API (default: https://api.openai.com/v1)",
-    )
-    parser.add_argument(
-        "--header",
-        action="append",
-        dest="headers",
-        default=None,
-        help="Additional HTTP header for the task model client. Format: 'Name: Value'. Repeatable.",
-    )
-
-    parser.add_argument(
-        "--components",
-        nargs="+",
-        default=["system_prompt"],
-        help="Components to optimize (default: system_prompt)",
-    )
-
     parser.add_argument(
         "--reflection-minibatch-size",
         type=int,
@@ -360,46 +227,33 @@ def main():
         help="Number of examples per reflection step (default: 3)",
     )
 
+    # 10. Output/Logging
     parser.add_argument(
-        "--rollouts-per-example",
-        type=int,
-        default=1,
-        help="Number of rollouts per example (default: 1)",
-    )
-
-    # Model configuration
-    parser.add_argument(
-        "-T",
-        "--temperature",
-        type=float,
-        default=1.0,
-        help="Temperature for task model (default: 1.0)",
+        "--save-results",
+        "-s",
+        default=False,
+        action="store_true",
+        help="Save rollout trajectories to disk",
     )
-
     parser.add_argument(
-        "-t",
-        "--max-tokens",
+        "--save-every",
+        "-f",
         type=int,
-        default=8096,
-        help="Max tokens for task model (default: 8096)",
+        default=-1,
+        help="Save rollout trajectories every n evaluations during optimization",
     )
-
-    # Logging
     parser.add_argument(
         "--log-dir",
         help="Directory for GEPA logs (default: ./gepa_results/<env_id>/<run_id>)",
     )
-
     parser.add_argument(
         "--track-stats",
         action="store_true",
         help="Track detailed optimization statistics",
     )
-
     parser.add_argument(
         "--verbose", "-v", action="store_true", help="Enable verbose logging"
     )
-
     parser.add_argument(
         "--seed",
         type=int,
@@ -409,6 +263,7 @@ def main():
 
     args = parser.parse_args()
 
+    # Parse env_args
     try:
         env_args = json.loads(args.env_args)
         if not isinstance(env_args, dict):
@@ -418,6 +273,7 @@ def main():
             "--env-args must be valid JSON representing a dictionary"
         ) from exc
 
+    # Parse headers
     task_client_headers: dict[str, str] | None = None
     if args.headers:
         task_client_headers = {}
@@ -430,12 +286,7 @@ def main():
             task_client_headers[key.strip()] = value.strip()
 
     # Setup logging
-    log_level = logging.DEBUG if args.verbose else logging.INFO
-    logging.basicConfig(
-        level=log_level,
-        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
+    setup_logging("DEBUG" if args.verbose else "INFO")
 
     # Silence noisy third-party loggers
     logging.getLogger("openai").setLevel(logging.WARNING)
@@ -445,65 +296,95 @@ def main():
     logger.info(f"Starting GEPA optimization for environment: {args.env_id}")
     logger.info(f"Components to optimize: {args.components}")
 
-    # Setup client
+    if args.save_every > 0 and not args.save_results:
+        logger.warning("--save-every is ignored unless --save-results is set")
+
+    # Apply defaults: CLI > env pyproject.toml > hardcoded
+    env_defaults = get_env_gepa_defaults(args.env_id)
+    num_examples = (
+        args.num_examples
+        if args.num_examples is not None
+        else env_defaults.get("num_examples", DEFAULT_NUM_EXAMPLES)
+    )
+    num_val = (
+        args.num_val
+        if args.num_val is not None
+        else env_defaults.get("num_val", DEFAULT_NUM_VAL)
+    )
+    rollouts_per_example = (
+        args.rollouts_per_example
+        if args.rollouts_per_example is not None
+        else env_defaults.get("rollouts_per_example", DEFAULT_ROLLOUTS_PER_EXAMPLE)
+    )
+
+    # Log sources
+    if args.num_examples is None:
+        source = "pyproject.toml" if "num_examples" in env_defaults else "default"
+        logger.debug(f"Using num_examples={num_examples} from {source}")
+    if args.num_val is None:
+        source = "pyproject.toml" if "num_val" in env_defaults else "default"
+        logger.debug(f"Using num_val={num_val} from {source}")
+    if args.rollouts_per_example is None:
+        source = (
+            "pyproject.toml" if "rollouts_per_example" in env_defaults else "default"
+        )
+        logger.debug(f"Using rollouts_per_example={rollouts_per_example} from {source}")
+
+    # Load endpoints and resolve model config
+    endpoints = load_endpoints(args.endpoints_path)
+    if args.model in endpoints:
+        task_api_key_var = endpoints[args.model]["key"]
+        task_api_base_url = endpoints[args.model]["url"]
+        args.model = endpoints[args.model]["model"]
+        logger.debug(f"Using endpoint configuration for task model '{args.model}'")
+    else:
+        logger.debug(f"Task model '{args.model}' not in registry, using CLI args")
+        task_api_key_var = args.api_key_var
+        task_api_base_url = args.api_base_url
+
+    # Also check reflection model
+    if args.reflection_model in endpoints:
+        reflection_api_key_var = endpoints[args.reflection_model]["key"]
+        reflection_base_url = endpoints[args.reflection_model]["url"]
+        args.reflection_model = endpoints[args.reflection_model]["model"]
+        logger.debug(f"Using endpoint for reflection model '{args.reflection_model}'")
+    else:
+        reflection_api_key_var = args.reflection_api_key_var
+        reflection_base_url = args.reflection_base_url
+
+    # Merge sampling args with precedence to JSON payload
+    merged_sampling_args: dict = {}
+    if args.sampling_args is not None:
+        merged_sampling_args.update(args.sampling_args)
+    if "max_tokens" not in merged_sampling_args:
+        merged_sampling_args["max_tokens"] = args.max_tokens
+    if args.temperature is not None and "temperature" not in merged_sampling_args:
+        merged_sampling_args["temperature"] = args.temperature
+
+    # Ensure local environments directory is available for imports
+    ensure_env_dir_on_path(args.env_dir_path, args.env_id)
+
+    # Setup client config
     client_config_kwargs = {
-        "api_key_var": args.api_key_var,
-        "api_base_url": args.api_base_url,
+        "api_key_var": task_api_key_var,
+        "api_base_url": task_api_base_url,
     }
     if task_client_headers is not None:
         client_config_kwargs["extra_headers"] = task_client_headers
 
     client_config = ClientConfig(**client_config_kwargs)
-    client = setup_client(client_config)
-    logger.debug("Initialized OpenAI client")
 
     # Load environment
     vf_env = vf.load_environment(env_id=args.env_id, **env_args)
 
-    if isinstance(vf_env, vf.EnvGroup):
-        raise ValueError(
-            "GEPA optimization is not supported for EnvGroup environments. "
-            "Optimize each environment individually, then combine them."
-        )
-
-    for component in args.components:
-        if component == "tool_descriptions":
-            if not getattr(vf_env, "oai_tools", None):
-                raise ValueError(
-                    "Cannot optimize tool_descriptions: "
-                    f"environment '{args.env_id}' has no tools configured."
-                )
-        elif not hasattr(vf_env, component):
-            raise ValueError(
-                f"Environment '{args.env_id}' is missing component '{component}'. "
-                "Provide a component that exists on the environment."
-            )
-
-    # Setup sampling args
-    sampling_args = {
-        "temperature": args.temperature,
-        "max_tokens": args.max_tokens,
-    }
-
-    # Create adapter
-    adapter = GEPAAdapter(
-        env=vf_env,
-        client=client,
-        model=args.model,
-        sampling_args=sampling_args,
-        components_to_optimize=args.components,
-        num_rollouts_per_example=args.rollouts_per_example,
-        max_concurrent=32,
-    )
-
     # Prepare datasets
-    logger.info(f"Loading {args.num_examples} training examples")
-    logger.info(f"Loading {args.num_val} validation examples")
+    logger.info(f"Loading {num_examples} training examples")
+    logger.info(f"Loading {num_val} validation examples")
     if vf_env.eval_dataset is not None:
-        train_dataset_raw = vf_env.get_dataset(n=args.num_examples, seed=args.seed)
-        val_dataset_raw = vf_env.get_eval_dataset(n=args.num_val, seed=args.seed + 1)
+        train_dataset_raw = vf_env.get_dataset(n=num_examples, seed=args.seed)
+        val_dataset_raw = vf_env.get_eval_dataset(n=num_val, seed=args.seed + 1)
     else:
-        total_requested = max(args.num_examples, 0) + max(args.num_val, 0)
+        total_requested = max(num_examples, 0) + max(num_val, 0)
         base_dataset = vf_env.get_dataset(n=total_requested, seed=args.seed)
         base_examples = (
             base_dataset.to_list()
@@ -511,14 +392,10 @@ def main():
             else list(base_dataset)
         )
         train_dataset_raw = (
-            base_examples[: args.num_examples]
-            if args.num_examples > 0
-            else base_examples
+            base_examples[:num_examples] if num_examples > 0 else base_examples
         )
         val_dataset_raw = (
-            base_examples[args.num_examples : args.num_examples + args.num_val]
-            if args.num_val > 0
-            else []
+            base_examples[num_examples : num_examples + num_val] if num_val > 0 else []
         )
         logger.debug(
             "Eval dataset missing; derived %s validation examples from train split",
@@ -528,11 +405,11 @@ def main():
     trainset = prepare_gepa_dataset(train_dataset_raw)
     valset = prepare_gepa_dataset(val_dataset_raw)
 
-    if args.num_examples > 0 and not trainset:
+    if num_examples > 0 and not trainset:
         raise ValueError(
             "Training dataset is empty - check environment configuration and filters"
         )
-    if args.num_val > 0 and not valset:
+    if num_val > 0 and not valset:
         raise ValueError(
             "Validation dataset is empty - check environment configuration and filters"
         )
@@ -540,29 +417,16 @@ def main():
     logger.info(f"Training set: {len(trainset)} examples")
     logger.info(f"Validation set: {len(valset)} examples")
 
-    reflection_api_key_var = args.reflection_api_key_var or client_config.api_key_var
+    # Get reflection API key
     reflection_api_key = os.getenv(reflection_api_key_var)
     if not reflection_api_key:
         raise ValueError(
             f"{reflection_api_key_var} environment variable not set for reflection client"
         )
-    reflection_base_url = args.reflection_base_url
-    if not reflection_base_url:
-        base_url = getattr(client, "base_url", None)
-        reflection_base_url = str(base_url) if base_url else "https://api.openai.com/v1"
 
-    reflection_client_kwargs = {
-        "api_key": reflection_api_key,
-        "base_url": reflection_base_url,
-    }
-    if task_client_headers:
-        reflection_client_kwargs["default_headers"] = task_client_headers
-    reflection_client = OpenAI(**reflection_client_kwargs)
-    logger.debug(
-        "Reflection client configured for model %s at %s",
-        args.reflection_model,
-        reflection_base_url,
-    )
+    # Use resolved reflection_base_url or fall back to task client base URL
+    if not reflection_base_url:
+        reflection_base_url = task_api_base_url
 
     # Extract seed candidate (initial component values)
     seed_candidate = {}
@@ -583,11 +447,6 @@ def main():
         logger.error("No valid components found to optimize!")
         return
 
-    logger.info("Initial component values:")
-    for comp, value in seed_candidate.items():
-        preview = value[:200] + "..." if len(value) > 200 else value
-        logger.info(f"  {comp}: {preview}")
-
     # Setup log directory
     if args.log_dir:
         log_dir = Path(args.log_dir)
@@ -610,80 +469,46 @@ def main():
 
     logger.info(f"Budget: {max_metric_calls} metric calls total")
 
-    # Run GEPA
-    logger.info("=" * 80)
-    logger.info("Starting GEPA optimization...")
-    logger.info("=" * 80)
-
-    try:
-        result = optimize(
-            seed_candidate=seed_candidate,
-            trainset=trainset,
-            valset=valset,
-            adapter=adapter,
-            max_metric_calls=max_metric_calls,
-            reflection_lm=lambda x: call_reflection_model(
-                reflection_client,
-                x,
-                args.reflection_model,
-                args.reflection_temperature,
-                args.reflection_max_tokens,
-            ),
-            reflection_minibatch_size=args.reflection_minibatch_size,
-            run_dir=str(log_dir),
-            track_best_outputs=args.track_stats,
-            seed=args.seed,
-            display_progress_bar=True,
-        )
-    except Exception as e:
-        logger.error(f"GEPA optimization failed: {e}", exc_info=True)
-        raise
-
-    # Print results
-    print("\n" + "=" * 80)
-    print("GEPA OPTIMIZATION COMPLETE")
-    print("=" * 80)
-    print(f"Best validation score: {max(result.val_aggregate_scores):.3f}")
-    print(f"Initial validation score: {result.val_aggregate_scores[0]:.3f}")
-    print(
-        f"Improvement: {max(result.val_aggregate_scores) - result.val_aggregate_scores[0]:.3f}"
-    )
-    print(f"Total candidates explored: {len(result.candidates)}")
-    print("\nOptimized components:")
-    print("-" * 80)
-
-    for comp, text in result.best_candidate.items():
-        print(f"\n{comp}:")
-        print(textwrap.indent(text, "  "))
-
-    # Prepare run configuration for saving
-    run_config = {
-        "env_id": args.env_id,
-        "model": args.model,
-        "reflection_model": args.reflection_model,
-        "reflection_temperature": args.reflection_temperature,
-        "components": args.components,
-        "trainset_size": len(trainset),
-        "valset_size": len(valset),
-        "rollouts_per_example": args.rollouts_per_example,
-        "max_metric_calls": max_metric_calls,
-        "reflection_minibatch_size": args.reflection_minibatch_size,
-        "seed": args.seed,
-        "temperature": args.temperature,
-        "max_tokens": args.max_tokens,
-    }
-
-    # Save results
-    save_optimized_components(
-        args.env_id, result.best_candidate, seed_candidate, log_dir
-    )
-    save_optimization_metrics(args.env_id, result, log_dir, run_config)
-
-    print("\n" + "=" * 80)
-    print(f"Logs saved to: {log_dir}")
-    print("=" * 80)
-
-    logger.info("GEPA optimization completed successfully!")
+    # Build GEPA config
+    gepa_config = GEPAConfig(
+        # environment
+        env_id=args.env_id,
+        env_args=env_args,
+        env_dir_path=args.env_dir_path,
+        # task model
+        model=args.model,
+        client_config=client_config,
+        sampling_args=merged_sampling_args,
+        # reflection model
+        reflection_model=args.reflection_model,
+        reflection_api_key=reflection_api_key,
+        reflection_base_url=reflection_base_url,
+        reflection_temperature=args.reflection_temperature,
+        reflection_max_tokens=args.reflection_max_tokens,
+        reflection_minibatch_size=args.reflection_minibatch_size,
+        # datasets
+        num_examples=num_examples,
+        num_val=num_val,
+        rollouts_per_example=rollouts_per_example,
+        trainset=trainset,
+        valset=valset,
+        # optimization
+        components_to_optimize=args.components,
+        seed_candidate=seed_candidate,
+        max_metric_calls=max_metric_calls,
+        # execution
+        max_concurrent=args.max_concurrent,
+        seed=args.seed,
+        # output
+        log_dir=log_dir,
+        save_results=args.save_results,
+        save_every=args.save_every,
+        track_stats=args.track_stats,
+        verbose=args.verbose,
+    )
+
+    # Run GEPA optimization
+    asyncio.run(run_gepa_optimization(gepa_config))
 
 
 if __name__ == "__main__":
diff --git a/verifiers/types.py b/verifiers/types.py
index fef431c6f..30b95118e 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -247,3 +247,42 @@ class EvalConfig(BaseModel):
     save_every: int = -1
     save_to_hf_hub: bool = False
     hf_hub_dataset_name: str | None = None
+
+
+class GEPAConfig(BaseModel):
+    """Pydantic model for GEPA optimization configuration."""
+
+    # environment
+    env_id: str
+    env_args: dict
+    env_dir_path: str
+    # task model
+    model: str
+    client_config: ClientConfig
+    sampling_args: SamplingArgs
+    # reflection model
+    reflection_model: str
+    reflection_api_key: str
+    reflection_base_url: str
+    reflection_temperature: float
+    reflection_max_tokens: int
+    reflection_minibatch_size: int
+    # datasets
+    num_examples: int
+    num_val: int
+    rollouts_per_example: int
+    trainset: list[dict]
+    valset: list[dict]
+    # optimization
+    components_to_optimize: list[str]
+    seed_candidate: dict[str, str]
+    max_metric_calls: int
+    # execution
+    max_concurrent: int
+    seed: int
+    # output
+    log_dir: Path
+    save_results: bool
+    save_every: int
+    track_stats: bool
+    verbose: bool
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/utils/gepa_utils.py
new file mode 100644
index 000000000..a8dda7c88
--- /dev/null
+++ b/verifiers/utils/gepa_utils.py
@@ -0,0 +1,531 @@
+"""Utility functions for GEPA optimization."""
+
+import importlib.resources
+import json
+import logging
+import math
+import sys
+import textwrap
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict
+
+try:
+    import tomllib  # type: ignore[unresolved-import]
+except ImportError:
+    import tomli as tomllib  # type: ignore[unresolved-import]
+
+from openai import AsyncOpenAI, OpenAI
+
+import verifiers as vf
+from verifiers.adapters.gepa import GEPAAdapter
+from verifiers.types import GEPAConfig
+from verifiers.utils.eval_utils import save_rollout_results
+
+logger = logging.getLogger(__name__)
+
+# Auto-budget constants for clarity and tuning
+AUTO_BUDGET_CANDIDATES = {
+    "light": 6,
+    "medium": 12,
+    "heavy": 18,
+}
+TRIAL_LOG_BASE_MULTIPLIER = 2.0
+TRIAL_COMPONENT_MULTIPLIER = 2
+TRIAL_LINEAR_MULTIPLIER = 1.5
+BOOTSTRAP_TRIALS_PER_CANDIDATE = 5
+
+
+def get_env_gepa_defaults(env_id: str) -> Dict[str, Any]:
+    """Get GEPA config defaults from environment package's pyproject.toml.
+
+    Returns dict with 'num_examples', 'num_val', and 'rollouts_per_example' keys if found,
+    otherwise returns empty dict. All errors are silently handled.
+    """
+    defaults: Dict[str, Any] = {}
+    module_name = env_id.replace("-", "_").split("/")[-1]
+
+    try:
+        # read pyproject.toml from installed package
+        package_ref = importlib.resources.files(module_name)
+        pyproject_file = package_ref / "pyproject.toml"
+
+        if not pyproject_file.is_file():
+            logger.debug(f"pyproject.toml not found in installed package {module_name}")
+            return defaults
+
+        with pyproject_file.open("rb") as f:
+            pyproject_data = tomllib.load(f)
+
+        # Extract [tool.verifiers.gepa] section
+        gepa_config = (
+            pyproject_data.get("tool", {}).get("verifiers", {}).get("gepa", {})
+        )
+
+        if "num_examples" in gepa_config:
+            defaults["num_examples"] = gepa_config["num_examples"]
+        if "num_val" in gepa_config:
+            defaults["num_val"] = gepa_config["num_val"]
+        if "rollouts_per_example" in gepa_config:
+            defaults["rollouts_per_example"] = gepa_config["rollouts_per_example"]
+
+        if defaults:
+            logger.debug(
+                f"Loaded GEPA defaults from {module_name} pyproject.toml: {defaults}"
+            )
+    except ModuleNotFoundError:
+        logger.debug(f"Package {module_name} not installed")
+    except Exception as e:
+        logger.debug(
+            f"Could not load GEPA defaults from {module_name} pyproject.toml: {e}"
+        )
+
+    return defaults
+
+
+def ensure_env_dir_on_path(env_dir_path: str, env_id: str) -> None:
+    """Add local environment directory to sys.path if present."""
+    env_dir = Path(env_dir_path).resolve()
+    if not env_dir.exists():
+        return
+    module_name = env_id.replace("-", "_").split("/")[-1]
+    candidate = env_dir / module_name
+    if candidate.exists():
+        env_dir_str = str(env_dir)
+        if env_dir_str not in sys.path:
+            sys.path.insert(0, env_dir_str)
+            logger.debug(f"Added {env_dir_str} to sys.path for environment loading")
+
+
+async def save_candidate_rollouts(
+    adapter: GEPAAdapter,
+    candidate: dict[str, str],
+    label: str,
+    client: AsyncOpenAI,
+    model: str,
+    sampling_args: dict,
+    num_examples: int,
+    rollouts_per_example: int,
+    max_concurrent: int,
+    save_every: int,
+    log_dir: Path,
+) -> None:
+    """
+    Evaluate a candidate program and save rollout trajectories to disk.
+    """
+    if num_examples <= 0:
+        logger.warning(
+            "Skipping rollout saving for %s candidate because num_examples<=0", label
+        )
+        return
+
+    env = adapter.build_program(candidate)
+    rollouts_dir = log_dir / "rollouts" / label
+    rollouts_dir.mkdir(parents=True, exist_ok=True)
+    logger.info(
+        "Saving %s candidate rollouts to %s (num_examples=%s, rollouts=%s)",
+        label,
+        rollouts_dir,
+        num_examples,
+        rollouts_per_example,
+    )
+    results = await env.evaluate(
+        client=client,
+        model=model,
+        sampling_args=sampling_args,
+        num_examples=num_examples,
+        rollouts_per_example=rollouts_per_example,
+        max_concurrent=max_concurrent,
+        results_path=rollouts_dir,
+        save_results=False,
+        save_every=save_every,
+    )
+    save_rollout_results(results)
+
+
+def auto_budget_to_metric_calls(
+    auto: str,
+    num_components: int,
+    valset_size: int,
+    minibatch_size: int = 3,
+    full_eval_steps: int = 5,
+) -> int:
+    """
+    Convert auto budget (light/medium/heavy) to max_metric_calls.
+
+    This replicates GEPA's auto_budget calculation for consistency.
+
+    Args:
+        auto: Budget level ('light', 'medium', or 'heavy')
+        num_components: Number of components being optimized
+        valset_size: Size of validation set
+        minibatch_size: Reflection minibatch size
+        full_eval_steps: Steps between full validations
+
+    Returns:
+        Maximum number of metric calls
+    """
+    num_candidates = AUTO_BUDGET_CANDIDATES[auto]
+
+    # Calculate number of trials using log-growth vs. linear fallback
+    log_trials = (
+        TRIAL_LOG_BASE_MULTIPLIER
+        * (num_components * TRIAL_COMPONENT_MULTIPLIER)
+        * math.log2(num_candidates)
+    )
+    linear_trials = TRIAL_LINEAR_MULTIPLIER * num_candidates
+    num_trials = int(max(log_trials, linear_trials))
+
+    V = valset_size
+    N = num_trials
+    M = minibatch_size
+    m = full_eval_steps
+
+    # Initial full evaluation on the default program
+    total = V
+
+    # Assume a handful of bootstrap trials per candidate
+    total += num_candidates * BOOTSTRAP_TRIALS_PER_CANDIDATE
+
+    # N minibatch evaluations
+    total += N * M
+
+    if N == 0:
+        return total
+
+    # Periodic full evals
+    periodic_fulls = (N + 1) // m + 1
+    extra_final = 1 if N < m else 0
+
+    total += (periodic_fulls + extra_final) * V
+
+    logger.info(
+        f"Auto budget '{auto}' → ~{num_candidates} candidates, "
+        f"~{total} metric calls (~{total // (V or 1)} full evals)"
+    )
+
+    return total
+
+
+def prepare_gepa_dataset(dataset) -> list[dict]:
+    """
+    Convert HuggingFace Dataset to GEPA format.
+
+    GEPA expects a list of dicts with keys like 'question', 'answer', 'info', 'task'.
+    """
+    if dataset is None:
+        return []
+
+    examples = []
+    for item in dataset:
+        example = {
+            "question": item.get("question", item.get("prompt", "")),
+            "answer": item.get("answer", ""),
+            "task": item.get("task", "default"),
+            "info": item.get("info", {}),
+        }
+        examples.append(example)
+
+    return examples
+
+
+def call_reflection_model(
+    client: OpenAI,
+    prompt: str,
+    model: str,
+    temperature: float = 1.0,
+    max_tokens: int | None = None,
+) -> str:
+    """
+    Call reflection model to generate proposal.
+
+    This is a wrapper around the API call for GEPA's reflection phase.
+    """
+    try:
+        request_args = {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": temperature,
+        }
+        if max_tokens is not None:
+            request_args["max_tokens"] = max_tokens
+        response = client.chat.completions.create(**request_args)
+        return response.choices[0].message.content or ""
+    except Exception as e:
+        logger.error(f"Error calling reflection model: {e}")
+        raise
+
+
+def save_optimized_components(
+    env_id: str,
+    best_candidate: dict[str, str],
+    seed_candidate: dict[str, str],
+    output_dir: Path,
+):
+    """Save optimized components to disk for future use."""
+    output_file = output_dir / f"{env_id}_optimized.json"
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_file, "w") as f:
+        json.dump(best_candidate, f, indent=2)
+
+    logger.info(f"Saved optimized components to: {output_file}")
+
+    # Also save the original (seed) components for comparison
+    original_file = output_dir / f"{env_id}_original.json"
+    with open(original_file, "w") as f:
+        json.dump(seed_candidate, f, indent=2)
+
+    logger.info(f"Saved original components to: {original_file}")
+
+
+def save_optimization_metrics(
+    env_id: str,
+    result,
+    output_dir: Path,
+    run_config: dict,
+):
+    """Save optimization metrics and configuration for analysis."""
+    metrics_file = output_dir / f"{env_id}_metrics.json"
+
+    metrics = {
+        # Run configuration
+        "config": run_config,
+        # Timestamps
+        "date": datetime.now().strftime("%Y-%m-%d"),
+        "timestamp": datetime.now().isoformat(),
+        # Results
+        "val_aggregate_scores": result.val_aggregate_scores,
+        "num_candidates": len(result.candidates),
+        "best_val_score": (
+            float(max(result.val_aggregate_scores))
+            if result.val_aggregate_scores
+            else 0.0
+        ),
+        "initial_val_score": (
+            float(result.val_aggregate_scores[0])
+            if result.val_aggregate_scores
+            else 0.0
+        ),
+        "improvement": (
+            float(max(result.val_aggregate_scores) - result.val_aggregate_scores[0])
+            if len(result.val_aggregate_scores) > 0
+            else 0.0
+        ),
+        "candidates_history": [
+            {
+                "iteration": i,
+                "score": float(score),
+            }
+            for i, score in enumerate(result.val_aggregate_scores)
+        ],
+    }
+
+    with open(metrics_file, "w") as f:
+        json.dump(metrics, f, indent=2)
+
+    logger.info(f"Saved optimization metrics to: {metrics_file}")
+
+
+def print_optimization_results(result, log_dir: Path):
+    """Print GEPA optimization results to console."""
+    print("\n" + "=" * 80)
+    print("GEPA OPTIMIZATION COMPLETE")
+    print("=" * 80)
+    print(f"Best validation score: {max(result.val_aggregate_scores):.3f}")
+    print(f"Initial validation score: {result.val_aggregate_scores[0]:.3f}")
+    print(
+        f"Improvement: {max(result.val_aggregate_scores) - result.val_aggregate_scores[0]:.3f}"
+    )
+    print(f"Total candidates explored: {len(result.candidates)}")
+    print("\nOptimized components:")
+    print("-" * 80)
+
+    for comp, text in result.best_candidate.items():
+        print(f"\n{comp}:")
+        print(textwrap.indent(text, "  "))
+
+    print("\n" + "=" * 80)
+    print(f"Logs saved to: {log_dir}")
+    print("=" * 80)
+
+
+async def run_gepa_optimization(config: GEPAConfig):
+    """
+    Run GEPA optimization with provided configuration.
+
+    Handles:
+    - Adapter creation
+    - Reflection client setup
+    - GEPA optimize() call
+    - Result saving and output
+
+    Args:
+        config: GEPAConfig with all optimization parameters
+
+    Returns:
+        GEPA optimization result
+    """
+    try:
+        from gepa import optimize
+    except ImportError:
+        print("Error: GEPA is not installed.")
+        print("Install with: uv add 'verifiers[gepa]'")
+        sys.exit(1)
+
+    from verifiers.utils.client_utils import setup_client
+
+    # Setup task client
+    client = setup_client(config.client_config)
+    logger.debug("Initialized OpenAI client")
+
+    # Load environment
+    vf_env = vf.load_environment(env_id=config.env_id, **config.env_args)
+
+    if isinstance(vf_env, vf.EnvGroup):
+        raise ValueError(
+            "GEPA optimization is not supported for EnvGroup environments. "
+            "Optimize each environment individually, then combine them."
+        )
+
+    # Validate components
+    for component in config.components_to_optimize:
+        if component == "tool_descriptions":
+            if not getattr(vf_env, "oai_tools", None):
+                raise ValueError(
+                    "Cannot optimize tool_descriptions: "
+                    f"environment '{config.env_id}' has no tools configured."
+                )
+        elif not hasattr(vf_env, component):
+            raise ValueError(
+                f"Environment '{config.env_id}' is missing component '{component}'. "
+                "Provide a component that exists on the environment."
+            )
+
+    # Create adapter
+    adapter = GEPAAdapter(
+        env=vf_env,
+        client=client,
+        model=config.model,
+        sampling_args=config.sampling_args,
+        components_to_optimize=config.components_to_optimize,
+        num_rollouts_per_example=config.rollouts_per_example,
+        max_concurrent=config.max_concurrent,
+    )
+
+    # Setup reflection client
+    reflection_client_kwargs = {
+        "api_key": config.reflection_api_key,
+        "base_url": config.reflection_base_url,
+    }
+    if config.client_config.extra_headers:
+        reflection_client_kwargs["default_headers"] = config.client_config.extra_headers
+    reflection_client = OpenAI(**reflection_client_kwargs)
+    logger.debug(
+        "Reflection client configured for model %s at %s",
+        config.reflection_model,
+        config.reflection_base_url,
+    )
+
+    # Log initial component values
+    logger.info("Initial component values:")
+    for comp, value in config.seed_candidate.items():
+        preview = value[:200] + "..." if len(value) > 200 else value
+        logger.info(f"  {comp}: {preview}")
+
+    # Run GEPA
+    logger.info("=" * 80)
+    logger.info("Starting GEPA optimization...")
+    logger.info("=" * 80)
+
+    try:
+        result = optimize(
+            seed_candidate=config.seed_candidate,
+            trainset=config.trainset,
+            valset=config.valset,
+            adapter=adapter,
+            max_metric_calls=config.max_metric_calls,
+            reflection_lm=lambda x: call_reflection_model(
+                reflection_client,
+                x,
+                config.reflection_model,
+                config.reflection_temperature,
+                config.reflection_max_tokens,
+            ),
+            reflection_minibatch_size=config.reflection_minibatch_size,
+            run_dir=str(config.log_dir),
+            track_best_outputs=config.track_stats,
+            seed=config.seed,
+            display_progress_bar=True,
+        )
+    except Exception as e:
+        logger.error(f"GEPA optimization failed: {e}", exc_info=True)
+        raise
+
+    # Print results
+    print_optimization_results(result, config.log_dir)
+
+    # Prepare run configuration for saving
+    run_config = {
+        "env_id": config.env_id,
+        "model": config.model,
+        "reflection_model": config.reflection_model,
+        "reflection_temperature": config.reflection_temperature,
+        "components": config.components_to_optimize,
+        "trainset_size": len(config.trainset),
+        "valset_size": len(config.valset),
+        "rollouts_per_example": config.rollouts_per_example,
+        "max_metric_calls": config.max_metric_calls,
+        "reflection_minibatch_size": config.reflection_minibatch_size,
+        "seed": config.seed,
+        "max_concurrent": config.max_concurrent,
+    }
+
+    # Save results
+    save_optimized_components(
+        config.env_id, result.best_candidate, config.seed_candidate, config.log_dir
+    )
+    save_optimization_metrics(config.env_id, result, config.log_dir, run_config)
+
+    # Save rollouts if requested
+    if config.save_results:
+        save_every = config.save_every if config.save_every > 0 else -1
+        val_examples_for_logging = (
+            config.num_val if config.num_val > 0 else config.num_examples
+        )
+
+        async def save_all_candidates():
+            await save_candidate_rollouts(
+                adapter=adapter,
+                candidate=config.seed_candidate,
+                label="seed",
+                client=client,
+                model=config.model,
+                sampling_args=config.sampling_args,
+                num_examples=val_examples_for_logging,
+                rollouts_per_example=config.rollouts_per_example,
+                max_concurrent=config.max_concurrent,
+                save_every=save_every,
+                log_dir=config.log_dir,
+            )
+            await save_candidate_rollouts(
+                adapter=adapter,
+                candidate=result.best_candidate,
+                label="best",
+                client=client,
+                model=config.model,
+                sampling_args=config.sampling_args,
+                num_examples=val_examples_for_logging,
+                rollouts_per_example=config.rollouts_per_example,
+                max_concurrent=config.max_concurrent,
+                save_every=save_every,
+                log_dir=config.log_dir,
+            )
+
+        try:
+            await save_all_candidates()
+        except RuntimeError as exc:
+            logger.error(f"Failed to save rollout trajectories: {exc}")
+
+    logger.info("GEPA optimization completed successfully!")
+    return result

From 57d54394156bf591d2ffbff55edf0d9a4e6ba846 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Mon, 24 Nov 2025 12:19:17 +0100
Subject: [PATCH 05/16] renamed '--auto' to '--budget', adjusted default
 minibatch size

---
 README.md                     |  4 +--
 docs/source/gepa.md           | 61 ++++++++++++++++++++---------------
 integrations/gepa/README.md   | 10 +++---
 verifiers/scripts/gepa.py     | 23 ++++++-------
 verifiers/utils/gepa_utils.py |  6 ++--
 5 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 86b529d74..dd053cb2e 100644
--- a/README.md
+++ b/README.md
@@ -84,10 +84,10 @@ Automatically improve your environment's prompts using GEPA (Genetic-Pareto):
 uv add 'verifiers[gepa]'
 
 # Optimize system prompt
-vf-gepa wordle --auto medium
+vf-gepa wordle --budget medium
 
 # Optimize system prompt + tool descriptions
-vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+vf-gepa wiki-search --budget heavy --components system_prompt tool_descriptions
 ```
 
 GEPA analyzes your rubric's feedback and iteratively refines prompts. Works best when reward functions return rich textual feedback. See the [GEPA documentation](docs/source/gepa.md) for details.
diff --git a/docs/source/gepa.md b/docs/source/gepa.md
index d5bd8c0ec..67965e77d 100644
--- a/docs/source/gepa.md
+++ b/docs/source/gepa.md
@@ -27,7 +27,7 @@ This installs the `gepa` optimization engine.
 Optimize the system prompt for an environment:
 
 ```bash
-vf-gepa wordle --auto medium
+vf-gepa wordle --budget medium
 ```
 
 This will:
@@ -38,33 +38,33 @@ This will:
 
 ## Budget Modes
 
-GEPA offers three auto budget levels:
+GEPA offers three budget presets:
 
 ### Light (~6 candidates)
 Fast iteration for testing:
 ```bash
-vf-gepa my-env --auto light
+vf-gepa my-env --budget light
 ```
-- Best for: Quick experiments, sanity checks
-- Time: ~5-10 minutes for simple environments
-- Use when: Testing GEPA setup, iterating rapidly
+- Best for: Quick experiments, initial testing
+- Time: ~30-60 minutes for typical environments
+- Use when: Testing GEPA setup, first optimization runs
 
 ### Medium (~12 candidates)  
 Balanced optimization:
 ```bash
-vf-gepa my-env --auto medium
+vf-gepa my-env --budget medium
 ```
 - Best for: Most use cases, good improvements
-- Time: ~15-30 minutes for simple environments
+- Time: ~1-2 hours for typical environments
 - Use when: Standard optimization runs
 
 ### Heavy (~18 candidates)
 Thorough exploration:
 ```bash
-vf-gepa my-env --auto heavy
+vf-gepa my-env --budget heavy
 ```
 - Best for: Final production prompts, critical environments
-- Time: ~30-60 minutes for simple environments
+- Time: ~2-4 hours for typical environments
 - Use when: You need the best possible prompt
 
 ### Custom Budget
@@ -74,24 +74,33 @@ For fine control, specify exact metric calls:
 vf-gepa my-env --max-metric-calls 1000
 ```
 
+### Faster Iteration
+
+For quicker feedback cycles (at the cost of potentially noisier signals), reduce the minibatch size:
+```bash
+vf-gepa my-env --budget light --reflection-minibatch-size 10
+```
+
+The default minibatch size is 35 examples per reflection step. Smaller values (5-15) trade stability for speed, useful during initial experimentation.
+
 ## Component Selection
 
 By default, GEPA optimizes `system_prompt`. You can specify multiple components:
 
 ### System Prompt Only
 ```bash
-vf-gepa my-env --auto medium --components system_prompt
+vf-gepa my-env --budget medium --components system_prompt
 ```
 
 ### Tool Descriptions
 For environments with tools, optimize their descriptions:
 ```bash
-vf-gepa wiki-search --auto medium --components tool_descriptions
+vf-gepa wiki-search --budget medium --components tool_descriptions
 ```
 
 ### Both System Prompt and Tool Descriptions
 ```bash
-vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+vf-gepa wiki-search --budget heavy --components system_prompt tool_descriptions
 ```
 
 When optimizing `tool_descriptions`, GEPA:
@@ -105,18 +114,18 @@ When optimizing `tool_descriptions`, GEPA:
 ### Task Model
 The model being optimized (default: `gpt-4o-mini`):
 ```bash
-vf-gepa my-env --auto medium -m gpt-4o
+vf-gepa my-env --budget medium -m gpt-4o
 ```
 
 ### Reflection Model
 The model generating improved prompts (default: `gpt-4o`):
 ```bash
-vf-gepa my-env --auto medium --reflection-model gpt-4o
+vf-gepa my-env --budget medium --reflection-model gpt-4o
 ```
 
 ### Sampling Parameters
 ```bash
-vf-gepa my-env --auto medium \
+vf-gepa my-env --budget medium \
   -T 0.7 \              # Temperature for task model
   -t 2048 \             # Max tokens
   --reflection-temperature 1.0  # Temperature for reflection
@@ -127,7 +136,7 @@ vf-gepa my-env --auto medium \
 Control train/validation split sizes:
 
 ```bash
-vf-gepa my-env --auto medium \
+vf-gepa my-env --budget medium \
   -n 100 \              # 100 training examples
   --num-val 30          # 30 validation examples
 ```
@@ -196,24 +205,24 @@ The `feedback` field is used by GEPA to understand *why* completions failed, ena
 ### Multiple Rollouts Per Example
 Increase robustness with multiple rollouts:
 ```bash
-vf-gepa my-env --auto medium --rollouts-per-example 3
+vf-gepa my-env --budget medium --rollouts-per-example 3
 ```
 
 ### Custom Log Directory
 ```bash
-vf-gepa my-env --auto medium --log-dir ./my_optimization_runs
+vf-gepa my-env --budget medium --log-dir ./my_optimization_runs
 ```
 
 ### Track Detailed Statistics
 Save full outputs for analysis:
 ```bash
-vf-gepa my-env --auto medium --track-stats
+vf-gepa my-env --budget medium --track-stats
 ```
 
 ### Verbose Logging
 Debug optimization process:
 ```bash
-vf-gepa my-env --auto medium -v
+vf-gepa my-env --budget medium -v
 ```
 
 ## Best Practices
@@ -238,7 +247,7 @@ return 0.5  # GEPA will only see the number
 Ensure your training and validation sets cover the full range of task difficulty and variety.
 
 ### 3. Start Light, Then Scale Up
-Begin with `--auto light` to verify everything works, then use `medium` or `heavy` for production.
+Begin with `--budget light` to verify everything works, then use `medium` or `heavy` for production.
 
 ### 4. Iterate on Feedback Quality
 If GEPA improvements are small, review your rubric's feedback. More specific feedback = better improvements.
@@ -273,7 +282,7 @@ Check that your environment exposes the component you're trying to optimize. Use
 - GEPA expects deterministic environment construction. Expensive setup code will re-run for every candidate.
 
 ### Low Improvement
-- Increase budget: Use `--auto heavy` or `--max-metric-calls 2000`
+- Increase budget: Use `--budget heavy` or `--max-metric-calls 2000`
 - Improve feedback: Make your rubric's feedback more specific
 - Add more examples: Use `-n 100 --num-val 30`
 - Check dataset quality: Ensure examples are representative
@@ -287,12 +296,12 @@ Check that your environment exposes the component you're trying to optimize. Use
 
 ### Basic Optimization
 ```bash
-vf-gepa wordle --auto medium
+vf-gepa wordle --budget medium
 ```
 
 ### Tool-Using Environment
 ```bash
-vf-gepa wiki-search --auto heavy \
+vf-gepa wiki-search --budget heavy \
   --components system_prompt tool_descriptions \
   -m gpt-4o
 ```
@@ -307,7 +316,7 @@ vf-gepa my-env --max-metric-calls 2000 \
 
 ### Custom Models
 ```bash
-vf-gepa my-env --auto medium \
+vf-gepa my-env --budget medium \
   -m claude-3-5-sonnet-20241022 \
   --reflection-model gpt-4o
 ```
diff --git a/integrations/gepa/README.md b/integrations/gepa/README.md
index efa57ac00..a67e2f0c9 100644
--- a/integrations/gepa/README.md
+++ b/integrations/gepa/README.md
@@ -24,13 +24,13 @@ This installs the `gepa` package (>=0.0.22).
 Optimize a system prompt:
 
 ```bash
-vf-gepa wordle --auto medium
+vf-gepa wordle --budget medium
 ```
 
 Optimize system prompt + tool descriptions:
 
 ```bash
-vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+vf-gepa wiki-search --budget heavy --components system_prompt tool_descriptions
 ```
 
 ## Components
@@ -99,7 +99,7 @@ When optimizing `tool_descriptions`, the adapter:
 Example:
 
 ```bash
-vf-gepa my-env --components tool_descriptions --auto medium
+vf-gepa my-env --components tool_descriptions --budget medium
 ```
 
 ## Architecture
@@ -181,7 +181,7 @@ Full documentation: [`docs/source/gepa.md`](../../docs/source/gepa.md)
 
 ```bash
 # Basic
-vf-gepa ENV_ID --auto light|medium|heavy
+vf-gepa ENV_ID --budget light|medium|heavy
 
 # Advanced
 vf-gepa ENV_ID \
@@ -195,7 +195,7 @@ vf-gepa ENV_ID \
 # Options
   -n, --num-examples       Training examples (default: 50)
   --num-val               Validation examples (default: 20)
-  --auto                  Budget: light/medium/heavy
+  --budget                Budget preset: light/medium/heavy
   --max-metric-calls      Custom budget (total metric calls)
   --components            What to optimize (default: system_prompt)
   -m, --model             Task model (default: gpt-4o-mini)
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index 7dc96b1bb..27e473783 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -3,8 +3,8 @@
 GEPA optimization script for Verifiers environments.
 
 Usage:
-    vf-gepa wordle --auto light
-    vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+    vf-gepa wordle --budget light
+    vf-gepa wiki-search --budget heavy --components system_prompt tool_descriptions
     vf-gepa my-env --max-metric-calls 1000 -n 100 --num-val 30
 """
 
@@ -52,10 +52,10 @@ def main():
         epilog="""
 Examples:
   # Light optimization (quick test)
-  vf-gepa wordle --auto light
+  vf-gepa wordle --budget light
 
   # Heavy optimization with tool descriptions
-  vf-gepa wiki-search --auto heavy --components system_prompt tool_descriptions
+  vf-gepa wiki-search --budget heavy --components system_prompt tool_descriptions
 
   # Custom configuration
   vf-gepa my-env --max-metric-calls 1000 -n 100 --num-val 30
@@ -178,9 +178,10 @@ def main():
     # 8. GEPA budget (mutually exclusive)
     budget_group = parser.add_mutually_exclusive_group(required=True)
     budget_group.add_argument(
-        "--auto",
+        "--budget",
+        "-B",
         choices=["light", "medium", "heavy"],
-        help="Auto budget: light (~6 candidates), medium (~12), heavy (~18)",
+        help="Budget preset: light (~6 candidates), medium (~12), heavy (~18)",
     )
     budget_group.add_argument(
         "--max-metric-calls", type=int, help="Maximum total metric calls budget"
@@ -223,8 +224,8 @@ def main():
     parser.add_argument(
         "--reflection-minibatch-size",
         type=int,
-        default=3,
-        help="Number of examples per reflection step (default: 3)",
+        default=35,
+        help="Number of examples per reflection step (default: 35)",
     )
 
     # 10. Output/Logging
@@ -456,10 +457,10 @@ def main():
     log_dir.mkdir(parents=True, exist_ok=True)
     logger.info(f"Log directory: {log_dir}")
 
-    # Convert auto budget to max_metric_calls if needed
-    if args.auto:
+    # Convert budget preset to max_metric_calls if needed
+    if args.budget:
         max_metric_calls = auto_budget_to_metric_calls(
-            auto=args.auto,
+            auto=args.budget,
             num_components=len(seed_candidate),
             valset_size=len(valset),
             minibatch_size=args.reflection_minibatch_size,
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/utils/gepa_utils.py
index a8dda7c88..8e07fb451 100644
--- a/verifiers/utils/gepa_utils.py
+++ b/verifiers/utils/gepa_utils.py
@@ -147,19 +147,19 @@ def auto_budget_to_metric_calls(
     auto: str,
     num_components: int,
     valset_size: int,
-    minibatch_size: int = 3,
+    minibatch_size: int = 35,
     full_eval_steps: int = 5,
 ) -> int:
     """
     Convert auto budget (light/medium/heavy) to max_metric_calls.
 
-    This replicates GEPA's auto_budget calculation for consistency.
+    This replicates DSPy's auto_budget calculation for consistency.
 
     Args:
         auto: Budget level ('light', 'medium', or 'heavy')
         num_components: Number of components being optimized
         valset_size: Size of validation set
-        minibatch_size: Reflection minibatch size
+        minibatch_size: Reflection minibatch size (default: 35, matching DSPy)
         full_eval_steps: Steps between full validations
 
     Returns:

From 92af75e051510cbdabe75914a1ef38a34b049dbd Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Mon, 24 Nov 2025 12:36:36 +0100
Subject: [PATCH 06/16] a few bugfixes in gepa adapter

---
 verifiers/adapters/gepa/adapter.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/verifiers/adapters/gepa/adapter.py b/verifiers/adapters/gepa/adapter.py
index ff45e2d11..3e939ca6a 100644
--- a/verifiers/adapters/gepa/adapter.py
+++ b/verifiers/adapters/gepa/adapter.py
@@ -9,6 +9,7 @@
 import asyncio
 import inspect
 import logging
+from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 from typing import Any
 
@@ -161,6 +162,10 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         for comp_name, comp_value in candidate.items():
             if comp_name.startswith("tool_") and comp_name.endswith("_description"):
                 continue
+            # Never pass dataset/eval_dataset - some envs create these internally
+            # and would get duplicate arguments
+            if comp_name in {"dataset", "eval_dataset"}:
+                continue
             if comp_name in signature.parameters or accepts_kwargs:
                 init_kwargs[comp_name] = comp_value
             else:
@@ -168,7 +173,13 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
 
         # Provide minimal dataset if none exists (adapter provides inputs directly)
         # This avoids copying large datasets and improves performance
-        if "dataset" not in init_kwargs and "eval_dataset" not in init_kwargs:
+        # Only add if dataset is an explicit parameter (not just accepted via **kwargs)
+        # Some envs like TextArenaEnv create dataset internally
+        if (
+            "dataset" not in init_kwargs
+            and "eval_dataset" not in init_kwargs
+            and "dataset" in signature.parameters
+        ):
             init_kwargs["dataset"] = vf.load_example_dataset(n=1)
 
         try:
@@ -212,12 +223,13 @@ def evaluate(
         try:
             asyncio.get_running_loop()
         except RuntimeError:
+            # No running loop - create one
             return asyncio.run(evaluation)
 
-        raise RuntimeError(
-            "GEPAAdapter.evaluate() cannot run inside an active asyncio loop. "
-            "Use 'await adapter.evaluate_async(...)' instead."
-        )
+        # Already in an event loop - run in a thread pool to avoid blocking
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(asyncio.run, evaluation)
+            return future.result()
 
     async def evaluate_async(
         self,

From 7df2a43c19047c27a2e0f8771b01af6e21a7189d Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Mon, 24 Nov 2025 14:04:30 +0100
Subject: [PATCH 07/16] unified log path with vf-eval

---
 verifiers/adapters/gepa/adapter.py | 15 +++++++++++++++
 verifiers/scripts/gepa.py          | 16 ----------------
 verifiers/types.py                 |  1 -
 verifiers/utils/gepa_utils.py      | 21 +++++++++++++--------
 verifiers/utils/path_utils.py      | 22 ++++++++++++++++++----
 5 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/verifiers/adapters/gepa/adapter.py b/verifiers/adapters/gepa/adapter.py
index 3e939ca6a..c5b5675de 100644
--- a/verifiers/adapters/gepa/adapter.py
+++ b/verifiers/adapters/gepa/adapter.py
@@ -60,6 +60,7 @@ def __init__(
         self.components_to_optimize = components_to_optimize or ["system_prompt"]
         self.num_rollouts_per_example = num_rollouts_per_example
         self.max_concurrent = max_concurrent
+        self._candidate_build_count = 0  # Track candidate environment builds
 
         if self.num_rollouts_per_example < 1:
             raise ValueError("num_rollouts_per_example must be at least 1")
@@ -95,6 +96,12 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         """
         Reconstruct a fresh Environment instance with updated components.
         """
+        self._candidate_build_count += 1
+        logger.debug(
+            f"Building candidate environment #{self._candidate_build_count} "
+            f"with components: {list(candidate.keys())}"
+        )
+
         env_class = self.base_env.__class__
         signature = inspect.signature(env_class.__init__)
         accepts_kwargs = any(
@@ -196,6 +203,9 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         if updated_oai_tools is not None:
             new_env.oai_tools = updated_oai_tools
 
+        logger.debug(
+            f"Successfully built {env_class.__name__} candidate #{self._candidate_build_count}"
+        )
         return new_env
 
     def evaluate(
@@ -218,6 +228,11 @@ def evaluate(
         # Build environment with candidate components
         env = self.build_program(candidate)
 
+        logger.debug(
+            f"Evaluating candidate on batch of {len(batch)} examples "
+            f"({self.num_rollouts_per_example} rollouts/example = {len(batch) * self.num_rollouts_per_example} total rollouts)"
+        )
+
         # Run evaluation using Environment's evaluate method
         evaluation = self._evaluate_async(env, batch, capture_traces)
         try:
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index 27e473783..64a27bc7f 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -14,8 +14,6 @@
 import logging
 import os
 import sys
-import uuid
-from pathlib import Path
 
 try:
     from gepa import optimize  # noqa: F401
@@ -243,10 +241,6 @@ def main():
         default=-1,
         help="Save rollout trajectories every n evaluations during optimization",
     )
-    parser.add_argument(
-        "--log-dir",
-        help="Directory for GEPA logs (default: ./gepa_results/<env_id>/<run_id>)",
-    )
     parser.add_argument(
         "--track-stats",
         action="store_true",
@@ -448,15 +442,6 @@ def main():
         logger.error("No valid components found to optimize!")
         return
 
-    # Setup log directory
-    if args.log_dir:
-        log_dir = Path(args.log_dir)
-    else:
-        run_id = str(uuid.uuid4())[:8]
-        log_dir = Path(f"./gepa_results/{args.env_id}/{run_id}")
-    log_dir.mkdir(parents=True, exist_ok=True)
-    logger.info(f"Log directory: {log_dir}")
-
     # Convert budget preset to max_metric_calls if needed
     if args.budget:
         max_metric_calls = auto_budget_to_metric_calls(
@@ -501,7 +486,6 @@ def main():
         max_concurrent=args.max_concurrent,
         seed=args.seed,
         # output
-        log_dir=log_dir,
         save_results=args.save_results,
         save_every=args.save_every,
         track_stats=args.track_stats,
diff --git a/verifiers/types.py b/verifiers/types.py
index 30b95118e..ed867ae6d 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -281,7 +281,6 @@ class GEPAConfig(BaseModel):
     max_concurrent: int
     seed: int
     # output
-    log_dir: Path
     save_results: bool
     save_every: int
     track_stats: bool
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/utils/gepa_utils.py
index 8e07fb451..aa04cd03d 100644
--- a/verifiers/utils/gepa_utils.py
+++ b/verifiers/utils/gepa_utils.py
@@ -20,7 +20,9 @@
 import verifiers as vf
 from verifiers.adapters.gepa import GEPAAdapter
 from verifiers.types import GEPAConfig
+from verifiers.utils.client_utils import setup_client
 from verifiers.utils.eval_utils import save_rollout_results
+from verifiers.utils.path_utils import get_gepa_results_path
 
 logger = logging.getLogger(__name__)
 
@@ -337,7 +339,7 @@ def print_optimization_results(result, log_dir: Path):
     print(
         f"Improvement: {max(result.val_aggregate_scores) - result.val_aggregate_scores[0]:.3f}"
     )
-    print(f"Total candidates explored: {len(result.candidates)}")
+    print(f"Total candidates fully explored: {len(result.candidates)}")
     print("\nOptimized components:")
     print("-" * 80)
 
@@ -373,7 +375,10 @@ async def run_gepa_optimization(config: GEPAConfig):
         print("Install with: uv add 'verifiers[gepa]'")
         sys.exit(1)
 
-    from verifiers.utils.client_utils import setup_client
+    # Setup log directory
+    log_dir = get_gepa_results_path(config)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Log directory: {log_dir}")
 
     # Setup task client
     client = setup_client(config.client_config)
@@ -453,7 +458,7 @@ async def run_gepa_optimization(config: GEPAConfig):
                 config.reflection_max_tokens,
             ),
             reflection_minibatch_size=config.reflection_minibatch_size,
-            run_dir=str(config.log_dir),
+            run_dir=str(log_dir),
             track_best_outputs=config.track_stats,
             seed=config.seed,
             display_progress_bar=True,
@@ -463,7 +468,7 @@ async def run_gepa_optimization(config: GEPAConfig):
         raise
 
     # Print results
-    print_optimization_results(result, config.log_dir)
+    print_optimization_results(result, log_dir)
 
     # Prepare run configuration for saving
     run_config = {
@@ -483,9 +488,9 @@ async def run_gepa_optimization(config: GEPAConfig):
 
     # Save results
     save_optimized_components(
-        config.env_id, result.best_candidate, config.seed_candidate, config.log_dir
+        config.env_id, result.best_candidate, config.seed_candidate, log_dir
     )
-    save_optimization_metrics(config.env_id, result, config.log_dir, run_config)
+    save_optimization_metrics(config.env_id, result, log_dir, run_config)
 
     # Save rollouts if requested
     if config.save_results:
@@ -506,7 +511,7 @@ async def save_all_candidates():
                 rollouts_per_example=config.rollouts_per_example,
                 max_concurrent=config.max_concurrent,
                 save_every=save_every,
-                log_dir=config.log_dir,
+                log_dir=log_dir,
             )
             await save_candidate_rollouts(
                 adapter=adapter,
@@ -519,7 +524,7 @@ async def save_all_candidates():
                 rollouts_per_example=config.rollouts_per_example,
                 max_concurrent=config.max_concurrent,
                 save_every=save_every,
-                log_dir=config.log_dir,
+                log_dir=log_dir,
             )
 
         try:
diff --git a/verifiers/utils/path_utils.py b/verifiers/utils/path_utils.py
index 6ab89923b..70547e132 100644
--- a/verifiers/utils/path_utils.py
+++ b/verifiers/utils/path_utils.py
@@ -1,17 +1,18 @@
 import uuid
 from pathlib import Path
 
-from verifiers.types import EvalConfig
+from verifiers.types import EvalConfig, GEPAConfig
 
 
 def get_results_path(
     env_id: str,
     model: str,
     base_path: Path = Path("./outputs"),
+    subdir: str = "evals",
 ) -> Path:
     uuid_str = str(uuid.uuid4())[:8]
     env_model_str = f"{env_id}--{model.replace('/', '--')}"
-    return base_path / "evals" / env_model_str / uuid_str
+    return base_path / subdir / env_model_str / uuid_str
 
 
 def get_eval_results_path(config: EvalConfig) -> Path:
@@ -20,8 +21,21 @@ def get_eval_results_path(config: EvalConfig) -> Path:
 
     if local_env_dir.exists():
         base_path = local_env_dir / "outputs"
-        results_path = get_results_path(config.env_id, config.model, base_path)
+        results_path = get_results_path(config.env_id, config.model, base_path, "evals")
     else:
         base_path = Path("./outputs")
-        results_path = get_results_path(config.env_id, config.model, base_path)
+        results_path = get_results_path(config.env_id, config.model, base_path, "evals")
+    return results_path
+
+
+def get_gepa_results_path(config: GEPAConfig) -> Path:
+    module_name = config.env_id.replace("-", "_")
+    local_env_dir = Path(config.env_dir_path) / module_name
+
+    if local_env_dir.exists():
+        base_path = local_env_dir / "outputs"
+        results_path = get_results_path(config.env_id, config.model, base_path, "gepa")
+    else:
+        base_path = Path("./outputs")
+        results_path = get_results_path(config.env_id, config.model, base_path, "gepa")
     return results_path

From dbbfff75c4678213baca8a09a58b5d3687e468a6 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 09:44:10 +0100
Subject: [PATCH 08/16] changed location of gepa adapter

---
 verifiers/adapters/{gepa/adapter.py => gepa.py} | 0
 verifiers/adapters/gepa/__init__.py             | 5 -----
 2 files changed, 5 deletions(-)
 rename verifiers/adapters/{gepa/adapter.py => gepa.py} (100%)
 delete mode 100644 verifiers/adapters/gepa/__init__.py

diff --git a/verifiers/adapters/gepa/adapter.py b/verifiers/adapters/gepa.py
similarity index 100%
rename from verifiers/adapters/gepa/adapter.py
rename to verifiers/adapters/gepa.py
diff --git a/verifiers/adapters/gepa/__init__.py b/verifiers/adapters/gepa/__init__.py
deleted file mode 100644
index cdff1d841..000000000
--- a/verifiers/adapters/gepa/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""GEPA adapter packaged for verifiers installations."""
-
-from .adapter import GEPAAdapter
-
-__all__ = ["GEPAAdapter"]

From ffebc614e3d5a60db95250f867e3f4fa21209e73 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 10:23:21 +0100
Subject: [PATCH 09/16] fail fast and loud + fixed unit test

---
 tests/test_gepa.py            | 172 ++++++++++++++++++++++++++++++++++
 verifiers/adapters/gepa.py    |  29 +++---
 verifiers/scripts/gepa.py     |  10 +-
 verifiers/utils/gepa_utils.py |  14 +--
 4 files changed, 201 insertions(+), 24 deletions(-)

diff --git a/tests/test_gepa.py b/tests/test_gepa.py
index 1496fd59e..9466500be 100644
--- a/tests/test_gepa.py
+++ b/tests/test_gepa.py
@@ -189,6 +189,178 @@ def test_gepa_adapter_build_program(self):
         assert len(new_env.dataset) == 1  # But it's minimal (dummy)
         assert new_env.dataset is not env.dataset  # Not the same reference
 
+    def test_gepa_adapter_build_program_multiturn_env(self):
+        """Test build_program with MultiTurnEnv (uses **kwargs)."""
+        GEPAAdapter = require_gepa_adapter()
+
+        # Create a simple MultiTurnEnv
+        dataset = vf.load_example_dataset(n=5)
+
+        class TestMultiTurnEnv(vf.MultiTurnEnv):
+            async def env_response(self, messages, state, **kwargs):
+                return [{"role": "user", "content": "test"}]
+
+        env = TestMultiTurnEnv(
+            dataset=dataset,
+            system_prompt="Original prompt",
+            rubric=vf.Rubric(),
+            max_turns=3,
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["system_prompt"],
+        )
+
+        candidate = {"system_prompt": "Optimized prompt"}
+        new_env = adapter.build_program(candidate)
+
+        # Verify component was updated
+        assert new_env.system_prompt == "Optimized prompt"
+        # Verify dataset was replaced with minimal dummy
+        assert new_env.dataset is not None
+        assert len(new_env.dataset) == 1
+        assert new_env.dataset is not env.dataset
+
+    def test_gepa_adapter_build_program_tool_env(self):
+        """Test build_program with ToolEnv."""
+        GEPAAdapter = require_gepa_adapter()
+
+        def example_tool(x: int) -> int:
+            return x * 2
+
+        dataset = vf.load_example_dataset(n=5)
+
+        class TestToolEnv(vf.ToolEnv):
+            def __init__(self, **kwargs):
+                super().__init__(tools=[example_tool], **kwargs)
+
+        env = TestToolEnv(
+            dataset=dataset,
+            system_prompt="Use the tool",
+            rubric=vf.Rubric(),
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["system_prompt"],
+        )
+
+        candidate = {"system_prompt": "Use the tool wisely"}
+        new_env = adapter.build_program(candidate)
+
+        # Verify component was updated
+        assert new_env.system_prompt == "Use the tool wisely"
+        # Verify dataset was replaced with minimal dummy
+        assert new_env.dataset is not None
+        assert len(new_env.dataset) == 1
+        assert new_env.oai_tools is not None  # Tools preserved
+
+    def test_gepa_adapter_build_program_stateful_tool_env(self):
+        """Test build_program with StatefulToolEnv."""
+        GEPAAdapter = require_gepa_adapter()
+
+        def stateful_tool(x: int, state_val: int) -> int:
+            return x + state_val
+
+        dataset = vf.load_example_dataset(n=5)
+
+        class TestStatefulToolEnv(vf.StatefulToolEnv):
+            def __init__(self, **kwargs):
+                super().__init__(tools=[stateful_tool], **kwargs)
+
+            def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
+                return {**tool_args, "state_val": 10}
+
+        env = TestStatefulToolEnv(
+            dataset=dataset,
+            system_prompt="Stateful tool env",
+            rubric=vf.Rubric(),
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["system_prompt"],
+        )
+
+        candidate = {"system_prompt": "Updated stateful prompt"}
+        new_env = adapter.build_program(candidate)
+
+        # Verify component was updated
+        assert new_env.system_prompt == "Updated stateful prompt"
+        # Verify dataset was replaced with minimal dummy
+        assert new_env.dataset is not None
+        assert len(new_env.dataset) == 1
+
+    def test_gepa_adapter_build_program_internal_dataset_env(self):
+        """Test build_program with env that creates dataset internally."""
+        GEPAAdapter = require_gepa_adapter()
+
+        class InternalDatasetEnv(vf.SingleTurnEnv):
+            """Mock env that creates dataset internally like TextArenaEnv."""
+
+            def __init__(
+                self,
+                num_train_examples: int = 10,
+                num_eval_examples: int = 0,
+                system_prompt: str | None = None,
+                **kwargs,
+            ):
+                # Create dataset internally (like TextArenaEnv does)
+                from datasets import Dataset
+
+                rows = [
+                    {"question": f"q{i}", "answer": f"a{i}"}
+                    for i in range(num_train_examples)
+                ]
+                dataset = Dataset.from_list(rows)
+
+                self.num_train_examples = num_train_examples
+                self.num_eval_examples = num_eval_examples
+
+                super().__init__(
+                    dataset=dataset,
+                    system_prompt=system_prompt,
+                    rubric=vf.Rubric(),
+                    **kwargs,
+                )
+
+        env = InternalDatasetEnv(
+            num_train_examples=100,
+            system_prompt="Internal dataset env",
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["system_prompt"],
+        )
+
+        candidate = {"system_prompt": "Updated internal prompt"}
+        new_env = adapter.build_program(candidate)
+
+        # Verify component was updated
+        assert new_env.system_prompt == "Updated internal prompt"
+        # Verify dataset was created internally (not the dummy one)
+        assert new_env.dataset is not None
+        assert len(new_env.dataset) == 100  # Created internally with num_train_examples
+        assert new_env.num_train_examples == 100
+
     def test_gepa_adapter_extract_seed_candidate(self):
         """Test extracting seed candidate from environment."""
         dataset = vf.load_example_dataset(n=5)
diff --git a/verifiers/adapters/gepa.py b/verifiers/adapters/gepa.py
index c5b5675de..af9300132 100644
--- a/verifiers/adapters/gepa.py
+++ b/verifiers/adapters/gepa.py
@@ -180,13 +180,13 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
 
         # Provide minimal dataset if none exists (adapter provides inputs directly)
         # This avoids copying large datasets and improves performance
-        # Only add if dataset is an explicit parameter (not just accepted via **kwargs)
-        # Some envs like TextArenaEnv create dataset internally
-        if (
-            "dataset" not in init_kwargs
-            and "eval_dataset" not in init_kwargs
-            and "dataset" in signature.parameters
-        ):
+        # Detect if env creates dataset internally (has num_train_examples or num_eval_examples params)
+        creates_internal_dataset = (
+            "num_train_examples" in signature.parameters
+            or "num_eval_examples" in signature.parameters
+        )
+        accepts_dataset = "dataset" in signature.parameters or accepts_kwargs
+        if accepts_dataset and not creates_internal_dataset:
             init_kwargs["dataset"] = vf.load_example_dataset(n=1)
 
         try:
@@ -267,9 +267,8 @@ async def _evaluate_async(
         """Async helper for evaluation."""
         rollout_inputs = self._build_rollout_inputs(env, batch)
         if not rollout_inputs:
-            logger.warning("Empty evaluation batch received by GEPAAdapter")
-            return EvaluationBatch(
-                outputs=[], scores=[], trajectories=[] if capture_traces else None
+            raise ValueError(
+                "Empty evaluation batch - no rollout inputs generated from batch"
             )
 
         generate_outputs = await env.generate(
@@ -285,7 +284,11 @@ async def _evaluate_async(
         states = generate_outputs["state"]
         rewards = generate_outputs["reward"]
 
-        scores = [float(score) if score is not None else 0.0 for score in rewards]
+        if any(r is None for r in rewards):
+            raise ValueError(
+                "Received None reward from environment - check rubric configuration"
+            )
+        scores = [float(score) for score in rewards]
         trajectories = [] if capture_traces else None
 
         if capture_traces:
@@ -459,6 +462,10 @@ def make_reflective_dataset(
                     feedback = self.base_env.rubric.get_feedback(state)
                 else:
                     # Default fallback for basic rubrics
+                    logger.warning(
+                        "Rubric lacks get_feedback method - using generic feedback. "
+                        "Consider implementing get_feedback for better GEPA reflection."
+                    )
                     feedback = f"Reward: {score:.3f}"
                     if score < 0.5:
                         feedback += " (Low score - needs improvement)"
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index 64a27bc7f..98bdfc997 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -436,11 +436,15 @@ def main():
         elif hasattr(vf_env, comp):
             seed_candidate[comp] = getattr(vf_env, comp)
         else:
-            logger.warning(f"Environment doesn't have component '{comp}', skipping")
+            raise ValueError(
+                f"Environment '{args.env_id}' does not have component '{comp}'. "
+                f"Available components: system_prompt, tool_descriptions"
+            )
 
     if not seed_candidate:
-        logger.error("No valid components found to optimize!")
-        return
+        raise ValueError(
+            f"No valid components found to optimize for environment '{args.env_id}'"
+        )
 
     # Convert budget preset to max_metric_calls if needed
     if args.budget:
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/utils/gepa_utils.py
index aa04cd03d..72e42ee36 100644
--- a/verifiers/utils/gepa_utils.py
+++ b/verifiers/utils/gepa_utils.py
@@ -42,7 +42,7 @@ def get_env_gepa_defaults(env_id: str) -> Dict[str, Any]:
     """Get GEPA config defaults from environment package's pyproject.toml.
 
     Returns dict with 'num_examples', 'num_val', and 'rollouts_per_example' keys if found,
-    otherwise returns empty dict. All errors are silently handled.
+    otherwise returns empty dict.
     """
     defaults: Dict[str, Any] = {}
     module_name = env_id.replace("-", "_").split("/")[-1]
@@ -77,10 +77,6 @@ def get_env_gepa_defaults(env_id: str) -> Dict[str, Any]:
             )
     except ModuleNotFoundError:
         logger.debug(f"Package {module_name} not installed")
-    except Exception as e:
-        logger.debug(
-            f"Could not load GEPA defaults from {module_name} pyproject.toml: {e}"
-        )
 
     return defaults
 
@@ -116,10 +112,7 @@ async def save_candidate_rollouts(
     Evaluate a candidate program and save rollout trajectories to disk.
     """
     if num_examples <= 0:
-        logger.warning(
-            "Skipping rollout saving for %s candidate because num_examples<=0", label
-        )
-        return
+        raise ValueError(f"num_examples must be positive, got {num_examples}")
 
     env = adapter.build_program(candidate)
     rollouts_dir = log_dir / "rollouts" / label
@@ -216,7 +209,7 @@ def prepare_gepa_dataset(dataset) -> list[dict]:
     GEPA expects a list of dicts with keys like 'question', 'answer', 'info', 'task'.
     """
     if dataset is None:
-        return []
+        raise ValueError("dataset cannot be None")
 
     examples = []
     for item in dataset:
@@ -531,6 +524,7 @@ async def save_all_candidates():
             await save_all_candidates()
         except RuntimeError as exc:
             logger.error(f"Failed to save rollout trajectories: {exc}")
+            raise
 
     logger.info("GEPA optimization completed successfully!")
     return result

From 54e0e66c1210a0cb6b7b622afc6a684243001f30 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 10:54:54 +0100
Subject: [PATCH 10/16] added wandb and mlflow flags

---
 verifiers/scripts/gepa.py     | 66 +++++++++++++++++++++++++++++++++++
 verifiers/types.py            | 10 ++++++
 verifiers/utils/gepa_utils.py | 25 +++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index 98bdfc997..fde67d3c3 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -256,6 +256,62 @@ def main():
         help="Random seed for reproducibility (default: 42)",
     )
 
+    # 11. Experiment tracking - wandb
+    parser.add_argument(
+        "--use-wandb",
+        action="store_true",
+        help="Enable wandb logging",
+    )
+    parser.add_argument(
+        "--wandb-project",
+        type=str,
+        default=None,
+        help="Wandb project name",
+    )
+    parser.add_argument(
+        "--wandb-entity",
+        type=str,
+        default=None,
+        help="Wandb entity/team name",
+    )
+    parser.add_argument(
+        "--wandb-name",
+        type=str,
+        default=None,
+        help="Wandb run name (default: auto-generated from env_id)",
+    )
+    parser.add_argument(
+        "--wandb-api-key-var",
+        type=str,
+        default="WANDB_API_KEY",
+        help="Environment variable containing wandb API key (default: WANDB_API_KEY)",
+    )
+    parser.add_argument(
+        "--wandb-init-kwargs",
+        type=json.loads,
+        default=None,
+        help='Additional wandb.init() kwargs as JSON (e.g., \'{"tags": ["gepa"], "mode": "offline"}\')',
+    )
+
+    # 12. Experiment tracking - mlflow
+    parser.add_argument(
+        "--use-mlflow",
+        action="store_true",
+        help="Enable mlflow logging",
+    )
+    parser.add_argument(
+        "--mlflow-tracking-uri",
+        type=str,
+        default=None,
+        help="MLflow tracking server URI",
+    )
+    parser.add_argument(
+        "--mlflow-experiment-name",
+        type=str,
+        default=None,
+        help="MLflow experiment name",
+    )
+
     args = parser.parse_args()
 
     # Parse env_args
@@ -494,6 +550,16 @@ def main():
         save_every=args.save_every,
         track_stats=args.track_stats,
         verbose=args.verbose,
+        # experiment tracking
+        use_wandb=args.use_wandb,
+        wandb_api_key_var=args.wandb_api_key_var,
+        wandb_project=args.wandb_project,
+        wandb_entity=args.wandb_entity,
+        wandb_name=args.wandb_name,
+        wandb_init_kwargs=args.wandb_init_kwargs,
+        use_mlflow=args.use_mlflow,
+        mlflow_tracking_uri=args.mlflow_tracking_uri,
+        mlflow_experiment_name=args.mlflow_experiment_name,
     )
 
     # Run GEPA optimization
diff --git a/verifiers/types.py b/verifiers/types.py
index ed867ae6d..f9e56c5a7 100644
--- a/verifiers/types.py
+++ b/verifiers/types.py
@@ -285,3 +285,13 @@ class GEPAConfig(BaseModel):
     save_every: int
     track_stats: bool
     verbose: bool
+    # experiment tracking
+    use_wandb: bool = False
+    wandb_api_key_var: str = "WANDB_API_KEY"
+    wandb_project: str | None = None
+    wandb_entity: str | None = None
+    wandb_name: str | None = None
+    wandb_init_kwargs: dict | None = None
+    use_mlflow: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment_name: str | None = None
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/utils/gepa_utils.py
index 72e42ee36..738900f6b 100644
--- a/verifiers/utils/gepa_utils.py
+++ b/verifiers/utils/gepa_utils.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import math
+import os
 import sys
 import textwrap
 from datetime import datetime
@@ -436,6 +437,23 @@ async def run_gepa_optimization(config: GEPAConfig):
     logger.info("Starting GEPA optimization...")
     logger.info("=" * 80)
 
+    # Build wandb_init_kwargs from config
+    wandb_init_kwargs = (
+        config.wandb_init_kwargs.copy() if config.wandb_init_kwargs else {}
+    )
+    if config.use_wandb:
+        if config.wandb_project:
+            wandb_init_kwargs["project"] = config.wandb_project
+        if config.wandb_entity:
+            wandb_init_kwargs["entity"] = config.wandb_entity
+        if config.wandb_name:
+            wandb_init_kwargs["name"] = config.wandb_name
+        else:
+            wandb_init_kwargs.setdefault("name", f"gepa-{config.env_id}")
+
+    # Get wandb API key from env var
+    wandb_api_key = os.getenv(config.wandb_api_key_var) if config.use_wandb else None
+
     try:
         result = optimize(
             seed_candidate=config.seed_candidate,
@@ -455,6 +473,13 @@ async def run_gepa_optimization(config: GEPAConfig):
             track_best_outputs=config.track_stats,
             seed=config.seed,
             display_progress_bar=True,
+            # experiment tracking
+            use_wandb=config.use_wandb,
+            wandb_api_key=wandb_api_key,
+            wandb_init_kwargs=wandb_init_kwargs if config.use_wandb else None,
+            use_mlflow=config.use_mlflow,
+            mlflow_tracking_uri=config.mlflow_tracking_uri,
+            mlflow_experiment_name=config.mlflow_experiment_name,
         )
     except Exception as e:
         logger.error(f"GEPA optimization failed: {e}", exc_info=True)

From 23fd4471fe94c3d0b0330f9f427094d3a810a939 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 11:21:26 +0100
Subject: [PATCH 11/16] added vf-gepa cli tests

---
 tests/test_gepa_cli.py    | 391 ++++++++++++++++++++++++++++++++++++++
 verifiers/scripts/gepa.py |   8 +-
 2 files changed, 395 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_gepa_cli.py

diff --git a/tests/test_gepa_cli.py b/tests/test_gepa_cli.py
new file mode 100644
index 000000000..60ccc24a8
--- /dev/null
+++ b/tests/test_gepa_cli.py
@@ -0,0 +1,391 @@
+"""Tests for vf-gepa CLI argument parsing and configuration."""
+
+import argparse
+import os
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+import verifiers as vf
+
+
+def require_gepa_script():
+    """Import gepa script or skip tests if module is unavailable."""
+    return pytest.importorskip("verifiers.scripts.gepa")
+
+
+def _make_mock_env():
+    """Create a mock environment for testing."""
+    env = MagicMock(spec=vf.Environment)
+    env.system_prompt = "Test system prompt"
+    env.eval_dataset = None
+    env.env_id = "test-env"
+    env.oai_tools = None
+
+    # Mock dataset methods - return enough items for all tests
+    # Most tests use num_examples=10 and num_val=5, so we need at least 15 items
+    mock_dataset = MagicMock()
+    mock_dataset.to_list.return_value = [
+        {"question": f"q{i}", "answer": f"a{i}", "task": "test", "info": {}}
+        for i in range(50)  # Plenty of items for all tests
+    ]
+    env.get_dataset.return_value = mock_dataset
+    env.get_eval_dataset.return_value = mock_dataset
+
+    return env
+
+
+def _run_cli(monkeypatch, overrides, custom_env=None):
+    """
+    Helper to run vf-gepa CLI with mocked dependencies.
+
+    Args:
+        monkeypatch: pytest monkeypatch fixture
+        overrides: dict of CLI args to override
+        custom_env: optional custom mock environment (default: _make_mock_env())
+
+    Returns:
+        dict containing captured GEPAConfig passed to run_gepa_optimization
+    """
+    gepa_script = require_gepa_script()
+
+    base_args = {
+        "env_id": "test-env",
+        "env_args": "{}",
+        "env_dir_path": "./environments",
+        "num_examples": 10,
+        "num_val": 5,
+        "endpoints_path": "./configs/endpoints.py",
+        "model": "gpt-4o-mini",
+        "api_key_var": "OPENAI_API_KEY",
+        "api_base_url": "https://api.openai.com/v1",
+        "headers": None,
+        "temperature": 1.0,
+        "max_tokens": None,
+        "sampling_args": None,  # Will be parsed by json.loads if not None
+        "rollouts_per_example": 1,
+        "max_concurrent": 32,
+        "budget": "light",  # Required - mutually exclusive with max_metric_calls
+        "max_metric_calls": None,
+        "components": ["system_prompt"],
+        "reflection_model": "gpt-4o",
+        "reflection_temperature": 1.0,
+        "reflection_base_url": None,
+        "reflection_api_key_var": "OPENAI_API_KEY",
+        "reflection_max_tokens": 8000,
+        "reflection_minibatch_size": 35,
+        "save_results": False,
+        "save_every": -1,
+        "track_stats": False,
+        "verbose": False,
+        "seed": 42,
+        "use_wandb": False,
+        "wandb_project": None,
+        "wandb_entity": None,
+        "wandb_name": None,
+        "wandb_api_key_var": "WANDB_API_KEY",
+        "wandb_init_kwargs": None,
+        "use_mlflow": False,
+        "mlflow_tracking_uri": None,
+        "mlflow_experiment_name": None,
+    }
+    base_args.update(overrides)
+    args_namespace = SimpleNamespace(**base_args)
+
+    captured = {}
+
+    # Mock argparse
+    monkeypatch.setattr(
+        argparse.ArgumentParser,
+        "parse_args",
+        lambda self: args_namespace,
+    )
+
+    # Mock setup_logging
+    monkeypatch.setattr(vf, "setup_logging", lambda *_, **__: None)
+
+    # Mock load_endpoints
+    from verifiers.utils import eval_utils
+
+    monkeypatch.setattr(eval_utils, "load_endpoints", lambda *_: {})
+
+    # Mock get_env_gepa_defaults
+    from verifiers.utils import gepa_utils
+
+    monkeypatch.setattr(gepa_utils, "get_env_gepa_defaults", lambda *_: {})
+
+    # Mock load_environment
+    mock_env = custom_env if custom_env is not None else _make_mock_env()
+    monkeypatch.setattr(vf, "load_environment", lambda **kwargs: mock_env)
+
+    # Mock os.getenv for reflection API key
+    def mock_getenv(key, default=None):
+        if key in ("OPENAI_API_KEY", "WANDB_API_KEY"):
+            return "fake-api-key"
+        return default
+
+    monkeypatch.setattr(os, "getenv", mock_getenv)
+
+    # Mock prepare_gepa_dataset to return non-empty datasets
+    def mock_prepare_gepa_dataset(dataset):
+        if dataset is None:
+            raise ValueError("dataset cannot be None")
+        # Return hardcoded examples instead of relying on the mock dataset
+        # This ensures we always have data for the tests
+        return [
+            {
+                "question": f"Question {i}",
+                "answer": f"Answer {i}",
+                "task": "test",
+                "info": {},
+            }
+            for i in range(10)
+        ]
+
+    monkeypatch.setattr(
+        gepa_utils,
+        "prepare_gepa_dataset",
+        mock_prepare_gepa_dataset,
+    )
+
+    # Mock run_gepa_optimization to capture config
+    # Must patch in the gepa script's namespace since it's imported at module level
+    async def fake_run_gepa_optimization(config):
+        captured["config"] = config
+        # Return immediately without running optimization
+        return None
+
+    monkeypatch.setattr(
+        gepa_script,
+        "run_gepa_optimization",
+        fake_run_gepa_optimization,
+    )
+
+    # Run the CLI
+    gepa_script.main()
+
+    return captured
+
+
+def test_cli_sampling_args_precedence_over_flags(monkeypatch):
+    """Test that --sampling-args takes precedence over --temperature and --max-tokens."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "sampling_args": {"temperature": 0.5, "max_tokens": 100},
+            "temperature": 0.9,
+            "max_tokens": 500,
+        },
+    )
+
+    config = captured["config"]
+    assert config.sampling_args["temperature"] == 0.5
+    assert config.sampling_args["max_tokens"] == 100
+
+
+def test_cli_sampling_args_fill_from_flags_when_missing(monkeypatch):
+    """Test that flags fill in when --sampling-args doesn't specify them."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "sampling_args": {"enable_thinking": True},
+            "temperature": 0.7,
+            "max_tokens": 200,
+        },
+    )
+
+    config = captured["config"]
+    assert config.sampling_args["temperature"] == 0.7
+    assert config.sampling_args["max_tokens"] == 200
+    assert config.sampling_args["enable_thinking"] is True
+
+
+def test_cli_budget_light_conversion(monkeypatch):
+    """Test that --budget light converts to expected max_metric_calls."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "budget": "light",
+            "max_metric_calls": None,
+            "num_examples": 10,
+            "num_val": 5,
+        },
+    )
+
+    config = captured["config"]
+    # Light budget should result in a positive number of metric calls
+    assert config.max_metric_calls > 0
+    # Light budget (~6 candidates) should be in a reasonable range
+    assert config.max_metric_calls >= 300  # At least 300
+    assert config.max_metric_calls <= 500  # At most 500
+
+
+def test_cli_budget_medium_conversion(monkeypatch):
+    """Test that --budget medium converts correctly."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "budget": "medium",
+            "max_metric_calls": None,
+            "num_examples": 10,
+            "num_val": 5,
+        },
+    )
+
+    config = captured["config"]
+    # Medium budget should result in more calls than light (~12 candidates)
+    assert config.max_metric_calls >= 500  # At least 500
+    assert config.max_metric_calls <= 1000  # At most 1000
+
+
+def test_cli_budget_heavy_conversion(monkeypatch):
+    """Test that --budget heavy converts correctly."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "budget": "heavy",
+            "max_metric_calls": None,
+            "num_examples": 10,
+            "num_val": 5,
+        },
+    )
+
+    config = captured["config"]
+    # Heavy budget should result in the most calls
+    assert config.max_metric_calls > 200
+
+
+def test_cli_max_metric_calls_direct(monkeypatch):
+    """Test that --max-metric-calls is used directly when provided."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "budget": None,
+            "max_metric_calls": 1234,
+        },
+    )
+
+    config = captured["config"]
+    assert config.max_metric_calls == 1234
+
+
+def test_cli_seed_candidate_extraction(monkeypatch):
+    """Test that seed_candidate is extracted from env's system_prompt."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "components": ["system_prompt"],
+        },
+    )
+
+    config = captured["config"]
+    assert "system_prompt" in config.seed_candidate
+    assert config.seed_candidate["system_prompt"] == "Test system prompt"
+    assert config.components_to_optimize == ["system_prompt"]
+
+
+def test_cli_defaults_fallback(monkeypatch):
+    """Test that CLI args are used when provided (not overridden by defaults)."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "num_examples": 25,
+            "num_val": 10,
+            "rollouts_per_example": 3,
+        },
+    )
+
+    config = captured["config"]
+    assert config.num_examples == 25
+    assert config.num_val == 10
+    assert config.rollouts_per_example == 3
+
+
+def test_cli_reflection_model_config(monkeypatch):
+    """Test that reflection model configuration is captured correctly."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "reflection_model": "gpt-4o",
+            "reflection_temperature": 0.8,
+            "reflection_max_tokens": 4000,
+            "reflection_minibatch_size": 20,
+        },
+    )
+
+    config = captured["config"]
+    assert config.reflection_model == "gpt-4o"
+    assert config.reflection_temperature == 0.8
+    assert config.reflection_max_tokens == 4000
+    assert config.reflection_minibatch_size == 20
+
+
+def test_cli_experiment_tracking_config(monkeypatch):
+    """Test that experiment tracking (wandb/mlflow) configuration is captured."""
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "use_wandb": True,
+            "wandb_project": "test-project",
+            "wandb_entity": "test-entity",
+            "wandb_name": "test-run",
+            "use_mlflow": True,
+            "mlflow_tracking_uri": "http://localhost:5000",
+            "mlflow_experiment_name": "test-experiment",
+        },
+    )
+
+    config = captured["config"]
+    assert config.use_wandb is True
+    assert config.wandb_project == "test-project"
+    assert config.wandb_entity == "test-entity"
+    assert config.wandb_name == "test-run"
+    assert config.use_mlflow is True
+    assert config.mlflow_tracking_uri == "http://localhost:5000"
+    assert config.mlflow_experiment_name == "test-experiment"
+
+
+def test_cli_env_args_parsing(monkeypatch):
+    """Test that --env-args is a string that gets parsed to dict correctly."""
+    # Note: env_args stays as a string in the CLI args, then gets parsed by json.loads
+    # But since we're passing through SimpleNamespace, we just verify the config receives it
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "env_args": '{"custom_arg": "value", "num": 42}',
+        },
+    )
+
+    config = captured["config"]
+    assert config.env_args["custom_arg"] == "value"
+    assert config.env_args["num"] == 42
+
+
+def test_cli_components_multiple(monkeypatch):
+    """Test that multiple components can be specified."""
+    # Create a mock env with oai_tools
+    env_with_tools = _make_mock_env()
+    env_with_tools.oai_tools = [
+        {
+            "function": {
+                "name": "test_tool",
+                "description": "A test tool",
+                "parameters": {},
+            }
+        }
+    ]
+
+    captured = _run_cli(
+        monkeypatch,
+        {
+            "components": ["system_prompt", "tool_descriptions"],
+        },
+        custom_env=env_with_tools,
+    )
+
+    config = captured["config"]
+    assert config.components_to_optimize == ["system_prompt", "tool_descriptions"]
+    # Should have both system_prompt and tool descriptions in seed_candidate
+    assert "system_prompt" in config.seed_candidate
+    assert "tool_0_description" in config.seed_candidate
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index fde67d3c3..ee21e7831 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -106,8 +106,8 @@ def main():
     parser.add_argument(
         "-m",
         "--model",
-        default="gpt-4o-mini",
-        help="Model to optimize (default: gpt-4o-mini)",
+        default="gpt-5-mini",
+        help="Model to optimize (default: gpt-5-mini)",
     )
     parser.add_argument(
         "--api-key-var",
@@ -194,8 +194,8 @@ def main():
     )
     parser.add_argument(
         "--reflection-model",
-        default="gpt-4o",
-        help="Model for reflection/proposal (default: gpt-4o)",
+        default="gpt-5-mini",
+        help="Model for reflection/proposal (default: gpt-5-mini)",
     )
     parser.add_argument(
         "--reflection-temperature",

From 0f1cd1fc0a0b919f994bb2b2b8efa4c6f65ae0d1 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 12:41:39 +0100
Subject: [PATCH 12/16] simplified `build_program` in `GEPAAdapter`

---
 environments/gsm8k/gsm8k.py   |  21 ++++-
 tests/test_gepa.py            |  31 ++++----
 verifiers/adapters/gepa.py    | 141 ++++++++--------------------------
 verifiers/rubrics/rubric.py   |   9 ++-
 verifiers/utils/gepa_utils.py |  16 ++--
 5 files changed, 88 insertions(+), 130 deletions(-)

diff --git a/environments/gsm8k/gsm8k.py b/environments/gsm8k/gsm8k.py
index dd8ac79e0..f77f52f85 100644
--- a/environments/gsm8k/gsm8k.py
+++ b/environments/gsm8k/gsm8k.py
@@ -1,4 +1,5 @@
 import verifiers as vf
+from verifiers.types import RewardResult
 from verifiers.utils.data_utils import (
     BOXED_SYSTEM_PROMPT,
     extract_boxed_answer,
@@ -20,9 +21,25 @@ def load_environment(
 
     parser = vf.Parser(extract_fn=extract_boxed_answer)
 
-    def correct_answer_reward_func(parser, completion, answer, **kwargs):
+    def correct_answer_reward_func(
+        parser, completion, answer, **kwargs
+    ) -> RewardResult:
         response = parser.parse_answer(completion) or ""
-        return 1.0 if response == answer else 0.0
+        is_correct = response == answer
+
+        # Build feedback for GEPA optimization
+        if is_correct:
+            feedback = f"Correct! The model correctly computed {answer}."
+        else:
+            if not response:
+                feedback = (
+                    f"Incorrect. The model did not provide an answer in \\boxed{{}}. "
+                    f"Expected: {answer}"
+                )
+            else:
+                feedback = f"Incorrect. The model answered {response} but the correct answer is {answer}."
+
+        return {"score": 1.0 if is_correct else 0.0, "feedback": feedback}
 
     rubric = vf.Rubric(
         parser=parser,
diff --git a/tests/test_gepa.py b/tests/test_gepa.py
index 9466500be..a4963bd01 100644
--- a/tests/test_gepa.py
+++ b/tests/test_gepa.py
@@ -152,7 +152,7 @@ def test_gepa_adapter_tool_descriptions_validation(self):
     def test_gepa_adapter_build_program(self):
         """Test GEPAAdapter.build_program creates new environment with updated components.
 
-        Important: datasets should NOT be copied for efficiency (can be huge).
+        Important: datasets are shared (not copied) for efficiency via shallow copy.
         The adapter provides inputs directly via _build_rollout_inputs.
         """
         GEPAAdapter = require_gepa_adapter()
@@ -183,11 +183,12 @@ def test_gepa_adapter_build_program(self):
         assert new_env.system_prompt == "Optimized prompt"
         assert new_env.system_prompt != env.system_prompt
 
-        # Verify dataset was NOT copied (efficiency optimization)
-        # New env should have a minimal dummy dataset, not the original
-        assert new_env.dataset is not None  # Has some dataset to satisfy init
-        assert len(new_env.dataset) == 1  # But it's minimal (dummy)
-        assert new_env.dataset is not env.dataset  # Not the same reference
+        # Verify dataset is shared (shallow copy - most efficient)
+        assert new_env.dataset is not None
+        assert new_env.dataset is env.dataset  # Same reference (shared)
+
+        # Verify rubric is also shared (preserves feedback functions)
+        assert new_env.rubric is env.rubric
 
     def test_gepa_adapter_build_program_multiturn_env(self):
         """Test build_program with MultiTurnEnv (uses **kwargs)."""
@@ -221,10 +222,9 @@ async def env_response(self, messages, state, **kwargs):
 
         # Verify component was updated
         assert new_env.system_prompt == "Optimized prompt"
-        # Verify dataset was replaced with minimal dummy
+        # Verify dataset is shared (shallow copy)
         assert new_env.dataset is not None
-        assert len(new_env.dataset) == 1
-        assert new_env.dataset is not env.dataset
+        assert new_env.dataset is env.dataset
 
     def test_gepa_adapter_build_program_tool_env(self):
         """Test build_program with ToolEnv."""
@@ -259,9 +259,9 @@ def __init__(self, **kwargs):
 
         # Verify component was updated
         assert new_env.system_prompt == "Use the tool wisely"
-        # Verify dataset was replaced with minimal dummy
+        # Verify dataset is shared (shallow copy)
         assert new_env.dataset is not None
-        assert len(new_env.dataset) == 1
+        assert new_env.dataset is env.dataset
         assert new_env.oai_tools is not None  # Tools preserved
 
     def test_gepa_adapter_build_program_stateful_tool_env(self):
@@ -300,9 +300,9 @@ def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
 
         # Verify component was updated
         assert new_env.system_prompt == "Updated stateful prompt"
-        # Verify dataset was replaced with minimal dummy
+        # Verify dataset is shared (shallow copy)
         assert new_env.dataset is not None
-        assert len(new_env.dataset) == 1
+        assert new_env.dataset is env.dataset
 
     def test_gepa_adapter_build_program_internal_dataset_env(self):
         """Test build_program with env that creates dataset internally."""
@@ -356,9 +356,10 @@ def __init__(
 
         # Verify component was updated
         assert new_env.system_prompt == "Updated internal prompt"
-        # Verify dataset was created internally (not the dummy one)
+        # Verify dataset is shared (shallow copy preserves all attributes)
         assert new_env.dataset is not None
-        assert len(new_env.dataset) == 100  # Created internally with num_train_examples
+        assert new_env.dataset is env.dataset  # Shared reference
+        assert len(new_env.dataset) == 100  # Original dataset preserved
         assert new_env.num_train_examples == 100
 
     def test_gepa_adapter_extract_seed_candidate(self):
diff --git a/verifiers/adapters/gepa.py b/verifiers/adapters/gepa.py
index af9300132..d9405c71e 100644
--- a/verifiers/adapters/gepa.py
+++ b/verifiers/adapters/gepa.py
@@ -7,7 +7,6 @@
 """
 
 import asyncio
-import inspect
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
@@ -93,118 +92,43 @@ def __init__(
         )
 
     def build_program(self, candidate: dict[str, str]) -> vf.Environment:
+        """Create a candidate environment with updated components using shallow copy.
+
+        Shallow copy shares heavy objects (dataset, rubric, parser) while
+        allowing string attributes to be replaced. For oai_tools, we deep copy
+        only if tool descriptions are being updated.
         """
-        Reconstruct a fresh Environment instance with updated components.
-        """
+        import copy
+
         self._candidate_build_count += 1
         logger.debug(
             f"Building candidate environment #{self._candidate_build_count} "
             f"with components: {list(candidate.keys())}"
         )
 
-        env_class = self.base_env.__class__
-        signature = inspect.signature(env_class.__init__)
-        accepts_kwargs = any(
-            param.kind == inspect.Parameter.VAR_KEYWORD
-            for param in signature.parameters.values()
-        )
-
-        init_kwargs: dict[str, Any] = {}
-        post_init_overrides: dict[str, Any] = {}
-
-        # Preserve constructor arguments present on the base environment
-        # Skip dataset/eval_dataset as they are not needed (adapter provides inputs)
-        # and copying them would be hugely inefficient for large datasets
-        for param_name in signature.parameters:
-            if param_name == "self":
-                continue
-            if param_name in ("dataset", "eval_dataset"):
-                continue
-            if hasattr(self.base_env, param_name):
-                value = getattr(self.base_env, param_name)
-                if isinstance(value, (dict, list)):
-                    init_kwargs[param_name] = deepcopy(value)
-                else:
-                    init_kwargs[param_name] = value
-
-        # Ensure core Environment parameters are forwarded when available
-        # BUT only if they're explicitly in the specific environment's signature
-        # (Some envs like TextArenaEnv create dataset/eval_dataset internally)
-        # Skip dataset/eval_dataset for efficiency (not needed by adapter)
-        env_signature = inspect.signature(vf.Environment.__init__)
-        env_param_names = [
-            name
-            for name in env_signature.parameters
-            if name not in {"self", "kwargs", "dataset", "eval_dataset"}
-        ]
-        for param_name in env_param_names:
-            if param_name in init_kwargs:
-                continue
-            # Only add if explicitly in the environment's signature
-            # Skip if only accepted via **kwargs
-            if param_name not in signature.parameters:
-                continue
-            if not hasattr(self.base_env, param_name):
-                continue
-            value = getattr(self.base_env, param_name)
-            if isinstance(value, (dict, list)):
-                init_kwargs[param_name] = deepcopy(value)
-            else:
-                init_kwargs[param_name] = value
-
-        updated_oai_tools = None
-        if (
-            "tool_descriptions" in self.components_to_optimize
-            and hasattr(self.base_env, "oai_tools")
-            and self.base_env.oai_tools
-        ):
-            updated_oai_tools = deepcopy(self.base_env.oai_tools)
-            for i, tool in enumerate(updated_oai_tools):
-                tool_desc_key = f"tool_{i}_description"
-                if tool_desc_key in candidate:
-                    tool["function"]["description"] = candidate[tool_desc_key]
-            init_kwargs["oai_tools"] = updated_oai_tools
-
-        # Override constructor args with candidate values when applicable
-        for comp_name, comp_value in candidate.items():
-            if comp_name.startswith("tool_") and comp_name.endswith("_description"):
-                continue
-            # Never pass dataset/eval_dataset - some envs create these internally
-            # and would get duplicate arguments
-            if comp_name in {"dataset", "eval_dataset"}:
-                continue
-            if comp_name in signature.parameters or accepts_kwargs:
-                init_kwargs[comp_name] = comp_value
-            else:
-                post_init_overrides[comp_name] = comp_value
-
-        # Provide minimal dataset if none exists (adapter provides inputs directly)
-        # This avoids copying large datasets and improves performance
-        # Detect if env creates dataset internally (has num_train_examples or num_eval_examples params)
-        creates_internal_dataset = (
-            "num_train_examples" in signature.parameters
-            or "num_eval_examples" in signature.parameters
-        )
-        accepts_dataset = "dataset" in signature.parameters or accepts_kwargs
-        if accepts_dataset and not creates_internal_dataset:
-            init_kwargs["dataset"] = vf.load_example_dataset(n=1)
-
-        try:
-            new_env = env_class(**init_kwargs)
-        except TypeError as exc:
-            raise ValueError(
-                f"Failed to reconstruct {env_class.__name__} with optimized components. "
-                f"Error: {exc}"
-            ) from exc
+        # Create shallow copy - shares dataset, rubric, parser, etc.
+        new_env = copy.copy(self.base_env)
 
-        for attr_name, attr_value in post_init_overrides.items():
-            setattr(new_env, attr_name, attr_value)
+        # Update system_prompt (assignment replaces reference, safe)
+        if "system_prompt" in candidate:
+            new_env.system_prompt = candidate["system_prompt"]
 
-        if updated_oai_tools is not None:
-            new_env.oai_tools = updated_oai_tools
+        # Update tool descriptions (need deep copy since we mutate nested dicts)
+        if hasattr(self.base_env, "oai_tools") and self.base_env.oai_tools:
+            tool_updates = {
+                k: v
+                for k, v in candidate.items()
+                if k.startswith("tool_") and k.endswith("_description")
+            }
+            if tool_updates:
+                new_env.oai_tools = copy.deepcopy(self.base_env.oai_tools)
+                for i, tool in enumerate(new_env.oai_tools):
+                    key = f"tool_{i}_description"
+                    if key in tool_updates:
+                        tool["function"]["description"] = tool_updates[key]
 
         logger.debug(
-            f"Successfully built {env_class.__name__} candidate #{self._candidate_build_count}"
+            f"Successfully built {new_env.__class__.__name__} candidate #{self._candidate_build_count}"
         )
         return new_env
 
@@ -416,6 +340,7 @@ def make_reflective_dataset(
             )
 
         reflective_data: dict[str, list[dict]] = {}
+        _warned_no_get_feedback = False
 
         # For environment-level components (like system_prompt), all examples
         # reflect on the same component, so we aggregate feedback across examples
@@ -461,11 +386,13 @@ def make_reflective_dataset(
                 if hasattr(self.base_env.rubric, "get_feedback"):
                     feedback = self.base_env.rubric.get_feedback(state)
                 else:
-                    # Default fallback for basic rubrics
-                    logger.warning(
-                        "Rubric lacks get_feedback method - using generic feedback. "
-                        "Consider implementing get_feedback for better GEPA reflection."
-                    )
+                    # Default fallback for basic rubrics - warn once
+                    if not _warned_no_get_feedback:
+                        logger.warning(
+                            "Rubric lacks get_feedback method - using generic feedback. "
+                            "Consider implementing get_feedback for better GEPA reflection."
+                        )
+                        _warned_no_get_feedback = True
                     feedback = f"Reward: {score:.3f}"
                     if score < 0.5:
                         feedback += " (Low score - needs improvement)"
diff --git a/verifiers/rubrics/rubric.py b/verifiers/rubrics/rubric.py
index 05a797a44..07596cd24 100644
--- a/verifiers/rubrics/rubric.py
+++ b/verifiers/rubrics/rubric.py
@@ -48,6 +48,7 @@ def __init__(
             )
 
         self.parser = parser or vf.Parser()
+        self._warned_no_feedback = False
 
         # class objects for reward functions
         self.class_objects = {}
@@ -300,7 +301,13 @@ def get_feedback(self, state: State) -> str:
         feedbacks = state.get("feedbacks", [])
 
         if not feedbacks:
-            # Fallback if no functions provided feedback
+            # Fallback if no functions provided feedback - warn once
+            if not self._warned_no_feedback:
+                self.logger.warning(
+                    "No detailed feedback from reward functions. For better GEPA optimization, "
+                    "return RewardResult({'score': float, 'feedback': str}) from reward functions."
+                )
+                self._warned_no_feedback = True
             score = state.get("reward", 0.0)
             return f"Score: {score:.2%} (no detailed feedback available)"
 
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/utils/gepa_utils.py
index 738900f6b..948e8719a 100644
--- a/verifiers/utils/gepa_utils.py
+++ b/verifiers/utils/gepa_utils.py
@@ -83,17 +83,23 @@ def get_env_gepa_defaults(env_id: str) -> Dict[str, Any]:
 
 
 def ensure_env_dir_on_path(env_dir_path: str, env_id: str) -> None:
-    """Add local environment directory to sys.path if present."""
+    """Add local environment directory to sys.path if present.
+
+    Adds the specific environment folder (e.g., environments/gsm8k/) to sys.path
+    so that `import gsm8k` finds gsm8k.py directly, avoiding namespace package issues.
+    """
     env_dir = Path(env_dir_path).resolve()
     if not env_dir.exists():
         return
     module_name = env_id.replace("-", "_").split("/")[-1]
     candidate = env_dir / module_name
     if candidate.exists():
-        env_dir_str = str(env_dir)
-        if env_dir_str not in sys.path:
-            sys.path.insert(0, env_dir_str)
-            logger.debug(f"Added {env_dir_str} to sys.path for environment loading")
+        # Add the specific environment folder so Python finds the .py file directly
+        # e.g., add environments/gsm8k/ so `import gsm8k` finds gsm8k.py
+        env_folder_str = str(candidate)
+        if env_folder_str not in sys.path:
+            sys.path.insert(0, env_folder_str)
+            logger.debug(f"Added {env_folder_str} to sys.path for environment loading")
 
 
 async def save_candidate_rollouts(

From 17c2a995a2a6474be57d30e369ad82f9d24ade5a Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 13:26:51 +0100
Subject: [PATCH 13/16] Fix GEPA tool call handling and tool_test dict access

---
 environments/tool_test/tool_test.py |  2 +-
 verifiers/adapters/gepa.py          | 69 +++++++++++++++++++++--------
 2 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/environments/tool_test/tool_test.py b/environments/tool_test/tool_test.py
index b3f958b1b..61ec8c1d8 100644
--- a/environments/tool_test/tool_test.py
+++ b/environments/tool_test/tool_test.py
@@ -65,7 +65,7 @@ def tool_D(x: bool) -> bool:
 def tool_call_reward_func(completion, info):
     # check if completion tool calls exactly matches info tool calls
     tool_calls = completion[-1].get("tool_calls", [])
-    called_tool_names = sorted([call.function.name for call in tool_calls])
+    called_tool_names = sorted([call["function"]["name"] for call in tool_calls])
     expected_tool_names = sorted(info["tool_names"])
     if called_tool_names == expected_tool_names:
         return 1.0
diff --git a/verifiers/adapters/gepa.py b/verifiers/adapters/gepa.py
index d9405c71e..6b791dcf0 100644
--- a/verifiers/adapters/gepa.py
+++ b/verifiers/adapters/gepa.py
@@ -312,6 +312,16 @@ def _format_prompt(self, env: vf.Environment, prompt: str | Messages) -> Message
         messages.append({"role": "user", "content": str(prompt)})
         return messages
 
+    def _format_tool_calls_text(self, tool_calls: list[dict]) -> str:
+        """Format tool calls as readable text for GEPA reflection."""
+        parts = []
+        for tc in tool_calls:
+            func = tc.get("function", {})
+            name = func.get("name", "unknown")
+            args_str = func.get("arguments", "{}")
+            parts.append(f"Tool Call: {name}({args_str})")
+        return "\n".join(parts)
+
     def make_reflective_dataset(
         self,
         candidate: dict[str, str],
@@ -345,7 +355,24 @@ def make_reflective_dataset(
         # For environment-level components (like system_prompt), all examples
         # reflect on the same component, so we aggregate feedback across examples
         for comp_name in components_to_update:
-            if comp_name not in self.components_to_optimize:
+            # Check if component is in optimization list
+            # Support both exact matches (e.g., "system_prompt") and group patterns
+            # (e.g., "tool_0_description" matches "tool_descriptions")
+            is_optimizable = comp_name in self.components_to_optimize
+
+            # Check if this is a tool description (tool_N_description pattern)
+            if (
+                not is_optimizable
+                and "tool_descriptions" in self.components_to_optimize
+            ):
+                # Match pattern: tool_0_description, tool_1_description, etc.
+                if comp_name.startswith("tool_") and comp_name.endswith("_description"):
+                    is_optimizable = True
+
+            if not is_optimizable:
+                logger.debug(
+                    f"Skipping component '{comp_name}' - not in components_to_optimize: {self.components_to_optimize}"
+                )
                 continue
 
             examples = []
@@ -364,15 +391,33 @@ def make_reflective_dataset(
                 else:
                     prompt_text = prompt
 
-                # Extract completion text
+                # Extract completion text - format entire conversation
                 if isinstance(completion, list):
-                    # Chat format
-                    asst_msgs = [m for m in completion if m.get("role") == "assistant"]
+                    # Chat format - include all messages (assistant + tool responses)
+                    completion_parts = []
+                    for msg in completion:
+                        role = msg.get("role", "")
+                        content = msg.get("content", "")
+
+                        if role == "assistant":
+                            # Include content if present
+                            if content:
+                                completion_parts.append(f"Assistant: {content}")
+                            # Include tool calls
+                            tool_calls = msg.get("tool_calls", [])
+                            if tool_calls:
+                                completion_parts.append(
+                                    self._format_tool_calls_text(tool_calls)
+                                )
+                        elif role == "tool":
+                            # Include tool responses
+                            completion_parts.append(f"Tool Result: {content}")
+
                     completion_text = (
-                        asst_msgs[-1].get("content", "") if asst_msgs else ""
+                        "\n\n".join(completion_parts) if completion_parts else ""
                     )
                 else:
-                    completion_text = completion
+                    completion_text = str(completion)
 
                 # Build inputs dict
                 inputs = {
@@ -414,18 +459,6 @@ def make_reflective_dataset(
                 f"No reflective data generated for components: {components_to_update}"
             )
 
-        # Log sample feedback for debugging
-        for comp_name, examples in reflective_data.items():
-            logger.debug("\n%s\nComponent: %s", "=" * 80, comp_name)
-            logger.debug("Sample feedback (first example):")
-            if examples:
-                first_ex = examples[0]
-                logger.debug(
-                    f"  Task: {first_ex['Inputs'].get('Task', 'N/A')[:200]}..."
-                )
-                logger.debug(f"  Output: {first_ex['Generated Outputs'][:200]}...")
-                logger.debug(f"  Feedback: {first_ex['Feedback'][:500]}...")
-
         logger.info(
             f"Generated reflective dataset with {sum(len(v) for v in reflective_data.values())} examples "
             f"across {len(reflective_data)} components"

From 1c21915191d62f23b793e4d93bce82fd617d94b2 Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 14:13:18 +0100
Subject: [PATCH 14/16] improved tool description optimization, new folder
 structure

---
 integrations/gepa/README.md                   |   2 +-
 tests/test_gepa.py                            | 170 +++++++++++++++-
 tests/test_gepa_cli.py                        |   2 +-
 verifiers/adapters/__init__.py                |   5 -
 verifiers/gepa/__init__.py                    |  45 +++++
 .../{adapters/gepa.py => gepa/adapter.py}     | 181 +++++++++++++++++-
 verifiers/gepa/templates.py                   |  41 ++++
 .../{utils/gepa_utils.py => gepa/utils.py}    |  19 +-
 verifiers/scripts/gepa.py                     |   2 +-
 9 files changed, 449 insertions(+), 18 deletions(-)
 delete mode 100644 verifiers/adapters/__init__.py
 create mode 100644 verifiers/gepa/__init__.py
 rename verifiers/{adapters/gepa.py => gepa/adapter.py} (71%)
 create mode 100644 verifiers/gepa/templates.py
 rename verifiers/{utils/gepa_utils.py => gepa/utils.py} (97%)

diff --git a/integrations/gepa/README.md b/integrations/gepa/README.md
index a67e2f0c9..a9a33a05d 100644
--- a/integrations/gepa/README.md
+++ b/integrations/gepa/README.md
@@ -47,7 +47,7 @@ The `GEPAAdapter` class bridges Verifiers environments to GEPA's optimization pr
 ### Key Methods
 
 ```python
-from verifiers.adapters.gepa import GEPAAdapter
+from verifiers.gepa import GEPAAdapter
 
 adapter = GEPAAdapter(
     env=vf_env,
diff --git a/tests/test_gepa.py b/tests/test_gepa.py
index a4963bd01..099e434dd 100644
--- a/tests/test_gepa.py
+++ b/tests/test_gepa.py
@@ -11,7 +11,7 @@
 
 def require_gepa_adapter():
     """Import GEPAAdapter or skip tests if the module is unavailable."""
-    module = pytest.importorskip("verifiers.adapters.gepa")
+    module = pytest.importorskip("verifiers.gepa.adapter")
     return module.GEPAAdapter
 
 
@@ -467,6 +467,174 @@ async def generate(
         assert result.trajectories is not None
         assert result.trajectories[0]["score"] == 0.9
 
+    def test_gepa_adapter_tool_metadata_extraction(self):
+        """Test that GEPAAdapter extracts tool metadata for tool_descriptions."""
+        GEPAAdapter = require_gepa_adapter()
+
+        def search_tool(query: str, max_results: int = 10) -> str:
+            """Search for information about a query.
+
+            Args:
+                query: The search query string
+                max_results: Maximum number of results to return
+            """
+            return f"Results for: {query}"
+
+        dataset = vf.load_example_dataset(n=5)
+        env = vf.ToolEnv(
+            dataset=dataset,
+            tools=[search_tool],
+            system_prompt="Use the search tool",
+            rubric=vf.Rubric(),
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["tool_descriptions"],
+        )
+
+        # Verify tool metadata was extracted
+        assert "tool_0_description" in adapter._tool_metadata
+        assert adapter._tool_metadata["tool_0_description"]["name"] == "search_tool"
+        assert "parameters" in adapter._tool_metadata["tool_0_description"]
+
+        # Verify parameters include the function arguments
+        params = adapter._tool_metadata["tool_0_description"]["parameters"]
+        assert "properties" in params
+        assert "query" in params["properties"]
+        assert "max_results" in params["properties"]
+
+    def test_gepa_adapter_propose_new_texts_tool_descriptions(self):
+        """Test that propose_new_texts uses tool-specific template for tool descriptions."""
+        GEPAAdapter = require_gepa_adapter()
+
+        def calculate(x: int, y: int) -> int:
+            """Add two numbers together."""
+            return x + y
+
+        dataset = vf.load_example_dataset(n=5)
+        env = vf.ToolEnv(
+            dataset=dataset,
+            tools=[calculate],
+            system_prompt="Use the calculator",
+            rubric=vf.Rubric(),
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["tool_descriptions"],
+        )
+
+        # Mock reflection_lm
+        reflection_output = "```\nImproved tool description that adds two numbers with better clarity.\n```"
+        adapter.reflection_lm = MagicMock(return_value=reflection_output)
+
+        # Create mock candidate and reflective dataset
+        candidate = {"tool_0_description": "Add two numbers together."}
+        reflective_dataset = {
+            "tool_0_description": [
+                {
+                    "Inputs": {"Task": "Calculate 2 + 3"},
+                    "Generated Outputs": "Tool Call: calculate(x=2, y=3)",
+                    "Feedback": "Correct usage",
+                }
+            ]
+        }
+
+        # Call propose_new_texts
+        new_texts = adapter.propose_new_texts(
+            candidate=candidate,
+            reflective_dataset=reflective_dataset,
+            components_to_update=["tool_0_description"],
+        )
+
+        # Verify the reflection_lm was called
+        assert adapter.reflection_lm.called
+        called_prompt = adapter.reflection_lm.call_args[0][0]
+
+        # Verify tool name is in the prompt
+        assert "calculate" in called_prompt
+
+        # Verify tool parameters are in the prompt (JSON schema)
+        assert "parameters" in called_prompt.lower()
+        assert '"x"' in called_prompt or "'x'" in called_prompt
+        assert '"y"' in called_prompt or "'y'" in called_prompt
+
+        # Verify current description is in the prompt
+        assert "Add two numbers together" in called_prompt
+
+        # Verify new text was extracted correctly
+        assert "tool_0_description" in new_texts
+        assert "Improved tool description" in new_texts["tool_0_description"]
+
+    def test_gepa_adapter_propose_new_texts_system_prompt(self):
+        """Test that propose_new_texts uses default GEPA template for system_prompt."""
+        GEPAAdapter = require_gepa_adapter()
+
+        dataset = vf.load_example_dataset(n=5)
+        env = vf.SingleTurnEnv(
+            dataset=dataset,
+            system_prompt="Original system prompt",
+            rubric=vf.Rubric(),
+        )
+
+        client = AsyncMock()
+        adapter = GEPAAdapter(
+            env=env,
+            client=client,
+            model="gpt-4o-mini",
+            sampling_args={},
+            components_to_optimize=["system_prompt"],
+        )
+
+        # Mock reflection_lm
+        reflection_output = "```\nImproved system prompt with better instructions.\n```"
+        adapter.reflection_lm = MagicMock(return_value=reflection_output)
+
+        # Create mock candidate and reflective dataset
+        candidate = {"system_prompt": "Original system prompt"}
+        reflective_dataset = {
+            "system_prompt": [
+                {
+                    "Inputs": {"Task": "Solve this problem"},
+                    "Generated Outputs": "Here's the solution",
+                    "Feedback": "Good response",
+                }
+            ]
+        }
+
+        # Call propose_new_texts
+        new_texts = adapter.propose_new_texts(
+            candidate=candidate,
+            reflective_dataset=reflective_dataset,
+            components_to_update=["system_prompt"],
+        )
+
+        # Verify the reflection_lm was called
+        assert adapter.reflection_lm.called
+        called_prompt = adapter.reflection_lm.call_args[0][0]
+
+        # Verify it uses the default GEPA template (should NOT contain tool-specific language)
+        assert "TOOL NAME" not in called_prompt
+        assert "TOOL PARAMETERS" not in called_prompt
+
+        # Should contain the default GEPA language about "assistant" and "instructions"
+        assert (
+            "assistant" in called_prompt.lower()
+            or "instruction" in called_prompt.lower()
+        )
+
+        # Verify new text was extracted correctly
+        assert "system_prompt" in new_texts
+
 
 class TestRubricDictSupport:
     """Tests for base Rubric class dict return support."""
diff --git a/tests/test_gepa_cli.py b/tests/test_gepa_cli.py
index 60ccc24a8..7fb802143 100644
--- a/tests/test_gepa_cli.py
+++ b/tests/test_gepa_cli.py
@@ -111,7 +111,7 @@ def _run_cli(monkeypatch, overrides, custom_env=None):
     monkeypatch.setattr(eval_utils, "load_endpoints", lambda *_: {})
 
     # Mock get_env_gepa_defaults
-    from verifiers.utils import gepa_utils
+    from verifiers import gepa as gepa_utils
 
     monkeypatch.setattr(gepa_utils, "get_env_gepa_defaults", lambda *_: {})
 
diff --git a/verifiers/adapters/__init__.py b/verifiers/adapters/__init__.py
deleted file mode 100644
index 9f02635fe..000000000
--- a/verifiers/adapters/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Adapters that bridge Verifiers with external optimization systems."""
-
-from .gepa import GEPAAdapter
-
-__all__ = ["GEPAAdapter"]
diff --git a/verifiers/gepa/__init__.py b/verifiers/gepa/__init__.py
new file mode 100644
index 000000000..aa2b4d99b
--- /dev/null
+++ b/verifiers/gepa/__init__.py
@@ -0,0 +1,45 @@
+"""
+GEPA (Genetic-Pareto) integration for Verifiers.
+
+This module provides adapter, utilities, and templates for optimizing
+Verifiers environments using the GEPA reflection-based optimization algorithm.
+
+Main components:
+- GEPAAdapter: Bridges Verifiers environments with GEPA optimization
+- run_gepa_optimization: High-level function to run GEPA on an environment
+- TOOL_DESCRIPTION_PROMPT_TEMPLATE: Template for tool description optimization
+"""
+
+from .adapter import GEPAAdapter
+from .templates import TOOL_DESCRIPTION_PROMPT_TEMPLATE
+from .utils import (
+    auto_budget_to_metric_calls,
+    call_reflection_model,
+    ensure_env_dir_on_path,
+    get_env_gepa_defaults,
+    prepare_gepa_dataset,
+    print_optimization_results,
+    run_gepa_optimization,
+    save_candidate_rollouts,
+    save_optimized_components,
+    save_optimization_metrics,
+)
+
+__all__ = [
+    # Core adapter
+    "GEPAAdapter",
+    # Templates
+    "TOOL_DESCRIPTION_PROMPT_TEMPLATE",
+    # Main optimization function
+    "run_gepa_optimization",
+    # Utility functions
+    "auto_budget_to_metric_calls",
+    "call_reflection_model",
+    "ensure_env_dir_on_path",
+    "get_env_gepa_defaults",
+    "prepare_gepa_dataset",
+    "print_optimization_results",
+    "save_candidate_rollouts",
+    "save_optimized_components",
+    "save_optimization_metrics",
+]
diff --git a/verifiers/adapters/gepa.py b/verifiers/gepa/adapter.py
similarity index 71%
rename from verifiers/adapters/gepa.py
rename to verifiers/gepa/adapter.py
index 6b791dcf0..d80434b2c 100644
--- a/verifiers/adapters/gepa.py
+++ b/verifiers/gepa/adapter.py
@@ -7,7 +7,9 @@
 """
 
 import asyncio
+import json
 import logging
+from collections.abc import Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 from typing import Any
@@ -17,6 +19,7 @@
 from openai import AsyncOpenAI
 
 import verifiers as vf
+from verifiers.gepa.templates import TOOL_DESCRIPTION_PROMPT_TEMPLATE
 from verifiers.types import Messages, RolloutInput
 
 logger = logging.getLogger(__name__)
@@ -60,6 +63,10 @@ def __init__(
         self.num_rollouts_per_example = num_rollouts_per_example
         self.max_concurrent = max_concurrent
         self._candidate_build_count = 0  # Track candidate environment builds
+        self._tool_metadata: dict[
+            str, dict[str, Any]
+        ] = {}  # Maps tool_N_description -> {name, parameters}
+        self.reflection_lm = None  # Will be set before optimization starts
 
         if self.num_rollouts_per_example < 1:
             raise ValueError("num_rollouts_per_example must be at least 1")
@@ -71,12 +78,19 @@ def __init__(
                 self.num_rollouts_per_example,
             )
 
-        # Validate components
+        # Validate components and extract tool metadata
         if "tool_descriptions" in self.components_to_optimize:
             if not hasattr(env, "oai_tools") or not env.oai_tools:
                 raise ValueError(
                     "Cannot optimize tool_descriptions: environment has no tools"
                 )
+            # Build metadata mapping for tool descriptions
+            for i, tool in enumerate(env.oai_tools):
+                comp_name = f"tool_{i}_description"
+                self._tool_metadata[comp_name] = {
+                    "name": tool["function"]["name"],
+                    "parameters": tool["function"].get("parameters", {}),
+                }
 
         for comp in self.components_to_optimize:
             if comp not in ["system_prompt", "tool_descriptions"]:
@@ -466,5 +480,170 @@ def make_reflective_dataset(
 
         return reflective_data
 
+    def propose_new_texts(
+        self,
+        candidate: dict[str, str],
+        reflective_dataset: Mapping[str, Sequence[Mapping[str, Any]]],
+        components_to_update: list[str],
+    ) -> dict[str, str]:
+        """
+        Propose new text for components using tool-aware templates.
+
+        For tool descriptions (tool_N_description), uses a tool-specific template
+        that includes the tool name and parameter schema. For other components,
+        uses GEPA's default instruction proposal template.
+
+        Args:
+            candidate: Current candidate component values
+            reflective_dataset: Feedback data generated by make_reflective_dataset
+            components_to_update: List of component names to update
+
+        Returns:
+            Dict mapping component names to newly proposed text
+        """
+        if self.reflection_lm is None:
+            raise ValueError(
+                "reflection_lm must be set on GEPAAdapter before propose_new_texts can be called. "
+                "This should be set by run_gepa_optimization before calling gepa.optimize()."
+            )
+
+        from gepa.strategies.instruction_proposal import InstructionProposalSignature
+
+        new_texts: dict[str, str] = {}
+
+        for comp_name in components_to_update:
+            # Gracefully handle missing component data
+            if comp_name not in reflective_dataset or not reflective_dataset.get(
+                comp_name
+            ):
+                logger.warning(
+                    f"Component '{comp_name}' not in reflective dataset. Skipping."
+                )
+                continue
+
+            current_text = candidate[comp_name]
+            feedback_data = reflective_dataset[comp_name]
+
+            # Check if this is a tool description component
+            if comp_name in self._tool_metadata:
+                # Use tool-specific template
+                tool_info = self._tool_metadata[comp_name]
+                new_texts[comp_name] = self._propose_tool_description(
+                    tool_name=tool_info["name"],
+                    tool_parameters=tool_info["parameters"],
+                    current_description=current_text,
+                    feedback_data=feedback_data,
+                )
+                logger.debug(
+                    f"Proposed new tool description for {comp_name} (tool: {tool_info['name']})"
+                )
+            else:
+                # Use default GEPA instruction proposal for system_prompt, etc.
+                new_texts[comp_name] = InstructionProposalSignature.run(
+                    lm=self.reflection_lm,
+                    input_dict={
+                        "current_instruction_doc": current_text,
+                        "dataset_with_feedback": feedback_data,
+                        "prompt_template": None,  # Use default
+                    },
+                )["new_instruction"]
+                logger.debug(f"Proposed new instruction for {comp_name}")
+
+        return new_texts
+
+    def _propose_tool_description(
+        self,
+        tool_name: str,
+        tool_parameters: dict,
+        current_description: str,
+        feedback_data: Sequence[Mapping[str, Any]],
+    ) -> str:
+        """
+        Propose a new tool description using the tool-specific template.
+
+        Args:
+            tool_name: Name of the tool being optimized
+            tool_parameters: JSON schema of tool parameters
+            current_description: Current tool description text
+            feedback_data: Reflective examples with feedback
+
+        Returns:
+            Newly proposed tool description
+        """
+
+        # Format the feedback data using GEPA's standard markdown formatter
+        def format_samples(samples):
+            def render_value(value, level=3):
+                if isinstance(value, dict):
+                    s = ""
+                    for k, v in value.items():
+                        s += f"{'#' * level} {k}\n"
+                        s += render_value(v, min(level + 1, 6))
+                    if not value:
+                        s += "\n"
+                    return s
+                elif isinstance(value, list | tuple):
+                    s = ""
+                    for i, item in enumerate(value):
+                        s += f"{'#' * level} Item {i + 1}\n"
+                        s += render_value(item, min(level + 1, 6))
+                    if not value:
+                        s += "\n"
+                    return s
+                else:
+                    return f"{str(value).strip()}\n\n"
+
+            def convert_sample_to_markdown(sample, examplenum):
+                s = f"# Example {examplenum}\n"
+                for key, val in sample.items():
+                    s += f"## {key}\n"
+                    s += render_value(val, level=3)
+                return s
+
+            return "\n\n".join(
+                convert_sample_to_markdown(sample, i + 1)
+                for i, sample in enumerate(samples)
+            )
+
+        # Build the tool-specific prompt
+        prompt = TOOL_DESCRIPTION_PROMPT_TEMPLATE
+        prompt = prompt.replace("<tool_name>", tool_name)
+        prompt = prompt.replace(
+            "<tool_parameters>", json.dumps(tool_parameters, indent=2)
+        )
+        prompt = prompt.replace("<curr_instructions>", current_description)
+        prompt = prompt.replace(
+            "<inputs_outputs_feedback>", format_samples(feedback_data)
+        )
+
+        # Call reflection LM
+        response = self.reflection_lm(prompt)
+
+        # Extract the new description from code blocks using GEPA's standard extractor
+        import re
+
+        def extract_instruction_text(lm_out: str) -> str:
+            start = lm_out.find("```") + 3
+            end = lm_out.rfind("```")
+
+            if start >= end:
+                stripped = lm_out.strip()
+                if stripped.startswith("```"):
+                    match = re.match(r"^```\S*\n?", lm_out)
+                    if match:
+                        return lm_out[match.end() :].strip()
+                elif stripped.endswith("```"):
+                    return stripped[:-3].strip()
+                return stripped
+
+            content = lm_out[start:end]
+            match = re.match(r"^\S*\n", content)
+            if match:
+                content = content[match.end() :]
+
+            return content.strip()
+
+        return extract_instruction_text(response)
+
 
 __all__ = ["GEPAAdapter"]
diff --git a/verifiers/gepa/templates.py b/verifiers/gepa/templates.py
new file mode 100644
index 000000000..6d09b9eb5
--- /dev/null
+++ b/verifiers/gepa/templates.py
@@ -0,0 +1,41 @@
+"""
+Prompt templates for GEPA optimization in Verifiers.
+
+This module contains specialized templates for different component types
+(tool descriptions, system prompts, etc.) used during GEPA's reflection phase.
+"""
+
+# Tool-specific prompt template for GEPA reflection
+TOOL_DESCRIPTION_PROMPT_TEMPLATE = """You are improving the description of a tool (function) that an AI assistant can call.
+
+TOOL NAME: <tool_name>
+
+TOOL PARAMETERS:
+```json
+<tool_parameters>
+```
+
+CURRENT DESCRIPTION:
+```
+<curr_instructions>
+```
+
+The following are examples of how the assistant used this tool, along with feedback on the results:
+```
+<inputs_outputs_feedback>
+```
+
+Your task is to write an improved TOOL DESCRIPTION for the "<tool_name>" tool.
+
+A good tool description should:
+- Clearly explain what the tool does and when to use it
+- Match the parameter schema shown above
+- Mention any important constraints, edge cases, or common mistakes
+- Be concise but informative enough for the AI to decide when/how to call this tool
+
+Based on the feedback, identify patterns in tool misuse and improve the description to prevent them.
+
+Provide the new tool description within ``` blocks."""
+
+
+__all__ = ["TOOL_DESCRIPTION_PROMPT_TEMPLATE"]
diff --git a/verifiers/utils/gepa_utils.py b/verifiers/gepa/utils.py
similarity index 97%
rename from verifiers/utils/gepa_utils.py
rename to verifiers/gepa/utils.py
index 948e8719a..aae7485f2 100644
--- a/verifiers/utils/gepa_utils.py
+++ b/verifiers/gepa/utils.py
@@ -19,7 +19,7 @@
 from openai import AsyncOpenAI, OpenAI
 
 import verifiers as vf
-from verifiers.adapters.gepa import GEPAAdapter
+from verifiers.gepa.adapter import GEPAAdapter
 from verifiers.types import GEPAConfig
 from verifiers.utils.client_utils import setup_client
 from verifiers.utils.eval_utils import save_rollout_results
@@ -460,6 +460,15 @@ async def run_gepa_optimization(config: GEPAConfig):
     # Get wandb API key from env var
     wandb_api_key = os.getenv(config.wandb_api_key_var) if config.use_wandb else None
 
+    # Set reflection_lm on adapter for propose_new_texts method
+    adapter.reflection_lm = lambda x: call_reflection_model(
+        reflection_client,
+        x,
+        config.reflection_model,
+        config.reflection_temperature,
+        config.reflection_max_tokens,
+    )
+
     try:
         result = optimize(
             seed_candidate=config.seed_candidate,
@@ -467,13 +476,7 @@ async def run_gepa_optimization(config: GEPAConfig):
             valset=config.valset,
             adapter=adapter,
             max_metric_calls=config.max_metric_calls,
-            reflection_lm=lambda x: call_reflection_model(
-                reflection_client,
-                x,
-                config.reflection_model,
-                config.reflection_temperature,
-                config.reflection_max_tokens,
-            ),
+            reflection_lm=adapter.reflection_lm,
             reflection_minibatch_size=config.reflection_minibatch_size,
             run_dir=str(log_dir),
             track_best_outputs=config.track_stats,
diff --git a/verifiers/scripts/gepa.py b/verifiers/scripts/gepa.py
index ee21e7831..d2ac7e541 100644
--- a/verifiers/scripts/gepa.py
+++ b/verifiers/scripts/gepa.py
@@ -25,7 +25,7 @@
 from verifiers import setup_logging
 from verifiers.types import ClientConfig, GEPAConfig
 from verifiers.utils.eval_utils import load_endpoints
-from verifiers.utils.gepa_utils import (
+from verifiers.gepa import (
     auto_budget_to_metric_calls,
     ensure_env_dir_on_path,
     get_env_gepa_defaults,

From 47d705b3f0013fece8e3afaf1efb5deff739f69d Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 14:26:11 +0100
Subject: [PATCH 15/16] updated docs

---
 docs/source/gepa.md         | 92 ++++++++++++++++++++++++++++++++-----
 integrations/gepa/README.md | 57 +++++++++++++++++++----
 2 files changed, 129 insertions(+), 20 deletions(-)

diff --git a/docs/source/gepa.md b/docs/source/gepa.md
index 67965e77d..d44aac2ab 100644
--- a/docs/source/gepa.md
+++ b/docs/source/gepa.md
@@ -112,22 +112,22 @@ When optimizing `tool_descriptions`, GEPA:
 ## Model Configuration
 
 ### Task Model
-The model being optimized (default: `gpt-4o-mini`):
+The model being optimized (default: `gpt-5-mini`):
 ```bash
-vf-gepa my-env --budget medium -m gpt-4o
+vf-gepa my-env --budget medium -m gpt-5-mini
 ```
 
 ### Reflection Model
-The model generating improved prompts (default: `gpt-4o`):
+The model generating improved prompts (default: `gpt-5-mini`):
 ```bash
-vf-gepa my-env --budget medium --reflection-model gpt-4o
+vf-gepa my-env --budget medium --reflection-model gpt-5-mini
 ```
 
 ### Sampling Parameters
 ```bash
 vf-gepa my-env --budget medium \
-  -T 0.7 \              # Temperature for task model
-  -t 2048 \             # Max tokens
+  -T 0.7 \                      # Temperature for task model
+  -t 2048 \                     # Max tokens
   --reflection-temperature 1.0  # Temperature for reflection
 ```
 
@@ -225,6 +225,76 @@ Debug optimization process:
 vf-gepa my-env --budget medium -v
 ```
 
+## Experiment Tracking
+
+GEPA supports integration with popular experiment tracking platforms to monitor and analyze optimization runs.
+
+### Weights & Biases (wandb)
+
+Track GEPA runs in wandb:
+
+```bash
+vf-gepa my-env --budget medium \
+  --use-wandb \
+  --wandb-project my-project \
+  --wandb-entity my-team \
+  --wandb-name "wordle-optimization"
+```
+
+**Configuration options**:
+- `--use-wandb`: Enable wandb logging
+- `--wandb-project PROJECT`: Wandb project name
+- `--wandb-entity ENTITY`: Wandb entity/team name
+- `--wandb-name NAME`: Run name (default: auto-generated from env_id)
+- `--wandb-api-key-var VAR`: Environment variable containing API key (default: `WANDB_API_KEY`)
+- `--wandb-init-kwargs JSON`: Additional `wandb.init()` kwargs as JSON
+
+**Example with additional kwargs**:
+```bash
+vf-gepa my-env --budget medium \
+  --use-wandb \
+  --wandb-project gepa-experiments \
+  --wandb-init-kwargs '{"tags": ["baseline", "system-prompt"], "mode": "online"}'
+```
+
+**Logged metrics**:
+- Validation scores per candidate
+- Training scores per reflection step
+- Component-level improvements
+- Optimization progress over time
+- Final best candidate components
+
+### MLflow
+
+Track GEPA runs in MLflow:
+
+```bash
+vf-gepa my-env --budget medium \
+  --use-mlflow \
+  --mlflow-tracking-uri http://localhost:5000 \
+  --mlflow-experiment-name gepa-wordle
+```
+
+**Configuration options**:
+- `--use-mlflow`: Enable MLflow logging
+- `--mlflow-tracking-uri URI`: MLflow tracking server URI
+- `--mlflow-experiment-name NAME`: Experiment name
+
+**Logged data**:
+- Parameters: model, budget, dataset sizes, components
+- Metrics: validation scores, improvements
+- Artifacts: optimized components, metrics JSON
+
+### Using Both Simultaneously
+
+You can enable both wandb and MLflow tracking in the same run:
+
+```bash
+vf-gepa my-env --budget medium \
+  --use-wandb --wandb-project my-project \
+  --use-mlflow --mlflow-tracking-uri http://localhost:5000
+```
+
 ## Best Practices
 
 ### 1. Provide Rich Feedback
@@ -290,7 +360,7 @@ Check that your environment exposes the component you're trying to optimize. Use
 ### Out of Memory
 - Reduce batch sizes: `--reflection-minibatch-size 2`
 - Reduce examples: `-n 30 --num-val 10`
-- Use smaller models: `-m gpt-4o-mini`
+- Use smaller models: `-m gpt-5-mini`
 
 ## Examples
 
@@ -303,7 +373,7 @@ vf-gepa wordle --budget medium
 ```bash
 vf-gepa wiki-search --budget heavy \
   --components system_prompt tool_descriptions \
-  -m gpt-4o
+  -m gpt-5-mini
 ```
 
 ### Large-Scale Optimization
@@ -318,7 +388,7 @@ vf-gepa my-env --max-metric-calls 2000 \
 ```bash
 vf-gepa my-env --budget medium \
   -m claude-3-5-sonnet-20241022 \
-  --reflection-model gpt-4o
+  --reflection-model gpt-5-mini
 ```
 
 ## API Usage
@@ -327,7 +397,7 @@ For programmatic use:
 
 ```python
 import verifiers as vf
-from verifiers.adapters import GEPAAdapter
+from verifiers.gepa import GEPAAdapter
 from gepa import optimize
 
 # Load environment
@@ -337,7 +407,7 @@ env = vf.load_environment("wordle")
 adapter = GEPAAdapter(
     env=env,
     client=client,
-    model="gpt-4o-mini",
+    model="gpt-5-mini",
     sampling_args={"temperature": 1.0, "max_tokens": 8096},
     components_to_optimize=["system_prompt"],
 )
diff --git a/integrations/gepa/README.md b/integrations/gepa/README.md
index a9a33a05d..1d56c7401 100644
--- a/integrations/gepa/README.md
+++ b/integrations/gepa/README.md
@@ -52,7 +52,7 @@ from verifiers.gepa import GEPAAdapter
 adapter = GEPAAdapter(
     env=vf_env,
     client=async_client,
-    model="gpt-4o-mini",
+    model="gpt-5-mini",
     sampling_args={"temperature": 1.0},
     components_to_optimize=["system_prompt"],
 )
@@ -60,13 +60,18 @@ adapter = GEPAAdapter(
 # Build new environment with optimized components
 new_env = adapter.build_program({"system_prompt": "Optimized prompt..."})
 
-# Evaluate candidate prompts
+# Evaluate candidate prompts (sync wrapper)
 results = adapter.evaluate(batch, candidate, capture_traces=True)
 
+# Evaluate candidate prompts (async - preferred in async contexts)
+results = await adapter.evaluate_async(batch, candidate, capture_traces=True)
+
 # Generate reflection dataset for GEPA
 reflective_data = adapter.make_reflective_dataset(candidate, results, components)
 ```
 
+**Note**: Use `evaluate_async()` when you're already in an async context (e.g., notebooks, async services). The sync `evaluate()` method is a convenience wrapper that manages the event loop for you.
+
 ## Rubric Feedback
 
 GEPA works best when reward functions return structured feedback:
@@ -147,8 +152,8 @@ vf-gepa my-env --components tool_descriptions --budget medium
 
 ### Models
 
-- **Task model** (being optimized): `gpt-4o-mini`, `gpt-4o`, or custom
-- **Reflection model** (generating proposals): `gpt-4o` recommended
+- **Task model** (being optimized): `gpt-5-mini`, or custom
+- **Reflection model** (generating proposals): `gpt-5-mini` (default)
 
 ## Output
 
@@ -158,11 +163,40 @@ GEPA saves results to `./gepa_results/<env_id>/<run_id>/`:
 - `<env_id>_original.json` - Original components (for comparison)
 - `<env_id>_metrics.json` - Optimization metrics and history
 
+## Experiment Tracking
+
+GEPA supports integration with Weights & Biases (wandb) and MLflow for tracking optimization runs:
+
+```bash
+# Track with wandb
+vf-gepa my-env --budget medium \
+  --use-wandb \
+  --wandb-project gepa-experiments
+
+# Track with MLflow
+vf-gepa my-env --budget medium \
+  --use-mlflow \
+  --mlflow-tracking-uri http://localhost:5000
+
+# Use both simultaneously
+vf-gepa my-env --budget medium \
+  --use-wandb --wandb-project my-project \
+  --use-mlflow --mlflow-tracking-uri http://localhost:5000
+```
+
+These integrations automatically log:
+- Validation and training scores
+- Component-level improvements
+- Optimization configuration
+- Final optimized components
+
+For detailed documentation on experiment tracking options, see [GEPA Documentation](../../docs/source/gepa.md#experiment-tracking).
+
 ## Implementation Notes
 
 ### Packaging
 
-The GEPA adapter ships inside the `verifiers.adapters` package so it is available to `pip install verifiers` users. The legacy `integrations/gepa` module re-exports the same class for backward compatibility inside this repository.
+The GEPA adapter ships inside the `verifiers.gepa` package so it is available to `pip install verifiers` users. The `integrations/gepa` directory contains additional documentation and examples for reference.
 
 ### Feedback Collection
 
@@ -188,8 +222,8 @@ vf-gepa ENV_ID \
   --max-metric-calls 1000 \
   -n 100 --num-val 30 \
   --components system_prompt tool_descriptions \
-  -m gpt-4o \
-  --reflection-model gpt-4o \
+  -m gpt-5-mini \
+  --reflection-model gpt-5-mini \
   --rollouts-per-example 3
 
 # Options
@@ -198,12 +232,17 @@ vf-gepa ENV_ID \
   --budget                Budget preset: light/medium/heavy
   --max-metric-calls      Custom budget (total metric calls)
   --components            What to optimize (default: system_prompt)
-  -m, --model             Task model (default: gpt-4o-mini)
-  --reflection-model      Reflection model (default: gpt-4o)
+  -m, --model             Task model (default: gpt-5-mini)
+  --reflection-model      Reflection model (default: gpt-5-mini)
   -T, --temperature       Task model temperature (default: 1.0)
   -t, --max-tokens        Max tokens (default: 8096)
   --track-stats           Save detailed statistics
   -v, --verbose           Verbose logging
+  --use-wandb             Enable wandb logging
+  --wandb-project         Wandb project name
+  --wandb-entity          Wandb entity/team name
+  --use-mlflow            Enable MLflow logging
+  --mlflow-tracking-uri   MLflow tracking server URI
 ```
 
 ## Links

From 18c71a3f8aabb0517fb7ded3e59542d051d067ed Mon Sep 17 00:00:00 2001
From: Robin Salimans <robin.salimans@zapier.com>
Date: Tue, 25 Nov 2025 14:43:46 +0100
Subject: [PATCH 16/16] improved comments in code

---
 verifiers/gepa/adapter.py | 100 +++++++++++++++++++++++++++++++-------
 verifiers/gepa/utils.py   |  52 ++++++++++++++++----
 2 files changed, 126 insertions(+), 26 deletions(-)

diff --git a/verifiers/gepa/adapter.py b/verifiers/gepa/adapter.py
index d80434b2c..62e012a02 100644
--- a/verifiers/gepa/adapter.py
+++ b/verifiers/gepa/adapter.py
@@ -108,9 +108,17 @@ def __init__(
     def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         """Create a candidate environment with updated components using shallow copy.
 
-        Shallow copy shares heavy objects (dataset, rubric, parser) while
-        allowing string attributes to be replaced. For oai_tools, we deep copy
-        only if tool descriptions are being updated.
+        Why shallow copy instead of deep copy?
+        - Efficiency: Datasets can be large (100s of MB). Shallow copy shares the dataset
+          reference across all candidate environments, avoiding memory bloat and copy overhead.
+        - Safety: String attributes like system_prompt are immutable. Assignment (e.g.,
+          new_env.system_prompt = "...") creates a new reference without affecting the original.
+        - Shared state: Rubric and parser objects are also shared, which is fine since they
+          don't get mutated during evaluation.
+
+        Special case for oai_tools:
+        - When optimizing tool_descriptions, we need to mutate nested dicts in oai_tools
+        - We deep copy oai_tools in this case to avoid mutating the base environment's tools
         """
         import copy
 
@@ -121,13 +129,16 @@ def build_program(self, candidate: dict[str, str]) -> vf.Environment:
         )
 
         # Create shallow copy - shares dataset, rubric, parser, etc.
+        # This is safe because we only replace immutable string attributes,
+        # not mutate shared objects (except oai_tools, handled below).
         new_env = copy.copy(self.base_env)
 
-        # Update system_prompt (assignment replaces reference, safe)
+        # Update system_prompt (assignment replaces reference, doesn't mutate original)
         if "system_prompt" in candidate:
             new_env.system_prompt = candidate["system_prompt"]
 
         # Update tool descriptions (need deep copy since we mutate nested dicts)
+        # We ONLY deep copy when actually updating tools to avoid unnecessary overhead
         if hasattr(self.base_env, "oai_tools") and self.base_env.oai_tools:
             tool_updates = {
                 k: v
@@ -155,6 +166,14 @@ def evaluate(
         """
         Evaluate candidate on batch of examples.
 
+        This method provides a synchronous interface to evaluation, required by GEPA's
+        optimization loop. Since the verifiers Environment API is async, we bridge the gap:
+        - If no event loop is running: Use asyncio.run() to create one
+        - If already in an event loop: Use ThreadPoolExecutor to avoid blocking
+
+        This allows GEPA to work in both sync contexts (normal scripts) and async contexts
+        (notebooks, services) without requiring callers to manage event loops.
+
         Args:
             batch: List of examples (dicts with 'question', 'answer', 'info', 'task')
             candidate: Dict of component values to evaluate
@@ -172,14 +191,17 @@ def evaluate(
         )
 
         # Run evaluation using Environment's evaluate method
+        # Note: We cannot simply await here because GEPA's optimize() expects a
+        # synchronous evaluate() method. We handle both sync and async contexts:
         evaluation = self._evaluate_async(env, batch, capture_traces)
         try:
             asyncio.get_running_loop()
         except RuntimeError:
-            # No running loop - create one
+            # No running loop - create one and run the async evaluation
             return asyncio.run(evaluation)
 
         # Already in an event loop - run in a thread pool to avoid blocking
+        # This happens when GEPA is called from an already-async context
         with ThreadPoolExecutor(max_workers=1) as executor:
             future = executor.submit(asyncio.run, evaluation)
             return future.result()
@@ -258,18 +280,29 @@ def _build_rollout_inputs(
         """
         Convert GEPA batch examples into Verifiers RolloutInput objects.
 
-        Handles prompt normalization, example/task bookkeeping, answer passthrough,
-        and optional info payloads while duplicating entries according to
-        num_rollouts_per_example so downstream generate() calls receive independent
-        rollout inputs.
+        GEPA uses a different schema than verifiers:
+        - GEPA: {"question": str, "answer": Any, "task": str, "info": dict, "example_id": int}
+        - Verifiers: {"prompt": Messages, "answer": Any, "task": str, "info": dict, "example_id": int}
+
+        This method:
+        1. Maps "question" -> "prompt" (with format normalization via _format_prompt)
+        2. Preserves "answer", "task", "info" fields
+        3. Ensures "example_id" is an integer (falls back to index)
+        4. Duplicates each input num_rollouts_per_example times for multiple evaluations
+
+        Why deepcopy for each rollout?
+        - Each rollout needs an independent RolloutInput to avoid state contamination
+        - Without deepcopy, modifying one rollout's state would affect all copies
         """
         rollout_inputs: list[RolloutInput] = []
 
         for example_idx, example in enumerate(batch):
+            # Extract prompt - GEPA uses "question", verifiers uses "prompt"
             raw_prompt = example.get("prompt") or example.get("question") or ""
             formatted_prompt = self._format_prompt(env, raw_prompt)
             task = str(example.get("task") or env.env_id or "default")
 
+            # Ensure example_id is an integer (GEPA may pass strings)
             example_id_value = example.get("example_id", example_idx)
             try:
                 example_id = int(example_id_value)
@@ -289,6 +322,7 @@ def _build_rollout_inputs(
             if info is not None:
                 base_input["info"] = deepcopy(info)
 
+            # Create independent copies for each rollout to avoid state contamination
             for _ in range(self.num_rollouts_per_example):
                 rollout_inputs.append(deepcopy(base_input))
 
@@ -298,14 +332,26 @@ def _format_prompt(self, env: vf.Environment, prompt: str | Messages) -> Message
         """
         Ensure prompts match the environment's declared message_type.
 
-        Completion environments expect raw strings, so chat-style prompts are
-        flattened into a single string. Chat environments expect structured
-        message lists, so bare strings are wrapped with system/few-shot context.
+        Environments can be either "completion" (raw text) or "chat" (message lists).
+        We need to normalize GEPA's prompts (which can be either format) to match:
+
+        For completion environments (message_type == "completion"):
+        - String prompts: Pass through as-is
+        - List prompts: Flatten message contents into a single string
+
+        For chat environments (message_type == "chat"):
+        - List prompts: Pass through as-is
+        - String prompts: Wrap in chat structure with system prompt + few-shot examples
+
+        This ensures the environment receives prompts in the format it expects,
+        regardless of how GEPA provides them.
         """
+        # Completion environment: flatten everything to a string
         if env.message_type == "completion":
             if isinstance(prompt, str):
                 return prompt
             if isinstance(prompt, list):
+                # Extract content from all messages and join
                 content_parts: list[str] = []
                 for message in prompt:
                     if isinstance(message, dict):
@@ -315,9 +361,11 @@ def _format_prompt(self, env: vf.Environment, prompt: str | Messages) -> Message
                 return " ".join(content_parts) if content_parts else str(prompt)
             return str(prompt)
 
+        # Chat environment: ensure we have a message list
         if isinstance(prompt, list):
             return prompt
 
+        # String prompt for chat env: wrap with system prompt + few-shot
         messages: list[dict[str, str]] = []
         if env.system_prompt:
             messages.append({"role": "system", "content": env.system_prompt})
@@ -372,6 +420,12 @@ def make_reflective_dataset(
             # Check if component is in optimization list
             # Support both exact matches (e.g., "system_prompt") and group patterns
             # (e.g., "tool_0_description" matches "tool_descriptions")
+            #
+            # Why this complexity?
+            # When optimizing tool_descriptions, GEPA's propose_new_texts receives
+            # individual components like "tool_0_description", "tool_1_description" etc.
+            # But components_to_optimize contains the group name "tool_descriptions".
+            # We need to match the individual tool components to the group.
             is_optimizable = comp_name in self.components_to_optimize
 
             # Check if this is a tool description (tool_N_description pattern)
@@ -489,9 +543,18 @@ def propose_new_texts(
         """
         Propose new text for components using tool-aware templates.
 
-        For tool descriptions (tool_N_description), uses a tool-specific template
-        that includes the tool name and parameter schema. For other components,
-        uses GEPA's default instruction proposal template.
+        Why different templates for different components?
+        - Tool descriptions need context about the tool's name, parameters, and purpose
+        - System prompts are general instructions that don't need tool-specific context
+
+        Template selection logic:
+        1. Check if component is in self._tool_metadata (tool_N_description pattern)
+           -> Use TOOL_DESCRIPTION_PROMPT_TEMPLATE with tool name + parameters
+        2. Otherwise (system_prompt, etc.)
+           -> Use GEPA's default InstructionProposalSignature
+
+        Both templates receive the same reflective feedback data, but format it
+        differently for the reflection model to generate appropriate improvements.
 
         Args:
             candidate: Current candidate component values
@@ -525,8 +588,10 @@ def propose_new_texts(
             feedback_data = reflective_dataset[comp_name]
 
             # Check if this is a tool description component
+            # Tool metadata is populated in __init__ when tool_descriptions is being optimized
             if comp_name in self._tool_metadata:
-                # Use tool-specific template
+                # Use tool-specific template that includes tool name and parameter schema
+                # This gives the reflection model context about what the tool does
                 tool_info = self._tool_metadata[comp_name]
                 new_texts[comp_name] = self._propose_tool_description(
                     tool_name=tool_info["name"],
@@ -538,7 +603,8 @@ def propose_new_texts(
                     f"Proposed new tool description for {comp_name} (tool: {tool_info['name']})"
                 )
             else:
-                # Use default GEPA instruction proposal for system_prompt, etc.
+                # Use default GEPA instruction proposal template for system_prompt, etc.
+                # This is GEPA's standard prompt optimization template
                 new_texts[comp_name] = InstructionProposalSignature.run(
                     lm=self.reflection_lm,
                     input_dict={
diff --git a/verifiers/gepa/utils.py b/verifiers/gepa/utils.py
index aae7485f2..664c0dfb3 100644
--- a/verifiers/gepa/utils.py
+++ b/verifiers/gepa/utils.py
@@ -155,7 +155,27 @@ def auto_budget_to_metric_calls(
     """
     Convert auto budget (light/medium/heavy) to max_metric_calls.
 
-    This replicates DSPy's auto_budget calculation for consistency.
+    This replicates DSPy's auto_budget calculation for consistency with GEPA's
+    expectations. The formula estimates total metric calls (rollout evaluations) by:
+
+    1. Mapping budget -> target number of candidates to explore:
+       - light: ~6 candidates
+       - medium: ~12 candidates
+       - heavy: ~18 candidates
+
+    2. Computing number of optimization trials (iterations) using:
+       - Log growth: 2.0 * (num_components * 2) * log2(num_candidates)
+       - Linear fallback: 1.5 * num_candidates
+       - Take the maximum to ensure sufficient exploration
+
+    3. Summing all evaluation costs:
+       - Initial validation: V (full eval on seed candidate)
+       - Bootstrap: num_candidates * 5 (small evals per candidate)
+       - Reflection minibatches: N * M (N trials on M examples each)
+       - Periodic full validations: (N // full_eval_steps + 1) * V
+
+    This ensures the optimization has enough budget to explore candidates
+    while periodically measuring improvement on the full validation set.
 
     Args:
         auto: Budget level ('light', 'medium', or 'heavy')
@@ -167,9 +187,11 @@ def auto_budget_to_metric_calls(
     Returns:
         Maximum number of metric calls
     """
+    # Map budget name to target number of candidates
     num_candidates = AUTO_BUDGET_CANDIDATES[auto]
 
     # Calculate number of trials using log-growth vs. linear fallback
+    # Log-growth scales better with more candidates, linear ensures minimum trials
     log_trials = (
         TRIAL_LOG_BASE_MULTIPLIER
         * (num_components * TRIAL_COMPONENT_MULTIPLIER)
@@ -178,24 +200,26 @@ def auto_budget_to_metric_calls(
     linear_trials = TRIAL_LINEAR_MULTIPLIER * num_candidates
     num_trials = int(max(log_trials, linear_trials))
 
-    V = valset_size
-    N = num_trials
-    M = minibatch_size
-    m = full_eval_steps
+    # Use shorter variable names for clarity in formula
+    V = valset_size  # Validation set size
+    N = num_trials  # Number of optimization trials
+    M = minibatch_size  # Minibatch size for reflection
+    m = full_eval_steps  # Steps between full validations
 
-    # Initial full evaluation on the default program
+    # Initial full evaluation on the seed (default) program
     total = V
 
-    # Assume a handful of bootstrap trials per candidate
+    # Bootstrap evaluations: quick evals to initialize each candidate
     total += num_candidates * BOOTSTRAP_TRIALS_PER_CANDIDATE
 
-    # N minibatch evaluations
+    # Reflection minibatch evaluations: N trials, each on M examples
     total += N * M
 
     if N == 0:
         return total
 
-    # Periodic full evals
+    # Periodic full validations to measure progress
+    # We do a full validation every m steps, plus potentially a final one
     periodic_fulls = (N + 1) // m + 1
     extra_final = 1 if N < m else 0
 
@@ -461,6 +485,16 @@ async def run_gepa_optimization(config: GEPAConfig):
     wandb_api_key = os.getenv(config.wandb_api_key_var) if config.use_wandb else None
 
     # Set reflection_lm on adapter for propose_new_texts method
+    # GEPA's optimize() expects a simple reflection_lm(prompt) -> str callable.
+    # We create a lambda that captures the reflection client and config,
+    # allowing the adapter's propose_new_texts() to call the reflection model
+    # without needing to manage the client itself.
+    #
+    # Why set this on the adapter?
+    # The GEPAAdapter.propose_new_texts() method needs to call the reflection model,
+    # but GEPA's protocol doesn't pass reflection_lm to that method - it only passes
+    # it to optimize(). By setting it as an attribute, we make it accessible within
+    # propose_new_texts() while keeping the GEPA protocol interface clean.
     adapter.reflection_lm = lambda x: call_reflection_model(
         reflection_client,
         x,