From b2745b88c8b83c3a58fffa3c0389a08a7484610e Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Tue, 21 Apr 2026 15:26:20 -0600
Subject: [PATCH 1/8] Add agent skills

---
 .apm/skills/benchmark-qed-autoe/SKILL.md      | 180 +++++++++
 .apm/skills/benchmark-qed-autoq/SKILL.md      | 164 ++++++++
 .apm/skills/benchmark-qed-setup/SKILL.md      | 170 +++++++++
 .../references/config-reference.md            | 349 ++++++++++++++++++
 apm.yml                                       |   8 +
 benchmark_qed/autoq/cli.py                    |  12 +-
 6 files changed, 877 insertions(+), 6 deletions(-)
 create mode 100644 .apm/skills/benchmark-qed-autoe/SKILL.md
 create mode 100644 .apm/skills/benchmark-qed-autoq/SKILL.md
 create mode 100644 .apm/skills/benchmark-qed-setup/SKILL.md
 create mode 100644 .apm/skills/benchmark-qed-setup/references/config-reference.md
 create mode 100644 apm.yml
diff --git a/.apm/skills/benchmark-qed-autoe/SKILL.md b/.apm/skills/benchmark-qed-autoe/SKILL.md
new file mode 100644
index 0000000..1480976
--- /dev/null
+++ b/.apm/skills/benchmark-qed-autoe/SKILL.md
@@ -0,0 +1,180 @@
+---
+name: benchmark-qed-autoe
+description: >
+  Evaluate RAG system outputs using benchmark-qed scoring methods. Use when:
+  running pairwise comparisons, reference-based scoring, assertion-based
+  evaluation (flat or hierarchical), retrieval metrics, or statistical
+  significance tests on RAG outputs. Also use when the user wants to score,
+  compare, or evaluate RAG methods, measure retrieval quality, or run
+  significance tests on benchmark results — even if they don't say "autoe"
+  explicitly.
+---
+
+# Benchmark-QED Evaluation (autoe)
+
+Evaluate and compare RAG system outputs using LLM-judged scoring, assertion-based evaluation, and retrieval metrics — all with built-in statistical significance testing.
+
+## Prerequisites
+
+- Generated questions/assertions from the autoq pipeline (or your own)
+- RAG method answer files (JSON, one per method per question set)
+- A valid `settings.yaml` for the evaluation type
+- LLM API key configured
+
+Run all commands with:
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed <command>
+```
+
+## Evaluation Methods Overview
+
+| Method | Command | Best for |
+|--------|---------|----------|
+| Pairwise comparison | `autoe pairwise-scores` | Comparing two RAG methods head-to-head |
+| Reference scoring | `autoe reference-scores` | Scoring against gold-standard answers |
+| Assertion scoring | `autoe assertion-scores` | Evaluating with ground-truth assertions (single or multi-RAG) |
+| Hierarchical assertions | `autoe hierarchical-assertion-scores` | Global + local assertion hierarchies |
+| Retrieval metrics | `autoe retrieval-scores` | Precision, recall, fidelity of retrieval |
+| Significance tests | `autoe assertion-significance` | Post-hoc significance on existing scores |
+
+## Commands
+
+### 1. Pairwise Scores
+
+Compare RAG methods using LLM-judged pairwise comparisons.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe pairwise-scores <config.yaml> <output_dir> [OPTIONS]
+```
+
+**Options:**
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--alpha` | `0.05` | P-value threshold for significance |
+| `--exclude-criteria` | `[]` | Criteria to exclude (repeatable) |
+| `--print-model-usage` | `false` | Print LLM token usage |
+
+**Config requires**: `base` (reference method), `others` (methods to compare), `question_sets`, `criteria`, `trials` (must be even), `llm_config`, `prompt_config`
+
+Default criteria: `comprehensiveness`, `diversity`, `empowerment`, `relevance`
+
+**Output**: `{question_set}_{base}--{other}.csv`, `win_rates.csv`, `winrates_sig_tests.csv`
+
+### 2. Reference Scores
+
+Score generated answers against reference (gold-standard) answers.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe reference-scores <config.yaml> <output_dir> [OPTIONS]
+```
+
+**Config requires**: `reference`, `generated` (list), `criteria`, `score_min`/`score_max`, `trials`, `llm_config`
+
+Default criteria: `correctness`, `completeness`. Default score range: 1–10.
+
+**Output**: `reference_scores-{name}.csv`, `model_usage.json`
+
+### 3. Assertion Scores
+
+Evaluate RAG methods using assertion-based scoring. Auto-detects single-RAG vs multi-RAG config.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-scores <config.yaml> <output_dir> [OPTIONS]
+```
+
+**Options:**
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--alpha` | `0.05` | Significance threshold (multi-RAG) |
+| `--print-model-usage` | `false` | Print LLM token usage |
+
+**Auto-detection**: If the YAML contains a `rag_methods` key, it runs in multi-RAG mode with automated significance testing. Otherwise, single-RAG mode.
+
+**Single-RAG output**: `assertion_scores.csv`, `assertion_summary_by_question.csv`, `eval_summary.json`
+
+**Multi-RAG output**: Per-method scores + significance tests in structured `output_dir/`
+
+### 4. Hierarchical Assertion Scores
+
+Score hierarchical assertions (global assertions with supporting local assertions).
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe hierarchical-assertion-scores <config.yaml> <output_dir> [OPTIONS]
+```
+
+**Modes**: `staged` (default — evaluate local first, then global) or `joint` (evaluate together)
+
+**Extra field**: `detect_discovery: true` enables detection of novel findings not covered by assertions.
+
+Also auto-detects single vs multi-RAG config (same as assertion-scores).
+
+### 5. Assertion Significance
+
+Run statistical significance tests on existing assertion scores (no LLM calls).
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-significance <config.yaml>
+```
+
+**Config requires**: `output_dir`, `rag_methods`, `question_sets`, `alpha`, `correction_method`
+
+**Correction methods**: `holm` (default, recommended), `bonferroni`, `fdr_bh`
+
+### 6. Hierarchical Assertion Significance
+
+Significance tests on hierarchical assertion scores.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe hierarchical-assertion-significance <config.yaml>
+```
+
+**Config requires**: `scores_dir`, `rag_methods`, `scores_filename_template`, `alpha`, `correction_method`, `output_dir`
+
+### 7. Generate Retrieval Reference
+
+Generate cluster relevance reference data for retrieval evaluation (one-off prep step).
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe generate-retrieval-reference <config.yaml>
+```
+
+**Config requires**: `llm_config`, `embedding_config`, question source (`questions_path` or `question_sets`), `text_units_path`
+
+**Key settings**: `num_clusters`, `assessor_type` (`rationale` or `bing`), `semantic_neighbors`, `centroid_neighbors`
+
+### 8. Retrieval Scores
+
+Evaluate retrieval precision, recall, and fidelity for RAG methods.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe retrieval-scores <config.yaml>
+```
+
+**Config requires**: `rag_methods`, `question_sets`, `reference_dir`, `text_units_path`, `output_dir`
+
+**Fidelity metrics**: `js` (Jensen-Shannon divergence) or `tvd` (total variation distance)
+
+## Workflow
+
+### Quick Evaluation (Assertion-Based)
+
+- [ ] Step 1: Verify questions and answers exist — list the workspace and confirm a `settings.yaml` (or `config.yaml`), question JSON files (typically under `output/`), and your RAG method answer JSONs are present.
+- [ ] Step 2: Initialize eval config — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init autoe_assertion ./eval_workspace`
+- [ ] Step 3: Configure settings.yaml with answer paths and assertion paths
+- [ ] Step 4: Run evaluation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-scores ./eval_workspace/settings.yaml ./eval_output`
+- [ ] Step 5: Summarize results — read the CSVs in `<output_dir>` (e.g. `assertion_scores.csv`, `assertion_summary_by_question.csv`) and `eval_summary.json` directly.
+
+### Multi-RAG Comparison
+
+For comparing multiple RAG methods, use multi-RAG config format (include `rag_methods` key in YAML). This gives you automated pairwise significance testing.
+
+For the full config reference with all fields, read the config reference in the `/benchmark-qed-setup` skill: [../benchmark-qed-setup/references/config-reference.md](../benchmark-qed-setup/references/config-reference.md).
+
+## Gotchas
+
+- **Config auto-detection**: `assertion-scores` and `hierarchical-assertion-scores` detect single vs multi-RAG based on the `rag_methods` key in YAML. Ensure your config matches your intent.
+- **Trials must be even**: For pairwise scores, `trials` must be even (for counterbalancing). Use 4 as default.
+- **Stale outputs**: Several commands skip existing output files. Use a fresh output directory or delete specific files to force re-evaluation.
+- **Output is in files**: All scores are written to CSV/JSON files. Parse output files, not CLI stdout.
+- **Long-running**: Evaluation with many questions and trials can take hours. Use background execution.
+- **No `config init` for hierarchical/retrieval**: `config init` only supports `autoe_assertion`, `autoe_pairwise`, and `autoe_reference`. For hierarchical and retrieval configs, create YAML manually using the config reference.
diff --git a/.apm/skills/benchmark-qed-autoq/SKILL.md b/.apm/skills/benchmark-qed-autoq/SKILL.md
new file mode 100644
index 0000000..90fe9bd
--- /dev/null
+++ b/.apm/skills/benchmark-qed-autoq/SKILL.md
@@ -0,0 +1,164 @@
+---
+name: benchmark-qed-autoq
+description: >
+  Generate benchmark questions and assertions from input data using
+  benchmark-qed. Use when: generating local, global, linked, or activity
+  questions for RAG benchmarking, creating assertions for existing questions,
+  computing assertion statistics, or running the autoq question generation
+  pipeline. Also use when the user wants to create a benchmark question set,
+  build evaluation questions from a dataset, or generate ground-truth
+  assertions — even if they don't say "autoq" explicitly.
+---
+
+# Benchmark-QED Question Generation (autoq)
+
+Generate benchmark questions and assertions from input data for RAG evaluation.
+
+## Prerequisites
+
+- A configured workspace with valid `settings.yaml` (use the `/benchmark-qed-setup` skill first)
+- Input data (CSV or JSON) in the workspace `input/` directory
+- Valid LLM API key in `.env`
+
+Run all commands with:
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed <command>
+```
+
+## Commands
+
+### 1. Generate Questions (`autoq`)
+
+The main question generation pipeline. Generates benchmark questions from input data.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq <settings.yaml> <output_dir> [OPTIONS]
+```
+
+**Options:**
+| Option | Description |
+|--------|-------------|
+| `--generation-types` | Specific types to generate (repeatable). CLI default: all except `data_linked`, but this skill always includes `data_linked` |
+| `--print-model-usage` | Print LLM token usage stats |
+
+**Generation types and dependencies:**
+
+```
+data_local          ← runs first (no dependencies)
+  ├── data_global   ← requires data_local candidates
+  └── data_linked   ← requires data_local candidates (not in CLI default, but this skill always includes it)
+
+activity_local      ← auto-generates activity_context first
+  └── activity_global ← requires activity_local
+```
+
+> **Important**: `data_linked` is NOT included in the CLI's default generation types, but this skill always generates it by passing all types explicitly. If running the CLI manually, you must add `--generation-types data_linked`.
+
+> **Gotcha**: `data_global` and `data_linked` silently return empty results if `data_local` hasn't been run first. Always run `data_local` before these types.
+
+**Examples:**
+```bash
+# Run from the workspace directory (paths resolve relative to settings.yaml location)
+cd ./workspace
+
+# Generate all types including data_linked (skill default)
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output \
+  --generation-types data_local --generation-types data_global --generation-types data_linked \
+  --generation-types activity_local --generation-types activity_global
+
+# Generate only local questions
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output --generation-types data_local
+
+# Generate local + linked questions
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output \
+  --generation-types data_local --generation-types data_linked
+```
+
+**Output structure:**
+```
+output_dir/
+├── sample_texts.parquet              # Intermediate: clustered text samples
+├── data_local_questions/
+│   ├── selected_questions.json       # Final curated questions
+│   ├── selected_questions_text.json  # Human-readable version
+│   └── candidate_questions.json      # All generated candidates
+├── data_global_questions/            # Same structure
+├── data_linked_questions/            # Same structure + question_stats.json
+├── activity_local_questions/         # Same structure
+├── activity_global_questions/        # Same structure
+├── context/
+│   └── activity_context_full.json    # Generated activity context
+└── model_usage.json                  # LLM token/cost tracking
+```
+
+### 2. Generate Assertions (`generate-assertions`)
+
+Generate ground-truth assertions for existing questions (decoupled from question generation). This is a **top-level** command, not a subcommand of `autoq`.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed generate-assertions <settings.yaml> <questions.json> <output_dir> [OPTIONS]
+```
+
+**Options:**
+| Option | Description |
+|--------|-------------|
+| `--type` / `-t` | Assertion type: `local`, `global`, or `linked` (default: `local`) |
+| `--print-model-usage` | Print LLM token usage stats |
+
+**Examples:**
+```bash
+# Run from the workspace directory (paths resolve relative to settings.yaml location)
+cd ./workspace
+
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed generate-assertions \
+  settings.yaml \
+  ./output/data_local_questions/candidate_questions.json \
+  ./output/data_local_questions/ \
+  --type local
+
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed generate-assertions \
+  settings.yaml \
+  ./output/data_global_questions/candidate_questions.json \
+  ./output/data_global_questions/ \
+  --type global
+```
+
+### 3. Assertion Statistics (`assertion-stats`)
+
+Compute quality statistics for assertion files. This is a **top-level** command, not a subcommand of `autoq`.
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assertion-stats <assertions_path> [OPTIONS]
+```
+
+**Options:**
+| Option | Description |
+|--------|-------------|
+| `--output` / `-o` | Output path for stats JSON (auto-generated if omitted) |
+| `--type` / `-t` | `global`, `map`, or `local` (auto-inferred if omitted) |
+| `--quiet` / `-q` | Suppress console output |
+
+**Examples:**
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assertion-stats ./output/assertions.json
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assertion-stats ./output/data_global_questions/ -q
+```
+
+## Workflow
+
+### Standard Question Generation Flow
+
+- [ ] Step 1: Verify workspace is ready — confirm `settings.yaml`, `.env`, and `input/` exist in `<workspace_dir>` (the CLI will fail fast if anything is misconfigured).
+- [ ] Step 2: `cd <workspace_dir>` then run question generation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output --generation-types data_local --generation-types data_global --generation-types data_linked --generation-types activity_local --generation-types activity_global`
+- [ ] Step 3: Verify output artifacts — list `<output_dir>` and confirm the per-type `selected_questions.json` files (see "Output structure" above) plus `model_usage.json` exist.
+- [ ] Step 4: (Optional) Generate additional assertions — use `generate-assertions`
+- [ ] Step 5: (Optional) Check assertion quality — use `assertion-stats`
+
+## Gotchas
+
+- **Path resolution**: The `autoq` and `generate-assertions` commands resolve `output_dir` (and other relative paths) **relative to the settings.yaml file's directory**, not the current working directory. Always `cd` into the workspace directory first, or use absolute paths. For example, running `benchmark-qed autoq workspace/settings.yaml workspace/output` from the repo root creates output at `workspace/workspace/output/` (not `workspace/output/`).
+- **Stale outputs**: The pipeline skips steps if output files already exist (`sample_texts.parquet`, `activity_context_full.json`). Use a fresh output directory for clean runs, or delete specific files to re-run a step.
+- **Long-running**: Question generation with large datasets can take hours. Use background execution and monitor via `model_usage.json` presence.
+- **Output is in files, not stdout**: All results are written to JSON/CSV/Parquet files. Parse the output files, not CLI stdout.
+- **Generation ordering**: `data_global` and `data_linked` depend on `data_local`. `activity_global` depends on `activity_local`. Running dependent types without their prerequisites produces silent empty results.
+- **`data_linked` CLI opt-in**: The CLI excludes `data_linked` by default, but this skill always includes it. If running the CLI manually outside this skill, add `--generation-types data_linked`.
diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md
new file mode 100644
index 0000000..464732c
--- /dev/null
+++ b/.apm/skills/benchmark-qed-setup/SKILL.md
@@ -0,0 +1,170 @@
+---
+name: benchmark-qed-setup
+description: >
+  Initialize and configure benchmark-qed workspaces for RAG benchmarking.
+  Use when: setting up a new benchmarking project, initializing config files
+  for question generation or evaluation, downloading sample datasets,
+  or modifying benchmark-qed settings.yaml configuration. Also use when
+  the user mentions "benchmark-qed config", workspace setup, or needs to
+  prepare a benchmarking environment — even if they don't say "setup" explicitly.
+---
+
+# Benchmark-QED Workspace Setup
+
+Initialize workspaces, generate configuration files, download datasets, and manage settings for the benchmark-qed RAG benchmarking tool.
+
+## Prerequisites
+
+benchmark-qed requires Python 3.11+ and uv. Run commands with `uvx` to avoid installing globally:
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed <command>
+```
+
+Pin a specific version for reproducibility:
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed@v1.2.3" benchmark-qed <command>
+```
+
+If `uvx` is unavailable, install uv first:
+```bash
+pip install uv && uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed <command>
+```
+
+## Procedure
+
+### Step 1 — Initialize a Workspace
+
+Generate a configuration workspace for the desired workflow type:
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init <config_type> <root_directory>
+```
+
+**Config types** (pick one):
+| Type | Purpose |
+|------|---------|
+| `autoq` | Question generation (includes all prompt templates) |
+| `autoe_pairwise` | Pairwise comparison evaluation |
+| `autoe_reference` | Reference-based scoring |
+| `autoe_assertion` | Assertion-based scoring |
+
+Example:
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init autoq ./my_workspace
+```
+
+This creates:
+```
+root/
+├── .env              # API key placeholder
+├── input/            # Place your data here
+├── settings.yaml     # Main configuration file
+└── prompts/          # LLM prompt templates
+```
+
+### Step 2 — Download Sample Data (Optional)
+
+Download sample datasets for testing. This command has an interactive confirmation prompt with no `--yes` flag — use one of these approaches to avoid hanging:
+
+**Bash/Linux/macOS:**
+```bash
+echo y | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download <dataset> <output_dir>
+```
+
+**PowerShell:**
+```powershell
+"y" | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download <dataset> <output_dir>
+```
+
+**Available datasets**: `AP_news`, `podcast`, `example_answers`
+
+### Step 3 — Gather Configuration Choices from the User
+
+Before writing any values into `settings.yaml`, **prompt the user with `ask_user`** to collect the LLM / auth / endpoint settings. Do not guess — these decisions are environment-specific and getting them wrong wastes downstream LLM calls. Use enum/boolean fields whenever possible so the user picks from a known set rather than typing free-form text.
+
+Ask in **a single `ask_user` form** (split into two if the workflow is autoq, since autoq also needs an embedding model). Tailor the follow-up fields based on the provider/auth choice — if the first answer reveals an Azure provider, ask the Azure-only fields in a second form.
+
+#### LLM (chat) fields to collect
+
+| Field | Type | Options / examples | Notes |
+|-------|------|--------------------|-------|
+| `llm_provider` | enum | `openai.chat`, `azure.openai.chat`, `azure.inference.chat` | See provider table in [references/config-reference.md](references/config-reference.md). |
+| `model` | string | `gpt-4.1`, `gpt-4o`, `o3-mini`, an Azure deployment name | For Azure providers this is the **deployment name**, not the base model id. |
+| `auth_type` | enum | `api_key` (default), `azure_managed_identity` | `azure_managed_identity` is only valid for `azure.*` providers. |
+| `api_key_env_var` | string | `OPENAI_API_KEY` (default), `AZURE_OPENAI_API_KEY`, … | Only ask when `auth_type=api_key`. The skill writes `${VAR_NAME}` into YAML and adds the variable to `.env`. |
+| `azure_endpoint` | string (uri) | e.g. `https://my-resource.openai.azure.com/` | Only ask for `azure.*` providers. |
+| `api_version` | string | e.g. `2024-06-01` | Only ask for `azure.openai.*` providers. |
+| `concurrent_requests` | integer | default `4` | Optional; offer the default. |
+
+#### Embedding fields to collect (autoq only)
+
+Ask the same shape of questions for the embedding model:
+
+| Field | Type | Notes |
+|-------|------|-------|
+| `embedding_provider` | enum (`openai.embedding`, `azure.openai.embedding`, `azure.inference.embedding`) | Must be an *embedding* provider. |
+| `embedding_model` | string | e.g. `text-embedding-3-large`, or an Azure deployment name. |
+| Reuse `auth_type` / `api_key_env_var` / `azure_endpoint` / `api_version` from the chat answers unless the user wants different values — ask a yes/no `reuse_chat_auth` boolean first. |
+
+#### Input data fields (autoq only)
+
+| Field | Type | Notes |
+|-------|------|-------|
+| `dataset_path` | string | Path to CSV/JSON dataset, e.g. `./input/data.csv`. |
+| `input_type` | enum (`csv`, `json`) | |
+| `text_column` | string | Column/key containing the text content. |
+
+#### Eval-config-specific fields (autoe_*)
+
+Only ask the questions relevant to the chosen `config_type`:
+- `autoe_pairwise`: `base.name` + `base.answer_base_path`, plus a list of `others` (each with `name` and `answer_base_path`), and `question_sets`.
+- `autoe_reference`: `reference.name` + `reference.answer_base_path`, list of `generated`, and `question_sets`.
+- `autoe_assertion`: in single-RAG mode, `generated.name` + `generated.answer_base_path` and `assertions.assertions_path`. In multi-RAG mode (`rag_methods` provided), ask for `input_dir`, `output_dir`, `rag_methods` list, and `question_sets`.
+
+If the user declines a field, fall back to the documented default and call out the assumption in your response.
+
+### Step 4 — Apply the Answers
+
+Use the answers from Step 3 to edit `settings.yaml` and `.env` directly:
+
+```yaml
+# LLM configuration (template — substitute values from ask_user answers)
+chat_model:
+  model: <model from ask_user>
+  llm_provider: <llm_provider from ask_user>
+  auth_type: <auth_type from ask_user>
+  api_key: ${<api_key_env_var>}        # only when auth_type=api_key
+  concurrent_requests: <concurrent_requests>
+  init_args:                            # only for azure.* providers
+    azure_endpoint: <azure_endpoint>
+    api_version: "<api_version>"        # azure.openai.* only
+
+# Input data (autoq only)
+input:
+  dataset_path: <dataset_path>
+  input_type: <input_type>
+  text_column: <text_column>
+```
+
+**Rules when writing the YAML:**
+- Omit `api_key` entirely when `auth_type=azure_managed_identity` — do not leave `${OPENAI_API_KEY}` in place.
+- Omit `init_args` for non-Azure providers.
+- Quote `api_version` (it would otherwise be parsed as a date).
+- For `azure_managed_identity`, do **not** add anything to `.env` for that key.
+- For `api_key` auth, append `<api_key_env_var>=<placeholder>` to `.env` if the variable is missing, and tell the user to replace the placeholder with their real key before running any command.
+
+For the full set of optional fields, read [references/config-reference.md](references/config-reference.md).
+
+### Step 5 — Validate Configuration
+
+The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any missing or malformed fields are reported when you run a command. After applying the answers, run the actual target command (e.g. `benchmark-qed autoq …`) — config errors surface immediately, before any LLM calls.
+
+## Gotchas
+
+- The `data download` command blocks on `typer.confirm()`. Always use `echo y | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download ...` to prevent hanging.
+- Environment variables in YAML use `${VAR_NAME}` syntax (resolved at load time via python-dotenv).
+- The `.env` file must be in the workspace root directory, not the project root.
+- Config types `autoe_pairwise`, `autoe_reference`, and `autoe_assertion` generate different settings.yaml templates — use the correct type for your evaluation method.
+- Prompts are copied as `.txt` files using Python `string.Template` syntax (`$variable` or `${variable}`).
+- **`prompts_config` vs `prompt_config`**: Some generated autoe configs may use `prompts_config`, but the runtime expects `prompt_config`. If you get config validation errors, rename the key.
diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md
new file mode 100644
index 0000000..08a34a6
--- /dev/null
+++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md
@@ -0,0 +1,349 @@
+# Configuration Reference
+
+Reference for benchmark-qed configuration fields. Load this file when you need to understand or modify specific config settings. Default values shown are from the source code Pydantic models.
+
+## autoq Configuration (`QuestionGenerationConfig`)
+
+### Input Configuration
+```yaml
+input:
+  dataset_path: ./input/data.csv    # Path to input dataset (REQUIRED)
+  input_type: csv                    # csv or json
+  text_column: text                  # Column containing text content
+  metadata_columns: null             # Optional list of metadata columns (e.g., [headline, date])
+  file_encoding: utf-8               # File encoding (template uses utf-8-sig)
+```
+
+### Encoding Configuration
+```yaml
+encoding:
+  model_name: o200k_base            # Tokenizer model
+  chunk_size: 600                    # Tokens per chunk
+  chunk_overlap: 100                 # Overlap between chunks
+```
+
+### Sampling Configuration
+```yaml
+sampling:
+  num_clusters: 50                   # Number of clusters for sampling
+  num_samples_per_cluster: 10        # Samples per cluster
+  random_seed: 42                    # Reproducibility seed
+```
+
+### LLM Configuration (shared across all commands)
+```yaml
+chat_model:
+  model: gpt-4.1                     # Model name
+  auth_type: api_key                 # api_key | azure_managed_identity
+  api_key: ${OPENAI_API_KEY}         # Required for api_key auth
+  llm_provider: openai.chat          # Provider (see table below)
+  concurrent_requests: 4             # Parallel LLM requests
+  init_args: {}                      # Extra model init args (e.g., api_version, azure_endpoint)
+  call_args:                         # Extra model call args
+    temperature: 0.0
+    seed: 42
+  custom_providers: []               # Custom provider registrations
+
+embedding_model:
+  model: text-embedding-3-large      # Embedding model (template default; code default is gpt-4.1)
+  llm_provider: openai.embedding     # Must use an embedding provider
+  api_key: ${OPENAI_API_KEY}
+```
+
+### Question Generation Types
+
+All question types share a base config with `num_questions` (default: `50`) and `oversample_factor` (default: `2.0`). Type-specific fields are listed below.
+
+```yaml
+data_local:
+  num_questions: 50                  # Number of questions to generate
+  oversample_factor: 2.0             # Generate oversample_factor × num_questions candidates
+
+data_global:                         # Requires data_local to be run first
+  num_questions: 50
+  oversample_factor: 2.0
+  min_questions_in_context: 2        # Min local questions required to form global context
+  min_claim_count: 2                 # Min claims required for global question
+  min_relevant_reference_count: 10   # Min relevant references for global question
+  enable_question_validation: true   # Validate generated global questions
+
+data_linked:                         # Requires data_local; opt-in (not generated by default)
+  num_questions: 50
+  oversample_factor: 2.0
+  min_questions_per_entity: 2        # Min local questions sharing an entity to form a group
+  max_questions_per_entity: 3        # Max local questions per entity group
+  type_balance_weight: 0.5           # Weight for balancing linked question types
+  max_questions_to_generate: 2       # Max linked questions per entity group
+  entity_frequency_threshold: 2      # Min entity frequency to be considered
+
+activity_local:                      # Auto-generates activity_context first
+  num_questions: 50
+  oversample_factor: 2.0
+  num_personas: 5                    # Number of personas to generate
+  num_tasks_per_persona: 5           # Tasks per persona
+  num_entities_per_task: 10          # Entities per task
+
+activity_global:                     # Requires activity_local
+  num_questions: 50
+  oversample_factor: 2.0
+  num_personas: 5
+  num_tasks_per_persona: 5
+  num_entities_per_task: 10
+```
+
+### Assertion Configuration
+```yaml
+assertions:
+  local:
+    max_assertions: 20               # Max assertions per question (null = unlimited, 0 = disable)
+    enable_validation: true           # Quality filtering via LLM validation
+    min_validation_score: 3           # Min score (1-5) to pass validation
+    max_source_count: 500             # Max source chunks to consider
+    concurrent_llm_calls: 8           # Concurrent LLM calls for validation
+    max_concurrent_questions: 8       # Parallel questions for assertion generation
+  global:
+    max_assertions: 20
+    enable_validation: true
+    min_validation_score: 3
+    max_source_count: 500
+    batch_size: 100                   # Batch size for map-reduce processing
+    map_data_tokens: 8000             # Max tokens per cluster in map step
+    reduce_data_tokens: 32000         # Max input tokens for reduce step
+    enable_semantic_grouping: true    # Group similar claims before map step
+    validate_map_assertions: true     # Validate map assertions before reduce
+    validate_reduce_assertions: true  # Validate final assertions after reduce
+    concurrent_llm_calls: 8
+    max_concurrent_questions: 2
+  linked:
+    max_assertions: 20
+    enable_validation: true
+    min_validation_score: 3
+    max_source_count: 500
+    concurrent_llm_calls: 8
+    max_concurrent_questions: 2
+
+concurrent_requests: 8               # Top-level concurrency for autoq pipeline
+```
+
+## autoe Pairwise Configuration (`PairwiseConfig`)
+```yaml
+base:
+  name: method_a                     # REQUIRED
+  answer_base_path: ./answers/method_a/  # REQUIRED
+
+others:
+  - name: method_b
+    answer_base_path: ./answers/method_b/
+
+question_sets:
+  - data_local_questions
+  - data_global_questions
+
+criteria:                            # Default: comprehensiveness, diversity, empowerment, relevance
+  - name: comprehensiveness          # Each criterion requires both name and description
+    description: "..."
+  - name: diversity
+    description: "..."
+
+trials: 4                            # Must be even (counterbalancing)
+llm_config: ...                      # Same LLM config structure as above
+prompt_config:
+  user_prompt: prompts/pairwise_user.txt
+  system_prompt: prompts/pairwise_system.txt
+```
+
+## autoe Reference Configuration (`ReferenceConfig`)
+```yaml
+reference:
+  name: gold_standard                # REQUIRED
+  answer_base_path: ./answers/reference/  # REQUIRED
+
+generated:
+  - name: method_a
+    answer_base_path: ./answers/method_a/
+
+criteria:                            # Default: correctness, completeness
+  - name: correctness
+    description: "..."
+
+score_min: 1
+score_max: 10
+trials: 4                            # Default is 4 (not 3)
+```
+
+## autoe Assertion Configuration
+
+### Single-RAG (`AssertionConfig`)
+```yaml
+generated:
+  name: method_a                     # REQUIRED
+  answer_base_path: ./answers/method_a/  # REQUIRED
+
+assertions:
+  assertions_path: ./questions/assertions.json  # REQUIRED
+
+pass_threshold: 0.5
+trials: 4                            # Default is 4 (not 3)
+```
+
+### Multi-RAG (`MultiRAGAssertionConfig`)
+```yaml
+input_dir: ./data                    # REQUIRED
+output_dir: ./eval_output            # REQUIRED
+rag_methods:                         # REQUIRED
+  - method_a
+  - method_b
+
+question_sets:                       # REQUIRED
+  - data_local_questions
+
+assertions_filename_template: "{question_set}_assertions.json"
+answers_path_template: "{input_dir}/{rag_method}/{question_set}.json"
+question_text_key: question_text     # Key for question text in JSON
+answer_text_key: answer              # Key for answer text in JSON
+
+pass_threshold: 0.5
+top_k_assertions: null               # null = use all
+trials: 4
+
+run_significance_test: true
+significance_alpha: 0.05
+significance_correction: holm        # holm | bonferroni | fdr_bh
+
+run_clustered_permutation: false
+n_permutations: 10000                # Number of permutations for clustered test
+permutation_seed: null               # null = random seed
+```
+
+## autoe Hierarchical Assertion Configuration
+
+### Single-RAG (`HierarchicalAssertionConfig`)
+```yaml
+generated:
+  name: method_a                     # REQUIRED
+  answer_base_path: ./answers/method_a/  # REQUIRED
+
+assertions:
+  assertions_path: ./assertions.json # REQUIRED
+
+mode: staged                         # staged (default) or joint
+detect_discovery: true               # Detect novel findings not in assertions
+pass_threshold: 0.5
+trials: 4
+```
+
+### Multi-RAG (`MultiRAGHierarchicalAssertionConfig`)
+```yaml
+input_dir: ./data                    # REQUIRED
+output_dir: ./eval_output            # REQUIRED
+rag_methods:                         # REQUIRED
+  - method_a
+  - method_b
+assertions_file: assertions.json     # REQUIRED — assertions filename
+
+answers_path_template: "{input_dir}/{rag_method}/data_global.json"
+question_id_key: question_id
+question_text_key: question_text
+answer_text_key: answer
+supporting_assertions_key: supporting_assertions
+
+mode: staged                         # staged | joint
+pass_threshold: 0.5
+trials: 4
+
+run_significance_test: true
+significance_alpha: 0.05
+significance_correction: holm
+
+run_clustered_permutation: false
+n_permutations: 10000
+permutation_seed: null
+```
+
+## autoe Retrieval Reference Configuration (`RetrievalReferenceConfig`)
+```yaml
+# Provide EITHER questions_path OR question_sets (not both)
+questions_path: ./questions/selected_questions.json
+# OR for multiple question sets:
+question_sets:
+  - name: data_local
+    questions_path: ./questions/data_local/selected_questions.json
+
+text_units_path: ./data/text_units.parquet  # REQUIRED
+output_dir: ./retrieval_reference           # REQUIRED
+clusters_path: null                  # Optional pre-computed clusters
+num_clusters: null                   # int, list of ints, or null (auto)
+save_clusters: true
+
+semantic_neighbors: 10
+centroid_neighbors: 5
+relevance_threshold: 2               # Min relevance score for a text unit
+assessor_type: rationale             # rationale or bing
+concurrent_requests: 16
+max_questions: null                  # null = process all questions
+cache_dir: null                      # Optional cache directory
+
+embedding_config: ...                # LLM config for generating embeddings (if needed)
+
+text_unit_fields:
+  id_col: id
+  text_col: text
+  embedding_col: text_embedding      # Set to null to auto-generate embeddings
+  short_id_col: short_id             # Set to null to auto-generate from index
+```
+
+## autoe Retrieval Scores Configuration (`RetrievalScoresConfig`)
+```yaml
+rag_methods:
+  - name: method_a
+    retrieval_results_path: ./results/method_a/
+
+question_sets:
+  - data_local_questions
+
+reference_dir: ./retrieval_reference # REQUIRED
+reference_filename: reference.json   # Filename within reference_dir subdirectories
+clusters_path: ./clusters.parquet    # REQUIRED
+text_units_path: ./text_units.parquet  # REQUIRED
+output_dir: ./retrieval_eval         # REQUIRED
+
+relevance_threshold: 2
+assessor_type: rationale             # rationale or bing
+fidelity_metric: js                  # js (Jensen-Shannon) or tvd
+context_id_key: chunk_id             # Key for chunk ID in retrieval results
+context_text_key: text               # Key for chunk text in retrieval results
+cluster_match_by: text               # Field to match clusters
+
+cache_dir: null                      # Optional cache directory
+
+run_significance_test: true
+significance_alpha: 0.05
+significance_correction: holm
+```
+
+## LLM Providers Reference
+
+| Provider | Value | Use for |
+|----------|-------|---------|
+| OpenAI Chat | `openai.chat` | Chat/generation models |
+| OpenAI Embedding | `openai.embedding` | Embedding models |
+| Azure OpenAI Chat | `azure.openai.chat` | Azure-hosted chat models |
+| Azure OpenAI Embedding | `azure.openai.embedding` | Azure-hosted embeddings |
+| Azure Inference Chat | `azure.inference.chat` | Azure AI Inference chat |
+| Azure Inference Embedding | `azure.inference.embedding` | Azure AI Inference embeddings |
+
+## Custom LLM Providers
+```yaml
+custom_providers:
+  - model_type: chat                 # chat or embedding
+    name: custom.chat                # Matches llm_provider value
+    module: my_module.provider       # Python module path
+    model_class: MyCustomChatModel   # Class name
+```
+
+## Significance Test Options
+| Correction | Description |
+|------------|-------------|
+| `holm` | Holm-Bonferroni (default, recommended) |
+| `bonferroni` | Bonferroni (conservative) |
+| `fdr_bh` | Benjamini-Hochberg FDR |
diff --git a/apm.yml b/apm.yml
new file mode 100644
index 0000000..27a7e3d
--- /dev/null
+++ b/apm.yml
@@ -0,0 +1,8 @@
+name: benchmark-qed
+version: 0.1.0
+description: APM project for benchmark-qed
+author: Andres Morales Esquivel
+dependencies:
+  apm: []
+  mcp: []
+scripts: {}
diff --git a/benchmark_qed/autoq/cli.py b/benchmark_qed/autoq/cli.py
index 2c3f8ea..04f468d 100644
--- a/benchmark_qed/autoq/cli.py
+++ b/benchmark_qed/autoq/cli.py
@@ -741,13 +741,13 @@ def assertion_stats(
     Examples
     --------
         # Generate stats for a single assertion file
-        benchmark-qed autoq assertion-stats output/assertions.json
+        benchmark-qed assertion-stats output/assertions.json
 
         # Generate stats for all assertion files in a directory
-        benchmark-qed autoq assertion-stats output/data_global_questions/
+        benchmark-qed assertion-stats output/data_global_questions/
 
         # Specify output path
-        benchmark-qed autoq assertion-stats assertions.json -o stats/my_stats.json
+        benchmark-qed assertion-stats assertions.json -o stats/my_stats.json
     """
     from benchmark_qed.autoq.question_gen.data_questions.assertion_gen.stats import (
         generate_stats_for_assertion_file,
@@ -940,17 +940,17 @@ def generate_assertions(
     Examples
     --------
         # Generate local assertions for candidate questions
-        benchmark-qed autoq generate-assertions settings.yaml \
+        benchmark-qed generate-assertions settings.yaml \
             output/data_local_questions/candidate_questions.json \
             output/data_local_questions/ --type local
 
         # Generate global assertions
-        benchmark-qed autoq generate-assertions settings.yaml \
+        benchmark-qed generate-assertions settings.yaml \
             output/data_global_questions/candidate_questions.json \
             output/data_global_questions/ --type global
 
         # Generate linked assertions
-        benchmark-qed autoq generate-assertions settings.yaml \
+        benchmark-qed generate-assertions settings.yaml \
             output/data_linked_questions/candidate_questions.json \
             output/data_linked_questions/ --type linked
     """

From d747707397c15af0b0efde21d1568742cd51b161 Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Wed, 22 Apr 2026 15:45:26 -0600
Subject: [PATCH 2/8] Update skills + interactive setup

---
 .apm/skills/benchmark-qed-autoe/SKILL.md      |   8 +-
 .apm/skills/benchmark-qed-autoq/SKILL.md      |   3 +-
 .apm/skills/benchmark-qed-setup/SKILL.md      |  29 +-
 .../references/config-reference.md            |  37 +
 .../minor-20260422214721711139.json           |   4 +
 benchmark_qed/__main__.py                     |   4 +
 benchmark_qed/autoe/retrieval/scores.py       |   2 +-
 benchmark_qed/autoq/config.py                 |   2 +-
 benchmark_qed/cli/init_config.py              |  30 +-
 benchmark_qed/cli/interactive.py              | 792 ++++++++++++++++++
 benchmark_qed/cli/scaffold.py                 |  40 +
 benchmark_qed/cli/yaml_renderer.py            | 545 ++++++++++++
 tests/autoe/assertion/pipeline_test.py        |   4 +-
 tests/test_interactive_init.py                | 693 +++++++++++++++
 14 files changed, 2160 insertions(+), 33 deletions(-)
 create mode 100644 .semversioner/next-release/minor-20260422214721711139.json
 create mode 100644 benchmark_qed/cli/interactive.py
 create mode 100644 benchmark_qed/cli/scaffold.py
 create mode 100644 benchmark_qed/cli/yaml_renderer.py
 create mode 100644 tests/test_interactive_init.py

diff --git a/.apm/skills/benchmark-qed-autoe/SKILL.md b/.apm/skills/benchmark-qed-autoe/SKILL.md
index 1480976..432cf67 100644
--- a/.apm/skills/benchmark-qed-autoe/SKILL.md
+++ b/.apm/skills/benchmark-qed-autoe/SKILL.md
@@ -19,6 +19,7 @@ Evaluate and compare RAG system outputs using LLM-judged scoring, assertion-base
 - Generated questions/assertions from the autoq pipeline (or your own)
 - RAG method answer files (JSON, one per method per question set)
 - A valid `settings.yaml` for the evaluation type
+- A configured workspace with valid `settings.yaml` (use the `benchmark-qed-setup` skill to initialize and configure)
 - LLM API key configured
 
 Run all commands with:
@@ -159,7 +160,7 @@ uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe
 ### Quick Evaluation (Assertion-Based)
 
 - [ ] Step 1: Verify questions and answers exist — list the workspace and confirm a `settings.yaml` (or `config.yaml`), question JSON files (typically under `output/`), and your RAG method answer JSONs are present.
-- [ ] Step 2: Initialize eval config — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init autoe_assertion ./eval_workspace`
+- [ ] Step 2: Initialize eval config — use the `benchmark-qed-setup` skill to create and configure an assertion evaluation workspace.
 - [ ] Step 3: Configure settings.yaml with answer paths and assertion paths
 - [ ] Step 4: Run evaluation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe assertion-scores ./eval_workspace/settings.yaml ./eval_output`
 - [ ] Step 5: Summarize results — read the CSVs in `<output_dir>` (e.g. `assertion_scores.csv`, `assertion_summary_by_question.csv`) and `eval_summary.json` directly.
@@ -168,8 +169,6 @@ uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoe
 
 For comparing multiple RAG methods, use multi-RAG config format (include `rag_methods` key in YAML). This gives you automated pairwise significance testing.
 
-For the full config reference with all fields, read the config reference in the `/benchmark-qed-setup` skill: [../benchmark-qed-setup/references/config-reference.md](../benchmark-qed-setup/references/config-reference.md).
-
 ## Gotchas
 
 - **Config auto-detection**: `assertion-scores` and `hierarchical-assertion-scores` detect single vs multi-RAG based on the `rag_methods` key in YAML. Ensure your config matches your intent.
@@ -177,4 +176,5 @@ For the full config reference with all fields, read the config reference in the
 - **Stale outputs**: Several commands skip existing output files. Use a fresh output directory or delete specific files to force re-evaluation.
 - **Output is in files**: All scores are written to CSV/JSON files. Parse output files, not CLI stdout.
 - **Long-running**: Evaluation with many questions and trials can take hours. Use background execution.
-- **No `config init` for hierarchical/retrieval**: `config init` only supports `autoe_assertion`, `autoe_pairwise`, and `autoe_reference`. For hierarchical and retrieval configs, create YAML manually using the config reference.
+- **No `config init` for hierarchical/retrieval**: The `benchmark-qed-setup` skill only supports `autoe_assertion`, `autoe_pairwise`, and `autoe_reference`. For hierarchical, multi-RAG, and retrieval configs, create YAML manually.
+- **Advanced config types**: Use the `benchmark-qed-setup` skill for configuration guidance on advanced config types.
diff --git a/.apm/skills/benchmark-qed-autoq/SKILL.md b/.apm/skills/benchmark-qed-autoq/SKILL.md
index 90fe9bd..4443213 100644
--- a/.apm/skills/benchmark-qed-autoq/SKILL.md
+++ b/.apm/skills/benchmark-qed-autoq/SKILL.md
@@ -17,6 +17,7 @@ Generate benchmark questions and assertions from input data for RAG evaluation.
 ## Prerequisites
 
 - A configured workspace with valid `settings.yaml` (use the `/benchmark-qed-setup` skill first)
+- A configured workspace with valid `settings.yaml` (use the `benchmark-qed-setup` skill to initialize and configure)
 - Input data (CSV or JSON) in the workspace `input/` directory
 - Valid LLM API key in `.env`
 
@@ -148,7 +149,7 @@ uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed assert
 
 ### Standard Question Generation Flow
 
-- [ ] Step 1: Verify workspace is ready — confirm `settings.yaml`, `.env`, and `input/` exist in `<workspace_dir>` (the CLI will fail fast if anything is misconfigured).
+- [ ] Step 1: Initialize workspace if needed — use the `benchmark-qed-setup` skill to create and configure the workspace. Verify `settings.yaml`, `.env`, and `input/` exist.
 - [ ] Step 2: `cd <workspace_dir>` then run question generation — `uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed autoq settings.yaml ./output --generation-types data_local --generation-types data_global --generation-types data_linked --generation-types activity_local --generation-types activity_global`
 - [ ] Step 3: Verify output artifacts — list `<output_dir>` and confirm the per-type `selected_questions.json` files (see "Output structure" above) plus `model_usage.json` exist.
 - [ ] Step 4: (Optional) Generate additional assertions — use `generate-assertions`
diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md
index 464732c..e2c382b 100644
--- a/.apm/skills/benchmark-qed-setup/SKILL.md
+++ b/.apm/skills/benchmark-qed-setup/SKILL.md
@@ -35,7 +35,23 @@ pip install uv && uvx --from "git+https://github.com/microsoft/benchmark-qed" be
 
 ### Step 1 — Initialize a Workspace
 
-Generate a configuration workspace for the desired workflow type:
+**Option A (Recommended): Interactive wizard**
+
+The interactive wizard guides you through configuration with sensible defaults:
+
+```bash
+uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed init <root_directory>
+```
+
+This walks through:
+- Config type selection (autoq, autoe_pairwise, autoe_reference, autoe_assertion)
+- LLM provider selection with Azure-specific prompts (endpoint, API version)
+- Section-by-section customization (press Enter to accept defaults)
+- Automatic YAML validation before writing
+
+**Option B: Non-interactive (template-based)**
+
+Generate a static template and edit manually:
 
 ```bash
 uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed config init <config_type> <root_directory>
@@ -160,6 +176,15 @@ For the full set of optional fields, read [references/config-reference.md](refer
 
 The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any missing or malformed fields are reported when you run a command. After applying the answers, run the actual target command (e.g. `benchmark-qed autoq …`) — config errors surface immediately, before any LLM calls.
 
+## Best Practices
+
+See [references/config-reference.md](references/config-reference.md) for detailed best practices covering LLM configuration, prompts, question generation, assertion generation, evaluation, and retrieval.
+
+Key highlights:
+- Use `${OPENAI_API_KEY}` env var substitution — never hardcode secrets
+- Use `benchmark-qed init` (interactive wizard) to avoid manual YAML errors
+- Pin a specific version of benchmark-qed for reproducibility in CI/CD
+
 ## Gotchas
 
 - The `data download` command blocks on `typer.confirm()`. Always use `echo y | uvx --from "git+https://github.com/microsoft/benchmark-qed" benchmark-qed data download ...` to prevent hanging.
@@ -167,4 +192,4 @@ The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any
 - The `.env` file must be in the workspace root directory, not the project root.
 - Config types `autoe_pairwise`, `autoe_reference`, and `autoe_assertion` generate different settings.yaml templates — use the correct type for your evaluation method.
 - Prompts are copied as `.txt` files using Python `string.Template` syntax (`$variable` or `${variable}`).
-- **`prompts_config` vs `prompt_config`**: Some generated autoe configs may use `prompts_config`, but the runtime expects `prompt_config`. If you get config validation errors, rename the key.
+- **`prompts_config` vs `prompt_config`**: The non-interactive `config init` for some autoe types generates `prompts_config`, but the runtime expects `prompt_config`. The interactive `benchmark-qed init` wizard avoids this issue. If using `config init`, rename the key if you get validation errors.
diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md
index 08a34a6..e1f6ff2 100644
--- a/.apm/skills/benchmark-qed-setup/references/config-reference.md
+++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md
@@ -347,3 +347,40 @@ custom_providers:
 | `holm` | Holm-Bonferroni (default, recommended) |
 | `bonferroni` | Bonferroni (conservative) |
 | `fdr_bh` | Benjamini-Hochberg FDR |
+
+## Best Practices
+
+### LLM Configuration
+- Use `${OPENAI_API_KEY}` environment variable substitution — never hardcode secrets in YAML
+- Use `azure_managed_identity` for production Azure deployments (omit `api_key` entirely)
+- Set `temperature: 0.0` and `seed: 42` for reproducible LLM outputs
+- Start with `concurrent_requests: 4`; increase based on your rate limit budget
+- For Azure providers, always set `azure_endpoint` and `api_version` in `init_args`
+- Quote `api_version` values: `"2024-12-01-preview"` (YAML would otherwise parse as a date)
+
+### Question Generation (autoq)
+- **Wizard defaults vs model defaults**: The interactive wizard uses curated starter values (e.g., `num_questions: 10`, `num_clusters: 20`) suitable for initial exploration. The Pydantic model defaults (e.g., `num_questions: 50`, `num_clusters: 50`) are for production runs. Adjust based on your dataset size and budget.
+- Keep `chunk_overlap` at 15–20% of `chunk_size` (default: 100/600 ≈ 17%)
+- Use `oversample_factor: 2.0` to generate 2× candidates before filtering — lower values risk insufficient quality diversity
+- Enable `enable_semantic_grouping: true` for global assertions to improve claim consolidation
+- Set `max_concurrent_questions` lower for global (2) than local (8) — global processing is heavier per question
+
+### Assertion Generation
+- Keep `max_assertions: 20` as a reasonable limit per question
+- Enable validation (`enable_validation: true`) for production benchmarks — it filters low-quality assertions
+- `min_validation_score: 3` (scale 1–5) provides a good baseline quality threshold
+- Setting `max_assertions: 0` disables assertion generation entirely for that question type
+- `max_source_count: 500` drops entire questions when exceeded — monitor for unexpected question drops
+
+### Evaluation (autoe)
+- Trials must be **even** for pairwise and reference evaluation (counterbalancing) — the config validator rejects odd values
+- Assertion and hierarchical evaluation do NOT require even trials
+- Use `staged` mode for hierarchical assertions (more accurate); `joint` mode is cheaper but risks anchoring bias
+- Use `holm` correction for significance testing (default) — balances power and error control
+- Set `pass_threshold: 0.5` as the default quality bar; adjust based on assertion strictness
+
+### Retrieval Evaluation
+- `assessor_type: rationale` (default) provides structured JSON with reasoning; `bing` uses the UMBRELA DNA prompt
+- Match the assessor type between `generate-retrieval-reference` and `retrieval-scores` to share the cache
+- `relevance_threshold: 2` on a 0–3 scale is a reasonable default — lower values include marginal matches
+- Use `cache_dir` for iterative development to avoid redundant LLM calls across runs
diff --git a/.semversioner/next-release/minor-20260422214721711139.json b/.semversioner/next-release/minor-20260422214721711139.json
new file mode 100644
index 0000000..30272b6
--- /dev/null
+++ b/.semversioner/next-release/minor-20260422214721711139.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add benchmark-qed agentic skills with interactive configuration wizard and best practices"
+}
diff --git a/benchmark_qed/__main__.py b/benchmark_qed/__main__.py
index d517689..3f5f9c5 100644
--- a/benchmark_qed/__main__.py
+++ b/benchmark_qed/__main__.py
@@ -9,6 +9,7 @@
 from benchmark_qed.autoe.cli import app as autoe_cli
 from benchmark_qed.autoq.cli import app as autoq_cli
 from benchmark_qed.cli.init_config import app as init_cli
+from benchmark_qed.cli.interactive import interactive_init
 from benchmark_qed.data.cli import app as data_cli
 
 app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
@@ -16,6 +17,9 @@
 app.add_typer(autoe_cli, name="autoe", help="Relative scores CLI.")
 app.add_typer(autoq_cli, help="Question generation CLI.")
 app.add_typer(init_cli, name="config", help="Configuration initialization CLI.")
+app.command(name="init", help="Interactively create a benchmark-qed configuration.")(
+    interactive_init
+)
 app.add_typer(data_cli, name="data", help="Dataset downloader CLI.")
 
 
diff --git a/benchmark_qed/autoe/retrieval/scores.py b/benchmark_qed/autoe/retrieval/scores.py
index 42d92f4..647e174 100644
--- a/benchmark_qed/autoe/retrieval/scores.py
+++ b/benchmark_qed/autoe/retrieval/scores.py
@@ -654,7 +654,7 @@ async def run_retrieval_evaluation(
             retrieval_path = Path(rag_method["retrieval_results_path"])
 
             # Check if path includes question_set placeholder
-            if "{question_set}" in str(retrieval_path):  # noqa: RUF027
+            if "{question_set}" in str(retrieval_path):
                 retrieval_path = Path(
                     str(retrieval_path).format(question_set=question_set)
                 )
diff --git a/benchmark_qed/autoq/config.py b/benchmark_qed/autoq/config.py
index d5fedfb..8d04722 100644
--- a/benchmark_qed/autoq/config.py
+++ b/benchmark_qed/autoq/config.py
@@ -682,7 +682,7 @@ class QuestionGenerationConfig(BaseModel):
     )
 
 
-class QuestionType(str):  # noqa: FURB189
+class QuestionType(str):
     """Enumeration for question types that support assertion regeneration."""
 
     __slots__ = ()
diff --git a/benchmark_qed/cli/init_config.py b/benchmark_qed/cli/init_config.py
index 32f7884..a113ea3 100644
--- a/benchmark_qed/cli/init_config.py
+++ b/benchmark_qed/cli/init_config.py
@@ -33,6 +33,7 @@
 from benchmark_qed.autoq.prompts.data_questions import (
     local_questions as data_local_prompts,
 )
+from benchmark_qed.cli.scaffold import copy_prompts, ensure_input_folder, write_env_file
 
 app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
 
@@ -314,15 +315,11 @@ class ConfigType(StrEnum):
 
 
 def __copy_prompts(prompts_path: Path, output_path: Path) -> None:
-    """Copy prompts from the prompts directory to the output directory."""
-    if not output_path.exists():
-        output_path.mkdir(parents=True, exist_ok=True)
-    for prompt_file in prompts_path.iterdir():
-        if prompt_file.is_file() and prompt_file.suffix == ".txt":
-            target_file = output_path / prompt_file.name
-            target_file.write_text(
-                prompt_file.read_text(encoding="utf-8"), encoding="utf-8"
-            )
+    """Copy prompts from the prompts directory to the output directory.
+
+    Delegates to the shared scaffold utility.
+    """
+    copy_prompts(prompts_path, output_path)
 
 
 @app.command()
@@ -338,13 +335,7 @@ def init(
     ],
 ) -> None:
     """Generate settings file."""
-    input_folder = root / "input"
-    if not input_folder.exists():
-        input_folder.mkdir(parents=True, exist_ok=True)
-        typer.echo(f"Input folder created at {input_folder}")
-        typer.echo(
-            "Please place your input files in the 'input' folder before running, or modify the settings.yaml to point to your input files."
-        )
+    ensure_input_folder(root)
 
     settings = root / "settings.yaml"
     prompts_folder = root / "prompts"
@@ -399,9 +390,4 @@ def init(
 
     typer.echo(f"Configuration file created at {settings}")
 
-    env_file = root / ".env"
-    if not env_file.exists():
-        env_file.write_text("OPENAI_API_KEY=<API_KEY>", encoding="utf-8")
-    typer.echo(
-        f"Change the OPENAI_API_KEY placeholder at {env_file} with your actual OPENAI_API_KEY."
-    )
+    write_env_file(root)
diff --git a/benchmark_qed/cli/interactive.py b/benchmark_qed/cli/interactive.py
new file mode 100644
index 0000000..a9f00e1
--- /dev/null
+++ b/benchmark_qed/cli/interactive.py
@@ -0,0 +1,792 @@
+# Copyright (c) 2025 Microsoft Corporation.
+"""Interactive configuration wizard for benchmark-qed."""
+
+from __future__ import annotations
+
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Annotated, Any
+
+import typer
+from rich import print as rich_print
+from rich.panel import Panel
+from rich.table import Table
+
+app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
+
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FieldDef:
+    """Definition of a configurable field shown to the user."""
+
+    name: str
+    description: str
+    default: Any
+    field_type: type = str
+    choices: list[str] | None = None
+
+
+@dataclass
+class ProviderResult:
+    """Result from the provider selection flow."""
+
+    llm_provider: str
+    model: str
+    auth_type: str
+    init_args: dict[str, Any] = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Provider metadata
+# ---------------------------------------------------------------------------
+
+CHAT_PROVIDERS: list[tuple[str, str, str]] = [
+    ("openai.chat", "OpenAI", "OpenAI API (default)"),
+    ("azure.openai.chat", "Azure OpenAI", "Azure-hosted OpenAI models"),
+    ("azure.inference.chat", "Azure Inference", "Azure AI Inference endpoint"),
+]
+
+EMBEDDING_PROVIDERS: list[tuple[str, str, str]] = [
+    ("openai.embedding", "OpenAI", "OpenAI API (default)"),
+    ("azure.openai.embedding", "Azure OpenAI", "Azure-hosted OpenAI embeddings"),
+    (
+        "azure.inference.embedding",
+        "Azure Inference",
+        "Azure AI Inference endpoint",
+    ),
+]
+
+AUTH_TYPES: list[tuple[str, str]] = [
+    ("api_key", "API Key"),
+    ("azure_managed_identity", "Azure Managed Identity"),
+]
+
+DEFAULT_CHAT_MODEL = "gpt-4.1"
+DEFAULT_EMBEDDING_MODEL = "text-embedding-3-large"
+DEFAULT_API_VERSION = "2024-12-01-preview"
+
+
+# ---------------------------------------------------------------------------
+# Guard helpers
+# ---------------------------------------------------------------------------
+
+
+def check_tty() -> None:
+    """Abort if stdin is not a terminal (non-interactive context)."""
+    if not sys.stdin.isatty():
+        typer.echo(
+            "Error: Interactive mode requires a terminal. "
+            "Use 'benchmark-qed config init' for non-interactive setup.",
+            err=True,
+        )
+        raise typer.Exit(code=1)
+
+
+def confirm_overwrite(path: typer.Path | Any) -> None:
+    """Ask for confirmation before overwriting an existing settings file."""
+    from pathlib import Path as _Path
+
+    p = _Path(str(path))
+    if p.exists():
+        typer.confirm(
+            f"{p} already exists. Overwrite?",
+            abort=True,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Selection / display primitives
+# ---------------------------------------------------------------------------
+
+
+def select_option(
+    title: str,
+    options: list[tuple[str, str]],
+) -> str:
+    """Display numbered options and return the selected value.
+
+    Parameters
+    ----------
+    title:
+        Prompt title shown to the user.
+    options:
+        List of ``(value, label)`` tuples.
+
+    Returns
+    -------
+    The *value* string of the chosen option.
+    """
+    rich_print(f"\n[bold]{title}[/bold]")
+    for idx, (_value, label) in enumerate(options, 1):
+        rich_print(f"  [cyan][{idx}][/cyan] {label}")
+
+    choice = typer.prompt(
+        "Select",
+        type=int,
+        default=1,
+    )
+    if choice < 1 or choice > len(options):
+        typer.echo("Invalid choice. Defaulting to 1.")
+        choice = 1
+    return options[choice - 1][0]
+
+
+def show_section_defaults(title: str, fields: list[FieldDef]) -> None:
+    """Render a Rich table showing current default values for a section."""
+    table = Table(title=title, show_header=False, padding=(0, 2))
+    table.add_column("Field", style="cyan", min_width=28)
+    table.add_column("Default", style="green")
+    for f in fields:
+        table.add_row(f.name, str(f.default))
+    rich_print(table)
+
+
+def prompt_section(
+    title: str,
+    fields: list[FieldDef],
+) -> dict[str, Any]:
+    """Show section defaults and optionally let the user customise them.
+
+    Returns a dict mapping field names to their (possibly user-overridden) values.
+    """
+    show_section_defaults(title, fields)
+
+    if not typer.confirm("Customize this section?", default=False):
+        return {f.name: f.default for f in fields}
+
+    result: dict[str, Any] = {}
+    for f in fields:
+        if f.choices:
+            value = select_option(f.description, [(c, c) for c in f.choices])
+        else:
+            raw = typer.prompt(
+                f.name,
+                default=f.default,
+                type=f.field_type,
+            )
+            value = raw
+        result[f.name] = value
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Provider selection
+# ---------------------------------------------------------------------------
+
+
+def _prompt_azure_init_args(provider_value: str) -> dict[str, Any]:
+    """Prompt for Azure-specific init_args based on provider type."""
+    init_args: dict[str, Any] = {}
+
+    if "azure" in provider_value:
+        endpoint = typer.prompt("Azure endpoint URL")
+        init_args["azure_endpoint"] = endpoint
+
+    if "azure.openai" in provider_value:
+        api_version = typer.prompt("API version", default=DEFAULT_API_VERSION)
+        init_args["api_version"] = api_version
+
+    return init_args
+
+
+def prompt_provider(
+    purpose: str = "chat",
+    *,
+    default_model: str | None = None,
+) -> ProviderResult:
+    """Guide the user through LLM provider selection.
+
+    Parameters
+    ----------
+    purpose:
+        ``"chat"`` or ``"embedding"`` — determines available providers and default model.
+    default_model:
+        Override the default model name. If *None*, uses the standard default for the purpose.
+    """
+    providers = CHAT_PROVIDERS if purpose == "chat" else EMBEDDING_PROVIDERS
+    model_default = default_model or (
+        DEFAULT_CHAT_MODEL if purpose == "chat" else DEFAULT_EMBEDDING_MODEL
+    )
+
+    provider_value = select_option(
+        f"Select {purpose} LLM provider",
+        [(val, label) for val, label, _desc in providers],
+    )
+
+    # Auth type
+    auth_type = select_option("Authentication type", AUTH_TYPES)
+
+    # Provider-specific init args
+    init_args = _prompt_azure_init_args(provider_value)
+
+    # Model name
+    model = typer.prompt("Model name", default=model_default)
+
+    return ProviderResult(
+        llm_provider=provider_value,
+        model=model,
+        auth_type=auth_type,
+        init_args=init_args,
+    )
+
+
+def prompt_embedding_provider(
+    chat_result: ProviderResult,
+) -> ProviderResult:
+    """Ask whether to reuse the chat provider for embeddings, or configure separately."""
+    if typer.confirm("Use the same provider for embeddings?", default=True):
+        # Derive the embedding provider from the chat provider
+        mapping = {
+            "openai.chat": "openai.embedding",
+            "azure.openai.chat": "azure.openai.embedding",
+            "azure.inference.chat": "azure.inference.embedding",
+        }
+        emb_provider = mapping.get(chat_result.llm_provider, "openai.embedding")
+        return ProviderResult(
+            llm_provider=emb_provider,
+            model=DEFAULT_EMBEDDING_MODEL,
+            auth_type=chat_result.auth_type,
+            init_args=dict(chat_result.init_args),
+        )
+    return prompt_provider("embedding")
+
+
+# ---------------------------------------------------------------------------
+# List collection
+# ---------------------------------------------------------------------------
+
+
+def prompt_list_items(
+    item_name: str,
+    field_defs: list[FieldDef],
+    *,
+    min_items: int = 1,
+) -> list[dict[str, Any]]:
+    """Collect a list of items by prompting the user in a loop.
+
+    Each iteration prompts for each field in *field_defs*, then asks
+    "Add another <item_name>?".
+    """
+    items: list[dict[str, Any]] = []
+    while True:
+        rich_print(f"\n[bold]  {item_name} #{len(items) + 1}[/bold]")
+        item: dict[str, Any] = {}
+        for f in field_defs:
+            raw = typer.prompt(f"  {f.name}", default=f.default, type=f.field_type)
+            item[f.name] = raw
+        items.append(item)
+
+        if len(items) >= min_items:
+            if not typer.confirm(f"Add another {item_name}?", default=False):
+                break
+        else:
+            typer.echo(f"  (need at least {min_items})")
+    return items
+
+
+def prompt_comma_list(prompt_text: str, default: str = "") -> list[str]:
+    """Prompt for a comma-separated list and return split values."""
+    raw = typer.prompt(prompt_text, default=default)
+    return [s.strip() for s in raw.split(",") if s.strip()]
+
+
+# ---------------------------------------------------------------------------
+# AutoQ interactive configuration
+# ---------------------------------------------------------------------------
+
+_QUESTION_TYPES = [
+    "data_local",
+    "data_global",
+    "data_linked",
+    "activity_local",
+    "activity_global",
+]
+
+
+def build_autoq_config() -> dict[str, Any]:
+    """Walk the user through AutoQ configuration and return a render-ready dict.
+
+    The returned dictionary contains every value needed to render the AutoQ
+    YAML settings file.  Keys are organised into logical sections that mirror
+    the wizard steps shown to the user.
+    """
+    rich_print(Panel("[bold]AutoQ — Question Generation[/bold]", expand=False))
+
+    # ── 1. Chat LLM provider ──────────────────────────────────────────────
+    chat_result = prompt_provider("chat")
+
+    # ── 2. Embedding LLM ──────────────────────────────────────────────────
+    embedding_result = prompt_embedding_provider(chat_result)
+
+    # ── 3. Input section ──────────────────────────────────────────────────
+    input_fields = [
+        FieldDef("dataset_path", "Path to input dataset", "./input"),
+        FieldDef("input_type", "Input file type", "json", choices=["csv", "json"]),
+        FieldDef("text_column", "Column containing text", "text"),
+        FieldDef(
+            "metadata_columns",
+            "Metadata columns (comma-separated)",
+            "",
+        ),
+        FieldDef("file_encoding", "File encoding", "utf-8-sig"),
+    ]
+    input_values = prompt_section("Input", input_fields)
+
+    # Normalise metadata_columns to a list or None
+    raw_meta = input_values.get("metadata_columns", "")
+    if isinstance(raw_meta, str):
+        parts = [s.strip() for s in raw_meta.split(",") if s.strip()]
+        input_values["metadata_columns"] = parts or None
+
+    # ── 4. Encoding section ───────────────────────────────────────────────
+    encoding_fields = [
+        FieldDef("model_name", "Tokeniser model name", "o200k_base"),
+        FieldDef("chunk_size", "Chunk size (tokens)", 600, field_type=int),
+        FieldDef("chunk_overlap", "Chunk overlap (tokens)", 100, field_type=int),
+    ]
+    encoding_values = prompt_section("Encoding", encoding_fields)
+
+    # ── 5. Sampling section ───────────────────────────────────────────────
+    sampling_fields = [
+        FieldDef("num_clusters", "Number of clusters", 20, field_type=int),
+        FieldDef(
+            "num_samples_per_cluster",
+            "Samples per cluster",
+            10,
+            field_type=int,
+        ),
+        FieldDef("random_seed", "Random seed", 42, field_type=int),
+    ]
+    sampling_values = prompt_section("Sampling", sampling_fields)
+
+    # ── 6. Question Types section ─────────────────────────────────────────
+    qt_fields: list[FieldDef] = []
+    for qt in _QUESTION_TYPES:
+        qt_fields.extend([
+            FieldDef(
+                f"{qt}_num_questions",
+                f"{qt} — number of questions",
+                10,
+                field_type=int,
+            ),
+            FieldDef(
+                f"{qt}_oversample_factor",
+                f"{qt} — oversample factor",
+                2.0,
+                field_type=float,
+            ),
+        ])
+
+    qt_values = prompt_section("Question Types", qt_fields)
+
+    # Reshape flat values into nested per-type dicts
+    question_types: dict[str, dict[str, Any]] = {}
+    customised_qt = qt_values != {f.name: f.default for f in qt_fields}
+    for qt in _QUESTION_TYPES:
+        question_types[qt] = {
+            "num_questions": qt_values[f"{qt}_num_questions"],
+            "oversample_factor": qt_values[f"{qt}_oversample_factor"],
+        }
+
+    # ── 7. Activity question params (only when QT section was customised) ─
+    activity_defaults = {
+        "num_personas": 5,
+        "num_tasks_per_persona": 2,
+        "num_entities_per_task": 5,
+    }
+    if customised_qt:
+        activity_fields = [
+            FieldDef("num_personas", "Number of personas", 5, field_type=int),
+            FieldDef(
+                "num_tasks_per_persona",
+                "Tasks per persona",
+                2,
+                field_type=int,
+            ),
+            FieldDef(
+                "num_entities_per_task",
+                "Entities per task",
+                5,
+                field_type=int,
+            ),
+        ]
+        activity_values = prompt_section("Activity Question Params", activity_fields)
+    else:
+        activity_values = dict(activity_defaults)
+
+    # ── 8. Assertions section ─────────────────────────────────────────────
+    assertions_fields = [
+        FieldDef("max_assertions", "Max assertions per question", 20, field_type=int),
+        FieldDef(
+            "enable_validation",
+            "Enable assertion validation",
+            default=True,
+            field_type=bool,
+        ),
+        FieldDef(
+            "min_validation_score",
+            "Minimum validation score",
+            3,
+            field_type=int,
+        ),
+    ]
+
+    # Display defaults, then offer customisation.
+    # We handle the bool field (enable_validation) specially via typer.confirm.
+    show_section_defaults("Assertions", assertions_fields)
+    if not typer.confirm("Customize this section?", default=False):
+        assertions_values = {f.name: f.default for f in assertions_fields}
+    else:
+        assertions_values: dict[str, Any] = {}
+        for f in assertions_fields:
+            if f.field_type is bool:
+                assertions_values[f.name] = typer.confirm(f.name, default=f.default)
+            elif f.choices:
+                assertions_values[f.name] = select_option(
+                    f.description, [(c, c) for c in f.choices]
+                )
+            else:
+                assertions_values[f.name] = typer.prompt(
+                    f.name, default=f.default, type=f.field_type
+                )
+
+    # ── 9. Concurrency ────────────────────────────────────────────────────
+    concurrent_requests = typer.prompt("Concurrent requests", default=8, type=int)
+
+    # ── Build final config dict ───────────────────────────────────────────
+    return {
+        "chat_provider": chat_result,
+        "embedding_provider": embedding_result,
+        "input": input_values,
+        "encoding": encoding_values,
+        "sampling": sampling_values,
+        "question_types": question_types,
+        "activity_params": activity_values,
+        "assertions": assertions_values,
+        "concurrent_requests": concurrent_requests,
+    }
+
+
+# ---------------------------------------------------------------------------
+# AutoE interactive configuration flows
+# ---------------------------------------------------------------------------
+
+_CONDITION_FIELDS = [
+    FieldDef(
+        name="name",
+        description="Condition name",
+        default="",
+        field_type=str,
+    ),
+    FieldDef(
+        name="answer_base_path",
+        description="Path to answer files",
+        default="input/method_name",
+        field_type=str,
+    ),
+]
+
+
+def _prompt_condition(label: str) -> dict[str, Any]:
+    """Prompt for a single condition (name + answer_base_path)."""
+    rich_print(f"\n[bold]  {label}[/bold]")
+    name = typer.prompt("  name", default="")
+    answer_base_path = typer.prompt("  answer_base_path", default="input/method_name")
+    return {"name": name, "answer_base_path": answer_base_path}
+
+
+def _prompt_even_trials(default: int = 4) -> int:
+    """Prompt for a trial count and ensure it is even."""
+    trials = typer.prompt("Number of trials (must be even)", default=default, type=int)
+    if trials % 2 != 0:
+        trials += 1
+        typer.echo(f"  Trials must be even — rounded up to {trials}.")
+    return trials
+
+
+def build_autoe_pairwise_config() -> dict[str, Any]:
+    """Interactive flow for AutoE pairwise evaluation configuration."""
+    rich_print(Panel("AutoE — Pairwise Evaluation"))
+
+    # LLM provider
+    chat_provider = prompt_provider("chat")
+
+    # Base condition
+    base = _prompt_condition("Base condition")
+
+    # Other conditions
+    rich_print("\n[bold]Other conditions to compare against the base:[/bold]")
+    others = prompt_list_items("condition", _CONDITION_FIELDS, min_items=1)
+
+    # Question sets
+    question_sets = prompt_comma_list(
+        "Question sets (comma-separated)", "activity_global, activity_local"
+    )
+
+    # Trials
+    trials = _prompt_even_trials()
+
+    # Custom criteria
+    criteria: list[dict[str, Any]] | None = None
+    if typer.confirm("Add custom scoring criteria?", default=False):
+        criteria = prompt_list_items(
+            "criterion",
+            [
+                FieldDef(
+                    name="name",
+                    description="Criterion name",
+                    default="",
+                    field_type=str,
+                ),
+                FieldDef(
+                    name="description",
+                    description="Criterion description",
+                    default="",
+                    field_type=str,
+                ),
+            ],
+        )
+
+    return {
+        "chat_provider": chat_provider,
+        "base": base,
+        "others": others,
+        "question_sets": question_sets,
+        "trials": trials,
+        "criteria": criteria,
+    }
+
+
+def build_autoe_reference_config() -> dict[str, Any]:
+    """Interactive flow for AutoE reference evaluation configuration."""
+    rich_print(Panel("AutoE — Reference Evaluation"))
+
+    # LLM provider
+    chat_provider = prompt_provider("chat")
+
+    # Reference condition
+    reference = _prompt_condition("Reference condition")
+
+    # Generated conditions
+    rich_print("\n[bold]Generated conditions to evaluate:[/bold]")
+    generated = prompt_list_items("generated condition", _CONDITION_FIELDS, min_items=1)
+
+    # Score range
+    score_min = typer.prompt("Score minimum", default=1, type=int)
+    score_max = typer.prompt("Score maximum", default=10, type=int)
+
+    # Trials
+    trials = _prompt_even_trials()
+
+    return {
+        "chat_provider": chat_provider,
+        "reference": reference,
+        "generated": generated,
+        "score_min": score_min,
+        "score_max": score_max,
+        "trials": trials,
+    }
+
+
+def build_autoe_assertion_config() -> dict[str, Any]:
+    """Interactive flow for AutoE assertion evaluation configuration."""
+    rich_print(Panel("AutoE — Assertion Evaluation"))
+
+    # LLM provider
+    chat_provider = prompt_provider("chat")
+
+    # Generated condition
+    generated = _prompt_condition("Generated condition")
+
+    # Assertions path
+    assertions_path = typer.prompt(
+        "Path to assertions file", default="input/assertions.json"
+    )
+
+    # Pass threshold
+    pass_threshold = typer.prompt("Pass threshold", default=0.5, type=float)
+
+    # Trials
+    trials = typer.prompt("Number of trials", default=4, type=int)
+
+    return {
+        "chat_provider": chat_provider,
+        "generated": generated,
+        "assertions": {"assertions_path": assertions_path},
+        "pass_threshold": pass_threshold,
+        "trials": trials,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Config type metadata
+# ---------------------------------------------------------------------------
+
+CONFIG_TYPE_OPTIONS: list[tuple[str, str]] = [
+    ("autoq", "AutoQ — Question Generation"),
+    ("autoe_pairwise", "AutoE — Pairwise Evaluation"),
+    ("autoe_reference", "AutoE — Reference Evaluation"),
+    ("autoe_assertion", "AutoE — Assertion Evaluation"),
+]
+
+
+# ---------------------------------------------------------------------------
+# Prompt copying orchestration
+# ---------------------------------------------------------------------------
+
+
+def _copy_prompts_for_config(config_type: str, prompts_folder: Path) -> None:
+    """Copy the appropriate prompt templates for the given config type."""
+    from benchmark_qed.autod.prompts import summarization
+    from benchmark_qed.autoe.prompts import assertion as assertion_prompts
+    from benchmark_qed.autoe.prompts import pairwise as pairwise_prompts
+    from benchmark_qed.autoe.prompts import reference as reference_prompts
+    from benchmark_qed.autoq.prompts import data_questions as data_questions_prompts
+    from benchmark_qed.autoq.prompts.activity_questions import (
+        activity_context as activity_context_prompts,
+    )
+    from benchmark_qed.autoq.prompts.activity_questions import (
+        global_questions as activity_global_prompts,
+    )
+    from benchmark_qed.autoq.prompts.activity_questions import (
+        local_questions as activity_local_prompts,
+    )
+    from benchmark_qed.autoq.prompts.data_questions import (
+        assertions as autoq_assertion_prompts,
+    )
+    from benchmark_qed.autoq.prompts.data_questions import (
+        global_questions as data_global_prompts,
+    )
+    from benchmark_qed.autoq.prompts.data_questions import (
+        linked_questions as data_linked_prompts,
+    )
+    from benchmark_qed.autoq.prompts.data_questions import (
+        local_questions as data_local_prompts,
+    )
+    from benchmark_qed.cli.scaffold import copy_prompts
+
+    match config_type:
+        case "autoq":
+            copy_prompts(
+                Path(summarization.__file__).parent,
+                prompts_folder / "summarization",
+            )
+            copy_prompts(
+                Path(activity_context_prompts.__file__).parent,
+                prompts_folder / "activity_questions" / "activity_context",
+            )
+            copy_prompts(
+                Path(activity_global_prompts.__file__).parent,
+                prompts_folder / "activity_questions" / "activity_global",
+            )
+            copy_prompts(
+                Path(activity_local_prompts.__file__).parent,
+                prompts_folder / "activity_questions" / "activity_local",
+            )
+            copy_prompts(
+                Path(data_global_prompts.__file__).parent,
+                prompts_folder / "data_questions" / "data_global",
+            )
+            copy_prompts(
+                Path(data_local_prompts.__file__).parent,
+                prompts_folder / "data_questions" / "data_local",
+            )
+            copy_prompts(
+                Path(data_linked_prompts.__file__).parent,
+                prompts_folder / "data_questions" / "data_linked",
+            )
+            copy_prompts(
+                Path(data_questions_prompts.__file__).parent,
+                prompts_folder / "data_questions",
+            )
+            copy_prompts(
+                Path(autoq_assertion_prompts.__file__).parent,
+                prompts_folder / "data_questions" / "assertions",
+            )
+        case "autoe_pairwise":
+            copy_prompts(Path(pairwise_prompts.__file__).parent, prompts_folder)
+        case "autoe_reference":
+            copy_prompts(Path(reference_prompts.__file__).parent, prompts_folder)
+        case "autoe_assertion":
+            copy_prompts(Path(assertion_prompts.__file__).parent, prompts_folder)
+
+
+# ---------------------------------------------------------------------------
+# Main init command
+# ---------------------------------------------------------------------------
+
+
+@app.command()
+def interactive_init(
+    root: Annotated[
+        Path,
+        typer.Argument(help="The root directory for the new benchmark project."),
+    ],
+) -> None:
+    """Interactively create a benchmark-qed configuration."""
+    from benchmark_qed.cli.scaffold import ensure_input_folder, write_env_file
+    from benchmark_qed.cli.yaml_renderer import (
+        render_autoe_assertion_yaml,
+        render_autoe_pairwise_yaml,
+        render_autoe_reference_yaml,
+        render_autoq_yaml,
+        validate_config,
+    )
+
+    check_tty()
+
+    rich_print(
+        Panel(
+            "[bold]benchmark-qed[/bold] — Interactive Configuration Wizard",
+            subtitle="Press Enter to accept defaults",
+        )
+    )
+
+    # 1. Select config type
+    config_type = select_option("Select configuration type", CONFIG_TYPE_OPTIONS)
+
+    # 2. Run the appropriate builder
+    builders = {
+        "autoq": build_autoq_config,
+        "autoe_pairwise": build_autoe_pairwise_config,
+        "autoe_reference": build_autoe_reference_config,
+        "autoe_assertion": build_autoe_assertion_config,
+    }
+    config_dict = builders[config_type]()
+
+    # 3. Render YAML
+    renderers = {
+        "autoq": render_autoq_yaml,
+        "autoe_pairwise": render_autoe_pairwise_yaml,
+        "autoe_reference": render_autoe_reference_yaml,
+        "autoe_assertion": render_autoe_assertion_yaml,
+    }
+    yaml_content = renderers[config_type](config_dict)
+
+    # 4. Validate against Pydantic model
+    validate_config(yaml_content, config_type)
+
+    # 5. Write files
+    root.mkdir(parents=True, exist_ok=True)
+    settings_path = root / "settings.yaml"
+    confirm_overwrite(settings_path)
+    settings_path.write_text(yaml_content, encoding="utf-8")
+
+    prompts_folder = root / "prompts"
+    _copy_prompts_for_config(config_type, prompts_folder)
+
+    ensure_input_folder(root)
+    write_env_file(root)
+
+    # 6. Success summary
+    rich_print(f"\n[green]✅ Configuration created at {settings_path}[/green]")
+    rich_print(f"[green]✅ Prompt templates copied to {prompts_folder}/[/green]")
+    rich_print(
+        "[green]✅ .env file created — update OPENAI_API_KEY before running[/green]"
+    )
diff --git a/benchmark_qed/cli/scaffold.py b/benchmark_qed/cli/scaffold.py
new file mode 100644
index 0000000..794a67e
--- /dev/null
+++ b/benchmark_qed/cli/scaffold.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025 Microsoft Corporation.
+"""Shared scaffolding utilities for config initialization."""
+
+from pathlib import Path
+
+import typer
+
+
+def copy_prompts(prompts_path: Path, output_path: Path) -> None:
+    """Copy prompt template files from a source directory to an output directory."""
+    if not output_path.exists():
+        output_path.mkdir(parents=True, exist_ok=True)
+    for prompt_file in prompts_path.iterdir():
+        if prompt_file.is_file() and prompt_file.suffix == ".txt":
+            target_file = output_path / prompt_file.name
+            target_file.write_text(
+                prompt_file.read_text(encoding="utf-8"), encoding="utf-8"
+            )
+
+
+def write_env_file(root: Path) -> None:
+    """Create a .env file with placeholder API key if it doesn't exist."""
+    env_file = root / ".env"
+    if not env_file.exists():
+        env_file.write_text("OPENAI_API_KEY=<API_KEY>", encoding="utf-8")
+    typer.echo(
+        f"Change the OPENAI_API_KEY placeholder at {env_file} with your actual OPENAI_API_KEY."
+    )
+
+
+def ensure_input_folder(root: Path) -> None:
+    """Create the input folder if it doesn't exist."""
+    input_folder = root / "input"
+    if not input_folder.exists():
+        input_folder.mkdir(parents=True, exist_ok=True)
+        typer.echo(f"Input folder created at {input_folder}")
+        typer.echo(
+            "Please place your input files in the 'input' folder before running, "
+            "or modify the settings.yaml to point to your input files."
+        )
diff --git a/benchmark_qed/cli/yaml_renderer.py b/benchmark_qed/cli/yaml_renderer.py
new file mode 100644
index 0000000..00166fa
--- /dev/null
+++ b/benchmark_qed/cli/yaml_renderer.py
@@ -0,0 +1,545 @@
+# Copyright (c) 2025 Microsoft Corporation.
+"""YAML renderer for the interactive config wizard.
+
+Transforms structured dicts (from the interactive wizard) into well-formatted,
+commented YAML strings using a template-based approach to preserve inline
+comments and consistent formatting.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import typer
+import yaml
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _render_llm_section(provider_dict: dict[str, Any], indent: int = 2) -> str:
+    """Render an LLM configuration section as a YAML fragment.
+
+    Parameters
+    ----------
+    provider_dict:
+        Dict (or dataclass) with keys ``llm_provider``, ``model``,
+        ``auth_type``, ``init_args`` (from :class:`ProviderResult`).
+    indent:
+        Number of leading spaces for each line.
+    """
+    import dataclasses
+
+    if dataclasses.is_dataclass(provider_dict) and not isinstance(provider_dict, type):
+        provider_dict = dataclasses.asdict(provider_dict)
+
+    pad = " " * indent
+    lines: list[str] = []
+
+    lines.extend([
+        f"{pad}model: {provider_dict['model']}",
+        f"{pad}auth_type: {provider_dict['auth_type']}",
+    ])
+
+    if provider_dict["auth_type"] == "api_key":
+        lines.append(f"{pad}api_key: ${{OPENAI_API_KEY}}")
+
+    lines.extend([
+        f"{pad}llm_provider: {provider_dict['llm_provider']}",
+        f"{pad}concurrent_requests: 4",
+    ])
+
+    init_args = provider_dict.get("init_args") or {}
+    if init_args:
+        lines.append(f"{pad}init_args:")
+        for key, value in init_args.items():
+            lines.append(f"{pad}  {key}: {value}")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# AutoQ
+# ---------------------------------------------------------------------------
+
+
+def render_autoq_yaml(config: dict[str, Any]) -> str:
+    """Render a complete AutoQ ``settings.yaml`` from wizard configuration.
+
+    Parameters
+    ----------
+    config:
+        Dict with keys ``chat_provider``, ``embedding_provider``, ``input``,
+        ``encoding``, ``sampling``, ``question_types``, ``activity_params``,
+        ``assertions``, ``concurrent_requests``.
+    """
+    inp = config["input"]
+    enc = config["encoding"]
+    samp = config["sampling"]
+    qt = config["question_types"]
+    ap = config["activity_params"]
+    assrt = config["assertions"]
+    concurrent = config["concurrent_requests"]
+
+    # Metadata columns
+    meta = inp.get("metadata_columns")
+    if meta is not None and isinstance(meta, list) and len(meta) > 0:
+        meta_line = f"  metadata_columns: [{', '.join(meta)}]"
+    else:
+        meta_line = ""
+
+    chat_section = _render_llm_section(config["chat_provider"])
+    embedding_section = _render_llm_section(config["embedding_provider"])
+
+    # Build metadata_columns block (include line only if present)
+    metadata_block = f"\n{meta_line}" if meta_line else ""
+
+    return f"""\
+## Input Configuration
+input:
+  dataset_path: {inp["dataset_path"]}
+  input_type: {inp["input_type"]}
+  text_column: {inp["text_column"]}{metadata_block}
+  file_encoding: {inp["file_encoding"]}
+
+## Encoder configuration
+encoding:
+  model_name: {enc["model_name"]}
+  chunk_size: {enc["chunk_size"]}
+  chunk_overlap: {enc["chunk_overlap"]}
+
+## Sampling Configuration
+sampling:
+  num_clusters: {samp["num_clusters"]}
+  num_samples_per_cluster: {samp["num_samples_per_cluster"]}
+  random_seed: {samp["random_seed"]}
+
+## LLM Configuration
+chat_model:
+{chat_section}
+
+embedding_model:
+{embedding_section}
+
+## Question Generation Configuration
+data_local:
+  num_questions: {qt["data_local"]["num_questions"]}
+  oversample_factor: {_fmt_float(qt["data_local"]["oversample_factor"])}
+data_global:
+  num_questions: {qt["data_global"]["num_questions"]}
+  oversample_factor: {_fmt_float(qt["data_global"]["oversample_factor"])}
+data_linked:
+  num_questions: {qt["data_linked"]["num_questions"]}
+  oversample_factor: {_fmt_float(qt["data_linked"]["oversample_factor"])}
+  min_questions_per_entity: 2
+  max_questions_per_entity: 10
+activity_local:
+  num_questions: {qt["activity_local"]["num_questions"]}
+  oversample_factor: {_fmt_float(qt["activity_local"]["oversample_factor"])}
+  num_personas: {ap["num_personas"]}
+  num_tasks_per_persona: {ap["num_tasks_per_persona"]}
+  num_entities_per_task: {ap["num_entities_per_task"]}
+activity_global:
+  num_questions: {qt["activity_global"]["num_questions"]}
+  oversample_factor: {_fmt_float(qt["activity_global"]["oversample_factor"])}
+  num_personas: {ap["num_personas"]}
+  num_tasks_per_persona: {ap["num_tasks_per_persona"]}
+  num_entities_per_task: {ap["num_entities_per_task"]}
+
+concurrent_requests: {concurrent}
+
+activity_questions_prompt_config:
+  activity_context_prompt_config:
+    data_summary_prompt_config:
+      summary_map_system_prompt:
+        prompt: prompts/summarization/summary_map_system_prompt.txt
+      summary_map_user_prompt:
+        prompt: prompts/summarization/summary_map_user_prompt.txt
+      summary_reduce_system_prompt:
+        prompt: prompts/summarization/summary_reduce_system_prompt.txt
+      summary_reduce_user_prompt:
+        prompt: prompts/summarization/summary_reduce_user_prompt.txt
+    activity_identification_prompt:
+      prompt: prompts/activity_questions/activity_context/activity_identification_prompt.txt
+    entity_extraction_map_system_prompt:
+      prompt: prompts/activity_questions/activity_context/entity_extraction_map_system_prompt.txt
+    entity_extraction_map_user_prompt:
+      prompt: prompts/activity_questions/activity_context/entity_extraction_map_user_prompt.txt
+    entity_extraction_reduce_system_prompt:
+      prompt: prompts/activity_questions/activity_context/entity_extraction_reduce_system_prompt.txt
+    entity_extraction_reduce_user_prompt:
+      prompt: prompts/activity_questions/activity_context/entity_extraction_reduce_user_prompt.txt
+  activity_global_prompt_config:
+    activity_global_gen_system_prompt:
+      prompt: prompts/activity_questions/activity_global/activity_global_gen_system_prompt.txt
+    activity_global_gen_user_prompt:
+      prompt: prompts/activity_questions/activity_global/activity_global_gen_user_prompt.txt
+  activity_local_prompt_config:
+    activity_local_gen_system_prompt:
+      prompt: prompts/activity_questions/activity_local/activity_local_gen_system_prompt.txt
+    activity_local_gen_user_prompt:
+      prompt: prompts/activity_questions/activity_local/activity_local_gen_user_prompt.txt
+
+data_questions_prompt_config:
+  claim_extraction_system_prompt:
+    prompt: prompts/data_questions/claim_extraction_system_prompt.txt
+  data_global_prompt_config:
+    data_global_gen_user_prompt:
+      prompt: prompts/data_questions/data_global/data_global_gen_user_prompt.txt
+    data_global_gen_system_prompt:
+      prompt: prompts/data_questions/data_global/data_global_gen_system_prompt.txt
+  data_local_prompt_config:
+    data_local_gen_system_prompt:
+      prompt: prompts/data_questions/data_local/data_local_gen_system_prompt.txt
+    data_local_expansion_system_prompt:
+      prompt: prompts/data_questions/data_local/data_local_expansion_system_prompt.txt
+    data_local_gen_user_prompt:
+      prompt: prompts/data_questions/data_local/data_local_gen_user_prompt.txt
+  data_linked_prompt_config:
+    bridge_question_system_prompt:
+      prompt: prompts/data_questions/data_linked/bridge_question_system_prompt.txt
+    comparison_question_system_prompt:
+      prompt: prompts/data_questions/data_linked/comparison_question_system_prompt.txt
+    intersection_question_system_prompt:
+      prompt: prompts/data_questions/data_linked/intersection_question_system_prompt.txt
+    linked_question_user_prompt:
+      prompt: prompts/data_questions/data_linked/linked_question_user_prompt.txt
+    batch_validation_prompt:
+      prompt: prompts/data_questions/data_linked/batch_validation_prompt.txt
+
+## Assertion Generation Configuration
+assertions:
+  local:
+    max_assertions: {assrt["max_assertions"]}
+    enable_validation: {_fmt_bool(assrt["enable_validation"])}
+    min_validation_score: {assrt["min_validation_score"]}
+    concurrent_llm_calls: 8
+    max_concurrent_questions: 8
+  global:
+    max_assertions: {assrt["max_assertions"]}
+    enable_validation: {_fmt_bool(assrt["enable_validation"])}
+    min_validation_score: {assrt["min_validation_score"]}
+    batch_size: 100
+    map_data_tokens: 8000
+    reduce_data_tokens: 32000
+    enable_semantic_grouping: true
+    validate_map_assertions: true
+    validate_reduce_assertions: true
+    concurrent_llm_calls: 8
+    max_concurrent_questions: 2
+  linked:
+    max_assertions: {assrt["max_assertions"]}
+    enable_validation: {_fmt_bool(assrt["enable_validation"])}
+    min_validation_score: {assrt["min_validation_score"]}
+    concurrent_llm_calls: 8
+    max_concurrent_questions: 2
+
+assertion_prompts:
+  local_assertion_gen_prompt:
+    prompt: prompts/data_questions/assertions/local_claim_assertion_gen_prompt.txt
+  global_assertion_map_prompt:
+    prompt: prompts/data_questions/assertions/global_claim_assertion_map_prompt.txt
+  global_assertion_reduce_prompt:
+    prompt: prompts/data_questions/assertions/global_claim_assertion_reduce_prompt.txt
+  local_validation_prompt:
+    prompt: prompts/data_questions/assertions/local_validation_prompt.txt
+  global_validation_prompt:
+    prompt: prompts/data_questions/assertions/global_validation_prompt.txt
+"""
+
+
+# ---------------------------------------------------------------------------
+# AutoE - Pairwise
+# ---------------------------------------------------------------------------
+
+
+def render_autoe_pairwise_yaml(config: dict[str, Any]) -> str:
+    """Render a pairwise evaluation ``settings.yaml`` from wizard configuration.
+
+    Parameters
+    ----------
+    config:
+        Dict with keys ``chat_provider``, ``base``, ``others``,
+        ``question_sets``, ``trials``, ``criteria``.
+    """
+    base = config["base"]
+    others = config["others"]
+    question_sets = config["question_sets"]
+    trials = config["trials"]
+    criteria = config.get("criteria")
+
+    llm_section = _render_llm_section(config["chat_provider"])
+
+    # others entries
+    others_lines = "\n".join(
+        f"  - name: {o['name']}\n    answer_base_path: {o['answer_base_path']}"
+        for o in others
+    )
+
+    # question sets
+    qsets_lines = "\n".join(f"  - {qs}" for qs in question_sets)
+
+    # criteria block
+    if criteria is not None:
+        criteria_lines = "criteria:\n" + "\n".join(
+            f'  - name: "{c["name"]}"\n    description: "{c["description"]}"'
+            for c in criteria
+        )
+    else:
+        criteria_lines = (
+            "# criteria:\n"
+            '#   - name: "criteria name"\n'
+            '#     description: "criteria description"'
+        )
+
+    return f"""\
+## Input Configuration
+base:
+  name: {base["name"]}
+  answer_base_path: {base["answer_base_path"]}
+others:
+{others_lines}
+question_sets:
+{qsets_lines}
+
+## Scoring Configuration
+{criteria_lines}
+trials: {trials}
+
+## LLM Configuration
+llm_config:
+{llm_section}
+
+prompts_config:
+  user_prompt:
+    prompt: prompts/pairwise_user_prompt.txt
+  system_prompt:
+    prompt: prompts/pairwise_system_prompt.txt
+"""
+
+
+# ---------------------------------------------------------------------------
+# AutoE - Reference
+# ---------------------------------------------------------------------------
+
+
+def render_autoe_reference_yaml(config: dict[str, Any]) -> str:
+    """Render a reference evaluation ``settings.yaml`` from wizard configuration.
+
+    Parameters
+    ----------
+    config:
+        Dict with keys ``chat_provider``, ``reference``, ``generated``,
+        ``score_min``, ``score_max``, ``trials``.
+    """
+    ref = config["reference"]
+    generated = config["generated"]
+    trials = config["trials"]
+    score_min = config.get("score_min", 1)
+    score_max = config.get("score_max", 10)
+
+    llm_section = _render_llm_section(config["chat_provider"])
+
+    generated_lines = "\n".join(
+        f"  - name: {g['name']}\n    answer_base_path: {g['answer_base_path']}"
+        for g in generated
+    )
+
+    return f"""\
+## Input Configuration
+reference:
+  name: {ref["name"]}
+  answer_base_path: {ref["answer_base_path"]}
+generated:
+{generated_lines}
+
+## Scoring Configuration
+score_min: {score_min}
+score_max: {score_max}
+trials: {trials}
+
+## LLM Configuration
+llm_config:
+{llm_section}
+
+prompts_config:
+  user_prompt:
+    prompt: prompts/reference_user_prompt.txt
+  system_prompt:
+    prompt: prompts/reference_system_prompt.txt
+"""
+
+
+# ---------------------------------------------------------------------------
+# AutoE - Assertion
+# ---------------------------------------------------------------------------
+
+
+def render_autoe_assertion_yaml(config: dict[str, Any]) -> str:
+    """Render an assertion evaluation ``settings.yaml`` from wizard configuration.
+
+    Parameters
+    ----------
+    config:
+        Dict with keys ``chat_provider``, ``generated``, ``assertions``,
+        ``pass_threshold``, ``trials``.
+    """
+    gen = config["generated"]
+    assertions = config["assertions"]
+    pass_threshold = config.get("pass_threshold", 0.5)
+    trials = config["trials"]
+
+    llm_section = _render_llm_section(config["chat_provider"])
+
+    return f"""\
+## Input Configuration
+generated:
+  name: {gen["name"]}
+  answer_base_path: {gen["answer_base_path"]}
+assertions:
+  assertions_path: {assertions["assertions_path"]}
+
+pass_threshold: {pass_threshold}
+trials: {trials}
+
+## LLM Configuration
+llm_config:
+{llm_section}
+
+prompts_config:
+  user_prompt:
+    prompt: prompts/assertion_user_prompt.txt
+  system_prompt:
+    prompt: prompts/assertion_system_prompt.txt
+"""
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+_REQUIRED_KEYS: dict[str, list[str]] = {
+    "autoq": [
+        "input",
+        "encoding",
+        "sampling",
+        "chat_model",
+        "embedding_model",
+        "data_local",
+        "data_global",
+        "data_linked",
+        "activity_local",
+        "activity_global",
+        "assertions",
+    ],
+    "autoe_pairwise": [
+        "base",
+        "others",
+        "question_sets",
+        "trials",
+        "llm_config",
+    ],
+    "autoe_reference": [
+        "reference",
+        "generated",
+        "score_min",
+        "score_max",
+        "trials",
+        "llm_config",
+    ],
+    "autoe_assertion": [
+        "generated",
+        "assertions",
+        "pass_threshold",
+        "trials",
+        "llm_config",
+    ],
+}
+
+
+def validate_config(yaml_content: str, config_type: str) -> None:
+    """Validate generated YAML against expected structure.
+
+    Parses the YAML and checks that required top-level keys exist and have
+    the correct types.  Prompt file paths are **not** validated because they
+    are written to disk *after* the settings file is generated.
+
+    Raises :class:`typer.BadParameter` on validation failure.
+    """
+    try:
+        data = yaml.safe_load(yaml_content)
+    except yaml.YAMLError as exc:
+        msg = f"Generated YAML is not valid: {exc}"
+        raise typer.BadParameter(msg) from exc
+
+    if not isinstance(data, dict):
+        msg = "Generated YAML root must be a mapping."
+        raise typer.BadParameter(msg)
+
+    required = _REQUIRED_KEYS.get(config_type)
+    if required is None:
+        msg = f"Unknown config type: {config_type!r}"
+        raise typer.BadParameter(msg)
+
+    missing = [k for k in required if k not in data]
+    if missing:
+        msg = f"Missing required keys for {config_type}: {', '.join(missing)}"
+        raise typer.BadParameter(msg)
+
+    # Type-check a few critical fields
+    try:
+        if config_type == "autoq":
+            _check_type(data, "input", dict)
+            _check_type(data, "encoding", dict)
+            _check_type(data, "sampling", dict)
+            _check_type(data, "chat_model", dict)
+            _check_type(data, "embedding_model", dict)
+            _check_type(data, "assertions", dict)
+        elif config_type == "autoe_pairwise":
+            _check_type(data, "base", dict)
+            _check_type(data, "others", list)
+            _check_type(data, "question_sets", list)
+            _check_type(data, "trials", int)
+            _check_type(data, "llm_config", dict)
+        elif config_type == "autoe_reference":
+            _check_type(data, "reference", dict)
+            _check_type(data, "generated", list)
+            _check_type(data, "trials", int)
+            _check_type(data, "llm_config", dict)
+        elif config_type == "autoe_assertion":
+            _check_type(data, "generated", dict)
+            _check_type(data, "assertions", dict)
+            _check_type(data, "trials", int)
+            _check_type(data, "llm_config", dict)
+    except typer.BadParameter:
+        raise
+    except Exception as exc:
+        msg = f"Validation error for {config_type}: {exc}"
+        raise typer.BadParameter(msg) from exc
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _check_type(data: dict[str, Any], key: str, expected: type) -> None:
+    """Raise :class:`typer.BadParameter` if *data[key]* is not *expected* type."""
+    value = data[key]
+    if not isinstance(value, expected):
+        msg = f"Key '{key}' should be {expected.__name__}, got {type(value).__name__}"
+        raise typer.BadParameter(msg)
+
+
+def _fmt_bool(value: Any) -> str:
+    """Format a Python boolean as a YAML boolean literal."""
+    return "true" if value else "false"
+
+
+def _fmt_float(value: Any) -> str:
+    """Format a number, ensuring floats keep a decimal point."""
+    if isinstance(value, float):
+        return str(value)
+    # Integers that should display as floats (e.g. 2 -> 2.0)
+    return f"{float(value)}"
diff --git a/tests/autoe/assertion/pipeline_test.py b/tests/autoe/assertion/pipeline_test.py
index 774e51e..b129ad6 100644
--- a/tests/autoe/assertion/pipeline_test.py
+++ b/tests/autoe/assertion/pipeline_test.py
@@ -177,14 +177,14 @@ def test_non_dict_assertions_renamed(self, tmp_path: Path) -> None:
         """Non-dict assertions use the assertions_key rename path."""
         # Build a DataFrame where assertions are plain strings but
         # supporting_assertions is a separate column
-        df = pd.DataFrame({
+        test_df = pd.DataFrame({
             "question_id": ["q1"],
             "question_text": ["What?"],
             "assertions": ["A plain assertion."],
             "supporting_assertions": [["SA1"]],
         })
         path = tmp_path / "assertions.json"
-        df.to_json(path, orient="records")
+        test_df.to_json(path, orient="records")
 
         result = load_and_normalize_hierarchical_assertions(path)
 
diff --git a/tests/test_interactive_init.py b/tests/test_interactive_init.py
new file mode 100644
index 0000000..f515d4a
--- /dev/null
+++ b/tests/test_interactive_init.py
@@ -0,0 +1,693 @@
+# Copyright (c) 2025 Microsoft Corporation.
+"""Tests for the interactive init wizard and YAML renderers."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+import typer
+import yaml
+from typer.testing import CliRunner
+
+from benchmark_qed.__main__ import app
+from benchmark_qed.cli.interactive import (
+    prompt_comma_list,
+    select_option,
+)
+from benchmark_qed.cli.yaml_renderer import (
+    _render_llm_section,
+    render_autoe_assertion_yaml,
+    render_autoe_pairwise_yaml,
+    render_autoe_reference_yaml,
+    render_autoq_yaml,
+    validate_config,
+)
+
+# ---------------------------------------------------------------------------
+# Shared factory helpers
+# ---------------------------------------------------------------------------
+
+
+def _openai_chat_provider() -> dict[str, Any]:
+    return {
+        "llm_provider": "openai.chat",
+        "model": "gpt-4.1",
+        "auth_type": "api_key",
+        "init_args": {},
+    }
+
+
+def _openai_embedding_provider() -> dict[str, Any]:
+    return {
+        "llm_provider": "openai.embedding",
+        "model": "text-embedding-3-large",
+        "auth_type": "api_key",
+        "init_args": {},
+    }
+
+
+def _azure_chat_provider() -> dict[str, Any]:
+    return {
+        "llm_provider": "azure.openai.chat",
+        "model": "gpt-4.1",
+        "auth_type": "api_key",
+        "init_args": {
+            "azure_endpoint": "https://example.openai.azure.com",
+            "api_version": "2024-12-01-preview",
+        },
+    }
+
+
+def _azure_managed_identity_provider() -> dict[str, Any]:
+    return {
+        "llm_provider": "azure.openai.chat",
+        "model": "gpt-4.1",
+        "auth_type": "azure_managed_identity",
+        "init_args": {
+            "azure_endpoint": "https://example.openai.azure.com",
+            "api_version": "2024-12-01-preview",
+        },
+    }
+
+
+def _default_autoq_config() -> dict[str, Any]:
+    return {
+        "chat_provider": _openai_chat_provider(),
+        "embedding_provider": _openai_embedding_provider(),
+        "input": {
+            "dataset_path": "./input",
+            "input_type": "json",
+            "text_column": "text",
+            "metadata_columns": None,
+            "file_encoding": "utf-8",
+        },
+        "encoding": {
+            "model_name": "o200k_base",
+            "chunk_size": 600,
+            "chunk_overlap": 100,
+        },
+        "sampling": {
+            "num_clusters": 20,
+            "num_samples_per_cluster": 10,
+            "random_seed": 42,
+        },
+        "question_types": {
+            qt: {"num_questions": 10, "oversample_factor": 2.0}
+            for qt in [
+                "data_local",
+                "data_global",
+                "data_linked",
+                "activity_local",
+                "activity_global",
+            ]
+        },
+        "activity_params": {
+            "num_personas": 5,
+            "num_tasks_per_persona": 2,
+            "num_entities_per_task": 5,
+        },
+        "assertions": {
+            "max_assertions": 20,
+            "enable_validation": True,
+            "min_validation_score": 3,
+        },
+        "concurrent_requests": 8,
+    }
+
+
+def _default_pairwise_config() -> dict[str, Any]:
+    return {
+        "chat_provider": _openai_chat_provider(),
+        "base": {"name": "baseline", "answer_base_path": "input/baseline"},
+        "others": [
+            {"name": "method_a", "answer_base_path": "input/method_a"},
+        ],
+        "question_sets": ["activity_global", "activity_local"],
+        "trials": 4,
+        "criteria": None,
+    }
+
+
+def _default_reference_config() -> dict[str, Any]:
+    return {
+        "chat_provider": _openai_chat_provider(),
+        "reference": {"name": "golden", "answer_base_path": "input/golden"},
+        "generated": [
+            {"name": "method_a", "answer_base_path": "input/method_a"},
+        ],
+        "score_min": 1,
+        "score_max": 10,
+        "trials": 4,
+    }
+
+
+def _default_assertion_config() -> dict[str, Any]:
+    return {
+        "chat_provider": _openai_chat_provider(),
+        "generated": {"name": "method_a", "answer_base_path": "input/method_a"},
+        "assertions": {"assertions_path": "input/assertions.json"},
+        "pass_threshold": 0.5,
+        "trials": 4,
+    }
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 1. YAML Renderer Tests
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class TestYamlRenderers:
+    """Verify each renderer produces valid, parseable YAML."""
+
+    def test_render_autoq_yaml_produces_valid_yaml(self):
+        """render_autoq_yaml returns parseable YAML with all expected sections."""
+        config = _default_autoq_config()
+        yaml_content = render_autoq_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert parsed is not None
+        assert "input" in parsed
+        assert "chat_model" in parsed
+        assert "embedding_model" in parsed
+        assert "sampling" in parsed
+        assert parsed["concurrent_requests"] == 8
+
+    def test_render_autoq_yaml_includes_question_types(self):
+        """AutoQ YAML includes all five question type sections."""
+        config = _default_autoq_config()
+        yaml_content = render_autoq_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        for qt in [
+            "data_local",
+            "data_global",
+            "data_linked",
+            "activity_local",
+            "activity_global",
+        ]:
+            assert qt in parsed, f"Missing question type section: {qt}"
+            assert "num_questions" in parsed[qt]
+
+    def test_render_autoq_yaml_includes_assertions(self):
+        """AutoQ YAML includes assertions section with local/global/linked."""
+        config = _default_autoq_config()
+        yaml_content = render_autoq_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert "assertions" in parsed
+        for section in ["local", "global", "linked"]:
+            assert section in parsed["assertions"]
+
+    def test_render_autoq_yaml_with_metadata_columns(self):
+        """Metadata columns are included when provided."""
+        config = _default_autoq_config()
+        config["input"]["metadata_columns"] = ["source", "date"]
+        yaml_content = render_autoq_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert parsed["input"]["metadata_columns"] == ["source", "date"]
+
+    def test_render_autoq_yaml_without_metadata_columns(self):
+        """No metadata_columns key when None is provided."""
+        config = _default_autoq_config()
+        config["input"]["metadata_columns"] = None
+        yaml_content = render_autoq_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert "metadata_columns" not in parsed["input"]
+
+    def test_render_autoq_yaml_includes_prompt_configs(self):
+        """AutoQ YAML includes prompt configuration sections."""
+        config = _default_autoq_config()
+        yaml_content = render_autoq_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert "activity_questions_prompt_config" in parsed
+        assert "data_questions_prompt_config" in parsed
+        assert "assertion_prompts" in parsed
+
+    def test_render_autoe_pairwise_yaml(self):
+        """Pairwise YAML includes base, others, question_sets."""
+        config = _default_pairwise_config()
+        yaml_content = render_autoe_pairwise_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert parsed is not None
+        assert "base" in parsed
+        assert "others" in parsed
+        assert "question_sets" in parsed
+        assert parsed["trials"] == 4
+        assert "llm_config" in parsed
+
+    def test_render_autoe_pairwise_yaml_with_criteria(self):
+        """Pairwise YAML with custom criteria includes them."""
+        config = _default_pairwise_config()
+        config["criteria"] = [
+            {"name": "accuracy", "description": "Is the answer correct?"},
+        ]
+        yaml_content = render_autoe_pairwise_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert "criteria" in parsed
+        assert len(parsed["criteria"]) == 1
+
+    def test_render_autoe_pairwise_yaml_no_criteria_is_commented(self):
+        """Pairwise YAML without criteria has commented-out criteria block."""
+        config = _default_pairwise_config()
+        config["criteria"] = None
+        yaml_content = render_autoe_pairwise_yaml(config)
+        assert "# criteria:" in yaml_content
+
+    def test_render_autoe_reference_yaml(self):
+        """Reference YAML includes score range."""
+        config = _default_reference_config()
+        yaml_content = render_autoe_reference_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert parsed is not None
+        assert "reference" in parsed
+        assert "generated" in parsed
+        assert parsed["score_min"] == 1
+        assert parsed["score_max"] == 10
+        assert "llm_config" in parsed
+
+    def test_render_autoe_reference_yaml_multiple_generated(self):
+        """Reference YAML with multiple generated conditions."""
+        config = _default_reference_config()
+        config["generated"].append({
+            "name": "method_b",
+            "answer_base_path": "input/method_b",
+        })
+        yaml_content = render_autoe_reference_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert len(parsed["generated"]) == 2
+
+    def test_render_autoe_assertion_yaml(self):
+        """Assertion YAML includes pass_threshold."""
+        config = _default_assertion_config()
+        yaml_content = render_autoe_assertion_yaml(config)
+        parsed = yaml.safe_load(yaml_content)
+        assert parsed is not None
+        assert "generated" in parsed
+        assert "assertions" in parsed
+        assert parsed["pass_threshold"] == 0.5
+        assert parsed["trials"] == 4
+        assert "llm_config" in parsed
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 2. LLM Section Rendering Tests
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class TestLlmRendering:
+    """Verify _render_llm_section output for different provider configs."""
+
+    def test_openai_provider_includes_api_key(self):
+        """OpenAI provider with api_key auth includes ${OPENAI_API_KEY}."""
+        provider = _openai_chat_provider()
+        section = _render_llm_section(provider)
+        assert "${OPENAI_API_KEY}" in section
+        assert "model: gpt-4.1" in section
+        assert "llm_provider: openai.chat" in section
+
+    def test_azure_provider_includes_init_args(self):
+        """Azure provider renders azure_endpoint and api_version in init_args."""
+        provider = _azure_chat_provider()
+        section = _render_llm_section(provider)
+        assert "init_args:" in section
+        assert "azure_endpoint: https://example.openai.azure.com" in section
+        assert "api_version: 2024-12-01-preview" in section
+
+    def test_managed_identity_omits_api_key(self):
+        """azure_managed_identity auth type does NOT include api_key line."""
+        provider = _azure_managed_identity_provider()
+        section = _render_llm_section(provider)
+        assert "api_key:" not in section
+        assert "auth_type: azure_managed_identity" in section
+
+    def test_openai_provider_no_init_args_block(self):
+        """OpenAI provider with empty init_args omits init_args block."""
+        provider = _openai_chat_provider()
+        section = _render_llm_section(provider)
+        assert "init_args:" not in section
+
+    def test_section_includes_concurrent_requests(self):
+        """Every LLM section includes concurrent_requests."""
+        provider = _openai_chat_provider()
+        section = _render_llm_section(provider)
+        assert "concurrent_requests: 4" in section
+
+    def test_custom_indent(self):
+        """Custom indent produces properly indented output."""
+        provider = _openai_chat_provider()
+        section = _render_llm_section(provider, indent=4)
+        for line in section.split("\n"):
+            assert line.startswith("    ")
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 3. Validation Tests
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class TestValidation:
+    """Verify validate_config accepts valid YAML and rejects invalid YAML."""
+
+    def test_validate_config_passes_valid_autoq(self):
+        """Valid AutoQ YAML passes validation."""
+        yaml_content = render_autoq_yaml(_default_autoq_config())
+        validate_config(yaml_content, "autoq")
+
+    def test_validate_config_passes_valid_pairwise(self):
+        """Valid pairwise YAML passes validation."""
+        yaml_content = render_autoe_pairwise_yaml(_default_pairwise_config())
+        validate_config(yaml_content, "autoe_pairwise")
+
+    def test_validate_config_passes_valid_reference(self):
+        """Valid reference YAML passes validation."""
+        yaml_content = render_autoe_reference_yaml(_default_reference_config())
+        validate_config(yaml_content, "autoe_reference")
+
+    def test_validate_config_passes_valid_assertion(self):
+        """Valid assertion YAML passes validation."""
+        yaml_content = render_autoe_assertion_yaml(_default_assertion_config())
+        validate_config(yaml_content, "autoe_assertion")
+
+    def test_validate_config_rejects_invalid_yaml(self):
+        """Malformed YAML is rejected."""
+        with pytest.raises(typer.BadParameter):
+            validate_config(": :\n  bad: [", "autoq")
+
+    def test_validate_config_rejects_missing_keys(self):
+        """YAML missing required keys is rejected."""
+        with pytest.raises(typer.BadParameter, match="Missing required keys"):
+            validate_config("foo: bar\n", "autoq")
+
+    def test_validate_config_rejects_wrong_type(self):
+        """YAML with wrong types is rejected."""
+        yaml_content = render_autoq_yaml(_default_autoq_config())
+        parsed = yaml.safe_load(yaml_content)
+        parsed["input"] = "not_a_dict"
+        bad_yaml = yaml.dump(parsed)
+        with pytest.raises(typer.BadParameter, match="should be dict"):
+            validate_config(bad_yaml, "autoq")
+
+    def test_validate_config_rejects_unknown_type(self):
+        """Unknown config type is rejected."""
+        with pytest.raises(typer.BadParameter, match="Unknown config type"):
+            validate_config("foo: bar\n", "unknown_type")
+
+    def test_validate_config_rejects_non_mapping_root(self):
+        """YAML whose root is not a mapping is rejected."""
+        with pytest.raises(typer.BadParameter, match="root must be a mapping"):
+            validate_config("- item1\n- item2\n", "autoq")
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 4. CLI Integration Tests
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class TestCliIntegration:
+    """End-to-end CLI tests using typer.testing.CliRunner."""
+
+    @pytest.fixture(autouse=True)
+    def _patch_tty_check(self, monkeypatch):
+        """Disable the TTY check so CliRunner can drive the wizard."""
+        monkeypatch.setattr("benchmark_qed.cli.interactive.check_tty", lambda: None)
+
+    def test_init_autoq_creates_files(self, tmp_path):
+        """benchmark-qed init creates settings.yaml, prompts/, .env."""
+        runner = CliRunner()
+        # Input sequence: see build_autoq_config for prompt order.
+        input_lines = [
+            "1",  # config type: autoq
+            "1",  # chat provider: OpenAI
+            "1",  # auth type: api_key
+            "",  # model: accept default gpt-4.1
+            "Y",  # use same provider for embeddings
+            "N",  # customize input section
+            "N",  # customize encoding section
+            "N",  # customize sampling section
+            "N",  # customize question types section
+            "N",  # customize assertions section
+            "8",  # concurrent requests
+        ]
+        input_text = "\n".join(input_lines) + "\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code == 0, (
+            f"CLI failed (code={result.exit_code}):\n{result.output}"
+        )
+        assert (tmp_path / "settings.yaml").exists()
+        assert (tmp_path / "prompts").exists()
+        assert (tmp_path / ".env").exists()
+
+    def test_init_autoq_settings_yaml_is_valid(self, tmp_path):
+        """The generated settings.yaml is parseable and contains expected keys."""
+        runner = CliRunner()
+        input_lines = [
+            "1",
+            "1",
+            "1",
+            "",
+            "Y",
+            "N",
+            "N",
+            "N",
+            "N",
+            "N",
+            "8",
+        ]
+        input_text = "\n".join(input_lines) + "\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code == 0, result.output
+
+        settings = yaml.safe_load(
+            (tmp_path / "settings.yaml").read_text(encoding="utf-8")
+        )
+        assert "chat_model" in settings
+        assert "embedding_model" in settings
+        assert "input" in settings
+
+    def test_init_autoe_pairwise_creates_files(self, tmp_path):
+        """Pairwise init creates correct files."""
+        runner = CliRunner()
+        input_lines = [
+            "2",  # config type: autoe_pairwise
+            "1",  # chat provider: OpenAI
+            "1",  # auth type: api_key
+            "",  # model: default
+            "baseline",  # base condition name
+            "input/baseline",  # base answer_base_path
+            "method_a",  # other condition #1 name
+            "input/method_a",  # other condition #1 answer_base_path
+            "N",  # add another condition? no
+            "",  # question sets: accept default
+            "4",  # trials (even)
+            "N",  # add custom criteria? no
+        ]
+        input_text = "\n".join(input_lines) + "\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code == 0, f"CLI failed:\n{result.output}"
+        assert (tmp_path / "settings.yaml").exists()
+        assert (tmp_path / "prompts").exists()
+
+    def test_init_autoe_reference_creates_files(self, tmp_path):
+        """Reference init creates correct files."""
+        runner = CliRunner()
+        input_lines = [
+            "3",  # config type: autoe_reference
+            "1",  # chat provider: OpenAI
+            "1",  # auth type: api_key
+            "",  # model: default
+            "golden",  # reference condition name
+            "input/golden",  # reference answer_base_path
+            "method_a",  # generated condition #1 name
+            "input/method_a",  # generated condition #1 answer_base_path
+            "N",  # add another generated? no
+            "1",  # score min
+            "10",  # score max
+            "4",  # trials
+        ]
+        input_text = "\n".join(input_lines) + "\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code == 0, f"CLI failed:\n{result.output}"
+        assert (tmp_path / "settings.yaml").exists()
+
+    def test_init_autoe_assertion_creates_files(self, tmp_path):
+        """Assertion init creates correct files."""
+        runner = CliRunner()
+        input_lines = [
+            "4",  # config type: autoe_assertion
+            "1",  # chat provider: OpenAI
+            "1",  # auth type: api_key
+            "",  # model: default
+            "method_a",  # generated condition name
+            "input/method_a",  # generated answer_base_path
+            "",  # assertions path: accept default
+            "0.5",  # pass threshold
+            "4",  # trials
+        ]
+        input_text = "\n".join(input_lines) + "\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code == 0, f"CLI failed:\n{result.output}"
+        assert (tmp_path / "settings.yaml").exists()
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 5. Overwrite Protection Tests
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class TestOverwriteProtection:
+    """Verify overwrite protection for existing settings files."""
+
+    @pytest.fixture(autouse=True)
+    def _patch_tty_check(self, monkeypatch):
+        monkeypatch.setattr("benchmark_qed.cli.interactive.check_tty", lambda: None)
+
+    def _autoq_default_input(self) -> str:
+        """Input sequence that walks through autoq with all defaults."""
+        lines = ["1", "1", "1", "", "Y", "N", "N", "N", "N", "N", "8"]
+        return "\n".join(lines) + "\n"
+
+    def test_init_warns_on_existing_settings(self, tmp_path):
+        """If settings.yaml exists, overwrite confirmation is prompted."""
+        (tmp_path / "settings.yaml").write_text("existing", encoding="utf-8")
+        runner = CliRunner()
+        # Same autoq defaults + "y" for overwrite confirmation
+        input_text = self._autoq_default_input() + "y\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code == 0, f"CLI failed:\n{result.output}"
+        content = (tmp_path / "settings.yaml").read_text(encoding="utf-8")
+        assert content != "existing"
+
+    def test_init_aborts_on_overwrite_decline(self, tmp_path):
+        """Declining overwrite aborts the command."""
+        (tmp_path / "settings.yaml").write_text("existing", encoding="utf-8")
+        runner = CliRunner()
+        input_text = self._autoq_default_input() + "N\n"
+        result = runner.invoke(
+            app,
+            ["init", str(tmp_path)],
+            input=input_text,
+        )
+        assert result.exit_code != 0
+        content = (tmp_path / "settings.yaml").read_text(encoding="utf-8")
+        assert content == "existing"
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 6. Helper Function Tests
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class TestHelpers:
+    """Tests for individual interactive helper functions."""
+
+    def test_prompt_comma_list_splits(self):
+        """prompt_comma_list splits comma-separated input."""
+        result_holder: list[str] = []
+        test_app = typer.Typer()
+
+        @test_app.command()
+        def _cmd() -> None:
+            result_holder.extend(prompt_comma_list("Enter items", default=""))
+
+        runner = CliRunner()
+        runner.invoke(test_app, input="alpha, beta, gamma\n")
+        assert result_holder == ["alpha", "beta", "gamma"]
+
+    def test_prompt_comma_list_uses_default(self):
+        """prompt_comma_list uses default when input is empty."""
+        result_holder: list[str] = []
+        test_app = typer.Typer()
+
+        @test_app.command()
+        def _cmd() -> None:
+            result_holder.extend(prompt_comma_list("Enter items", default="a, b"))
+
+        runner = CliRunner()
+        runner.invoke(test_app, input="\n")
+        assert result_holder == ["a", "b"]
+
+    def test_prompt_comma_list_strips_whitespace(self):
+        """prompt_comma_list strips whitespace from items."""
+        result_holder: list[str] = []
+        test_app = typer.Typer()
+
+        @test_app.command()
+        def _cmd() -> None:
+            result_holder.extend(prompt_comma_list("Enter", default=""))
+
+        runner = CliRunner()
+        runner.invoke(test_app, input="  x ,  y  , z  \n")
+        assert result_holder == ["x", "y", "z"]
+
+    def test_select_option_returns_correct_value(self):
+        """select_option returns the value of the chosen option."""
+        result_holder: list[str] = []
+        test_app = typer.Typer()
+
+        @test_app.command()
+        def _cmd() -> None:
+            val = select_option(
+                "Pick one",
+                [("val_a", "Label A"), ("val_b", "Label B")],
+            )
+            result_holder.append(val)
+
+        runner = CliRunner()
+        runner.invoke(test_app, input="2\n")
+        assert result_holder == ["val_b"]
+
+    def test_select_option_out_of_range_defaults(self):
+        """Out-of-range selection defaults to option 1."""
+        result_holder: list[str] = []
+        test_app = typer.Typer()
+
+        @test_app.command()
+        def _cmd() -> None:
+            val = select_option(
+                "Pick one",
+                [("first", "First"), ("second", "Second")],
+            )
+            result_holder.append(val)
+
+        runner = CliRunner()
+        runner.invoke(test_app, input="99\n")
+        assert result_holder == ["first"]
+
+    def test_select_option_default_is_one(self):
+        """Empty input (Enter) defaults to option 1."""
+        result_holder: list[str] = []
+        test_app = typer.Typer()
+
+        @test_app.command()
+        def _cmd() -> None:
+            val = select_option(
+                "Pick one",
+                [("default_val", "Default"), ("other", "Other")],
+            )
+            result_holder.append(val)
+
+        runner = CliRunner()
+        runner.invoke(test_app, input="\n")
+        assert result_holder == ["default_val"]

From bb5ab56e0da3adfea8817e9dba54d1c5379a624c Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Wed, 22 Apr 2026 15:56:20 -0600
Subject: [PATCH 3/8] Undo change

---
 benchmark_qed/autoe/retrieval/scores.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark_qed/autoe/retrieval/scores.py b/benchmark_qed/autoe/retrieval/scores.py
index 647e174..42d92f4 100644
--- a/benchmark_qed/autoe/retrieval/scores.py
+++ b/benchmark_qed/autoe/retrieval/scores.py
@@ -654,7 +654,7 @@ async def run_retrieval_evaluation(
             retrieval_path = Path(rag_method["retrieval_results_path"])
 
             # Check if path includes question_set placeholder
-            if "{question_set}" in str(retrieval_path):
+            if "{question_set}" in str(retrieval_path):  # noqa: RUF027
                 retrieval_path = Path(
                     str(retrieval_path).format(question_set=question_set)
                 )

From 1c4382461b0ff4ae55a4e8d15c6b566cead69b80 Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Fri, 24 Apr 2026 11:54:09 -0600
Subject: [PATCH 4/8] Ask for confirmation settings

---
 .apm/skills/benchmark-qed-setup/SKILL.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md
index e2c382b..f6f76f6 100644
--- a/.apm/skills/benchmark-qed-setup/SKILL.md
+++ b/.apm/skills/benchmark-qed-setup/SKILL.md
@@ -172,7 +172,26 @@ input:
 
 For the full set of optional fields, read [references/config-reference.md](references/config-reference.md).
 
-### Step 5 — Validate Configuration
+### Step 5 — Review Settings with the User
+
+After writing `settings.yaml`, **show the user the generated configuration** and ask if they want to customize anything. This is critical — the generated config uses sensible defaults, but users often need to tune dataset-specific or environment-specific values.
+
+1. Read the generated `settings.yaml` and display its contents to the user (use `show_file`).
+2. Use `ask_user` with a boolean field: *"Would you like to customize any settings before proceeding?"*
+3. If the user wants changes, use `ask_user` with a **free-text string field**: *"Describe what you'd like to change"* — let them say it in their own words (e.g., "increase num_questions to 50 for all types", "change the model to gpt-4o", "set trials to 6 and add a custom criterion"). Then apply the requested changes to `settings.yaml`.
+4. After applying changes, show the updated file and ask again: *"Any other changes?"* (boolean). Repeat until the user says no.
+
+Do **not** limit the user to predefined sections — they should be able to modify any field in `settings.yaml` by describing what they want.
+
+**Sections the user is most likely to customize** (call these out):
+- **autoq**: `num_questions` per type, `num_clusters`, `chunk_size`, assertion settings, `concurrent_requests`
+- **autoe_pairwise**: `trials`, `criteria`, `question_sets`
+- **autoe_reference**: `score_min`/`score_max`, `trials`
+- **autoe_assertion**: `pass_threshold`, `trials`
+
+For the full set of optional fields and best practices, read [references/config-reference.md](references/config-reference.md).
+
+### Step 6 — Validate Configuration
 
 The benchmark-qed CLI validates `settings.yaml` via pydantic at startup, so any missing or malformed fields are reported when you run a command. After applying the answers, run the actual target command (e.g. `benchmark-qed autoq …`) — config errors surface immediately, before any LLM calls.
 

From 3115fa326311fea04912472e81d5bc765c25e5c5 Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Fri, 24 Apr 2026 16:18:05 -0600
Subject: [PATCH 5/8] Update skills to match config

---
 .../references/config-reference.md            | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md
index e1f6ff2..b1d693e 100644
--- a/.apm/skills/benchmark-qed-setup/references/config-reference.md
+++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md
@@ -38,6 +38,8 @@ chat_model:
   api_key: ${OPENAI_API_KEY}         # Required for api_key auth
   llm_provider: openai.chat          # Provider (see table below)
   concurrent_requests: 4             # Parallel LLM requests
+  azure_identity_scopes:             # Azure identity scopes (azure_managed_identity only)
+    - https://cognitiveservices.azure.com/.default
   init_args: {}                      # Extra model init args (e.g., api_version, azure_endpoint)
   call_args:                         # Extra model call args
     temperature: 0.0
@@ -50,6 +52,24 @@ embedding_model:
   api_key: ${OPENAI_API_KEY}
 ```
 
+### Azure Identity Scopes
+
+When using `auth_type: azure_managed_identity`, the `azure_identity_scopes` field controls which OAuth scopes are requested from Azure Active Directory via `get_bearer_token_provider`.
+
+```yaml
+azure_identity_scopes:
+  - https://cognitiveservices.azure.com/.default    # Default — Azure Cognitive Services
+```
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `azure_identity_scopes` | `list[str]` | `["https://cognitiveservices.azure.com/.default"]` | OAuth scopes passed to `get_bearer_token_provider`. Only used when `auth_type` is `azure_managed_identity`. |
+
+**When to change this:**
+- The default scope (`https://cognitiveservices.azure.com/.default`) works for standard Azure OpenAI deployments
+- Use a custom scope if your Azure resource requires a different audience (e.g., private endpoints, sovereign clouds)
+- Multiple scopes can be listed if your deployment requires more than one
+
 ### Question Generation Types
 
 All question types share a base config with `num_questions` (default: `50`) and `oversample_factor` (default: `2.0`). Type-specific fields are listed below.

From 520ad5567ed692478fade74c8a2be46c511ffb26 Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Mon, 27 Apr 2026 16:04:18 -0600
Subject: [PATCH 6/8] Update skills to match config

---
 .../references/config-reference.md            | 51 ++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/.apm/skills/benchmark-qed-setup/references/config-reference.md b/.apm/skills/benchmark-qed-setup/references/config-reference.md
index b1d693e..83282c3 100644
--- a/.apm/skills/benchmark-qed-setup/references/config-reference.md
+++ b/.apm/skills/benchmark-qed-setup/references/config-reference.md
@@ -7,11 +7,12 @@ Reference for benchmark-qed configuration fields. Load this file when you need t
 ### Input Configuration
 ```yaml
 input:
-  dataset_path: ./input/data.csv    # Path to input dataset (REQUIRED)
+  dataset_path: ./input/data.csv    # Path to input dataset (when storage is configured, path within the container)
   input_type: csv                    # csv or json
   text_column: text                  # Column containing text content
   metadata_columns: null             # Optional list of metadata columns (e.g., [headline, date])
   file_encoding: utf-8               # File encoding (template uses utf-8-sig)
+  storage: null                      # Optional StorageConfig for cloud storage (Azure Blob or Cosmos DB)
 ```
 
 ### Encoding Configuration
@@ -70,6 +71,47 @@ azure_identity_scopes:
 - Use a custom scope if your Azure resource requires a different audience (e.g., private endpoints, sovereign clouds)
 - Multiple scopes can be listed if your deployment requires more than one
 
+### Storage Configuration (Optional)
+
+All config types support optional cloud storage backends for reading input and writing output. When omitted, the local filesystem is used (default behavior).
+
+```yaml
+# AutoQ — input storage (inside the 'input' block)
+input:
+  dataset_path: ./input           # When storage is set, this is the path within the container
+  storage:                        # Optional: read input from Azure Blob Storage
+    type: blob
+    container_name: my-datasets
+    connection_string: ${AZURE_STORAGE_CONNECTION_STRING}  # Or use account_url for managed identity
+    # account_url: https://<account>.blob.core.windows.net
+    # base_dir: path/within/container
+
+# AutoQ/AutoE — output storage (top-level)
+output_storage:                   # Optional: write output to Azure Blob Storage
+  type: blob
+  container_name: my-output
+  connection_string: ${AZURE_STORAGE_CONNECTION_STRING}
+  # base_dir: experiments/run1
+
+# AutoE — input storage (top-level, for reading answers/assertions)
+input_storage:                    # Optional: read input from Azure Blob Storage
+  type: blob
+  container_name: my-datasets
+  account_url: https://<account>.blob.core.windows.net
+```
+
+#### StorageConfig Fields
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `type` | `str` | `"file"` | Storage backend: `file` (local), `blob` (Azure Blob Storage), `cosmosdb` (Azure Cosmos DB) |
+| `container_name` | `str \| null` | `null` | Azure Blob container or Cosmos DB container name |
+| `connection_string` | `str \| null` | `null` | Connection string for Azure (auth option 1) |
+| `account_url` | `str \| null` | `null` | Account URL for Azure managed identity (auth option 2) |
+| `base_dir` | `str \| null` | `null` | Base directory/prefix within the container |
+| `database_name` | `str \| null` | `null` | Database name (Cosmos DB only) |
+| `encoding` | `str \| null` | `null` | File encoding (file storage only) |
+
 ### Question Generation Types
 
 All question types share a base config with `num_questions` (default: `50`) and `oversample_factor` (default: `2.0`). Type-specific fields are listed below.
@@ -143,6 +185,7 @@ assertions:
     max_concurrent_questions: 2
 
 concurrent_requests: 8               # Top-level concurrency for autoq pipeline
+output_storage: null                 # Optional StorageConfig for writing output to cloud storage
 ```
 
 ## autoe Pairwise Configuration (`PairwiseConfig`)
@@ -404,3 +447,9 @@ custom_providers:
 - Match the assessor type between `generate-retrieval-reference` and `retrieval-scores` to share the cache
 - `relevance_threshold: 2` on a 0–3 scale is a reasonable default — lower values include marginal matches
 - Use `cache_dir` for iterative development to avoid redundant LLM calls across runs
+
+### Storage Configuration
+- Use `connection_string` with `${AZURE_STORAGE_CONNECTION_STRING}` for development; use `account_url` with managed identity for production
+- `base_dir` is optional — use it to organize multiple experiments within a single container
+- When `storage` is set on `input`, `dataset_path` becomes relative to the container/base_dir, not the local filesystem
+- Cosmos DB storage requires `database_name` in addition to `container_name`

From 9b3bb35da1062fe95c20da198721522c6054fd35 Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Mon, 27 Apr 2026 16:13:49 -0600
Subject: [PATCH 7/8] Formatting

---
 benchmark_qed/cli/interactive.py   | 78 ++++++++++++++----------------
 benchmark_qed/cli/yaml_renderer.py |  3 +-
 2 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/benchmark_qed/cli/interactive.py b/benchmark_qed/cli/interactive.py
index a9f00e1..2afc5ec 100644
--- a/benchmark_qed/cli/interactive.py
+++ b/benchmark_qed/cli/interactive.py
@@ -13,6 +13,41 @@
 from rich.panel import Panel
 from rich.table import Table
 
+from benchmark_qed.autod.prompts import summarization
+from benchmark_qed.autoe.prompts import assertion as assertion_prompts
+from benchmark_qed.autoe.prompts import pairwise as pairwise_prompts
+from benchmark_qed.autoe.prompts import reference as reference_prompts
+from benchmark_qed.autoq.prompts import data_questions as data_questions_prompts
+from benchmark_qed.autoq.prompts.activity_questions import (
+    activity_context as activity_context_prompts,
+)
+from benchmark_qed.autoq.prompts.activity_questions import (
+    global_questions as activity_global_prompts,
+)
+from benchmark_qed.autoq.prompts.activity_questions import (
+    local_questions as activity_local_prompts,
+)
+from benchmark_qed.autoq.prompts.data_questions import (
+    assertions as autoq_assertion_prompts,
+)
+from benchmark_qed.autoq.prompts.data_questions import (
+    global_questions as data_global_prompts,
+)
+from benchmark_qed.autoq.prompts.data_questions import (
+    linked_questions as data_linked_prompts,
+)
+from benchmark_qed.autoq.prompts.data_questions import (
+    local_questions as data_local_prompts,
+)
+from benchmark_qed.cli.scaffold import copy_prompts, ensure_input_folder, write_env_file
+from benchmark_qed.cli.yaml_renderer import (
+    render_autoe_assertion_yaml,
+    render_autoe_pairwise_yaml,
+    render_autoe_reference_yaml,
+    render_autoq_yaml,
+    validate_config,
+)
+
 app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
 
 
@@ -88,11 +123,9 @@ def check_tty() -> None:
         raise typer.Exit(code=1)
 
 
-def confirm_overwrite(path: typer.Path | Any) -> None:
+def confirm_overwrite(path: Path | str) -> None:
     """Ask for confirmation before overwriting an existing settings file."""
-    from pathlib import Path as _Path
-
-    p = _Path(str(path))
+    p = Path(path) if not isinstance(path, Path) else path
     if p.exists():
         typer.confirm(
             f"{p} already exists. Overwrite?",
@@ -643,34 +676,6 @@ def build_autoe_assertion_config() -> dict[str, Any]:
 
 def _copy_prompts_for_config(config_type: str, prompts_folder: Path) -> None:
     """Copy the appropriate prompt templates for the given config type."""
-    from benchmark_qed.autod.prompts import summarization
-    from benchmark_qed.autoe.prompts import assertion as assertion_prompts
-    from benchmark_qed.autoe.prompts import pairwise as pairwise_prompts
-    from benchmark_qed.autoe.prompts import reference as reference_prompts
-    from benchmark_qed.autoq.prompts import data_questions as data_questions_prompts
-    from benchmark_qed.autoq.prompts.activity_questions import (
-        activity_context as activity_context_prompts,
-    )
-    from benchmark_qed.autoq.prompts.activity_questions import (
-        global_questions as activity_global_prompts,
-    )
-    from benchmark_qed.autoq.prompts.activity_questions import (
-        local_questions as activity_local_prompts,
-    )
-    from benchmark_qed.autoq.prompts.data_questions import (
-        assertions as autoq_assertion_prompts,
-    )
-    from benchmark_qed.autoq.prompts.data_questions import (
-        global_questions as data_global_prompts,
-    )
-    from benchmark_qed.autoq.prompts.data_questions import (
-        linked_questions as data_linked_prompts,
-    )
-    from benchmark_qed.autoq.prompts.data_questions import (
-        local_questions as data_local_prompts,
-    )
-    from benchmark_qed.cli.scaffold import copy_prompts
-
     match config_type:
         case "autoq":
             copy_prompts(
@@ -730,15 +735,6 @@ def interactive_init(
     ],
 ) -> None:
     """Interactively create a benchmark-qed configuration."""
-    from benchmark_qed.cli.scaffold import ensure_input_folder, write_env_file
-    from benchmark_qed.cli.yaml_renderer import (
-        render_autoe_assertion_yaml,
-        render_autoe_pairwise_yaml,
-        render_autoe_reference_yaml,
-        render_autoq_yaml,
-        validate_config,
-    )
-
     check_tty()
 
     rich_print(
diff --git a/benchmark_qed/cli/yaml_renderer.py b/benchmark_qed/cli/yaml_renderer.py
index 00166fa..fb1f09f 100644
--- a/benchmark_qed/cli/yaml_renderer.py
+++ b/benchmark_qed/cli/yaml_renderer.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import dataclasses
 from typing import Any
 
 import typer
@@ -29,8 +30,6 @@ def _render_llm_section(provider_dict: dict[str, Any], indent: int = 2) -> str:
     indent:
         Number of leading spaces for each line.
     """
-    import dataclasses
-
     if dataclasses.is_dataclass(provider_dict) and not isinstance(provider_dict, type):
         provider_dict = dataclasses.asdict(provider_dict)
 

From 7f404629e17a9c94f6118994bad18a492932d3e0 Mon Sep 17 00:00:00 2001
From: Andres Morales Esquivel <andresmor@microsoft.com>
Date: Mon, 27 Apr 2026 16:53:17 -0600
Subject: [PATCH 8/8] Address comments

---
 .apm/skills/benchmark-qed-setup/SKILL.md |  2 +-
 benchmark_qed/cli/interactive.py         |  6 +-----
 benchmark_qed/cli/yaml_renderer.py       | 15 +++++++++++----
 tests/test_interactive_init.py           | 12 +++++++++---
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/.apm/skills/benchmark-qed-setup/SKILL.md b/.apm/skills/benchmark-qed-setup/SKILL.md
index f6f76f6..b1b8443 100644
--- a/.apm/skills/benchmark-qed-setup/SKILL.md
+++ b/.apm/skills/benchmark-qed-setup/SKILL.md
@@ -211,4 +211,4 @@ Key highlights:
 - The `.env` file must be in the workspace root directory, not the project root.
 - Config types `autoe_pairwise`, `autoe_reference`, and `autoe_assertion` generate different settings.yaml templates — use the correct type for your evaluation method.
 - Prompts are copied as `.txt` files using Python `string.Template` syntax (`$variable` or `${variable}`).
-- **`prompts_config` vs `prompt_config`**: The non-interactive `config init` for some autoe types generates `prompts_config`, but the runtime expects `prompt_config`. The interactive `benchmark-qed init` wizard avoids this issue. If using `config init`, rename the key if you get validation errors.
+- **`prompt_config` key**: The runtime expects `prompt_config` (singular) for all autoe config types. Both `benchmark-qed init` and `config init` now generate the correct key. If you hand-edit YAML, ensure you use `prompt_config`, not `prompts_config`.
diff --git a/benchmark_qed/cli/interactive.py b/benchmark_qed/cli/interactive.py
index 2afc5ec..fb8f254 100644
--- a/benchmark_qed/cli/interactive.py
+++ b/benchmark_qed/cli/interactive.py
@@ -48,9 +48,6 @@
     validate_config,
 )
 
-app: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
-
-
 # ---------------------------------------------------------------------------
 # Data types
 # ---------------------------------------------------------------------------
@@ -727,7 +724,6 @@ def _copy_prompts_for_config(config_type: str, prompts_folder: Path) -> None:
 # ---------------------------------------------------------------------------
 
 
-@app.command()
 def interactive_init(
     root: Annotated[
         Path,
@@ -765,7 +761,7 @@ def interactive_init(
     }
     yaml_content = renderers[config_type](config_dict)
 
-    # 4. Validate against Pydantic model
+    # 4. Validate rendered YAML structure and basic config fields
     validate_config(yaml_content, config_type)
 
     # 5. Write files
diff --git a/benchmark_qed/cli/yaml_renderer.py b/benchmark_qed/cli/yaml_renderer.py
index fb1f09f..4b75cbc 100644
--- a/benchmark_qed/cli/yaml_renderer.py
+++ b/benchmark_qed/cli/yaml_renderer.py
@@ -53,7 +53,11 @@ def _render_llm_section(provider_dict: dict[str, Any], indent: int = 2) -> str:
     if init_args:
         lines.append(f"{pad}init_args:")
         for key, value in init_args.items():
-            lines.append(f"{pad}  {key}: {value}")
+            # Quote string values to prevent YAML coercion (e.g., api_version dates)
+            if isinstance(value, str):
+                lines.append(f'{pad}  {key}: "{value}"')
+            else:
+                lines.append(f"{pad}  {key}: {value}")
 
     return "\n".join(lines)
 
@@ -310,7 +314,7 @@ def render_autoe_pairwise_yaml(config: dict[str, Any]) -> str:
 llm_config:
 {llm_section}
 
-prompts_config:
+prompt_config:
   user_prompt:
     prompt: prompts/pairwise_user_prompt.txt
   system_prompt:
@@ -362,7 +366,7 @@ def render_autoe_reference_yaml(config: dict[str, Any]) -> str:
 llm_config:
 {llm_section}
 
-prompts_config:
+prompt_config:
   user_prompt:
     prompt: prompts/reference_user_prompt.txt
   system_prompt:
@@ -406,7 +410,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str:
 llm_config:
 {llm_section}
 
-prompts_config:
+prompt_config:
   user_prompt:
     prompt: prompts/assertion_user_prompt.txt
   system_prompt:
@@ -438,6 +442,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str:
         "question_sets",
         "trials",
         "llm_config",
+        "prompt_config",
     ],
     "autoe_reference": [
         "reference",
@@ -446,6 +451,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str:
         "score_max",
         "trials",
         "llm_config",
+        "prompt_config",
     ],
     "autoe_assertion": [
         "generated",
@@ -453,6 +459,7 @@ def render_autoe_assertion_yaml(config: dict[str, Any]) -> str:
         "pass_threshold",
         "trials",
         "llm_config",
+        "prompt_config",
     ],
 }
 
diff --git a/tests/test_interactive_init.py b/tests/test_interactive_init.py
index f515d4a..dc37f10 100644
--- a/tests/test_interactive_init.py
+++ b/tests/test_interactive_init.py
@@ -232,6 +232,8 @@ def test_render_autoe_pairwise_yaml(self):
         assert "question_sets" in parsed
         assert parsed["trials"] == 4
         assert "llm_config" in parsed
+        assert "prompt_config" in parsed
+        assert isinstance(parsed["prompt_config"], dict)
 
     def test_render_autoe_pairwise_yaml_with_criteria(self):
         """Pairwise YAML with custom criteria includes them."""
@@ -262,6 +264,8 @@ def test_render_autoe_reference_yaml(self):
         assert parsed["score_min"] == 1
         assert parsed["score_max"] == 10
         assert "llm_config" in parsed
+        assert "prompt_config" in parsed
+        assert isinstance(parsed["prompt_config"], dict)
 
     def test_render_autoe_reference_yaml_multiple_generated(self):
         """Reference YAML with multiple generated conditions."""
@@ -282,9 +286,11 @@ def test_render_autoe_assertion_yaml(self):
         assert parsed is not None
         assert "generated" in parsed
         assert "assertions" in parsed
-        assert parsed["pass_threshold"] == 0.5
+        assert parsed["pass_threshold"] == pytest.approx(0.5)
         assert parsed["trials"] == 4
         assert "llm_config" in parsed
+        assert "prompt_config" in parsed
+        assert isinstance(parsed["prompt_config"], dict)
 
 
 # ═══════════════════════════════════════════════════════════════════════════
@@ -308,8 +314,8 @@ def test_azure_provider_includes_init_args(self):
         provider = _azure_chat_provider()
         section = _render_llm_section(provider)
         assert "init_args:" in section
-        assert "azure_endpoint: https://example.openai.azure.com" in section
-        assert "api_version: 2024-12-01-preview" in section
+        assert 'azure_endpoint: "https://example.openai.azure.com"' in section
+        assert 'api_version: "2024-12-01-preview"' in section
 
     def test_managed_identity_omits_api_key(self):
         """azure_managed_identity auth type does NOT include api_key line."""